# Sentiment Analysis of Amazon Reviews using Discriminative Models Trained on Synthetic Data


<h2>1. Preprocessing Data</h2>

Import Packages

In [None]:
pip install scikit-plot

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer,ToktokTokenizer
import re
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,recall_score,precision_score
from sklearn.naive_bayes import MultinomialNB
from string import punctuation
from nltk.stem import WordNetLemmatizer
import os
pd.set_option('display.float_format', lambda x: '%.3f' % x)

from sklearn.model_selection import train_test_split
from sklearn import metrics

import re
import string

from tensorflow import keras
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import SimpleRNN, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import random
import os

<h4>Read the data</h4>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset_path = "/content/drive/MyDrive/NLP Final Project/Womens Clothing E-Commerce Reviews.csv"

In [None]:
data = pd.read_csv(dataset_path)

In [None]:
data = data[['Review Text','Recommended IND']]

<h4>Renaming Columns</h4>

In [None]:
data.rename(columns={'Review Text':'review_text','Recommended IND':'recommended'},inplace=True)

<h4>Checking and Handling Missing Values</h4>

In [None]:
data.isna().sum()

review_text    845
recommended      0
dtype: int64

<div style="text-align:justify">From 2 variable above, review_text have so many missing values. Removing those missing values can lead to lack of information from the data, so instead of remove the data, let's fill missing values with blank space.</div>

In [None]:
data['review_text'] = data['review_text'].fillna(' ')

In [None]:
data.isna().sum()

review_text    0
recommended    0
dtype: int64

In [None]:
data.dtypes

review_text    object
recommended     int64
dtype: object

<h2>Exploratory Data Analysis (EDA)</h2>

<h4>Renaming Target Variable Values</h4>
Instead of using 0 and 1 as values of target variable, we can use more appropiate values like "Not Recommended" and "Recommended".

In [None]:
data.loc[data["recommended"] == 0, "recommended"] = "Not Recommended" # 0 -> Not Recommended
data.loc[data["recommended"] == 1, "recommended"] = "Recommended" # 1 -> Recommended

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

<h2>Text Mining</h2>

<div style="text-align:justify">In this text mining process we will exploring and analyzing unstructured text data</div>  

In [None]:
tokenizer=ToktokTokenizer()
stopword_list=nltk.corpus.stopwords.words('english')

In [None]:
main_text = data['review_text']
target = data['recommended']

<h4>Expanding Contraction</h4>

In [None]:
contractions_dict = {     
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I had",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "iit will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that had",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there had",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they had",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [None]:
def expand_contractions(text, contractions_dict):
    contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_dict.get(match) \
            if contractions_dict.get(match) \
            else contractions_dict.get(match.lower())
        expanded_contraction = expanded_contraction
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text
def cons(text):
    text=expand_contractions(text,contractions_dict)
    return text

main_text = main_text.apply(cons)

<h4>To lowercase</h4><br>
Change all uppercase character to be lowercase character. For example "Pretty" to be "pretty" or "BEAUTY" to be "beauty"

In [None]:
#Tolowercase
def to_lower(text):
    return ' '.join([w.lower() for w in word_tokenize(text)])

main_text = main_text.apply(to_lower)

<h4>Remove Special Character and Punctuation</h4><br>
Removing all special character like .?/@# etc

In [None]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)

main_text = main_text.apply(remove_special_characters)
main_text = main_text.apply(strip_punctuation)

<h4>Replace Elongated Words</h4><br>
Replace all elongated words with appropriate words. For example "soooooo" to be "so" or "looooong" to be "long"


In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
from nltk.corpus import wordnet

def replaceElongated(word):
    repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
    repl = r'\1\2\3'
    if wordnet.synsets(word):
        return word
    repl_word = repeat_regexp.sub(repl, word)
    if repl_word != word:      
        return replaceElongated(repl_word)
    else:       
        return repl_word
main_text = main_text.apply(replaceElongated)

<h4>Tokenization</h4><br>
Tokenization is splitting sentences into smaller unit, such as terms or word. 

In [None]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

main_text = main_text.apply(lambda x: tokenizer.tokenize(x))


<h4>Removing Stopwords</h4><br>
Remove stopwords like "is, the, with, etc" since they don't have usefull information

In [None]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopword_list]
    return words

main_text = main_text.apply(lambda x : remove_stopwords(x))

<h4>Stemming</h4><br>
Stemming is the process of reducing a word to its word stem. For example "Consulting" to be "consult"

In [None]:
from nltk.stem import SnowballStemmer

snowball_stemmer = SnowballStemmer('english')

def stem_update(text_list):
    text_list_new = []
    for word in text_list:
        word = snowball_stemmer.stem(word)
        text_list_new.append(word)
    return text_list_new
main_text = main_text.apply(stem_update)


<h4>Drop Numbers</h4><br>
Remove numbers from text, since numbers doesn't give much importance to get the main words.

In [None]:
def drop_numbers(list_text):
    list_text_new = []
    for i in list_text:
        if not re.search('\d', i):
            list_text_new.append(i)
    return ' '.join(list_text_new)
main_text = main_text.apply(drop_numbers)

In [None]:
df = pd.concat([main_text,target],axis=1)

In [None]:
df

Unnamed: 0,review_text,recommended
0,absolut wonder silki sexi comfort,Recommended
1,love dres preti hapen find store glad bc never...,Recommended
2,high hope dres reali want work initiali order ...,Not Recommended
3,love love love jumpsuit fun flirti fabul everi...,Recommended
4,shirt flater al due adjust front tie perfect l...,Recommended
...,...,...
23481,hapi snag dres great price easi slip flater cu...,Recommended
23482,remind matern cloth soft stretchi shini materi...,Recommended
23483,fit wel top se never would work glad abl tri s...,Not Recommended
23484,bought dres wede sumer cute unfortun fit perfe...,Recommended


<h2>Modelling using Multinomial Naive Bayes</h2>

<h4>Split the data and count the vectorize in each words</h4>

<div style="text-align:justify">The next step is to create a numerical feature vector for each document and split them into train data and test data.</div>

In [None]:
cv=CountVectorizer()

train_data,test_data = train_test_split(df,train_size=0.8,random_state=0)

X_train = cv.fit_transform(train_data['review_text'])
y_train = train_data['recommended']
X_test = cv.transform(test_data['review_text'])
y_test = test_data['recommended']

In [None]:
X_train.shape

(18788, 10578)

In [None]:
X_test.shape

(4698, 10578)

In [None]:
y_train = np.where(y_train == "Recommended", 1, 0)

In [None]:
y_test = np.where(y_test == "Recommended", 1, 0)

<h4>Length of Generated Reviews</h4>
<div style="text-align:justify">Normal distribution using mean and standard deviation from real data</div>

In [None]:
print(np.mean(np.sum(X_train, axis=1)))
print(np.mean(np.sum(X_test, axis=1)))
print(np.std(np.sum(X_train, axis=1)))
print(np.std(np.sum(X_test, axis=1)))

27.198584202682564
27.002979991485738
13.760563291686998
13.678715978231923


In [None]:
np.count_nonzero(np.sum(X_train, axis=1))

18111

<h4>Multinomial Naive Bayes Modelling</h4>
<div style="text-align:justify">Naive Bayes is one of algorithms method based on applying Bayes theorem.Bayes theorem calculates probability P(c|x) where c is the class of the possible outcomes and x is the given instance which has to be classified. Herre below the formula of naive bayes:</div>

![Capture.JPG](attachment:Capture.JPG)

Where:<br>
`P(A|B)` : measure of how often A and B are observed to occure together (posterior probability) <br>
`P(B|A)` : measures of how often B occur in A (likelihood)<br>
`P(A)` : measure of how often A is observed to occur in general (prior probability) <br>
`P(B)` : measure of how often B is observed to occur in general (marginal likelihood)<br>

In [None]:
nb = MultinomialNB()
nb.fit(X_train,y_train)

MultinomialNB()

In [None]:
nb_predict=nb.predict(X_test)



---



Generate synthethic data from NB model

In [None]:
nb.feature_log_prob_.shape

(2, 10578)

In [None]:
nb.feature_log_prob_[0].shape

(10578,)

In [None]:
from scipy.stats import multinomial

Generate training data

In [None]:
np.random.seed(0)
lengths = np.random.normal(loc=27, scale=14, size=18800//2)
train_positive_lengths = np.abs((np.rint(lengths)).astype(int))

In [None]:
np.random.seed(2)
lengths = np.random.normal(loc=27, scale=14, size=18800//2)
train_negative_lengths = np.abs((np.rint(lengths)).astype(int))

In [None]:
X_train_positive_synthetic = [multinomial.rvs(length, np.exp(nb.feature_log_prob_[0])) for length in train_positive_lengths]
X_train_negative_synthetic = [multinomial.rvs(length, np.exp(nb.feature_log_prob_[1])) for length in train_negative_lengths]

In [None]:
X_train_positive_synthetic = np.stack(X_train_positive_synthetic)
X_train_negative_synthetic = np.stack(X_train_negative_synthetic)

In [None]:
X_train_synthetic = np.concatenate((X_train_positive_synthetic, X_train_negative_synthetic))
y_train_synthetic = np.concatenate((np.ones(18800//2), np.zeros(18800//2)))

In [None]:
print(X_train_synthetic.shape)
print(y_train_synthetic.shape)

(18800, 10578)
(18800,)


Generate test data

In [None]:
np.random.seed(1)
lengths = np.random.normal(loc=27, scale=14, size=4700//2)
test_positive_lengths = np.abs((np.rint(lengths)).astype(int))

In [None]:
np.random.seed(3)
lengths = np.random.normal(loc=27, scale=14, size=4700//2)
test_negative_lengths = np.abs((np.rint(lengths)).astype(int))

In [None]:
X_test_positive_synthetic = [multinomial.rvs(length, np.exp(nb.feature_log_prob_[0])) for length in test_positive_lengths]
X_test_negative_synthetic = [multinomial.rvs(length, np.exp(nb.feature_log_prob_[1])) for length in test_negative_lengths]

In [None]:
X_test_positive_synthetic = np.stack(X_test_positive_synthetic)
X_test_negative_synthetic = np.stack(X_test_negative_synthetic)

In [None]:
X_test_synthetic = np.concatenate((X_test_positive_synthetic, X_test_negative_synthetic))
y_test_synthetic = np.concatenate((np.ones(4700//2), np.zeros(4700//2)))

In [None]:
print(X_test_synthetic.shape)
print(y_test_synthetic.shape)

(4700, 10578)
(4700,)


Transform synthetic data to text

In [None]:
X_train_synthethic_text = cv.inverse_transform(X_train_synthetic)
X_test_synthethic_text = cv.inverse_transform(X_test_synthetic)

In [None]:
print(X_train_synthethic_text[0])
print(X_test_synthethic_text[-1])

['botom' 'boxi' 'brand' 'colect' 'color' 'either' 'fan' 'fit' 'hang'
 'heavier' 'like' 'litl' 'love' 'lower' 'make' 'materi' 'might' 'model'
 'motl' 'neck' 'nice' 'normal' 'noth' 'noyt' 'onlin' 'order' 'otk' 'pant'
 'petiti' 'plane' 'price' 'qualiti' 'reali' 'right' 'se' 'serious' 'shirt'
 'side' 'smal' 'super' 'time' 'transpa' 'volumn' 'wast' 'way' 'wear'
 'winterth' 'would' 'xl']
['ade' 'chest' 'god' 'golden' 'lege' 'like' 'love' 'medium' 'open' 'order'
 'precious' 'product' 'reali' 'shape' 'sher' 'stil' 'yelow']




---



**Preparing Synthetic Data For Training on Discriminative Models**

In [None]:
X_Train_Syn = []
for i in X_train_synthethic_text:
  X_Train_Syn.append(' '.join(i))

X_Test_Syn = []
for i in X_test_synthethic_text:
  X_Test_Syn.append(' '.join(i))

X_train_synthetic_text = pd.DataFrame(X_Train_Syn)
X_test_synthetic_text = pd.DataFrame(X_Test_Syn)



In [None]:
y_train_synthetic_results = pd.DataFrame(y_train_synthetic)
y_test_synthetic_results = pd.DataFrame(y_test_synthetic)

In [None]:
# Run this cell to set seeds
np.random.seed(684)
tf.random.set_seed(684)
random.seed(684)
os.environ['PYTHONHASHSEED']=str(684)

In [None]:
# Setting up the evaluation metrics
def roc_auc(predictions,target):
    
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [None]:
# Rename Columns
X_train_synthetic_text= X_train_synthetic_text.rename(columns={0: "Review Text"})
X_test_synthetic_text= X_test_synthetic_text.rename(columns={0: "Review Text"})

Unnamed: 0,Review Text
0,botom boxi brand colect color either fan fit h...
1,athletichourgla back blend color dres enough e...
2,arm back best bodi bradshaw cloth cute debat d...
3,also asum bag blous boxi compfi complet could ...
4,away awkward back beauti bete bordeaux cheapn ...
...,...
18795,around avail bagylos bit even fit gather god g...
18796,amaz arm beauti bit cami cut cute done flare f...
18797,btw daughter definit fite form got holiday ned...
18798,around backshouldersup cut flower go know ligh...


**Tokenization**

In [None]:
num_words = None   

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train_synthetic_text['Review Text'].tolist() + X_test_synthetic_text['Review Text'].tolist())  

word_index = tokenizer.word_index

X_train_seq = tokenizer.texts_to_sequences(X_train_synthetic_text['Review Text'].tolist())
X_test_seq = tokenizer.texts_to_sequences(X_test_synthetic_text['Review Text'].tolist())

max_len = max([len(x) for x in X_train_seq])

X_train_pad_syn = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad_syn = pad_sequences(X_test_seq, maxlen=max_len)

# Output shapes
print("\nPadded training shape:", X_train_pad_syn.shape)
print("\nPadded test shape:", X_test_pad_syn.shape)


Padded training shape: (18800, 75)

Padded test shape: (4700, 75)


**RNN METHOD ON SYNTHETIC**

In [None]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                    50,     
                    input_length=max_len))
model.add(SimpleRNN(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 75, 50)            526200    
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               15100     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 541,401
Trainable params: 541,401
Non-trainable params: 0
_________________________________________________________________


In [None]:
batch_size = 512
model.fit(X_train_pad_syn, y_train_synthetic, epochs=5, batch_size=batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7faf91179e80>

In [None]:
scores = model.predict(X_test_pad_syn)
print("AUC: %.2f%%" % (roc_auc(scores,y_test_synthetic_results)))

AUC: 0.94%


**LSTM METHOD WITH SYTHETIC**

In [None]:
model2 = Sequential()
model2.add(Embedding(len(word_index) + 1,
                    50,     # embeds it in a 50-dimensional vector
                    input_length=max_len))

model2.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
model2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 75, 50)            526200    
                                                                 
 lstm (LSTM)                 (None, 100)               60400     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 586,701
Trainable params: 586,701
Non-trainable params: 0
_________________________________________________________________


In [None]:
batch_size = 512

model2.fit(X_train_pad_syn, y_train_synthetic, epochs=5, batch_size=batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7faf90772040>

In [None]:
scores = model2.predict(X_test_pad_syn)
print("AUC: %.2f%%" % (roc_auc(scores,y_test_synthetic_results)))

AUC: 0.96%
