### Import Packages and Libraries

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
import re
import gensim
from gensim import corpora
from gensim import similarities
from gensim import models
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


### Import Data File and Cleaning

In [2]:
# data_file = "SHOPEE_MAYBELLINE_CLEAN_V2.csv"
data_file = "Lazada_sentiment.csv"
data = pd.read_csv(data_file)
data.columns = data.columns.str.strip().str.replace(" ","_")
# data.info()
# data.head()

# data.drop(columns=['Brand','Category','Product_Name','Price','Reviewer','Product_Purchase','Ratings','Date_Of_Review','Response', 'Topic'])
# review_list = data['Review'].tolist()
# polarity_list = data['Polarity'].tolist()

reviews = data['Review']
# polarity = data['Polarity']
# print (reviews)

review_docs = []
for each_reviews in reviews:
    temp = each_reviews.split(" ")
    review_docs.append(temp)
# print (review_docs)

# Make sure all words are in lowercase
reviews_lower = [[each_word.lower() for each_word in each_review] for each_review in review_docs]
# print (reviews_lower)

# Use regular expressions to keep only allphabetical words
reviews_alpha = [[each_word for each_word in each_review if re.search('^[a-z]+$', each_word)] for each_review in reviews_lower]
# print (reviews_alpha)

# Remove stop words
stop_list = stopwords.words('english')
reviews_stop = [[each_word for each_word in each_review if each_word not in stop_list] for each_review in reviews_alpha]
# print (reviews_stop)

# Porter Stemming
stemmer = PorterStemmer()
reviews_stem = [[stemmer.stem(each_word) for each_word in each_review] for each_review in reviews_stop]
# print (reviews_stem)

all_data_cleaned = []
for each_sentence in reviews_stem:
    sentence = ""
    for each_word in each_sentence:
        sentence += each_word + " "
    sentence = sentence[0:-1]
    all_data_cleaned.append(sentence)
# print (all_data_cleaned)

polarity_raw = data['Polarity']
polarity_0_and_1 = []
for each_polarity in polarity_raw:
    if int(each_polarity) == int("0"):
        polarity_0_and_1.append(0.5)
    if int(each_polarity) == int("-1"):
        polarity_0_and_1.append(int(0))
    if int(each_polarity) == int("1"):
        polarity_0_and_1.append(int(1))
# print (polarity)


### Building a Model - Multinomial Naive Bayes 
1. Count Vectorizer
2. TFIDF Vectorizer

In [4]:
print ("1. Count Vectorizer")

reviews = all_data_cleaned
polarity = data['Polarity']
X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

tfidfVectorizer = TfidfVectorizer(use_idf = False, min_df = 4, max_df=0.85)
X_train = tfidfVectorizer.fit_transform(X_train)
X_test = tfidfVectorizer.transform(X_test)

mnbClf = MultinomialNB()
mnbClf.fit(X_train, y_train)
mnbClf_ypred = mnbClf.predict(X_test)
f1_mnbClf = f1_score(y_test, mnbClf_ypred, average = 'weighted')
accuracy_mnbClf = accuracy_score(y_test, mnbClf_ypred)
print ("F1-score of Multinomial Naive Bayes: ", f1_mnbClf*100)
print ("Accuracy of Multinomial Naive Bayes: ", accuracy_mnbClf*100)

print ("\n2. TFIDF Vectorizer")

reviews = all_data_cleaned
polarity = data['Polarity']
X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

tfidfVectorizer = TfidfVectorizer(use_idf = True, min_df = 4, max_df=0.85)
X_train = tfidfVectorizer.fit_transform(X_train)
X_test = tfidfVectorizer.transform(X_test)

mnbTfidfClf = MultinomialNB()
mnbTfidfClf.fit(X_train, y_train)
mnbTfidfClf_ypred = mnbTfidfClf.predict(X_test)
f1_mnbTfidfClf = f1_score(y_test, mnbTfidfClf_ypred, average='weighted')
accuracy_mnbTfidfClf = accuracy_score(y_test, mnbTfidfClf_ypred)
print ("F1-score of Multinomial Naive Bayes with TFIDF: ", f1_mnbTfidfClf*100)
print ("Accuracy of Multinomial Naive Bayes with TFIDF: ", accuracy_mnbTfidfClf*100)




1. Count Vectorizer
F1-score of Multinomial Naive Bayes:  70.62505296771751
Accuracy of Multinomial Naive Bayes:  72.85245901639344

2. TFIDF Vectorizer
F1-score of Multinomial Naive Bayes with TFIDF:  73.45205689395802
Accuracy of Multinomial Naive Bayes with TFIDF:  74.88524590163934


### Building a Model - Bernoulli Naive Bayes
1. Count Vectorizer
2. TFIDF Vectorizer

In [5]:
print ("1. Count Vectorizer")

reviews = all_data_cleaned
polarity = data['Polarity']
X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

tfidfVectorizer = TfidfVectorizer(use_idf = False, min_df = 4, max_df=0.85)
X_train = tfidfVectorizer.fit_transform(X_train)
X_test = tfidfVectorizer.transform(X_test)

bnbClf = BernoulliNB()
bnbClf.fit(X_train, y_train)
bnbClf_ypred = bnbClf.predict(X_test)
f1_bnbClf = f1_score(y_test, bnbClf_ypred, average = 'weighted')
accuracy_bnbClf = accuracy_score(y_test, bnbClf_ypred)
print ("F1-score of Bernoulli Naive Bayes: ", f1_bnbClf*100)
print ("Accuracy of Bernoulli Naive Bayes: ", accuracy_bnbClf*100)


print ("\n2. TFIDF Vectorizer")

reviews = all_data_cleaned
polarity = data['Polarity']
X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

tfidfVectorizer = TfidfVectorizer(use_idf = True, min_df = 4, max_df=0.85)
X_train = tfidfVectorizer.fit_transform(X_train)
X_test = tfidfVectorizer.transform(X_test)

bnbTfidfClf = BernoulliNB()
bnbTfidfClf.fit(X_train, y_train)
bnbTfidfClf_ypred = bnbTfidfClf.predict(X_test)
f1_bnbTfidfClf = f1_score(y_test, bnbTfidfClf_ypred, average='weighted')
accuracy_bnbTfidfClf = accuracy_score(y_test, bnbTfidfClf_ypred)
print ("F1-score of Bernoulli Naive Bayes with TFIDF: ", f1_bnbTfidfClf*100)
print ("Accuracy of Bernoulli Naive Bayes with TFIDF: ", accuracy_bnbTfidfClf*100)

1. Count Vectorizer
F1-score of Bernoulli Naive Bayes:  78.1315113690941
Accuracy of Bernoulli Naive Bayes:  78.68852459016394

2. TFIDF Vectorizer
F1-score of Bernoulli Naive Bayes with TFIDF:  78.1315113690941
Accuracy of Bernoulli Naive Bayes with TFIDF:  78.68852459016394


### Build a Model - Logistic Regression
1. Count Vectorizer
2. TFIDF Vectorizer

In [6]:
print ("1. Count Vectorizer")

reviews = all_data_cleaned
polarity = data['Polarity']
X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

tfidfVectorizer = TfidfVectorizer(use_idf = False, min_df = 4, max_df=0.85)
X_train = tfidfVectorizer.fit_transform(X_train)
X_test = tfidfVectorizer.transform(X_test)

logRegClf = LogisticRegression()
logRegClf.fit(X_train, y_train)
logRegClf_ypred = logRegClf.predict(X_test)
f1_logRegClf = f1_score(y_test, logRegClf_ypred, average = 'weighted')
accuracy_logRegClf = accuracy_score(y_test, logRegClf_ypred)
print ("F1-score of Logistic Regression: ", f1_logRegClf*100)
print ("Accuracy of Logistic Regression: ", accuracy_logRegClf*100)


print ("\n2. TFIDF Vectorizer")

reviews = all_data_cleaned
polarity = data['Polarity']
X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

tfidfVectorizer = TfidfVectorizer(use_idf = True, min_df = 4, max_df=0.85)
X_train = tfidfVectorizer.fit_transform(X_train)
X_test = tfidfVectorizer.transform(X_test)

logRegTfidfClf = LogisticRegression()
logRegTfidfClf.fit(X_train, y_train)
logRegTfidfClf_ypred = logRegTfidfClf.predict(X_test)
f1_logRegTfidfClf = f1_score(y_test, logRegTfidfClf_ypred, average='weighted')
accuracy_logRegTfidfClf = accuracy_score(y_test, logRegTfidfClf_ypred)
print ("F1-score of Logistic Regression with TFIDF: ", f1_logRegTfidfClf*100)
print ("Accuracy of Logistic Regression with TFIDF: ", accuracy_logRegTfidfClf*100)


1. Count Vectorizer
F1-score of Logistic Regression:  79.84358844245423
Accuracy of Logistic Regression:  81.04918032786885

2. TFIDF Vectorizer
F1-score of Logistic Regression with TFIDF:  81.81270330259666
Accuracy of Logistic Regression with TFIDF:  82.68852459016394


### Build a Model - Support Vector Machine
1. Count Vectorizer
2. TFIDF Vectorizer
3. Count Vectorizer with Tuning
4. TFIDF Vectorizer with Tuning

In [7]:
print ("1. Count Vectorizer")

reviews = all_data_cleaned
polarity = data['Polarity']
X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

tfidfVectorizer = TfidfVectorizer(use_idf = False, min_df = 4, max_df=0.85)
X_train = tfidfVectorizer.fit_transform(X_train)
X_test = tfidfVectorizer.transform(X_test)

svmClf = SVC()
svmClf.fit(X_train, y_train)
svmClf_ypred = svmClf.predict(X_test)
f1_svmClf = f1_score(y_test, svmClf_ypred, average = 'weighted')
accuracy_svmClf = accuracy_score(y_test, svmClf_ypred)
print ("F1-score of SVM: ", f1_svmClf*100)
print ("Accuracy of SVM: ", accuracy_svmClf*100)


print ("\n2. TFIDF Vectorizer")

reviews = all_data_cleaned
polarity = data['Polarity']
X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

tfidfVectorizer = TfidfVectorizer(use_idf = True, min_df = 4, max_df=0.85)
X_train = tfidfVectorizer.fit_transform(X_train)
X_test = tfidfVectorizer.transform(X_test)

svmTfidfClf = LogisticRegression()
svmTfidfClf.fit(X_train, y_train)
svmTfidfClf_ypred = svmTfidfClf.predict(X_test)
f1_svmTfidfClf = f1_score(y_test, svmTfidfClf_ypred, average='weighted')
accuracy_svmTfidfClf = accuracy_score(y_test, svmTfidfClf_ypred)
print ("F1-score of SVM with TFIDF: ", f1_svmTfidfClf*100)
print ("Accuracy of SVM with TFIDF: ", accuracy_svmTfidfClf*100)

print ("\n3. Count Vectorizer with Tuning")

reviews = all_data_cleaned
polarity = data['Polarity']
X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

tfidfVectorizer = TfidfVectorizer(use_idf = False, min_df = 4, max_df=0.85)
X_train = tfidfVectorizer.fit_transform(X_train)
X_test = tfidfVectorizer.transform(X_test)

# parameters = {'C':[1,2,3,4,5,6,7,8,14], 'gamma':[0.1, 0.01, 0.001, 0.0001], 'kernel':['linear', 'poly', 'rbf'], 'degree': [1,2,3,4,5]}
parameters = {'C':[1,2,3], 'gamma':[0.1, 0.01], 'kernel':['linear', 'poly', 'rbf'], 'degree': [1,2]}

svmClfTuned = GridSearchCV(estimator=SVC(), param_grid=parameters)
svmClfTuned.fit(X_train, y_train)
svmClfTuned_ypred = svmClfTuned.predict(X_test)
f1_svmClfTuned = f1_score(y_test, svmClfTuned_ypred, average = 'weighted')
accuracy_svmClfTuned = accuracy_score(y_test, svmClfTuned_ypred)
print ("F1-score of SVM with Tuning: ", f1_svmClfTuned*100)
print ("Accuracy of SVM with Tuning: ", accuracy_svmClfTuned*100)
print(svmClfTuned.best_params_)


print ("\n4. TFIDF Vectorizer with Tuning")

reviews = all_data_cleaned
polarity = data['Polarity']
X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

tfidfVectorizer = TfidfVectorizer(use_idf = True, min_df = 4, max_df=0.85)
X_train = tfidfVectorizer.fit_transform(X_train)
X_test = tfidfVectorizer.transform(X_test)

# parameters = {'C':[1,2,3,4,5,6,7,8,14], 'gamma':[0.1, 0.01, 0.001, 0.0001], 'kernel':['linear', 'poly', 'rbf'], 'degree': [1,2,3,4,5]}
parameters = {'C':[1,2,3], 'gamma':[0.1, 0.01], 'kernel':['linear', 'poly', 'rbf'], 'degree': [1,2]}

svmTfidfClfTuned = GridSearchCV(estimator=SVC(), param_grid=parameters)
svmTfidfClfTuned.fit(X_train, y_train)
svmTfidfClfTuned_ypred = svmTfidfClfTuned.predict(X_test)
f1_svmTfidfClfTuned = f1_score(y_test, svmTfidfClfTuned_ypred, average='weighted')
accuracy_svmTfidfClfTuned = accuracy_score(y_test, svmTfidfClfTuned_ypred)
print ("F1-score of SVM with TFIDF with Tuning: ", f1_svmTfidfClfTuned*100)
print ("Accuracy of SVM with TFIDF with Tuning: ", accuracy_svmTfidfClfTuned*100)
print (svmTfidfClfTuned.best_params_)



1. Count Vectorizer
F1-score of SVM:  32.86082463112957
Accuracy of SVM:  49.57377049180327

2. TFIDF Vectorizer
F1-score of SVM with TFIDF:  81.81270330259666
Accuracy of SVM with TFIDF:  82.68852459016394

3. Count Vectorizer with Tuning
F1-score of SVM with Tuning:  86.38192827841739
Accuracy of SVM with Tuning:  86.49180327868852
{'C': 3, 'degree': 1, 'gamma': 0.1, 'kernel': 'linear'}

4. TFIDF Vectorizer with Tuning
F1-score of SVM with TFIDF with Tuning:  86.52612669729935
Accuracy of SVM with TFIDF with Tuning:  86.62295081967213
{'C': 3, 'degree': 1, 'gamma': 0.1, 'kernel': 'linear'}


### Summary of Accuracy

In [8]:
print ("F1-score of Multinomial Naive Bayes: ", f1_mnbClf*100)
print ("Accuracy of Multinomial Naive Bayes: ", accuracy_mnbClf*100)
print ("F1-score of Multinomial Naive Bayes with TFIDF: ", f1_mnbTfidfClf*100)
print ("Accuracy of Multinomial Naive Bayes with TFIDF: ", accuracy_mnbTfidfClf*100)
print ("\n")
print ("F1-score of Bernoulli Naive Bayes: ", f1_bnbClf*100)
print ("Accuracy of Bernoulli Naive Bayes: ", accuracy_bnbClf*100)
print ("F1-score of Bernoulli Naive Bayes with TFIDF: ", f1_bnbTfidfClf*100)
print ("Accuracy of Bernoulli Naive Bayes with TFIDF: ", accuracy_bnbTfidfClf*100)
print ("\n")
print ("F1-score of Logistic Regression: ", f1_logRegClf*100)
print ("Accuracy of Logistic Regression: ", accuracy_logRegClf*100)
print ("F1-score of Logistic Regression with TFIDF: ", f1_logRegTfidfClf*100)
print ("Accuracy of Logistic Regression with TFIDF: ", accuracy_logRegTfidfClf*100)
print ("\n")
print ("F1-score of SVM: ", f1_svmClf*100)
print ("Accuracy of SVM: ", accuracy_svmClf*100)
print ("F1-score of SVM with TFIDF: ", f1_svmTfidfClf*100)
print ("Accuracy of SVM with TFIDF: ", accuracy_svmTfidfClf*100)
print ("\n")
print ("F1-score of SVM with Tuning: ", f1_svmClfTuned*100)
print ("Accuracy of SVM with Tuning: ", accuracy_svmClfTuned*100)
print(svmClfTuned.best_params_)
print ("\n")
print ("F1-score of SVM with TFIDF with Tuning: ", f1_svmTfidfClfTuned*100)
print ("Accuracy of SVM with TFIDF with Tuning: ", accuracy_svmTfidfClfTuned*100)
print (svmTfidfClfTuned.best_params_)


F1-score of Multinomial Naive Bayes:  70.62505296771751
Accuracy of Multinomial Naive Bayes:  72.85245901639344
F1-score of Multinomial Naive Bayes with TFIDF:  73.45205689395802
Accuracy of Multinomial Naive Bayes with TFIDF:  74.88524590163934


F1-score of Bernoulli Naive Bayes:  78.1315113690941
Accuracy of Bernoulli Naive Bayes:  78.68852459016394
F1-score of Bernoulli Naive Bayes with TFIDF:  78.1315113690941
Accuracy of Bernoulli Naive Bayes with TFIDF:  78.68852459016394


F1-score of Logistic Regression:  79.84358844245423
Accuracy of Logistic Regression:  81.04918032786885
F1-score of Logistic Regression with TFIDF:  81.81270330259666
Accuracy of Logistic Regression with TFIDF:  82.68852459016394


F1-score of SVM:  32.86082463112957
Accuracy of SVM:  49.57377049180327
F1-score of SVM with TFIDF:  81.81270330259666
Accuracy of SVM with TFIDF:  82.68852459016394


F1-score of SVM with Tuning:  86.38192827841739
Accuracy of SVM with Tuning:  86.49180327868852
{'C': 3, 'degree':

# ERROR :(
### Build a Model - ANN
1. Count Vectorizer
2. TFIDF Vectorizer

In [180]:
def createModel():
    model = tf.keras.Sequential()
    model.add(layers.Dense(16, input_dim = X_train.shape[1], kernel_initializer='normal', activation='tanh'))
    model.add(layers.Dense(8, activation='tanh'))
    model.add(layers.Dense(4, activation='softmax'))
    
    model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.optimizers.Adam(), metrics=['accuracy'])
    return (model)

num_epochs = 10
batch_size = 10

print ("1. Count Vectorizer")

reviews = all_data_cleaned
polarity = polarity_0_and_1
X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

tfidfVectorizer = TfidfVectorizer(use_idf = False, min_df = 4, max_df=0.85)
X_train = tfidfVectorizer.fit_transform(X_train)
X_test = tfidfVectorizer.transform(X_test)

model = createModel()
annClf = model.fit(X_train, y_train, epochs=num_epochs, 
                   validation_data=(X_test, y_test), 
                   batch_size = batch_size)

annClf_ypred = model.predict(X_test)
annClf_ypred = np.argmax(annClf_ypred)
annClf_ytest = np.argmax(y_test)
# f1_annClf = f1_score(y_test, annClf_ypred, average='weighted')
# accuracy_annClf = f1_score(annClf_ytest, annClf_ypred)
# print ("F1-score of ANN: ", f1_annClf*100)
# print ("Accuracy of ANN: ", accuracy_annClf*100)



1. Count Vectorizer
Train on 349 samples, validate on 117 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### CNN with LSTM
1. https://medium.com/@mrunal68/text-sentiments-classification-with-cnn-and-lstm-f92652bc29fd
2. https://medium.com/datadriveninvestor/deep-learning-techniques-for-text-classification-9392ca9492c7

In [None]:
# reviews = all_data_cleaned
# polarity = data['Polarity']
# X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

# tfidfVectorizer = TfidfVectorizer(use_idf = True, min_df = 4, max_df=0.85)
# X_train = tfidfVectorizer.fit_transform(X_train).toarray()
# X_test = tfidfVectorizer.transform(X_test).toarray()

# # DNN
# def buildDNNModel(shape, nClasses, dropout=0.5)
#     model = tf.keras.Sequential()
#     node = 512
#     nLayers = 4
    
#     model.add(Dense(node, input_dim=shapte, activation='relu'))
#     model.add(Dropout(dropout))
#     for i in range(0, nLayers):
#         model.add(Dense(node, input_dim=node, activation='relu'))
#         model.add(Dropout(dropout))
#     model.add(Dense(nClasses,activation='softmax'))
    
#     model.compile(loss='sparse_categorical_crossentropy',
#                  optimizer=tf.optimizers.Adam(),
#                  metrics=['accuracy'])
#     return (model)

# model_DNN = buildDNNModel(X_train.shape[1],3)
# model_DNN.fit(X_train, y_train,
#              validation_data = (X_test, y_test),
#              epochs=10,
#              batch_size=128,
#              verbose=2)
# predicted = model_DNN.predict(X_test)
# print (metrics.classification_report(y_test, predicted))
