In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
import re
import gensim
from gensim import corpora
from gensim import similarities
from gensim import models
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime

Using TensorFlow backend.


In [2]:
# data_file = "SHOPEE_MAYBELLINE_CLEAN_V2.csv"
# data_file = "Lazada_sentiment.csv"
data_file = "Shopee_AllData_Sentiment_v2.csv"
data = pd.read_csv(data_file)
data.columns = data.columns.str.strip().str.replace(" ","_")
# data.info()
# data.head()

# data.drop(columns=['Brand','Category','Product_Name','Price','Reviewer','Product_Purchase','Ratings','Date_Of_Review','Response', 'Topic'])
# review_list = data['Review'].tolist()
# polarity_list = data['Polarity'].tolist()

reviews = data['Review']
# polarity = data['Polarity']
# print (reviews)

review_docs = []
for each_reviews in reviews:
    temp = each_reviews.split(" ")
    review_docs.append(temp)
# print (review_docs)

# Make sure all words are in lowercase
reviews_lower = [[each_word.lower() for each_word in each_review] for each_review in review_docs]
# print (reviews_lower)

# Use regular expressions to keep only allphabetical words
reviews_alpha = [[each_word for each_word in each_review if re.search('^[a-z]+$', each_word)] for each_review in reviews_lower]
# print (reviews_alpha)

# Remove stop words
stop_list = stopwords.words('english')
reviews_stop = [[each_word for each_word in each_review if each_word not in stop_list] for each_review in reviews_alpha]
# print (reviews_stop)

# Porter Stemming
stemmer = PorterStemmer()
reviews_stem = [[stemmer.stem(each_word) for each_word in each_review] for each_review in reviews_stop]
# print (reviews_stem)

all_data_cleaned = []
for each_sentence in reviews_stem:
    sentence = ""
    for each_word in each_sentence:
        sentence += each_word + " "
    sentence = sentence[0:-1]
    all_data_cleaned.append(sentence)
# print (all_data_cleaned)

polarity_raw = data['Polarity']
polarity_0_and_1 = []
for each_polarity in polarity_raw:
    if int(each_polarity) == int("0"):
        polarity_0_and_1.append(0.5)
    if int(each_polarity) == int("-1"):
        polarity_0_and_1.append(int(0))
    if int(each_polarity) == int("1"):
        polarity_0_and_1.append(int(1))
# print (polarity)

print (len(all_data_cleaned))


12808


In [4]:
print (datetime.now())
print ("1. Count Vectorizer")

reviews = all_data_cleaned
polarity = data['Polarity']
X_train, X_test, y_train, y_test = train_test_split(reviews, polarity, test_size=0.25, random_state=42)

# print (X_test)

tfidfVectorizer = TfidfVectorizer(use_idf = True, min_df = 4, max_df=0.85)
X_train = tfidfVectorizer.fit_transform(X_train)
X_test = tfidfVectorizer.transform(X_test)

# print (X_test)

parameters = {'C':[1,2,3], 'gamma':[0.1, 0.01], 'kernel':['linear', 'poly', 'rbf'], 'degree': [1,2]}

svmClf = GridSearchCV(estimator = SVC(), param_grid = parameters)
svmClf.fit(X_train, y_train)
svmClf_ypred = svmClf.predict(X_test)
f1_svmClf = f1_score(y_test, svmClf_ypred, average = 'weighted')
accuracy_svmClf = accuracy_score(y_test, svmClf_ypred)
print ("F1-score of SVM: ", f1_svmClf*100)
print ("Accuracy of SVM: ", accuracy_svmClf*100)
print (svmClf.best_params_)

print (datetime.now())

2020-03-17 20:59:32.677003
1. Count Vectorizer
F1-score of SVM:  77.32012381935355
Accuracy of SVM:  77.82635852592131
{'C': 1, 'degree': 1, 'gamma': 0.1, 'kernel': 'poly'}
2020-03-17 21:08:17.625769


In [20]:
# tester = ["nice product", "bad","okay"]
# testerX = countVectorizer.transform(tester)

# print (testerX)
# x_pred = svmClf.predict(testerX)
# print (x_pred)

  (0, 510)	1
  (0, 604)	1
  (1, 51)	1
  (2, 531)	1
[ 1 -1  0]


In [15]:
print (datetime.now())
platform_list = data['Platform'].tolist()
brand_list = data['Brand'].tolist()
category_list = data['Category'].tolist()
product_name_list = data['Product_Name'].tolist()
price_list = data['Price'].tolist()
reviewer_list = data['Reviewer'].tolist()
review_list = data['Review'].tolist()
review_splitted_list = data['Review_splitted'].tolist()
product_purchase_list = data['Product_Purchase'].tolist()
rating_list = data['Ratings'].tolist()
date_review_list = data['Date_Of_Review'].tolist()
response_list = data['Response'].tolist()
topic_list = data['topic'].tolist()
polarity_list = data['Polarity'].tolist()
predicted_polarity_list = []

reviews = all_data_cleaned
count = 0

for i in range(len(review_splitted_list)):
    curr_review = [review_splitted_list[i]]
    curr_review = tfidfVectorizer.transform(curr_review)
    predicted_polarity = svmClf.predict(curr_review)
    predicted_polarity_list.append(predicted_polarity[0])
    
    if int(predicted_polarity) == polarity_list[i]:
        count +=1
    
# print (count)

data_file = "Shopee_AllData_Sentiment_v2.csv"
data = pd.read_csv(data_file)
data['Predicted_Polarity'] = predicted_polarity_list
data.to_csv('Shopee_AllData_Sentiment_labelled.csv')



# new_data = {'Brand':brand_csv, 'Category': category_csv, 'Product Name': product_name_csv, 'Price':prices_csv ,'Reviewer':reviewer_csv,'Review':review_csv, 'Product Purchase':product_variation_csv,'Ratings':rating_csv,'Date Of Review':date_review_csv,'Response':response_csv,'Topic':topic_csv, 'Polarity':polarity_csv }
# new_df = pd.DataFrame.from_dict(new_data)
# new_df.to_csv('SHOPEE_MAYBELLINE_CLEAN_V2.csv')
# print (len(review_list))
print (datetime.now())

2020-03-17 21:22:29.681469
2020-03-17 21:22:44.989682
