In [20]:
import pandas as pd
import numpy as np
import sklearn
import math
import re
from pathlib import Path
from sklearn.metrics import accuracy_score

In [2]:
# Import data
df_train = pd.read_csv('train.csv')
df_train

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1
...,...,...,...,...,...
17489,25535,EverQuest II,2012.0,Arguably the single greatest mmorp that exists...,1
17490,25536,EverQuest II,2017.0,"An older game, to be sure, but has its own cha...",1
17491,25537,EverQuest II,2011.0,When I frist started playing Everquest 2 it wa...,1
17492,25538,EverQuest II,,cool game. THe only thing that REALLY PISSES M...,1


In [3]:
df_train = pd.read_csv('train_data.csv')
df_train = df_train[['user_review','user_suggestion']]
df_train

Unnamed: 0,user_review,user_suggestion
0,"Heroes & Generals had potential, but had to me...",0
1,Early Access ReviewHell yeah just came back fr...,1
2,I've played this game for 5 years on their int...,1
3,Do NOT buy anything from them. I stupidly boug...,0
4,Early Access ReviewThis is the future of MMOs....,1
...,...,...
15739,"Early Access ReviewQuite a fun game, the block...",0
15740,Early Access Reviewtoo many people jumped on t...,1
15741,Lets see...â™¦ Half the players play like bots...,0
15742,Early Access ReviewMy most favoured (hated) ga...,1


In [4]:
#change the label
df_train = df_train.rename(columns={'user_suggestion':'label'})

In [5]:
df_train.head()

Unnamed: 0,user_review,label
0,"Heroes & Generals had potential, but had to me...",0
1,Early Access ReviewHell yeah just came back fr...,1
2,I've played this game for 5 years on their int...,1
3,Do NOT buy anything from them. I stupidly boug...,0
4,Early Access ReviewThis is the future of MMOs....,1


In [6]:
df_train['label'].unique()

array([0, 1], dtype=int64)

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15744 entries, 0 to 15743
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_review  15744 non-null  object
 1   label        15744 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 246.1+ KB


In [8]:
# tokenize word from game review
import re

def process_text(content):
    sentence = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','', content['user_review'])
    sentence = re.sub('@[^\s]+','', sentence)
    sentence = sentence.lower().split()
    reformed_sentence = [word for word in sentence]
    reformed_sentence = " ".join(reformed_sentence) 
    sentence = re.sub('&[^\s]+;', '', reformed_sentence)
    sentence = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', sentence)
    sentence = re.sub(' +',' ', sentence)
    #text = re.sub(' [\w] ', ' ', text)
    return sentence.strip()

df_train['user_review'] = df_train.apply(process_text, axis=1)
df_train.head()

Unnamed: 0,user_review,label
0,heroes generals had potential but had to mess ...,0
1,early access reviewhell yeah just came back fr...,1
2,i ve played this game for 5 years on their int...,1
3,do not buy anything from them i stupidly bough...,0
4,early access reviewthis is the future of mmos ...,1


In [9]:
def process_text1(content):
    sentence = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','', content)
    sentence = re.sub('@[^\s]+','', sentence)
    sentence = sentence.lower().split()
    reformed_sentence = [word for word in sentence]
    reformed_sentence = " ".join(reformed_sentence) 
    sentence = re.sub('&[^\s]+;', '', reformed_sentence)
    sentence = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', sentence)
    sentence = re.sub(' +',' ', sentence)
    #text = re.sub(' [\w] ', ' ', text)
    return [sentence.strip()]

In [10]:
process_text1('i liek @dasd,123,hello, world! hello')

['i liek world hello']

In [11]:
df_train['label'].value_counts()

1    8958
0    6786
Name: label, dtype: int64

In [12]:
df_train_X = df_train['user_review']
df_train_y = df_train['label']

In [13]:
#Import testing data
df_test = pd.read_csv('test_data.csv')
df_test=df_test.rename(columns={"user_suggestion":'label'})
df_test = df_test[['user_review','label']]

In [14]:
df_test_X = df_test['user_review']
df_test_y = df_test['label']

In [15]:
# Use different model and create pipeline
# 1.LogisticRegression
# 2.RandomForest
# 3.KNN
# 4.Decision Tree
# 5.SVM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

tvec = TfidfVectorizer()
classifier_lr = LogisticRegression()
classifier_rf = RandomForestClassifier()
classifier_knn = KNeighborsClassifier()
classifier_dt = DecisionTreeClassifier()
classifier_svm = SVC()

pipeline_model_lr = Pipeline([('vectorizer1',tvec),('classifier_lr',classifier_lr)])
pipeline_model_rf = Pipeline([('vectorizer2',tvec),('classifier_rf',classifier_rf)])
pipeline_model_knn = Pipeline([('vectorizer3',tvec),('classifier_knn',classifier_knn)])
pipeline_model_dt = Pipeline([('vectorizer4',tvec),('classifier_dt',classifier_dt)])
pipeline_model_svm = Pipeline([('vectorizer5',tvec),('classifier_svm',classifier_svm)])


pipelines = [pipeline_model_lr,pipeline_model_rf,pipeline_model_knn,pipeline_model_dt,pipeline_model_svm]

In [16]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [17]:
pipe_dict = {0:'Logistic Regression',1:'RandomForest',2:'KNN',3:'Decision Tree',4:'SVM'}

In [18]:
for pipe in pipelines:
    pipe.fit(df_train_X,df_train_y)

In [21]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],accuracy_score(model.predict(df_test_X),df_test_y)))

Logistic Regression Test Accuracy: 0.8668571428571429
RandomForest Test Accuracy: 0.8325714285714285
KNN Test Accuracy: 0.5777142857142857
Decision Tree Test Accuracy: 0.6817142857142857
SVM Test Accuracy: 0.8765714285714286


In [22]:
for i,model in enumerate(pipelines):
    if accuracy_score(model.predict(df_test_X),df_test_y)>best_accuracy:
        best_accuracy = accuracy_score(model.predict(df_test_X),df_test_y)
        best_pipeline = model
        best_classifier = i
print("The best classifier is {},and accuracy is {}".format(pipe_dict[best_classifier],best_accuracy))        

The best classifier is SVM,and accuracy is 0.8765714285714286


In [23]:
best_model = pipelines[[i for i in pipe_dict if pipe_dict[i]==pipe_dict[best_classifier]][0]]

In [24]:
from sklearn.metrics import confusion_matrix
best_prediction =best_model.predict(df_test_X)
confusion_matrix(best_prediction, df_test_y)

array([[621,  97],
       [119, 913]], dtype=int64)

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score

print("Best_model Accuracy : ", accuracy_score(best_prediction, df_test_y))
print("Best_model Precision : ", precision_score(best_prediction, df_test_y, average = 'weighted'))
print("Best_model Recall : ", recall_score(best_prediction, df_test_y, average = 'weighted'))
print("Best_model F1 score:",f1_score(best_prediction,df_test_y,average = 'weighted'))

Best_model Accuracy :  0.8765714285714286
Best_model Precision :  0.8773856951718337
Best_model Recall :  0.8765714285714286
Best_model F1 score: 0.8768377511413751


In [28]:
import pickle
file = open('sentiment_analysis_best_model.pkl','wb')
pickle.dump(best_model,file)

In [29]:
model = pickle.load(open('sentiment_analysis_best_model.pkl','rb'))

In [30]:
example = 'Do NOT buy anything from them. I stupidly boug'

best_model.predict(process_text1(example))

array([0], dtype=int64)