In [1]:
import pandas as pd
import numpy as np
import sklearn
import math
import re
from pathlib import Path
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,roc_auc_score

In [2]:
# Import data
df_train = pd.read_csv('train_canonical.csv')
df_train.shape

(15729, 5)

In [3]:
df_train = pd.read_csv('train_data.csv')
df_train = df_train[['user_review','user_suggestion']]
df_train

Unnamed: 0,user_review,user_suggestion
0,"Heroes & Generals had potential, but had to me...",0
1,Early Access ReviewHell yeah just came back fr...,1
2,I've played this game for 5 years on their int...,1
3,Do NOT buy anything from them. I stupidly boug...,0
4,Early Access ReviewThis is the future of MMOs....,1
...,...,...
15739,"Early Access ReviewQuite a fun game, the block...",0
15740,Early Access Reviewtoo many people jumped on t...,1
15741,Lets see...â™¦ Half the players play like bots...,0
15742,Early Access ReviewMy most favoured (hated) ga...,1


In [4]:
#change the label
df_train = df_train.rename(columns={'user_suggestion':'label'})

In [5]:
df_train.head()

Unnamed: 0,user_review,label
0,"Heroes & Generals had potential, but had to me...",0
1,Early Access ReviewHell yeah just came back fr...,1
2,I've played this game for 5 years on their int...,1
3,Do NOT buy anything from them. I stupidly boug...,0
4,Early Access ReviewThis is the future of MMOs....,1


In [6]:
df_train['label'].unique()

array([0, 1], dtype=int64)

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15744 entries, 0 to 15743
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_review  15744 non-null  object
 1   label        15744 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 246.1+ KB


In [8]:
# tokenize word from game review
import re

def process_text(content):
    sentence = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','', content['user_review'])
    sentence = re.sub('@[^\s]+','', sentence)
    sentence = sentence.lower().split()
    reformed_sentence = [word for word in sentence]
    reformed_sentence = " ".join(reformed_sentence) 
    sentence = re.sub('&[^\s]+;', '', reformed_sentence)
    sentence = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', sentence)
    sentence = re.sub(' +',' ', sentence)
    #text = re.sub(' [\w] ', ' ', text)
    return sentence.strip()

df_train['user_review'] = df_train.apply(process_text, axis=1)
pd.set_option('display.max_colwidth', None)
df_train.head()

Unnamed: 0,user_review,label
0,heroes generals had potential but had to mess it up with greed and apathy this game had so much potential to be a good game in fact it was a good game before these updates messed up everything the updates ruined the german faction to the point that any game with the germans is a near certain loss they don t listen to community requests and go on their own to buff things making them op or nerfing the hell out of items whcih devalues the time put in that endless grind to get any item in this game it took around the 6 hour mark to actually get a bare sniper rifle for my infantry which needs an extra 3 hours to get the essential attachments to be any use in matches moving to gameplay it used to be pretty fun with games having an even match that last a considerable amount of time compared to the current match stomps where you only see paratroopers and people sneaking past the lines to steal the obj while the other team goes for their base i have seen countless times where our or the enemy team after a long battle capturing the objective getting their point capture by those paratroopers that drop in every minute right before the point was secured moving past that infantry destroys tanks which makes them obsolete there s barely any reason to play recon since everything is speeden up from the constant stream of crappy updates and they ban many people who hadn t even done anything there s too much to list and it makes this game incredibly boring as well as tilting i can t find any reason why you would play this game in it s current state unless you want to waste time or money which i might add is completely pay to win since the most op weapons are sitting on top of a hefty grind or a pretty penny this game is horribly executing and since the developer don t care of the popular reception will continually to dig a grave for this game,0
1,early access reviewhell yeah just came back from the store and both this fresh pair of pants now let me see if there is some cool free games on steam oh spooky they say well lets se about that click on start new games 5 min later instant stains on your new 4 pantsand as a bonus when you get to the end of the game you will have enough bricks to build a spooky house of jumpscares yourself 1 1 for making me install a toilet in my bedroom d,1
2,i ve played this game for 5 years on their internet client and it s been great over the years i expect great things from deca and i reccomend this game to anyone who is looking for an great game to play solo or with friends,1
3,do not buy anything from them i stupidly bought the starter pack for 4 99 after i played for a while i did really enjoy it but this morning i wake up go to check my vault and everything is gone no vault no pack waste of money,0
4,early access reviewthis is the future of mmos it s honestly so 7 1 that i could play it for hours without getting bored the later levels lack the same spark of interest that levels 1 1 have but i assume this is because the game is in early access it s worth it to make an account play around for a couple hours and send it on its way until it s better,1


In [9]:
def process_text1(content):
    sentence = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','', content)
    sentence = re.sub('@[^\s]+','', sentence)
    sentence = sentence.lower().split()
    reformed_sentence = [word for word in sentence]
    reformed_sentence = " ".join(reformed_sentence) 
    sentence = re.sub('&[^\s]+;', '', reformed_sentence)
    sentence = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', sentence)
    sentence = re.sub(' +',' ', sentence)
    #text = re.sub(' [\w] ', ' ', text)
    return [sentence.strip()]

In [10]:
process_text1('i liek @dasd,123,hello, world! hello')

['i liek world hello']

In [11]:
df_train['label'].value_counts()

1    8958
0    6786
Name: label, dtype: int64

In [12]:
df_train_X = df_train['user_review']
df_train_y = df_train['label']

In [13]:
# dataset is balanced, we don't need to deal 
df_train_y.sum()/len(df_train_y)

0.5689786585365854

In [14]:
#Import testing data
df_test = pd.read_csv('test_held_out.csv')
df_test=df_test.rename(columns={"user_suggestion":'label'})
df_test = df_test[['user_review','label']]

In [15]:
df_test_X = df_test['user_review']
df_test_y = df_test['label']
df_test_X.shape

(1765,)

In [16]:
df_test_y

0       1
1       1
2       1
3       1
4       1
       ..
1760    1
1761    0
1762    1
1763    1
1764    1
Name: label, Length: 1765, dtype: int64

In [17]:
# Use different model and create pipeline
# 1.LogisticRegression
# 2.RandomForest
# 3.KNN
# 4.Decision Tree
# 5.SVM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

tvec = TfidfVectorizer()
classifier_lr = LogisticRegression()
classifier_rf = RandomForestClassifier()
classifier_knn = KNeighborsClassifier()
classifier_dt = DecisionTreeClassifier()
classifier_svm = SVC()

pipeline_model_lr = Pipeline([('vectorizer1',tvec),('classifier_lr',classifier_lr)])
pipeline_model_rf = Pipeline([('vectorizer2',tvec),('classifier_rf',classifier_rf)])
pipeline_model_knn = Pipeline([('vectorizer3',tvec),('classifier_knn',classifier_knn)])
pipeline_model_dt = Pipeline([('vectorizer4',tvec),('classifier_dt',classifier_dt)])
pipeline_model_svm = Pipeline([('vectorizer5',tvec),('classifier_svm',classifier_svm)])


pipelines = [pipeline_model_lr,pipeline_model_rf,pipeline_model_knn,pipeline_model_dt,pipeline_model_svm]

In [18]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [19]:
pipe_dict = {0:'Logistic Regression',1:'RandomForest',2:'KNN',3:'Decision Tree',4:'SVM'}

In [20]:
for pipe in pipelines:
    pipe.fit(df_train_X,df_train_y)

In [21]:
# Accuracy
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],accuracy_score(model.predict(df_test_X),df_test_y)))

Logistic Regression Test Accuracy: 0.9218130311614731
RandomForest Test Accuracy: 0.9835694050991501
KNN Test Accuracy: 0.39206798866855525
Decision Tree Test Accuracy: 0.9592067988668556
SVM Test Accuracy: 0.9790368271954675


In [22]:
# Precision score
for i,model in enumerate(pipelines):
    print("{} Test Preception Score: {}".format(pipe_dict[i],precision_score(model.predict(df_test_X),df_test_y)))

Logistic Regression Test Preception Score: 0.9225806451612903
RandomForest Test Preception Score: 0.9903225806451613
KNN Test Preception Score: 0.9967741935483871
Decision Tree Test Preception Score: 0.9661290322580646
SVM Test Preception Score: 0.9725806451612903


In [23]:
# Recall score
for i,model in enumerate(pipelines):
    print("{} Test Recall Score: {}".format(pipe_dict[i],recall_score(model.predict(df_test_X),df_test_y)))

Logistic Regression Test Recall Score: 0.8640483383685801
RandomForest Test Recall Score: 0.9638932496075353
KNN Test Recall Score: 0.36589698046181174
Decision Tree Test Recall Score: 0.9215384615384615
SVM Test Recall Score: 0.9678972712680578


In [24]:
# f1 score
for i,model in enumerate(pipelines):
    print("{} Test f1 Score: {}".format(pipe_dict[i],f1_score(model.predict(df_test_X),df_test_y)))

Logistic Regression Test f1 Score: 0.8923556942277692
RandomForest Test f1 Score: 0.9769291964996022
KNN Test f1 Score: 0.5352966652230403
Decision Tree Test f1 Score: 0.9433070866141733
SVM Test f1 Score: 0.9702333065164924


In [25]:
# roc_auc score
for i,model in enumerate(pipelines):
    print("{} Test f1 Score: {}".format(pipe_dict[i],roc_auc_score(model.predict(df_test_X),df_test_y)))

Logistic Regression Test f1 Score: 0.9102653296557316
RandomForest Test f1 Score: 0.9792870503356824
KNN Test f1 Score: 0.6697905954940638
Decision Tree Test f1 Score: 0.9513521904104865
SVM Test f1 Score: 0.9765055533222952


In [26]:
for i,model in enumerate(pipelines):
    if accuracy_score(model.predict(df_test_X),df_test_y)>best_accuracy:
        best_accuracy = accuracy_score(model.predict(df_test_X),df_test_y)
        best_pipeline = model
        best_classifier = i
print("The best classifier is {},and accuracy is {}".format(pipe_dict[best_classifier],best_accuracy))        

The best classifier is RandomForest,and accuracy is 0.9835694050991501


In [27]:
best_model = pipelines[[i for i in pipe_dict if pipe_dict[i]==pipe_dict[best_classifier]][0]]

In [28]:
from sklearn.metrics import confusion_matrix
best_prediction =best_model.predict(df_test_X)
confusion_matrix(best_prediction, df_test_y)

array([[1122,    6],
       [  23,  614]], dtype=int64)

In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score

print("Best_model Accuracy : ", accuracy_score(best_prediction, df_test_y))
print("Best_model Precision : ", precision_score(best_prediction, df_test_y, average = 'weighted'))
print("Best_model Recall : ", recall_score(best_prediction, df_test_y, average = 'weighted'))
print("Best_model F1 score:",f1_score(best_prediction,df_test_y,average = 'weighted'))

Best_model Accuracy :  0.9835694050991501
Best_model Precision :  0.9836696705876062
Best_model Recall :  0.9835694050991501
Best_model F1 score: 0.9835197422983617


In [32]:
import pickle
file = open('sentiment_analysis_best_model.pkl','wb')
pickle.dump(best_model,file)

In [33]:
model = pickle.load(open('sentiment_analysis_best_model.pkl','rb'))

In [34]:
example = 'Do NOT buy anything from them. I stupidly boug'

best_model.predict(process_text1(example))

array([0], dtype=int64)