In [44]:
import pandas as pd
import utils
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from math import ceil
from pickle import dump

In [10]:
raw_df = utils.load_reviews_db()

Database successfully loaded


In [11]:
raw_df

Unnamed: 0_level_0,package_name,review,polarity
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1


In [26]:
raw_df.polarity.value_counts()

polarity
0    584
1    307
Name: count, dtype: int64

In [13]:
df = raw_df.copy()
df["review"] = raw_df["review"].str.strip().str.lower()

In [23]:
vec_model = CountVectorizer(stop_words = "english")

x = df.review
y = df.polarity

x_train, x_test, y_train, y_test =train_test_split(x, y, test_size=0.2, random_state=42)

x_train = vec_model.fit_transform(x_train).toarray()
x_test = vec_model.transform(x_test).toarray()


In [27]:
model = MultinomialNB()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

acc_score = accuracy_score(y_test, y_pred)
rec_score = recall_score(y_test, y_pred)

print(f'accuracy:   {acc_score}')
print(f'recall:     {rec_score}')

accuracy:   0.8156424581005587
recall:     0.6037735849056604


In [30]:
grid = {
    'alpha':[0.001,0.1,0.5,1, 3],
    'fit_prior':[True,False]
}

grid = GridSearchCV(model, grid, scoring='balanced_accuracy')
grid.fit(x_train, y_train)

y_pred = grid.best_estimator_.predict(x_test)
params = grid.best_params_

acc_score = accuracy_score(y_test, y_pred)
rec_score = recall_score(y_test, y_pred)

print(params)
print(f'accuracy:   {acc_score}')
print(f'recall:     {rec_score}')

{'alpha': 0.5, 'fit_prior': False}
accuracy:   0.8044692737430168
recall:     0.660377358490566


In [39]:
import os.path as path

model_name = 'MultinomialNB'
param_string_list = ['_' + param.replace('_','') + '_' + str(value) 
                     for param, value in params.items()]
model_name += ''.join(param_string_list) + '.sav'

model_path = path.join('..','models',model_name)
dump(grid.best_estimator_,open(model_path,'wb'))



In [None]:
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(x_train,y_train)
y_pred = tree_model.predict(x_test)

acc_score = accuracy_score(y_test, y_pred)
rec_score = recall_score(y_test, y_pred)

print(f'accuracy:   {acc_score}')
print(f'recall:     {rec_score}')



accuracy:   0.7150837988826816
recall:     0.6037735849056604


In [45]:
tree_model = LogisticRegression(random_state=42)
tree_model.fit(x_train,y_train)
y_pred = tree_model.predict(x_test)

acc_score = accuracy_score(y_test, y_pred)
rec_score = recall_score(y_test, y_pred)

print(f'accuracy:   {acc_score}')
print(f'recall:     {rec_score}')

accuracy:   0.8324022346368715
recall:     0.8113207547169812
