In [7]:
# Importing libraries
import pandas as pd
import numpy as np
from collections import Counter
import pickle

import nltk
from nltk.probability import FreqDist
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.metrics import ConfusionMatrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# My own functions
from NLP_Functions import find_features, make_matrix, clean_up, tokenize, stem_and_lemmatize, remove_stopwords

In [8]:
# Importing the dataset
comments = pd.read_csv('Datasets/games_comments_cleaned.csv', index_col = 0)

In [9]:
# Creating a 15k sample this time, to see if things are a little better over a 5k sample
df = comments.sample(15000, random_state = 18)  # random_state for reproducibility

# Resetting the index
df.reset_index(drop = True, inplace = True)
df

Unnamed: 0,Title,Platform,Userscore,Comment,Username
0,Super Smash Bros. for Wii U,WiiU,9,"This game is amazing, improves in every aspect...",Zin_49
1,Zero Escape: Zero Time Dilemma,3DS,8,A bunch of bootleg robotic versions of the mai...,Techbane
2,Metro: Last Light,PC,6,I liked the original this is some innovation b...,gstiker5
3,PlanetSide 2,PC,5,Planetside 2 has battles of epic scale and de...,NasseSeta
4,Diablo III,PC,1,Game looks good and overall i like the graphic...,JamesLFranco
...,...,...,...,...,...
14995,Halo: Reach,Xbox360,9,"Halo is far superior to it's competition, alwa...",IcyGames
14996,Uncharted 4: A Thief's End,PlayStation4,10,A Great Game even includes a Crash Bandicoot M...,Daxterman20
14997,Cuphead,XboxOne,10,Easily one of the best games of the year with ...,simsy
14998,The Elder Scrolls III: Morrowind,PC,8,This game would easily be 10 out of 10. I've p...,cimerians


## Creating the 'Target' (label) column

I will be testing 2 different label classifications, one of my own (Target) and one suggested from the NPS system (Target_NPS)

In [10]:
# Creating the 'Target' column
## 9-10 positive
## 8 and below negative
df['Target'] = np.where((df['Userscore'] <= 8), 'Negative', 'Positive')  # Target_3 from previous notebook


# Creating the 'Target_NPS' column
## NPS Scale - https://en.wikipedia.org/wiki/Net_Promoter
## 9-10 = positive
## 7-8 = neutral
## 0-6 = negative
df['Target_NPS'] = np.where((df['Userscore'] <= 6), 'Negative', 'Positive')
df['Target_NPS'] = np.where(((df['Userscore'] >= 7) & (df['Userscore'] <= 8)), 'Neutral', df['Target_NPS'])


# Checking the different proportion of values
## unsure if I should balance these or not
print(df['Target'].value_counts())
df['Target_NPS'].value_counts()

Positive    8724
Negative    6276
Name: Target, dtype: int64


Positive    8724
Negative    3812
Neutral     2464
Name: Target_NPS, dtype: int64

## Processing the 'Comments'

In [11]:
# Creating the 'Comments_Processed' column
df['Comments_Processed'] = df['Comment'].apply(lambda x: stem_and_lemmatize(remove_stopwords(tokenize(clean_up(x)))))
## wonder if there's a more efficient way of doing this. It took a few minutes

In [12]:
df.head()

Unnamed: 0,Title,Platform,Userscore,Comment,Username,Target,Target_NPS,Comments_Processed
0,Super Smash Bros. for Wii U,WiiU,9,"This game is amazing, improves in every aspect...",Zin_49,Positive,Positive,"[game, amaz, improv, everi, aspect, previou, s..."
1,Zero Escape: Zero Time Dilemma,3DS,8,A bunch of bootleg robotic versions of the mai...,Techbane,Negative,Neutral,"[bunch, bootleg, robot, version, main, charact..."
2,Metro: Last Light,PC,6,I liked the original this is some innovation b...,gstiker5,Negative,Negative,"[like, origin, innov, silent, main, charact, e..."
3,PlanetSide 2,PC,5,Planetside 2 has battles of epic scale and de...,NasseSeta,Negative,Negative,"[planetsid, battl, epic, scale, decent, gfx, g..."
4,Diablo III,PC,1,Game looks good and overall i like the graphic...,JamesLFranco,Negative,Negative,"[game, look, good, overal, like, graphic, game..."


## Initializing the basic variables for testing

In [13]:
'''INITIALIZING EVERYTHING TO BEGIN TESTING WITH DIFFERENT MODELS'''
# Creating the bag of words
bow = [word for lst in df['Comments_Processed'] for word in lst]
fdist = FreqDist(bow)

# Getting just the 10k most common words
most_common = fdist.most_common(10000)  # WILL TRY CHANGING THIS A BIT TO SEE THE ACCURACY IMPACTS

'''TARGET'''
# Building the features and making the matrix
matrix = make_matrix(df['Comment'], df['Target'], most_common)

# Defining the size to use for the training and testing
size = int(len(matrix) * 0.20)

# Training with 80% of the data and testing against the remaining 20%
training_set = matrix[size:]
testing_set = matrix[:size]

In [None]:
'''TARGET_NPS'''
# Building the features and making the matrix
matrix_NPS = make_matrix(df['Comment'], df['Target_NPS'], most_common)

# Defining the size to use for the training and testing
size_NPS = int(len(matrix_NPS) * 0.20)

# Training with 80% of the data and testing against the remaining 20%
training_set_NPS = matrix_NPS[size_NPS:]
testing_set_NPS = matrix_NPS[:size_NPS]


'''
# Initializing and training the model
classifier_1 = nltk.NaiveBayesClassifier.train(training_set_1)

# Showing the top 15 most informative features
classifier_1.show_most_informative_features(15)

# Printing the model's accuracy
print('\n', 'Original NLTK NB accuracy (Target):', 
      str(round(nltk.classify.accuracy(classifier_1, testing_set_1) * 100, 2)) + '%')'''

## Multinomial NB Testing

### Testing with 'Target'

In [39]:
# TESTING WITH TARGET
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)

print('MultinomialNB accuracy (Target):', 
      str(round(nltk.classify.accuracy(MNB_classifier, testing_set) * 100, 2)) + '%')
# default: 73.17% - seems ok? (before with 5k sample it was 68.3% or so)
# alpha = 2: 69.4% - NOT TESTED HERE YET

# I will probably use this one I guess, if nothing better comes along

MultinomialNB accuracy (Target): 73.17%


### Confusion Matrix for 'Target'

In [118]:
# Predictions of the testing_set with the Target column
pred_list_MNB = [MNB_classifier.classify(testing_set[i][0]) for i in range(len(testing_set))]
print(pred_list_MNB.count('Positive'))  # 2025
print(pred_list_MNB.count('Negative'))  # 975

2025
975


In [199]:
# Unpacking the references (Positive or Negative) of the testing set to use for the confusion matrix
ref = [testing_set[i][1] for i in range(len(testing_set))]  # pos: 1748 | neg: 1258

# List of predictions ran above
tagged = pred_list_MNB

# The actual confusion matrix
cm = ConfusionMatrix(ref, tagged)

print(cm)
cm

labels = set('Positive Negative'.split())

true_positives = Counter()
false_negatives = Counter()
false_positives = Counter()

for i in labels:
    for j in labels:
        if i == j:
            true_positives[i] += cm[i, j]
        else:
            false_negatives[i] += cm[i, j]
            false_positives[j] += cm[i, j]

print('TP + TN:', sum(true_positives.values()), true_positives)
print('FP + FN:', sum(false_positives.values()), false_positives)

         |    N    P |
         |    e    o |
         |    g    s |
         |    a    i |
         |    t    t |
         |    i    i |
         |    v    v |
         |    e    e |
---------+-----------+
Negative | <711> 541 |
Positive |  264<1484>|
---------+-----------+
(row = reference; col = test)

TP + TN: 2195 Counter({'Positive': 1484, 'Negative': 711})
FP + FN: 805 Counter({'Positive': 541, 'Negative': 264})


### Testing with 'Target_NPS'

In [40]:
# TESTING WITH TARGET_NPS
MNB_classifier_NPS = SklearnClassifier(MultinomialNB())
MNB_classifier_NPS.train(training_set_NPS)

print('MultinomialNB accuracy (Target_NPS):', 
      str(round(nltk.classify.accuracy(MNB_classifier_NPS, testing_set_NPS) * 100, 2)) + '%')
# default: 67.93%  (before with 5k sample it was 68.3%)
# alpha = 2: 69.4% - NOT TESTED HERE YET

MultinomialNB accuracy (Target_NPS): 67.93%


### Confusion Matrix for 'Target_NPS'

In [117]:
# Predictions of the testing_set_NPS with the Target_NPS column
pred_list_MNB_NPS = [MNB_classifier_NPS.classify(testing_set_NPS[i][0]) for i in range(len(testing_set_NPS))]
print(pred_list_MNB_NPS.count('Positive'))  # 1946
print(pred_list_MNB_NPS.count('Negative'))  # 649
print(pred_list_MNB_NPS.count('Neutral'))  # 405

1946
649
405


In [202]:
# Unpacking the references (Positive, Negative, Neutral) of the testing_set_NPS to use for the confusion matrix
ref_NPS = [testing_set_NPS[i][1] for i in range(len(testing_set_NPS))]  # pos: 1748 | neg: 782 | neut: 470

# List of predictions ran above
tagged = pred_list_MNB_NPS

# The actual confusion matrix
cm = ConfusionMatrix(ref, tagged)

print(cm)
cm

labels = set('Positive Negative Neutral'.split())

true_positives = Counter()
false_negatives = Counter()
false_positives = Counter()

for i in labels:
    for j in labels:
        if i == j:
            true_positives[i] += cm[i, j]
        else:
            false_negatives[i] += cm[i, j]
            false_positives[j] += cm[i, j]

# Since I have 3 labels here the results are seeming a bit strange? Where is that 405 neutral from? Not in the CM
print('TP + TN:', sum(true_positives.values()), true_positives)
print('FP + FN:', sum(false_positives.values()), false_positives)

         |    N         P |
         |    e    N    o |
         |    g    e    s |
         |    a    u    i |
         |    t    t    t |
         |    i    r    i |
         |    v    a    v |
         |    e    l    e |
---------+----------------+
Negative | <545> 203  504 |
 Neutral |    .   <.>   . |
Positive |  104  202<1442>|
---------+----------------+
(row = reference; col = test)

TP + TN: 1987 Counter({'Positive': 1442, 'Negative': 545, 'Neutral': 0})
FP + FN: 1013 Counter({'Positive': 504, 'Neutral': 405, 'Negative': 104})


In [112]:
# Prediction of the infamous sentence the original NLTK NB never got right
bad_sentence = find_features('This game sucks so much. I hate it a lot. This is complete garbage', most_common)

print('MNB_classifier prediction:', MNB_classifier.classify(bad_sentence))
print('MNB_classifier_NPS prediction:', MNB_classifier_NPS.classify(bad_sentence))

MNB_classifier prediction: Negative
MNB_classifier_NPS prediction: Negative


## Logistic Regression Testing

### Testing with 'Target'

In [14]:
# TESTING WITH TARGET
LogisticRegression_classifier = SklearnClassifier(LogisticRegression(solver = 'saga', n_jobs = -1))#, max_iter = 500))
LogisticRegression_classifier.train(training_set)
# saga solver = 78.7% w/ 5k most_common
# saga solver = 79% w/ 10k most_common
# solver = 'saga', n_jobs = -1, max_iter = 500 -> 78%, still didn't converge

print('Logistic Regression accuracy (Target):', 
      str(round(nltk.classify.accuracy(LogisticRegression_classifier, testing_set) * 100, 2)) + '%')



Logistic Regression accuracy (Target): 79.0%


### Confusion Matrix for 'Target'

In [15]:
# Predictions of the testing_set with the Target column
pred_list_LR = [LogisticRegression_classifier.classify(testing_set[i][0]) for i in range(len(testing_set))]
print(pred_list_LR.count('Positive'))  # 1809 w/ remove_stopwords in wrong place | now: 1802
print(pred_list_LR.count('Negative'))  # 1191 w/ remove_stopwords in wrong place | now: 11198

1802
1198


In [16]:
# Unpacking the references (Positive or Negative) of the testing set to use for the confusion matrix
ref = [testing_set[i][1] for i in range(len(testing_set))]  # pos: 1748 | neg: 1258

# List of predictions ran above
tagged = pred_list_LR

# The actual confusion matrix
cm = ConfusionMatrix(ref, tagged)

print(cm)
cm

labels = set('Positive Negative'.split())

true_positives = Counter()
false_negatives = Counter()
false_positives = Counter()

for i in labels:
    for j in labels:
        if i == j:
            true_positives[i] += cm[i, j]
        else:
            false_negatives[i] += cm[i, j]
            false_positives[j] += cm[i, j]

# This seems to be the best model as the accuracy already predicted
print('TP + TN:', sum(true_positives.values()), true_positives)
print('FP + FN:', sum(false_positives.values()), false_positives)

         |    N    P |
         |    e    o |
         |    g    s |
         |    a    i |
         |    t    t |
         |    i    i |
         |    v    v |
         |    e    e |
---------+-----------+
Negative | <910> 342 |
Positive |  288<1460>|
---------+-----------+
(row = reference; col = test)

TP + TN: 2370 Counter({'Positive': 1460, 'Negative': 910})
FP + FN: 630 Counter({'Positive': 342, 'Negative': 288})


### Testing with 'Target_NPS'

In [56]:
# TESTING WITH TARGET_NPS
LogisticRegression_classifier_NPS = SklearnClassifier(LogisticRegression(solver = 'saga'))
LogisticRegression_classifier_NPS.train(training_set_NPS)
# saga solver = 72.9%

print('Logistic Regression accuracy (Target_NPS):', 
      str(round(nltk.classify.accuracy(LogisticRegression_classifier_NPS, testing_set_NPS) * 100, 2)) + '%')



Logistic Regression accuracy (Target_NPS): 72.9%


### Confusion Matrix for 'Target_NPS'

In [115]:
# Predictions of the testing_set_NPS with the Target_NPS column
pred_list_LR_NPS = [LogisticRegression_classifier_NPS.classify(testing_set_NPS[i][0]) 
                    for i in range(len(testing_set_NPS))]
print(pred_list_LR_NPS.count('Positive'))  # 1855
print(pred_list_LR_NPS.count('Negative'))  # 774
print(pred_list_LR_NPS.count('Neutral'))  # 371

1855
774
371


In [203]:
# Unpacking the references (Positive, Negative, Neutral) of the testing_set_NPS to use for the confusion matrix
ref_NPS = [testing_set_NPS[i][1] for i in range(len(testing_set_NPS))]  # pos: 1748 | neg: 782 | neut: 470

# List of predictions ran above
tagged = pred_list_LR_NPS

# The actual confusion matrix
cm = ConfusionMatrix(ref, tagged)

print(cm)
cm

labels = set('Positive Negative Neutral'.split())

true_positives = Counter()
false_negatives = Counter()
false_positives = Counter()

for i in labels:
    for j in labels:
        if i == j:
            true_positives[i] += cm[i, j]
        else:
            false_negatives[i] += cm[i, j]
            false_positives[j] += cm[i, j]

print('TP + TN:', sum(true_positives.values()), true_positives)
print('FP + FN:', sum(false_positives.values()), false_positives)

         |    N         P |
         |    e    N    o |
         |    g    e    s |
         |    a    u    i |
         |    t    t    t |
         |    i    r    i |
         |    v    a    v |
         |    e    l    e |
---------+----------------+
Negative | <664> 214  374 |
 Neutral |    .   <.>   . |
Positive |  110  157<1481>|
---------+----------------+
(row = reference; col = test)

TP + TN: 2145 Counter({'Positive': 1481, 'Negative': 664, 'Neutral': 0})
FP + FN: 855 Counter({'Positive': 374, 'Neutral': 371, 'Negative': 110})


In [120]:
# Prediction of the infamous sentence the original NLTK NB never got right
print('LogisticRegression_classifier prediction:', LogisticRegression_classifier.classify(bad_sentence))
print('LogisticRegression_classifier_NPS prediction:', LogisticRegression_classifier_NPS.classify(bad_sentence))

LogisticRegression_classifier prediction: Negative
LogisticRegression_classifier_NPS prediction: Negative


## Linear SVC Testing

The normal SVC model actually gave me pretty good results in the previous notebook.

However, it takes too long to compute, and it is even advised to use either Linear SVC or SGD in the documentation of SVC for larger datasets. That's why I am not testing it again with this bigger sample, and also why I won't be using it in the final 280k dataset, as it will take forever to process.

In [59]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
# Default - 74.93%

print('LinearSVC_classifier accuracy (Target):', 
      str(round(nltk.classify.accuracy(LinearSVC_classifier, testing_set) * 100, 2)) + '%')



LinearSVC_classifier accuracy (Target): 74.93%


In [60]:
LinearSVC_classifier_NPS = SklearnClassifier(LinearSVC())
LinearSVC_classifier_NPS.train(training_set_NPS)
# Default - 69.43%

print('LinearSVC_classifier_NPS accuracy (Target_NPS):', 
      str(round(nltk.classify.accuracy(LinearSVC_classifier_NPS, testing_set_NPS) * 100, 2)) + '%')

LinearSVC_classifier_NPS accuracy (Target_NPS): 69.43%


In [61]:
# Prediction of the infamous sentence the original NLTK NB never got right
print('LinearSVC_classifier prediction:', LinearSVC_classifier.classify(bad_sentence))
print('LinearSVC_classifier_NPS prediction:', LinearSVC_classifier_NPS.classify(bad_sentence))

LinearSVC_classifier prediction: Negative
LinearSVC_classifier_NPS prediction: Negative


## Function to Predict a Sentence's/Review's Label

In [231]:
# This was improved upon in notebook 6
def predictor(text):
    prediction = find_features(text, most_common)
    return print('Prediction:', LogisticRegression_classifier.classify(prediction))

predictor('This game is amazing!')
predictor('I love this game!')

Prediction: Positive
Prediction: Positive


In [17]:
# Saving the 5k most_common words of this 15k sample DF
save_most_common = open('10kmost_common_15ksample.pickle', 'wb')
pickle.dump(most_common, save_most_common)
save_most_common.close()

In [18]:
# Saving the trained LR algorithm
save_classifier = open('Logistic_Regression_15k.pickle', 'wb')
pickle.dump(LogisticRegression_classifier, save_classifier)
save_classifier.close()

In [19]:
# Exporting this 15k sample DF to use later for plots and everything
df.to_json('Datasets/comments_processed_15ksample.json')

In [155]:
# Testing opening the pickled Logistic Regression Trained Model
#classifier_f = open('naivebayes.pickle', 'rb')
#classifier_og = pickle.load(classifier_f)
#classifier_f.close()

#classifier_og  # this one is the trained algo with the whole comments df