In [50]:
import csv

In [51]:
import nltk

In [52]:
import numpy as np

In [53]:
import pandas as pd

In [54]:
import sklearn.feature_extraction.text

In [55]:
from sklearn.feature_extraction.text import CountVectorizer

In [56]:
from sklearn.feature_extraction.text import TfidfTransformer

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [58]:
from sklearn.naive_bayes import MultinomialNB

In [59]:
from sklearn import metrics

In [60]:
import scipy

In [61]:
import string

In [62]:
from nltk.tokenize import word_tokenize

In [63]:
from nltk.corpus import stopwords

In [64]:
#Opening the file

In [200]:
the_data = pd.read_csv("all_reviews.csv")

In [201]:
# Randomizing the rows in the file

In [244]:
the_data = the_data.reindex(np.random.permutation(the_data.index))

In [245]:
# Total instances in the csv data, pre-sorting into train and test.
good = 0
bad = 0
for item in the_data['good/bad']:
    if item == 'bad':
        bad += 1
    if item == 'good':
        good += 1
print('Good: ' + str(good))
print('Bad: ' + str(bad))
        

Good: 96
Bad: 85


In [246]:
data = []
for index, row in the_data.iterrows():
    sentence = ""
    # extract the review from the original
    review = str(row['review'])
    # split into words
    tokens = word_tokenize(review)
    # convert to lowercase
    tokens = [w.lower() for w in tokens]
    # remove punctuation and abberations
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words & join in a sentence
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if w not in stop_words]
    sentence = ' '.join(words)
    data.append({'stars': (row['stars']) / 10,
                      'review': sentence,
                      'good/bad': row['good/bad']})
new_frame = pd.DataFrame(data)

In [247]:
# Create a new dataframe with modified star value & a cleaned up review

In [248]:
#Extracting features from text, define target y and data x

In [249]:
X = new_frame['review']
Y = new_frame['good/bad']

In [250]:
#Partitioning the data into test and training set
split = 0.75
split_size = int(len(new_frame)*split)

X_train = X[:split_size]
X_test = X[split_size:]

Y_train = Y[:split_size]
Y_test = Y[split_size:]

In [251]:
vect = CountVectorizer()

In [252]:
X_train_dtm = vect.fit_transform(X_train)

In [253]:
tfidf_transformer = TfidfTransformer()

In [254]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_dtm)

In [255]:
# -------------------------------------------------

In [256]:
X_test_dtm = vect.transform(X_test)

In [257]:
X_test_tfidf = tfidf_transformer.transform(X_test_dtm)

In [258]:
#Test data numbers
test_good = 0
test_bad = 0 

for rating in Y_test:
    if rating == 'good':
        test_good += 1
    if rating == 'bad':
        test_bad += 1
print('Good reviews in test data: ' + str(test_good))
print('Bad reviews in test data: ' + str(test_bad))

Good reviews in test data: 26
Bad reviews in test data: 20


In [259]:
# Training the model

In [260]:
clf = MultinomialNB()
clf.fit(X_train_tfidf, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [261]:
# Evaluating the results

In [262]:
# Accuracy on training set
clf.score(X_train_tfidf, Y_train)

1.0

In [263]:
# Accuracy on testing set
print(clf.score(X_test_tfidf, Y_test))

0.695652173913


In [264]:
Y_pred = clf.predict(X_test_tfidf)
print(metrics.classification_report(Y_test, Y_pred))

             precision    recall  f1-score   support

        bad       0.80      0.40      0.53        20
       good       0.67      0.92      0.77        26

avg / total       0.72      0.70      0.67        46



In [265]:
# False negative
X_test[Y_pred > Y_test]

143    seen trailers expectations movie trying best g...
147    loved fantastic storytelling visuals spectacle...
153    star wars fan movie go fun leave casual star w...
154    lot liked movie things let go first luke time ...
157    rian johnson takes star wars universe injects ...
160    fanboy boy though enjoyed original trilogy qui...
163    genuinely bad space gravity would nt arc energ...
164    extended advert toys especially mogwai looking...
165                                 love much best movie
169    let preface review saying loved film bits leav...
173    loved movie filled enjoyment every second shak...
178                                  really enjoyed film
Name: review, dtype: object

In [266]:
#Messing around to see what we can pull.

In [267]:
# Testing it on new data

In [290]:
test = ['SJW bullshit']

In [291]:
t_test = vect.transform(test)

In [292]:
y_pred = clf.predict(t_test)

In [293]:
print(y_pred)

['bad']


In [294]:
# Predicting quality of unsorted data

In [295]:
#Importing
feature_data = pd.read_csv("test2.csv")

In [296]:
# This is to test the data is importing correctly
feature_good = 0
feature_bad = 0
for item in feature_data['stars']:
    if item >= 25:
        feature_good += 1
    else:
        feature_bad += 1
print('Positive Reviews: ' + str(feature_good))
print('Negative Reviews: ' + str(feature_bad))

Positive Reviews: 279
Negative Reviews: 411


In [297]:
# Cleaning
feature_list = []
for index, row in feature_data.iterrows():
    sentence = ""
    # extract the review from the original
    review = row['review']
    # split into words
    tokens = word_tokenize(review)
    # convert to lowercase
    tokens = [w.lower() for w in tokens]
    # remove punctuation and abberations
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words & join in a sentence
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if w not in stop_words]
    sentence = ' '.join(words)
    feature_list.append({'stars': (row['stars']) / 10,
                      'review': sentence,
                      'good/bad': row['good/bad']})
feature_frame = pd.DataFrame(feature_list)

In [298]:
feature_counts = vect.transform(feature_frame['review'])

In [299]:
feature_counts

<690x1864 sparse matrix of type '<class 'numpy.int64'>'
	with 20899 stored elements in Compressed Sparse Row format>

In [302]:
feature_test = vect.transform(feature_frame)

In [304]:
feature_counts = vect.transform(feature_frame['review'])

In [305]:
new_y_pred = clf.predict(feature_counts)

In [306]:
feature_good = 0
feature_bad = 0
for i in new_y_pred:
    if i == 'good':
        feature_good += 1
    if i == 'bad':
        feature_bad += 1

In [307]:
print("Bad: " + str(feature_bad) + " Good: " + str(feature_good))

Bad: 129 Good: 561


In [281]:
# -------------------------------------------------------------- #

In [308]:
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)

1864

In [309]:
a_token = clf.feature_count_[0, :]
b_token = clf.feature_count_[1, :]
tokens = pd.DataFrame({'token': X_train_tokens, 'bad': a_token, 'good': b_token}).set_index('token')

In [310]:
tokens.head()

Unnamed: 0_level_0,bad,good
token,Unnamed: 1_level_1,Unnamed: 2_level_1
abandons,0.0,0.168332
abarca,0.044934,0.0
able,0.136145,0.091609
abomination,0.0,0.150035
abrams,0.302636,0.318246


In [311]:
tokens['bad'] += 1
tokens['good'] += 1
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,bad,good
token,Unnamed: 1_level_1,Unnamed: 2_level_1
mary,1.0,1.268631
ways,1.128009,1.295002
numbers,1.374464,1.0
sit,1.496876,1.0
noninspiring,1.0,1.155912


In [312]:
tokens['ratio'] = tokens.bad / tokens.good
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,bad,good,ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
mary,1.0,1.268631,0.788251
ways,1.128009,1.295002,0.871048
numbers,1.374464,1.0,1.374464
sit,1.496876,1.0,1.496876
noninspiring,1.0,1.155912,0.865117


In [313]:
tokens.sort_values('ratio', ascending=False)

Unnamed: 0_level_0,bad,good,ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rewarding,1.876367,1.000000,1.876367
la,1.867203,1.000000,1.867203
enjoy,2.001820,1.078418,1.856256
sucked,1.805819,1.000000,1.805819
far,2.465002,1.368791,1.800861
hate,1.799097,1.000000,1.799097
better,3.483028,1.942699,1.792881
worst,3.072307,1.765260,1.740428
phenomenal,1.735597,1.000000,1.735597
history,1.716720,1.000000,1.716720
