In [50]:
import csv

In [51]:
import nltk

In [52]:
import numpy as np

In [53]:
import pandas as pd

In [54]:
import sklearn.feature_extraction.text

In [55]:
from sklearn.feature_extraction.text import CountVectorizer

In [56]:
from sklearn.feature_extraction.text import TfidfTransformer

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [58]:
from sklearn.naive_bayes import MultinomialNB

In [59]:
from sklearn import metrics

In [60]:
import scipy

In [61]:
import string

In [62]:
from nltk.tokenize import word_tokenize

In [63]:
from nltk.corpus import stopwords

In [64]:
#Opening the file

In [65]:
the_data = pd.read_csv("all_reviews.csv")

In [85]:
# Randomizing the rows in the file

In [86]:
the_data = the_data.reindex(np.random.permutation(the_data.index))

In [87]:
# Total instances in the csv data, pre-sorting into train and test.
good = 0
bad = 0
for item in the_data['good/bad']:
    if item == 'bad':
        bad += 1
    if item == 'good':
        good += 1
print('Good: ' + str(good))
print('Bad: ' + str(bad))
        

Good: 96
Bad: 85


In [88]:
data = []
for index, row in the_data.iterrows():
    sentence = ""
    # extract the review from the original
    review = str(row['review'])
    # split into words
    tokens = word_tokenize(review)
    # convert to lowercase
    tokens = [w.lower() for w in tokens]
    # remove punctuation and abberations
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words & join in a sentence
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if w not in stop_words]
    sentence = ' '.join(words)
    data.append({'stars': (row['stars']) / 10,
                      'review': sentence,
                      'good/bad': row['good/bad']})
new_frame = pd.DataFrame(data)

In [89]:
# Create a new dataframe with modified star value & a cleaned up review

In [90]:
#Extracting features from text, define target y and data x

In [72]:
X = new_frame['review']
Y = new_frame['good/bad']

In [73]:
#Partitioning the data into test and training set
split = 0.75
split_size = int(len(new_frame)*split)

X_train = X[:split_size]
X_test = X[split_size:]

Y_train = Y[:split_size]
Y_test = Y[split_size:]

In [103]:
vect = CountVectorizer()

In [104]:
X_train_dtm = vect.fit_transform(X_train)

In [105]:
tfidf_transformer = TfidfTransformer()

In [106]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_dtm)

In [107]:
# -------------------------------------------------

In [108]:
X_test_dtm = vect.transform(X_test)

In [114]:
X_test_tfidf = tfidf_transformer.transform(X_test_dtm)

In [110]:
#Test data numbers
test_good = 0
test_bad = 0 

for rating in Y_test:
    if rating == 'good':
        test_good += 1
    if rating == 'bad':
        test_bad += 1
print('Good reviews in test data: ' + str(test_good))
print('Bad reviews in test data: ' + str(test_bad))

Good reviews in test data: 24
Bad reviews in test data: 22


In [81]:
# Training the model

In [111]:
clf = MultinomialNB()
clf.fit(X_train_tfidf, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [112]:
# Evaluating the results

In [113]:
# Accuracy on training set
clf.score(X_train_tfidf, Y_train)

0.97037037037037033

In [115]:
# Accuracy on testing set
print(clf.score(X_test_tfidf, Y_test))

0.608695652174


In [116]:
Y_pred = clf.predict(X_test_tfidf)
print(metrics.classification_report(Y_test, Y_pred))

             precision    recall  f1-score   support

        bad       0.70      0.32      0.44        22
       good       0.58      0.88      0.70        24

avg / total       0.64      0.61      0.57        46



In [153]:
# False negative
X_test[Y_pred > Y_test]

141    loved movie filled enjoyment every second shak...
142    visually best star wars film nt like luke port...
146    never took critics wrong rian blew opportunity...
150    made account write review movie best star wars...
156    one point necessary cut umbilical cord best fi...
157    film piece shit rian johnson simply destroyed ...
158    extremely bad story writing think star wars fa...
159    absolute travesty mark hamillthis borderline k...
160    silver lining coolest lightsaber battle ever r...
162    kicsit még emészteni kell de alapjában jó volt...
168    loved saw friday parents mom seen star wars em...
169    disney killed star wars poor attempt star wars...
172    genuinely bad space gravity would nt arc energ...
173    movie ok nothing special made dislike characte...
179        emotionally powerful scenes star wars history
Name: review, dtype: object

In [117]:
#Messing around to see what we can pull.

In [None]:
# Testing it on new data

In [131]:
test = ['SJW bullshit']

In [132]:
t_test = vect.transform(test)

In [133]:
y_pred = clf.predict(t_test)

In [134]:
print(y_pred)

['bad']


In [None]:
# Predicting quality of unsorted data

In [135]:
#Importing
feature_data = pd.read_csv("test2.csv")

In [136]:
# This is to test the data is importing correctly
feature_good = 0
feature_bad = 0
for item in feature_data['stars']:
    if item >= 25:
        feature_good += 1
    else:
        feature_bad += 1
print('Positive Reviews: ' + str(feature_good))
print('Negative Reviews: ' + str(feature_bad))

Positive Reviews: 279
Negative Reviews: 411


In [137]:
# Cleaning
feature_list = []
for index, row in feature_data.iterrows():
    sentence = ""
    # extract the review from the original
    review = row['review']
    # split into words
    tokens = word_tokenize(review)
    # convert to lowercase
    tokens = [w.lower() for w in tokens]
    # remove punctuation and abberations
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words & join in a sentence
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if w not in stop_words]
    sentence = ' '.join(words)
    feature_list.append({'stars': (row['stars']) / 10,
                      'review': sentence,
                      'good/bad': row['good/bad']})
feature_frame = pd.DataFrame(feature_list)

In [140]:
feature_counts = vect.transform(feature_frame['review'])

In [144]:
feature_counts

<690x2114 sparse matrix of type '<class 'numpy.int64'>'
	with 21742 stored elements in Compressed Sparse Row format>

In [None]:
feature_counts = count_vect.transform(feature_frame['review'])

In [145]:
feature_test = vect.transform(feature_frame)

In [147]:
new_y_pred = clf.predict(feature_counts)

In [158]:
feature_good = 0
feature_bad = 0
for i in new_y_pred:
    if i == 'good':
        feature_good += 1
    if i == 'bad':
        feature_bad += 1

In [163]:
print("Bad: " + str(feature_bad) + " Good: " + str(feature_good))

Bad: 108 Good: 582
