In [327]:
import csv

In [328]:
import nltk

In [329]:
import numpy as np

In [330]:
import pandas as pd

In [331]:
import sklearn.feature_extraction.text

In [332]:
from sklearn.feature_extraction.text import TfidfTransformer

In [333]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [334]:
from sklearn.naive_bayes import MultinomialNB

In [335]:
from sklearn import metrics

In [336]:
import scipy

In [337]:
import string

In [338]:
from nltk.tokenize import word_tokenize

In [339]:
from nltk.corpus import stopwords

In [340]:
#Opening the file

In [341]:
the_data = pd.read_csv("all_reviews.csv")

In [342]:
# Randomizing the rows in the file

In [343]:
the_data = the_data.reindex(np.random.permutation(the_data.index))

In [344]:
# Total instances in the csv data, pre-sorting into train and test.
good = 0
bad = 0
for item in the_data['good/bad']:
    if item == 'bad':
        bad += 1
    if item == 'good':
        good += 1
print('Good: ' + str(good))
print('Bad: ' + str(bad))
        

Good: 96
Bad: 85


In [345]:
# Create a new dataframe with modified star value & a cleaned up review

In [384]:
data = []
for index, row in the_data.iterrows():
    sentence = ""
    # extract the review from the original
    review = row['review']
    # split into words
    tokens = word_tokenize(review)
    # convert to lowercase
    tokens = [w.lower() for w in tokens]
    # remove punctuation and abberations
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words & join in a sentence
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if w not in stop_words]
    sentence = ' '.join(words)
    data.append({'stars': (row['stars']) / 10,
                      'review': sentence,
                      'good/bad': row['good/bad']})
new_frame = pd.DataFrame(data)

Unnamed: 0,good/bad,review,stars
0,bad,major disappointment,0.5
1,good,think one best films whole movie although film...,5.0
2,bad,ok film poor star wars lazy writing character ...,1.0
3,bad,great film mark hamill turns performance caree...,5.0
4,good,best star wars film ever new trilogy taking st...,5.0
5,bad,never took critics wrong rian blew opportunity...,2.0
6,good,feel disturbance force movie light saber fight...,1.0
7,good,expected great movie resets star wars universe...,5.0
8,good,absolutely worst star wars movie thought tfa b...,0.5
9,good,movie completely annihilated beloved franchise...,0.5


In [347]:
#Extracting features from text, define target y and data x

In [348]:
vect = TfidfVectorizer()
X = vect.fit_transform(new_frame['review'])
Y = new_frame['good/bad']

In [349]:
#Partitioning the data into test and training set
split = 0.75
split_size = int(len(Y)*split)

X_train = X[:split_size]
X_test = X[split_size:]

Y_train = Y[:split_size]
Y_test = Y[split_size:]

In [350]:
#Test data numbers
test_good = 0
test_bad = 0 

for rating in Y_test:
    if rating == 'good':
        test_good += 1
    if rating == 'bad':
        test_bad += 1
print('Good reviews in test data: ' + str(test_good))
print('Bad reviews in test data: ' + str(test_bad))

Good reviews in test data: 22
Bad reviews in test data: 24


In [351]:
# Training the model

In [352]:
clf = MultinomialNB()
clf.fit(X, Y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [353]:
# Evaluating the results

In [354]:
# Accuracy on training set
print(clf.score(X_train, Y_train))

0.97037037037


In [355]:
# Accuracy on testing set
print(clf.score(X_test, Y_test))

1.0


In [306]:
Y_test


135     bad
136    good
137     bad
138     bad
139    good
140     bad
141     bad
142     bad
143    good
144     bad
145     bad
146    good
147     bad
148     bad
149     bad
150     bad
151     bad
152     bad
153    good
154     bad
155    good
156     bad
157    good
158     bad
159    good
160     bad
161     bad
162     bad
163    good
164    good
165    good
166    good
167     bad
168    good
169    good
170     bad
171     bad
172     bad
173    good
174    good
175    good
176    good
177     bad
178     bad
179     bad
180     bad
Name: good/bad, dtype: object

In [356]:
Y_pred = clf.predict(X_test)
print(metrics.classification_report(Y_test, Y_pred))

             precision    recall  f1-score   support

        bad       1.00      1.00      1.00        24
       good       1.00      1.00      1.00        22

avg / total       1.00      1.00      1.00        46



In [357]:
Y_pred

array(['good', 'good', 'good', 'bad', 'bad', 'good', 'bad', 'bad', 'good',
       'good', 'bad', 'good', 'bad', 'bad', 'good', 'good', 'good', 'good',
       'good', 'bad', 'good', 'bad', 'good', 'bad', 'bad', 'good', 'bad',
       'good', 'bad', 'bad', 'bad', 'bad', 'bad', 'good', 'good', 'bad',
       'bad', 'good', 'good', 'bad', 'good', 'bad', 'bad', 'good', 'bad',
       'bad'],
      dtype='<U4')

In [None]:
#Messing around to see what we can pull.

In [165]:
# Testing it on new data

In [358]:
test = ['This is nothing but a Chick Flick in Space, should be called Rainbow Wars,']

In [359]:
t_test = vect.transform(test)

In [360]:
y_pred = clf.predict(t_test)

Unnamed: 0,stars,good/bad,review
0,5,,I went in with high hopes. I was a huge fan ...
1,5,,Poor story. Some very dodgy CG and way too m...
2,5,,Disney destroyed 40 years worth of classic s...
3,5,,"Devastated at how bad this movie was. Awful,..."
4,5,,It was awful. I feel bad for the pointless c...
5,5,,Absolutely the worst Star Wars movie to date...
6,5,,Had some great moments but zero character de...
7,5,,???? ?? ? ????????? ??????? ????????????? ??...
8,5,,"Such a disappointing plot, storyline, and ch..."
9,5,,"I really wanted to leave a 1,000+ page revie..."


In [186]:
# Predicting quality of unsorted data

In [361]:
#Importing
feature_data = pd.read_csv("test2.csv")

In [375]:
# This is to test the data is importing correctly
feature_good = 0
feature_bad = 0
for item in feature_data['stars']:
    if item >= 25:
        feature_good += 1
    else:
        feature_bad += 1
print('Positive Reviews: ' + str(feature_good))
print('Negative Reviews: ' + str(feature_bad))

Positive Reviews: 279
Negative Reviews: 411


In [393]:
# Cleaning
feature_list = []
for index, row in feature_data.iterrows():
    sentence = ""
    # extract the review from the original
    review = row['review']
    # split into words
    tokens = word_tokenize(review)
    # convert to lowercase
    tokens = [w.lower() for w in tokens]
    # remove punctuation and abberations
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words & join in a sentence
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if w not in stop_words]
    sentence = ' '.join(words)
    feature_list.append({'stars': (row['stars']) / 10,
                      'review': sentence,
                      'good/bad': row['good/bad']})
feature_frame = pd.DataFrame(feature_list)

In [394]:
feature_test = vect.transform(feature_frame)

In [395]:
new_y_pred = clf.predict(feature_test)

In [397]:
feature_test

<3x2424 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [294]:
new_y_test = feature_frame['good/bad']

KeyError: 'good/bad'