In [2]:
import csv

In [3]:
import nltk

In [4]:
import numpy as np

In [5]:
import pandas as pd

In [6]:
import sklearn.feature_extraction.text

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
from sklearn.naive_bayes import MultinomialNB

In [109]:
from sklearn import metrics

In [8]:
import scipy

In [9]:
import string

In [7]:
from nltk.tokenize import word_tokenize

In [10]:
from nltk.corpus import stopwords

In [14]:
#Opening the file

In [116]:
the_data = pd.read_csv("all_reviews.csv")

In [117]:
# Randomizing the rows in the file

In [118]:
the_data = the_data.reindex(np.random.permutation(the_data.index))

In [151]:
# Total instances in the csv data, pre-sorting into train and test.
good = 0
bad = 0
for item in the_data['good/bad']:
    if item == 'bad':
        bad += 1
    if item == 'good':
        good += 1
print('Good: ' + str(good))
print('Bad: ' + str(bad))
        

Good: 96
Bad: 85


In [119]:
# Create a new dataframe with modified star value & a cleaned up review

In [120]:
data = []
for index, row in the_data.iterrows():
    sentence = ""
    # extract the review from the original
    review = row['review']
    # split into words
    tokens = word_tokenize(review)
    # convert to lowercase
    tokens = [w.lower() for w in tokens]
    # remove punctuation and abberations
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words & join in a sentence
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if w not in stop_words]
    sentence = ' '.join(words)
    data.append({'stars': (row['stars']) / 10,
                      'review': sentence,
                      'good/bad': row['good/bad']})
new_frame = pd.DataFrame(data)

In [121]:
#Extracting features from text, define target y and data x

In [122]:
vect = TfidfVectorizer()
X = vect.fit_transform(new_frame['review'])
Y = new_frame['good/bad']

In [123]:
#Partitioning the data into test and training set
split = 0.75
split_size = int(len(Y)*split)

X_train = X[:split_size]
X_test = X[split_size:]

Y_train = Y[:split_size]
Y_test = Y[split_size:]

In [160]:
#Test data numbers
test_good = 0
test_bad = 0 

for rating in Y_test:
    if rating == 'good':
        test_good += 1
    if rating == 'bad':
        test_bad += 1
print('Good reviews in test data: ' + str(test_good))
print('Bad reviews in test data: ' + str(test_bad))

Good reviews in test data: 18
Bad reviews in test data: 28


In [None]:
# Training the model

In [124]:
clf = MultinomialNB()
clf.fit(X, Y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [125]:
# Evaluating the results

In [126]:
# Accuracy on training set
print(clf.score(X_train, Y_train))

0.977777777778


In [127]:
# Accuracy on testing set
print(clf.score(X_test, Y_test))

0.978260869565


In [128]:
Y_pred = clf.predict(X_test)
print(metrics.classification_report(Y_test, Y_pred))

             precision    recall  f1-score   support

        bad       1.00      0.96      0.98        28
       good       0.95      1.00      0.97        18

avg / total       0.98      0.98      0.98        46



In [None]:
#Messing around to see what we can pull.

In [183]:
X

['abandons',
 'abarca',
 'able',
 'abomination',
 'abrams',
 'absolute',
 'absolutely',
 'abysmal',
 'accomplish',
 'accomplished',
 'according',
 'account',
 'achievement',
 'ackbar',
 'across',
 'acters',
 'acting',
 'action',
 'actionpacked',
 'actor',
 'actors',
 'acts',
 'actual',
 'actually',
 'adage',
 'add',
 'added',
 'addition',
 'additionally',
 'address',
 'admiral',
 'admittedly',
 'advancing',
 'adventure',
 'advert',
 'agenda',
 'agnostic',
 'agonizing',
 'agrivating',
 'ahead',
 'air',
 'aire',
 'airs',
 'al',
 'alapjában',
 'albeit',
 'alien',
 'alienated',
 'aliens',
 'allegiance',
 'allow',
 'allowed',
 'allowing',
 'almost',
 'along',
 'alot',
 'already',
 'alright',
 'also',
 'although',
 'always',
 'amazing',
 'ambitious',
 'amilyn',
 'amount',
 'anakin',
 'angrily',
 'animal',
 'annihilated',
 'annoyed',
 'annoying',
 'another',
 'answer',
 'answered',
 'answering',
 'answers',
 'anxiety',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'apart',
 'ap

In [165]:
# Testing it on new data

In [142]:
test = ['This is nothing but a Chick Flick in Space, should be called Rainbow Wars,']

In [143]:
t_test = vect.transform(test)

In [181]:
y_pred = clf.predict(t_test)