In [73]:
import random
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #Score of 4 or 5
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

## Load Data

In [63]:
import json

file_name = './data/Books_small_10000.json'

reviews =[]
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
reviews[10].text

"My only complaint about this book is that it is much too short. I love this author and this series, and I can't wait for the next installment."

## Prep Data

In [76]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews , test_size= 0.33, random_state= 10)

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)





In [65]:
print(training[0].sentiment)

POSITIVE


In [91]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

train_y.count(Sentiment.POSITIVE)

444

Bag of words vectorizing

In [105]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)





## Classification

### Linear SVM

In [106]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[10]

clf_svm.predict(test_x_vectors[10])

array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [107]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[10])

array(['NEGATIVE'], dtype='<U8')

### Logistic regression

In [108]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()

clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[10])


array(['POSITIVE'], dtype='<U8')

## Evaluation

In [109]:
# Mean Accuracy

print(clf_svm.score(test_x_vectors,test_y))

print(clf_dec.score(test_x_vectors,test_y))

print(clf_log.score(test_x_vectors,test_y))



0.835
0.6725
0.8325


In [111]:
# F1 Score

from sklearn.metrics import f1_score

print (f1_score(test_y, clf_svm.predict(test_x_vectors), average = None, 
         labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE]))

print (f1_score(test_y, clf_dec.predict(test_x_vectors), average = None, 
         labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE]))

print (f1_score(test_y, clf_log.predict(test_x_vectors), average = None, 
        labels = [Sentiment.POSITIVE,  Sentiment.NEGATIVE]))

[0.83163265 0.83823529]
[0.65974026 0.68433735]
[0.82506527 0.83932854]


In [103]:
test_set = ['really bad', "stupid", 'good use of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['NEGATIVE', 'NEGATIVE', 'POSITIVE'], dtype='<U8')

## Tuning using Grid Search

In [113]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [114]:
print(clf.score(test_x_vectors, test_y))


0.8275


### Saving Model

In [116]:
import pickle

with open('./models/reviewing_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

## Load Model

In [117]:
with open('./models/reviewing_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)
    

In [119]:

print(test_x[10])
loaded_clf.predict(test_x_vectors[10])

I just got into this hobby so I am trying to learn all I can.  I noticed this book is very highly reviewed so I pulled the trigger. I will probably donate it to my library when I am done because they don't have a copy.


array(['POSITIVE'], dtype='<U8')