In [3]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    def get_text(self):
        return [x.text for x in self.reviews]
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)


In [4]:
## load data
import json

file_name = 'Books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))
print(reviews[1].sentiment)

NEUTRAL


In [5]:
## Prep data, split data into training and test
from sklearn.model_selection import train_test_split
training, test = train_test_split(reviews, test_size = 0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [6]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

In [7]:
## bag of words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

In [8]:
## classification
## linear SVM

from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]
clf_svm.predict(test_x_vectors[0])




array(['NEGATIVE'], dtype='<U8')

In [9]:
## Decision Tree
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [10]:
## Naive Bayes
from sklearn.naive_bayes import GaussianNB

clf_gnb = DecisionTreeClassifier()
clf_gnb.fit(train_x_vectors, train_y)

clf_gnb.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

In [11]:
## Logistic Regression
from sklearn.linear_model import LogisticRegression

clf_log = DecisionTreeClassifier()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [12]:
## Evaluate 
## mean accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors, test_y))

0.8076923076923077
0.6802884615384616
0.6658653846153846
0.6418269230769231


In [13]:
## f1 scores
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average = None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])

array([0.80582524, 0.80952381])

In [14]:
test_set = ["brilliant", "horrible book", "waste of time", "wonderful, would definitely recommend it"]
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE'], dtype='<U8')

In [20]:
## grid search
from sklearn.model_selection import GridSearchCV
parameters = {'kernel': ('linear','rbf'), 'C': (1,4,8,16,32)}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)
print(clf.best_params_)


{'C': 4, 'kernel': 'rbf'}


In [21]:
print(clf.score(test_x_vectors, test_y))

0.8197115384615384


In [None]:
## save model
## import pickle (install lib)
## with open('directiory name/model name.pkl', 'wb') as f:
##      pickle.dump(clf, f)

## load model

## with open('directiory name/model name.pkl', 'rb') as f:
##      loaded_clf = pickle.load(f)
## loaded_clf.predict(test_x_vectors[0])