In [1]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #Score of 4 or 5
            return Sentiment.POSITIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

## Class Data

## Load Data

In [3]:
import json

file_name = './Data/Sentiment/Books_small_10000.json'

reviews = []

with open (file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))
        
reviews[22].sentiment

'POSITIVE'

## Prep Data

In [4]:
from sklearn.model_selection import train_test_split

training, testing = train_test_split(reviews, test_size = 0.3, random_state = 42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(testing)

In [5]:
train_container.evenly_distribute()

train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

train_y.count(Sentiment.POSITIVE)

461

## Bag of Words vectorization

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

train_x_vectors[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

## CLASSIFICATION MODEL

In [7]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')

clf_svm.fit(train_x_vectors,train_y)
clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [8]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()

clf_log.fit(train_x_vectors, train_y)
clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

## EVALUATION

In [9]:
print(clf_svm.score(test_x_vectors,test_y))
print(clf_log.score(test_x_vectors,test_y))

0.8387978142076503
0.8224043715846995


In [10]:
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average = None))
print(f1_score(test_y, clf_log.predict(test_x_vectors), average = None))

[0.84182306 0.8356546 ]
[0.82573727 0.8189415 ]


## GRID SEARCH TO FIND THE BEST PARAMETERS FOR THE MODEL

In [11]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel' : ('linear','rbf'), 'C' : (1,4,8,16,32)}

svc = svm.SVC()

clf = GridSearchCV(svc, parameters, cv = 5)

clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [12]:
print(clf.score(test_x_vectors,test_y))

0.8387978142076503


## TESTING

In [15]:
test =['an average book']

new_test = vectorizer.transform(test)

clf_svm.predict(new_test)

array(['POSITIVE'], dtype='<U8')

## SAVING THE MODEL

In [225]:
import pickle

with open ('./models/review_sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf,f)

In [234]:
with open ('./models/review_sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [247]:
test =['best book ever']

new_test = vectorizer.transform(test)

loaded_clf.predict(new_test)

array(['POSITIVE'], dtype='<U8')