# ML Algorithm to Judge the Emotional Sentiment of an Amazon Review

In [173]:
import random

# Creating a sentiment class to make the code neater (not necessary; could just return the sentiment strings by itself in get_sentiment method)

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"


# Create a class to make data storing into a list more clear

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews

    def get_text(self):
        return [x.text for x in self.reviews]

    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]

    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)] # Making a new list with positive reviews equal to the number of negative reviews
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

In [174]:
import json
from pprint import pprint

file_name = './Books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line) # Convert json to a dictionary
        reviews.append(Review(review['reviewText'], review['overall']))

print(reviews[-1].sentiment)

POSITIVE


In [175]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [176]:
# Creating datasets with equal number of elements

train_container.evenly_distribute()
test_container.evenly_distribute()

train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = train_container.get_text()
test_y = train_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))
print(test_y.count(Sentiment.POSITIVE))
print(test_y.count(Sentiment.NEGATIVE))

436
436
436
436


### Bag of words vectorization

In [185]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

count_vect = TfidfVectorizer()
train_x_vectors = count_vect.fit_transform(train_x)

test_x_vectors = count_vect.transform(test_x)

print(train_x[0])
print(train_x_vectors[0])

To put things in perspective, after reading two thirds of this book, I noticed a reference to &#34;present day Yugoslavia&#34;. At that point I realized that it was first published in 1982. After over 30 years this book maintains its freshness and relevance on an amazing subject.The level of detail that McCullough brings about the early part of Teddy Roosevelt's life is astonishing. A child that faced illness and developmental problems, went on to achieve so much in his life. And the secret to his success was a mixture of born grit and ambition coupled with unconditional parental love.What a wonderful story. Roosevelt's life is better than fiction.I would highly recommend this book to anybody interested in American history and politics, as well as a parenting book. We can all learn from how the Roosevelts raised their children in the late 1800s.
  (0, 29)	0.12102015490658889
  (0, 4560)	0.0958798235286412
  (0, 1409)	0.0848876018265445
  (0, 7935)	0.059139339254350616
  (0, 6346)	0.115

### Using different classification models

#### Linear SVM

In [186]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [187]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Naive Bayes

In [188]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()

clf_gnb.fit(train_x_vectors.toarray(), train_y)

clf_gnb.predict(test_x_vectors.toarray())

array(['POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE',
       'NEGATIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE',
       'NEGATIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE',
       'NEGATIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE',
       'NEGATIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE',
       'POSITIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE',
       'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE',
       'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE',
       'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE',
       'NEGATIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE',
       'POSITIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE',
       'NEGATIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE',
       'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE',
       'POSITIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATI

#### Logistic Regression

In [189]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Evaluation

#### Mean Accuracy

In [190]:
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors.toarray(), test_y))
print(clf_log.score(test_x_vectors, test_y))


0.9885321100917431
1.0
0.9839449541284404
0.9655963302752294


#### F1 Score

In [191]:
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_gnb.predict(test_x_vectors.toarray()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

[0.98850575 0.         0.98855835]
[1. 0. 1.]
[0.98368298 0.         0.98419865]
[0.96527778 0.         0.96590909]


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


### Tuning our model using Grid Search

In [194]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,6,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)


#### Saving our Model

In [196]:
import pickle

with open('./sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)


#### Loading Model

In [198]:
with open('./sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)
    

In [199]:
print(test_x[0])

loaded_clf.predict(test_x_vectors[0])

To put things in perspective, after reading two thirds of this book, I noticed a reference to &#34;present day Yugoslavia&#34;. At that point I realized that it was first published in 1982. After over 30 years this book maintains its freshness and relevance on an amazing subject.The level of detail that McCullough brings about the early part of Teddy Roosevelt's life is astonishing. A child that faced illness and developmental problems, went on to achieve so much in his life. And the secret to his success was a mixture of born grit and ambition coupled with unconditional parental love.What a wonderful story. Roosevelt's life is better than fiction.I would highly recommend this book to anybody interested in American history and politics, as well as a parenting book. We can all learn from how the Roosevelts raised their children in the late 1800s.


array(['POSITIVE'], dtype='<U8')