In [1]:
import numpy as np
import pandas as pd
import json

In [45]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"


class Review:
    def __init__ (self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2: # 1 or 2 
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #if score is 4 or 5
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__ (self,reviews):
        self.reviews = reviews
        
    def get_text(self):
       return [x.text for x in self.reviews]

    def get_sentiment(self):
       return [x.sentiment for x in self.reviews]

        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        #neutral = list(filter(lambda x: x.sentiment == Sentiment.NEUTRAL, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
        

In [46]:
file_name = 'Books_small_10000.json'

reviews = []

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
       
        reviews.append(Review(review['reviewText'], review['overall']))
        
        
reviews[32].score
        

5.0

In [47]:
from sklearn.model_selection import train_test_split
training, test = train_test_split(reviews, test_size = 0.33, random_state =42)
train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [48]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

In [50]:
print(train_y.count(Sentiment.POSITIVE))
#print(train_y.count(Sentiment.NEUTRAL))
print(train_y.count(Sentiment.NEGATIVE))

436
436


### Bag Of Words

In [51]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

print(train_x_vectors[0].toarray())
train_x_vectors[0]
train_y[15]


[[0. 0. 0. ... 0. 0. 0.]]


'NEGATIVE'

In [53]:
from sklearn import svm
clf_svm = svm.SVC(kernel = 'linear')
clf_svm.fit(train_x_vectors, train_y)
test_x[300]
clf_svm.predict(test_x_vectors[300])

array(['NEGATIVE'], dtype='<U8')

In [55]:
from sklearn.tree import DecisionTreeClassifier
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)
clf_dec.predict(test_x_vectors[300])

array(['POSITIVE'], dtype='<U8')

In [56]:
 from sklearn.naive_bayes import GaussianNB
 gnb_dec = DecisionTreeClassifier()
 gnb_dec.fit(train_x_vectors, train_y)
 gnb_dec.predict(test_x_vectors[300])   

array(['NEGATIVE'], dtype='<U8')

In [57]:
from sklearn.linear_model import LogisticRegression
clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)
clf_log.predict(test_x_vectors[300])



array(['NEGATIVE'], dtype='<U8')

### Mean Accuracy 

In [72]:
clf_svm.score(test_x_vectors,test_y)*100

80.76923076923077

In [73]:
clf_dec.score(test_x_vectors,test_y)*100

63.46153846153846

In [74]:
gnb_dec.score(test_x_vectors,test_y)*100

62.980769230769226

In [82]:
clf_log.score(test_x_vectors,test_y)*100

80.28846153846155

In [76]:
train_y.count(Sentiment.POSITIVE)

436

### F1 Scores

In [77]:
from sklearn.metrics import f1_score


print(f1_score(test_y,clf_log.predict(test_x_vectors), average=None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE])*100)
print(f1_score(test_y,clf_svm.predict(test_x_vectors), average=None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE])*100)
print(f1_score(test_y,clf_dec.predict(test_x_vectors), average=None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE])*100)
print(f1_score(test_y,gnb_dec.predict(test_x_vectors), average=None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE])*100)

[80.09708738 80.47619048]
[80.58252427 80.95238095]
[63.63636364 63.28502415]
[62.25490196 63.67924528]


In [85]:
# testing the models using our own texts
test_set = ['I do recommend this book, good read', 'I have had better', 'Do not buy, what a waste']
new_test = vectorizer.transform(test_set)
clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

We can see that the best performing models are svm and logistic regression.
The challenge that we have here is the lack of data, 