In [112]:
import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'
    
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        


class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
           

# Load Data

In [93]:
import json
file_name = './datasets/books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)

        reviews.append(Review(review['reviewText'], review['overall']))
        
reviews[5].score

5.0

# Prep Data

In [94]:
import pandas as pd

data = {'Review': [rev.text for rev in reviews],
        'Rating': [rev.score for rev in reviews]}
df = pd.DataFrame(data)
df

Unnamed: 0,Review,Rating
0,"I bought both boxed sets, books 1-5. Really a...",5.0
1,I enjoyed this short book. But it was way way ...,3.0
2,I love Nicholas Sparks. I&#8217;ve read everyt...,4.0
3,I really enjoyed this adventure and look forwa...,4.0
4,It was a decent read.. typical story line. Not...,3.0
...,...,...
9995,The whole series was great! Melody is a fanta...,5.0
9996,I didn't thing that much of this book. I am a...,3.0
9997,It is an emotional TRIP to the past with Trip ...,5.0
9998,This definitely got under my veins whereby I h...,5.0


In [113]:
from sklearn.model_selection import train_test_split

training, test= train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)


In [68]:
len(test)

3300

In [116]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

436
436


In [124]:
len(train_y)

872

Bag of words vectorization

In [127]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer() 
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0])

For a novice Kindle owner, this book has proven invaluable.  I return to it for instructions or brush up at least once a week.  Money well spent.  Very clear and so helpful.
  (0, 3177)	2
  (0, 5424)	1
  (0, 4476)	1
  (0, 5638)	1
  (0, 7976)	1
  (0, 991)	1
  (0, 3652)	1
  (0, 6231)	1
  (0, 4228)	1
  (0, 6631)	1
  (0, 8052)	1
  (0, 4277)	1
  (0, 4162)	1
  (0, 5549)	1
  (0, 1119)	1
  (0, 8408)	1
  (0, 611)	1
  (0, 4606)	1
  (0, 5513)	1
  (0, 8653)	1
  (0, 5168)	1
  (0, 8665)	1
  (0, 7383)	1
  (0, 8497)	1
  (0, 1483)	1
  (0, 423)	1
  (0, 7280)	1
  (0, 3728)	1


# Classification

Linear SVM

In [131]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

# Decision Tree

In [128]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)
 
clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Naive Bayes

#### Logistic Regresion

In [106]:
from sklearn.linear_model import LogisticRegression
clf_log = LogisticRegression()

clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array(['POSITIVE'], dtype='<U8')

In [107]:
# Mean Accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8124242424242424
0.7648484848484849
0.8409090909090909


In [132]:
# F1 Scores
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
##f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
# f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])

array([0.8028169 , 0.79310345])

In [76]:
train_y.count(Sentiment.POSITIVE)

5611

In [134]:
test_set = ['Great','I throughly enjoyed this, 5 stars', 'do not buy', 'horrible']

new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')