## LOAD DATA

In [84]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    POSITIVE = "POSITIVE"
    NEUTRAL = "NEUTRAL"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE  # Or just "NEGATIVE"
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:  # Score of 4 or 5
            return Sentiment.POSITIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        

In [85]:
import json

file_name = 'books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
reviews[0].sentiment

'POSITIVE'

## PREPARE DATA

In [86]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [87]:
print(train[1].text)

Perhaps one of the funniest, yet saddest stories I have ever read.  I cannot imagine that a family could possibly be so dysfunctional and base; it is beyond my comprehension, yet I was riveted and hoped for the best...did it come, read it and find out.


In [88]:
train_container = ReviewContainer(train)
test_container = ReviewContainer(test)

#### Split Texts and Sentiments into train_x and train_y respectively.

In [102]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(f"POSITIVE: {train_y.count(Sentiment.POSITIVE)}")
print(f"NEGATIVE: {train_y.count(Sentiment.NEGATIVE)}")

POSITIVE: 436
NEGATIVE: 436


### Bags of words vectorization   

##### Docs:
1. https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html?highlight=bags%20words

2. https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html?highlight=count%20vectorizer#sklearn.feature_extraction.text.CountVectorizer

In [128]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()


train_x_vectors = vectorizer.fit_transform(train_x)  # 0's and 1's
#     OR
# vectorizer.fit(train_x)
# train_x_vectors = vectorizer.transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())

I was sent a copy from the publisher through NetGally.Samantha Holt does it again!! I love Samantha's work and love that she is expanding her genera to include completely different elements to her work.I LOVED Eden's Fire. This romantic paranormal story is based around a young widow named Eden. She hasn't had a great life in the last few years and is down on her luck. She prays to the Gods one night for help and to her surprise the God of Fire, Tyondric, standing in her cottage. He is completely breathtaking and helps her start the fire and warm the bath water. She is completely mesmerized by Ty and is sad to learn she will never see him again... Or will she?Tyondric is a God and is bound to help human kind and help save them. Although he might be in for a world of shock when he meets the beautiful Eden. He cannot keep his thoughts away from Eden and she is about to turn his world upside down.Eden's Fire is an amazing journey that will have you laughing, rooting for these two to be tog

# Classification
Choosing the classifier

### Linear SVM

In [144]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)  # training the model

SVC(kernel='linear')

In [130]:
# Predict
clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Decision Trees

In [131]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

DecisionTreeClassifier()

In [132]:
# Predict
clf_dec.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

### Naive Bayes

In [133]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.toarray(), train_y)

GaussianNB()

In [134]:
# Predict
clf_gnb.predict(test_x_vectors[0].toarray())

array(['POSITIVE'], dtype='<U8')

### Logistic Regression

In [135]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

LogisticRegression()

In [136]:
# Predict
clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

# Evaluation

### Mean Accuracy

In [137]:
print(f"SVM: {clf_svm.score(test_x_vectors, test_y)}")
print(f"DEC: {clf_dec.score(test_x_vectors, test_y)}")
print(f"GNB: {clf_gnb.score(test_x_vectors.toarray(), test_y)}")
print(f"LOG: {clf_log.score(test_x_vectors, test_y)}")

SVM: 0.8076923076923077
DEC: 0.6225961538461539
GNB: 0.6610576923076923
LOG: 0.8052884615384616


### F1 Score

In [138]:
from sklearn.metrics import f1_score

print(f"SVM: {f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])}")
print(f"DEC: {f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])}")
print(f"GNB: {f1_score(test_y, clf_gnb.predict(test_x_vectors.toarray()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])}")
print(f"LOG: {f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])}")


SVM: [0.80582524 0.         0.80952381]
DEC: [0.62884161 0.         0.61613692]
GNB: [0.65693431 0.         0.66508314]
LOG: [0.80291971 0.         0.80760095]


  _warn_prf(
  _warn_prf(
  _warn_prf(
  _warn_prf(


#### Analyze the data

In [139]:
print(f"Total: {len(train_y)}")
print(f"Positive: {train_y.count(Sentiment.POSITIVE)}")
print(f"Negative: {train_y.count(Sentiment.NEGATIVE)}")
print(f"Neutral: {train_y.count(Sentiment.NEUTRAL)}")

Total: 872
Positive: 436
Negative: 436
Neutral: 0


## Qualitative Analysis

In [140]:
test_set = ['not great', 'bad book do not buy', 'good']
new_test = vectorizer.transform(test_set)

print(f"SVM: {clf_svm.predict(new_test)}")
print(f"DEC: {clf_dec.predict(new_test)}")
print(f"GBN: {clf_gnb.predict(new_test.toarray())}")
print(f"LOG: {clf_log.predict(new_test)}")

SVM: ['NEGATIVE' 'NEGATIVE' 'POSITIVE']
DEC: ['POSITIVE' 'NEGATIVE' 'POSITIVE']
GBN: ['NEGATIVE' 'NEGATIVE' 'NEGATIVE']
LOG: ['POSITIVE' 'NEGATIVE' 'POSITIVE']


### Tuning our model (with Grid Search)

In [146]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [148]:
# Accuracy
print(f"SVM: {clf.score(test_x_vectors, test_y)}")

SVM: 0.8197115384615384


## Saving our model

#### Save model

In [149]:
import os
os.getcwd()

'C:\\MyFiles\\Projects\\Sentiment Analysis'

In [150]:
import pickle

with open('./Model/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

#### Load model

In [151]:
with open('./Model/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [154]:
print(test_x[0])
print(test_y[0])

loaded_clf.predict(test_x_vectors[0])

Recommended by a friend as &#34;one of the best books I have read.&#34;  I would not go that far but it is a page turner and very touching story.  Delightful read.
POSITIVE


array(['POSITIVE'], dtype='<U8')