In [310]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"
    
class Review:
    def __init__(self, text, score):
        self.text=text
        self.score=score
        self.sentiment=self.get_sentiment()
        
    def get_sentiment(self):
        if self.score<=2:
            return Sentiment.NEGATIVE
        elif self.score==3:
            return Sentiment.NEUTRAL
        else:#score of 4 or 5
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews
    
    def get_text(self):
        return[x.text for x in self.reviews]
    
    def get_sentiment(self):
        return[x.sentiment for x in self.reviews]
    
    def evenly_distribute(self):
        negative = list(filter(lambda x : x.sentiment==Sentiment.NEGATIVE,self.reviews))
        positive = list(filter(lambda x : x.sentiment==Sentiment.POSITIVE,self.reviews))
        positive_shrink = positive[:len(negative)]
        self.reviews = negative + positive_shrink
        random.shuffle(self.reviews)
        #print(negative[0].text)
        #print(len(negative))
        #print(len(positive))
        

In [210]:
import json 

file_name = r'.\Books_small_10000.json'
reviews=[]
with open(file_name) as f:
    
    for line in f:
        review=json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))
reviews[30].text

"My Reflections:The well developed characters, the slow steady build all work together to deliver a tidy little package where mystery and history entwine.I loved the idea of a story centering on the decision, Franklyn Roosevelt devised to help people destroyed by the depression of 1935. His idea was to send families to a remote area of Alaska to colonies and grow the Matanuska Valley. The really superb thing about this book is these two authors use real and fictional characters to develop their narrative.Dr Jeremiah Vaughan's life is destroyed by allegations of abuse. When he uses a ground-breaking IV sedation technique with an influential patient, and the patient dies, the authorities are out for blood. This causes his license to be stripped away. Because of this, his intended and her mother want nothing to do with the shame. A has-been doctor is not what a high society woman wants on her arm. Fleeing from the hurt and rejection, from not only his fiance but also his own parents Jerem

# Preparation of Data

In [283]:
from sklearn.model_selection import train_test_split

training , test =train_test_split(reviews, test_size = 0.33, random_state = 42 )

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

cont.evenly_distribute()

len(cont.reviews)

24064

In [284]:
len(training)

6700

In [285]:
len(test)

3300

In [286]:
print(training[0])

<__main__.Review object at 0x172F2FD0>


In [311]:
train_container.evenly_distribute()
train_x=train_container.get_text()
train_y=train_container.get_sentiment()

test_container.evenly_distribute()
test_x=test_container.get_text()
test_y=test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

436
436


# Bag of Words

In [312]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

#This book is great !
#This book was bad

#vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()
#TfidfVectorization improves performance by reducing the value of the repeated words in the document
train_x_vectors=vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)
#vectorizer.fit(train_x)
#train_x_vectors = vectorizer.transform(train_x)
print(train_x[0])
print(train_x_vectors[0].toarray())

I like to read the reviews before I download a book, I guess now I am going to have to read about the reviewers before before I read the review before I download the book....I think somebody said  &#34;worth the time&#34;....well apparently their time isn't worth much because in my humble opinion, this was definitely one of the biggest wastes of my time yet....thank goodness it was short. I went back to read the reviews just now, and I'm sitting here shaking my head.....what did I miss??? The only reason I didn't give it one star is I don't want to hate anything. I seldom download short stories, the reviews swayed me.....I won't let that happen again...I mean, one nugget of wisdom could have made it worthwhile....the cover was also a tease....I guess the cover was worth the second star.  Sorry
[[0. 0. 0. ... 0. 0. 0.]]


# Classification
#### Linear svm

In [298]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors,train_y)
test_x[0]
clf_svm.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

#### Decision Tree

In [299]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)
clf_dec.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

#### Naive Bayes

In [291]:
from sklearn.naive_bayes import GaussianNB
q=train_x_vectors.todense()
p=test_x_vectors.todense()
clf_gau = GaussianNB()#Memory error, should work in 64 bit though
clf_gau.fit(q,train_y)
clf_gau.predict(p[0])
#we need to pass a dense matrix than a sparse matrix

array(['POSITIVE'], dtype='<U8')

#### Logistic Regression

In [300]:
from sklearn.linear_model import LogisticRegression
clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)
clf_log.predict(test_x_vectors[0])
#dont worry about the error that pops up, it is a future error


array(['NEGATIVE'], dtype='<U8')

In [313]:
print(clf_svm.score(test_x_vectors,test_y))
print(clf_dec.score(test_x_vectors,test_y))
print(clf_gau.score(p,test_y))
print(clf_log.score(test_x_vectors,test_y))
#Throws a ValueError if the dataset is big

0.8076923076923077
0.6370192307692307
0.5480769230769231
0.8052884615384616


In [304]:
#F1 Scores for the classifiers
from sklearn.metrics import f1_score
print(f1_score(test_y,clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE]))
print(f1_score(test_y,clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE]))
print(f1_score(test_y,clf_gau.predict(p), average=None, labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE]))
print(f1_score(test_y,clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE]))
#Throws a ValueError if the dataset is big

[0.80582524 0.80952381]
[0.63788969 0.63614458]
[0.59574468 0.66666667]
[0.80291971 0.80760095]


In [275]:
test_y.count(Sentiment.POSITIVE)

208

In [303]:
test_set=['I thoroughly enjoyed this', '2 stars', 'not great', 'bad book do not buy', 'horrible waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE'],
      dtype='<U8')

### Increasing the accuracy of the data with Grid Search

In [320]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [321]:
print(clf.score(test_x_vectors, test_y))

0.8052884615384616


### Saving the Model

In [325]:
import pickle

with open(r'C:\Users\admin\Documents\Vilashnee\College\Learning materials\Machine Learning\sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

#### Load Model

In [326]:
with open(r'C:\Users\admin\Documents\Vilashnee\College\Learning materials\Machine Learning\sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [329]:
print(test_x[0])

loaded_clf.predict(test_x_vectors[0])

I so enjoyed the Silo series and had high hopes for the 2 books y mMr Howey' What a let down . Wil look for his books on the sale shelf next time!these were more Tween books than adult.


array(['POSITIVE'], dtype='<U8')