## Data Class ##

In [35]:
import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'



class Review:
    
    def __init__(self,text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score ==3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    
    def __init__(self,reviews):
        self.reviews = reviews
    
    def get_text(self):
        return[x.text for x in self.reviews]
    
    def get_sentiment(self):
        return[x.sentiment for x in self.reviews]
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)] 
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
    
    
     


## Load Data ##

In [36]:
import json

file_name = './sklearn-master/data/sentiment/Books_small_10000.json'


with open(file_name) as f:
    
    reviews = []
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
        
    

In [45]:
reviews[45].sentiment


'POSITIVE'

## Prep Data ##

### Splitting the dataset ###

In [37]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size = 0.33, random_state = 42) 

training_cont = ReviewContainer(training)

test_cont = ReviewContainer(test)




In [59]:
training_cont.evenly_distribute()
train_x = training_cont.get_text()
train_y = training_cont.get_sentiment()

test_cont.evenly_distribute()
test_x = test_cont.get_text()
test_y = test_cont.get_sentiment()

### Bag of words vectorization ###

In [81]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

In [82]:
print(train_x[0])
print(train_x_vectors[0])



If you love owls this is a great book. There are lots of beautiful pictures of owls in action. There is wonderful information to learn and the author has tons of experience as an expert. Very happy with this!
  (0, 8760)	0.07421499687856961
  (0, 3625)	0.18050129175492904
  (0, 8497)	0.09806400637321677
  (0, 2868)	0.26455692242053824
  (0, 416)	0.09556743631521478
  (0, 569)	0.09374858732299207
  (0, 2861)	0.18211963703882597
  (0, 8078)	0.28063116262372156
  (0, 3652)	0.10617385773663535
  (0, 660)	0.10715885136834707
  (0, 7929)	0.04739299890952387
  (0, 423)	0.05013603771081082
  (0, 4600)	0.17600849453734507
  (0, 8052)	0.05152085724393126
  (0, 4101)	0.18741334946139915
  (0, 8781)	0.17327334429528585
  (0, 205)	0.17745025175969675
  (0, 4034)	0.06504419519402634
  (0, 5868)	0.2034875896645825
  (0, 791)	0.20075243942252327
  (0, 5478)	0.17301860941493563
  (0, 4778)	0.19352449196288005
  (0, 525)	0.09538175000727388
  (0, 7951)	0.20536773531112873
  (0, 991)	0.058249519306755024

## Classification ##

### SVM ###

In [83]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Decision Tree ###

In [84]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

### Naive Bayes ###

In [85]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()

clf_gnb.fit(train_x_vectors.toarray(), train_y)

clf_gnb.predict(test_x_vectors[0].toarray())



array(['NEGATIVE'], dtype='<U8')

### Logistic Regression ###

In [86]:
from sklearn.linear_model import LogisticRegression

clf_lg = LogisticRegression()

clf_lg.fit(train_x_vectors, train_y)

clf_lg.predict(test_x_vectors[0])





array(['POSITIVE'], dtype='<U8')

## Evaluation ##

### Mean Accuracy ###

In [87]:
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors.toarray(), test_y))
print(clf_lg.score(test_x_vectors, test_y))

0.8076923076923077
0.6322115384615384
0.6610576923076923
0.8052884615384616


### F1 score ###

In [88]:
from sklearn.metrics import f1_score

print(f1_score(test_y,clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
# print(f1_score(test_y,clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
# print(f1_score(test_y,clf_gnb.predict(test_x_vectors.toarray()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
# print(f1_score(test_y,clf_lg.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

[0.80582524 0.         0.80952381]


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [69]:
train_y.count('NEGATIVE')

436

### Qualitative evaluation ###

In [89]:
test_set = ['awesome', "bad book do not buy", 'horrible waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

### Tuning our model (GridSearch) ###

In [93]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)
clf.cv_results_






{'mean_fit_time': array([0.14858041, 0.1511272 , 0.15023031, 0.15259829, 0.15030336,
        0.15245337, 0.14999604, 0.1524929 , 0.15013065, 0.152596  ]),
 'std_fit_time': array([0.01263306, 0.00093014, 0.00107627, 0.00107753, 0.00094869,
        0.0009749 , 0.00101457, 0.00095868, 0.00105039, 0.00107546]),
 'mean_score_time': array([0.03127947, 0.03644471, 0.03081455, 0.03659735, 0.03082266,
        0.03653007, 0.03081894, 0.03669171, 0.03084302, 0.03659673]),
 'std_score_time': array([0.00035962, 0.00018968, 0.00037504, 0.00029284, 0.00026099,
        0.00029947, 0.00034683, 0.00025071, 0.00030068, 0.00018528]),
 'param_C': masked_array(data=[1, 1, 4, 4, 8, 8, 16, 16, 32, 32],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf',
                    'linear', 'rbf', 'linear', 'rbf'],
       

In [98]:
import pandas as pd
df = pd.DataFrame(clf.cv_results_)

In [99]:
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,linear,0.83598
1,1,rbf,0.822273
2,4,linear,0.819967
3,4,rbf,0.824539
4,8,linear,0.818818
5,8,rbf,0.824539
6,16,linear,0.818818
7,16,rbf,0.824539
8,32,linear,0.818818
9,32,rbf,0.824539


In [100]:
print(clf.score(test_x_vectors, test_y))

0.8076923076923077


## Save Model ##

In [101]:
import pickle

with open ('./sklearn-master/models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf,f)

## Load Model ##

In [102]:
with open ('./sklearn-master/models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [103]:
print(test_x[10])

loaded_clf.predict(test_x_vectors[10])

This odd and pretentious novel is based on the true case of an innocent man who falsely confessed to a series of homicides. The nation is on edge in the wake of a series of mysterious disappearances. The targets, all older, solitary sorts, vanish and their presumed abductor leaves nary a clue but for a marked playing card. Oda Sotatsu is a young man living a life both unfulfilling and uninteresting. That is until he meets a troublesome couple, the supposedly charismatic Sato Kakuzo and his girlfriend, the alluring,Jito Joo. Clearly disturbed, they play games and place wagers where the loser has to physically harm himself. They attach to Oda, inducing him into  a wager after plying him with alcohol. After losing the game, he signs a detailed confession admitting culpability in the disappearances. Joo delivers the confession to the police and Oda is soon arrested, imprisoned, abused, tried and convicted. He is subsequently sentenced to death by hanging and executed, remaining silent thro

array(['NEGATIVE'], dtype='<U8')