In [1]:
import random
class sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Reviews:
    def __init__(self,text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <=2 :
            return sentiment.NEGATIVE
        elif self.score == 3:
            return sentiment.NEUTRAL
        else:
            return sentiment.POSITIVE
        
        
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]
    def get_sentiment(self):
        return [x.sentiment for x in  self.reviews]
        
    def get_evenly(self):
        negative = list(filter(lambda x: x.sentiment == "NEGATIVE",self.reviews))
        positive = list(filter(lambda x: x.sentiment == "POSITIVE",self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk 
        random.shuffle(self.reviews) #shuffle so that negative and positive are not serially aligned
        

In [2]:
import json

file_name = './Books_small_10000.json'
reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Reviews(review['reviewText'],review['overall']))

        
reviews[5].sentiment

'POSITIVE'

In [3]:
#reviews is a list of Objects.


In [4]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(reviews,test_size = 0.33,random_state = 42)

In [5]:
len(train)

6700

In [6]:
#For file small_books
# We want text as an input to our model and the sentiment i.e Positive or negative as result
#train_x = [x.text for x in train]
#train_y = [x.sentiment for x in train]

#test_x = [x.text for x in test]
#test_y = [x.sentiment for x in test]

In [7]:
container = ReviewContainer(train)
container.get_evenly()
len(container.reviews)

872

In [8]:
#Updated for container
from sklearn.model_selection import train_test_split
training,testing = train_test_split(reviews,test_size=0.33,random_state=42)
train_container = ReviewContainer(training)
test_container = ReviewContainer(testing)
len(container.reviews)

872

In [9]:
train_container.get_evenly()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.get_evenly()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(sentiment.NEGATIVE))
print(train_y.count(sentiment.POSITIVE))
print(train_y.count(sentiment.NEUTRAL))

436
436
0


#Bags of Words Vectorizer


In [10]:
#to perform modeling on text we need to turn text into numericl feature vectors and this is done by Bags of Words vectorizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

#This is a good book
#This book is bad
#The Countvectorizer weights each word equally for eg 'This'and 'good' but 'good' is what that defines the sentiment
#hence we use TfidfVectorizer(Term Frequency inverse document frequency)



#vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(train_x)

<872x8906 sparse matrix of type '<class 'numpy.float64'>'
	with 53647 stored elements in Compressed Sparse Row format>

In [11]:
#print(train_x[0])
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)
#print(train_x_vectors[0].toarray())

#### Classification

Linear SVM

In [12]:
from sklearn.svm import SVC
clf = SVC(kernel='linear')
clf.fit(train_x_vectors,train_y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [13]:
test_x[0]
## As we can see the review is positive, lets see what our classifier predicts


"Loved this book.  Hoping my children will read it as well.  Can't wait to start the next one! Highly recommended!"

In [14]:
clf.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Decison Tree Classifier

In [15]:
from sklearn.tree import DecisionTreeClassifier
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors,train_y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [16]:
clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression(solver='liblinear')
clf_lr.fit(train_x_vectors,train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
clf_lr.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Naive Bayes

#### Naive bayes classifier requires data in dense matrix rather than sparse matrix

In [19]:
from sklearn.naive_bayes import GaussianNB
clf_nb = GaussianNB()
clf_nb.fit(train_x_vectors.todense(),train_y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [20]:
clf_nb.predict(test_x_vectors[0].todense())

array(['NEGATIVE'], dtype='<U8')

#### Evaluation

In [21]:
#Mean Accuracy Score
print("SVC : ",clf.score(test_x_vectors,test_y))
print("Decison Tree Classifier: ",clf_dec.score(test_x_vectors,test_y))
print("Logistic Regression: ",clf_lr.score(test_x_vectors,test_y))
print("Naive_bayes classifier: ",clf_nb.score(test_x_vectors.todense(),test_y))

SVC :  0.8076923076923077
Decison Tree Classifier:  0.6730769230769231
Logistic Regression:  0.8028846153846154
Naive_bayes classifier:  0.6610576923076923


In [22]:
# f1-score 
from sklearn.metrics import f1_score
print("[POSITIVE,NEUTRAL,NEGATIVE]")
print("SVC F1_score: ",f1_score(test_y,clf.predict(test_x_vectors),average=None,labels=[sentiment.POSITIVE,sentiment.NEUTRAL,sentiment.NEGATIVE]))
print("Decision Tree F1_score: ",f1_score(test_y,clf_dec.predict(test_x_vectors),average=None,labels=[sentiment.POSITIVE,sentiment.NEUTRAL,sentiment.NEGATIVE]))
print("Logistic Regression F1 score: ",f1_score(test_y,clf_lr.predict(test_x_vectors),average=None,labels=[sentiment.POSITIVE,sentiment.NEUTRAL,sentiment.NEGATIVE]))
print("Naive_bayes F1 score: ",f1_score(test_y,clf_nb.predict(test_x_vectors.todense()),average=None,labels=[sentiment.POSITIVE,sentiment.NEUTRAL,sentiment.NEGATIVE]))


[POSITIVE,NEUTRAL,NEGATIVE]


  _warn_prf(


SVC F1_score:  [0.80582524 0.         0.80952381]
Decision Tree F1_score:  [0.66169154 0.         0.68372093]
Logistic Regression F1 score:  [0.80097087 0.         0.8047619 ]
Naive_bayes F1 score:  [0.65693431 0.         0.66508314]


  _warn_prf(


In [23]:
train_y.count(sentiment.POSITIVE)
#out of 670 552 are positive hence the data is unbalannced

436

In [24]:
train_y.count(sentiment.NEGATIVE)

436

In [25]:
test_set = ["I really enjoyed the show!!","That was awfull","good"]
new_test = vectorizer.transform(test_set)
clf.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'POSITIVE'], dtype='<U8')

In [26]:
test_set1 = ['This']
newtest1 = vectorizer.transform(test_set1)
clf.predict(newtest1)
#the word this has no significance inn determining sentiment, words like good, bad determine the sentiment.

array(['NEGATIVE'], dtype='<U8')

#### Tuning our model with GridSearchCV

In [27]:
from sklearn.model_selection import GridSearchCV
#we saw the SVC was predicting good so applying GridSearchCV on it
parameters = {'kernel':('linear','rbf'),'C':(1,3,5,7)}
svc = SVC()
clf_gscv = GridSearchCV(svc,parameters,cv=5)
clf_gscv.fit(train_x_vectors,train_y)








GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': (1, 3, 5, 7), 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [28]:
print(clf_gscv.score(test_x_vectors,test_y))

0.8197115384615384


#### Saving our model

In [29]:
import pickle
 
with open("./models/sentiment_classifier.pkl",'wb') as f:
    pickle.dump(clf_gscv,f)

In [34]:
pickle.dump(vectorizer,open("./models/model_vectorizer.pickle",'wb'))

#### Load our model

In [31]:
with open("./models/sentiment_classifier.pkl",'rb') as f:
    loaded_clf = pickle.load(f)

In [33]:
ouput =loaded_clf.predict(vectorizer.transform(['This was really good']))
ouput[0]

'POSITIVE'