In [2]:
from google.colab import files
uploaded = files.upload()

Saving Books_small_10000.json to Books_small_10000.json


**Data Class**

In [127]:
import random
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #Score of 4 or 5
            return Sentiment.POSITIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        neutral = list(filter(lambda x: x.sentiment == Sentiment.NEUTRAL, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        



**Dataset Loading**

In [16]:
import json
file_name ='Books_small_10000.json'
reviews = []
with open(file_name) as f:
    for line in f:
        review =json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
reviews[50].sentiment

'NEGATIVE'

In [19]:
len(reviews)

10000

In [24]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)


**Training**

In [95]:
train_container = ReviewContainer(training)

test_container = ReviewContainer(test)
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))
print(train_y.count(Sentiment.NEUTRAL))

436
436
0


**Vectorization**

In [35]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer =CountVectorizer()
vectorizer.fit_transform(train_x)

<3300x18643 sparse matrix of type '<class 'numpy.int64'>'
	with 211314 stored elements in Compressed Sparse Row format>

In [47]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)
 
print(train_x[1])
print(train_x_vectors[1].toarray())

The story told here is powerful yet softly delivered with beautiful imagery of orchids delivered at crucial moments in the heroine's life.  What did they mean?  I was hooked and had to keep reading.  Great for romantics who love a nice read!
[[0. 0. 0. ... 0. 0. 0.]]


**Classification**

Linear SVM
 

In [50]:
from sklearn import svm
clf_svm =svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors,train_y)
test_x[0]
#test_x_vectors[0]
clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

Decision Tree

In [57]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors,train_y)
clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

Logistic Regression

In [72]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors,train_y)
clf_log.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

**Evaluation**

In [78]:
#Mean Accuracy
print (clf_svm.score(test_x_vectors,test_y))
print (clf_dec.score(test_x_vectors,test_y))
#print (clf_gnb.score(test_x_vectors,test_y))
print (clf_log.score(test_x_vectors,test_y))

0.8076923076923077
0.6370192307692307
0.8052884615384616


**F1 Score**


In [86]:
from sklearn.metrics import f1_score
f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE])


array([0.80582524, 0.80952381])

In [105]:
train_y.count(Sentiment.POSITIVE)

436

In [101]:
test_set = ['very fun', 'Wonderful', 'horrible waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'POSITIVE', 'NEGATIVE'], dtype='<U8')

**Grid Search**

In [107]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [108]:
print(clf.score(test_x_vectors,test_y))

0.5480769230769231
