In [1]:
import random
class Sentiment:
    NEGATIVE="NEGATIVE"
    NEUTRAL="NEUTRAL"
    POSITIVE="POSITIVE"

class Review:
    def __init__(self,text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <=2:
            return Sentiment.NEGATIVE
        elif self.score ==3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews=reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x:x.sentiment==Sentiment.NEGATIVE,self.reviews))
        positive = list(filter(lambda x:x.sentiment==Sentiment.POSITIVE,self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews=negative+positive_shrunk
        random.shuffle(self.reviews)

In [2]:
import json
file_name = 'E:\\MyCodes\\KeithG\\sklearn-master\\data\\sentiment\\Books_small_10000.json'
reviews=[]
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))
        


In [3]:
from sklearn.model_selection import train_test_split
training,test=train_test_split(reviews,test_size=.33,random_state=42)
train_container=ReviewContainer(training)
test_container=ReviewContainer(test)
train_container.evenly_distribute()
test_container.evenly_distribute()

In [4]:
train_x=train_container.get_text()
train_y=train_container.get_sentiment()

test_x=test_container.get_text()
test_y=test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

436
436


In [5]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors=vectorizer.fit_transform(train_x)
test_x_vectors=vectorizer.transform(test_x)


######Classifiers clf_######

In [6]:
from sklearn import svm
clf_svm=svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors,train_y)
clf_svm.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

In [7]:
from sklearn.tree import DecisionTreeClassifier
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors,train_y)
clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [8]:
from sklearn.naive_bayes import GaussianNB
clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.toarray(),train_y)
clf_gnb.predict(test_x_vectors[0].toarray())


array(['POSITIVE'], dtype='<U8')

In [9]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression()
clf_lr.fit(train_x_vectors,train_y)
clf_lr.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [10]:
print(clf_svm.score(test_x_vectors,test_y))
print(clf_dec.score(test_x_vectors,test_y))
print(clf_gnb.score(test_x_vectors.toarray(),test_y))
print(clf_lr.score(test_x_vectors,test_y))

0.8076923076923077
0.6538461538461539
0.6610576923076923
0.8052884615384616


In [11]:
from sklearn.metrics import f1_score
f1_score(test_y,clf_svm.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE])

array([0.80582524, 0.80952381])

In [12]:
test_set = ["terrible book","good book please  buy", "terrific use of time"]
new_test=vectorizer.transform(test_set)
clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

In [13]:
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear','rbf'),'C':(1,4,8,16,32)}
svc=svm.SVC()
clf=GridSearchCV(svc,parameters,cv=5)
clf.fit(train_x_vectors,train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [14]:
from sklearn.metrics import f1_score
f1_score(test_y,clf.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE])

array([0.82269504, 0.81662592])

In [15]:
test_set = ["terrible book","best book please  buy", "terrific use of time"]
new_test=vectorizer.transform(test_set)
clf.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

In [16]:
import pickle
with open('C:\\Users\\WindowsX\\Desktop\\Jupyter\\models\\sentiment_classifier.pkl','wb') as f:
    pickle.dump(clf,f)

In [17]:
with open('C:\\Users\\WindowsX\\Desktop\\Jupyter\\models\\sentiment_classifier.pkl','rb') as f:
    loaded_clf=pickle.load(f)

In [18]:
loaded_clf.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')