### Using Enums

In [1]:
import random
class Sentiment:
    Negative='Negative'
    Neutral = 'Neutral'
    Positive = 'Positive'
    
class Review:
    def __init__(self,text,score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    def get_sentiment(self):
        if self.score<=2:
            return Sentiment.Negative
        elif self.score ==3:
            return Sentiment.Neutral
        else:
            return Sentiment.Positive
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews= reviews
       
        #print(negative[0].text)
        #print(len(negative),len(positive))
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.Negative,self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.Positive,self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative+positive_shrunk
        random.shuffle(self.reviews)

### Import Json File

In [2]:
import json
reviews = []
file_name = './Books_small_10000.json'
with open (file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))
reviews[5].sentiment
        

'Positive'

### Invert text to numerical data

##### Import sklearn and split the data to training, and test

In [3]:
from sklearn.model_selection import train_test_split
#0.33 of the data is used for training, returns x & y

training,test = train_test_split(reviews,test_size=0.33,random_state = 42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

train_container.evenly_distribute()
test_container.evenly_distribute()

train_x = [x.text for x in train_container.reviews]
train_y = [x.sentiment for x in train_container.reviews]

test_x = [x.text for x in test_container.reviews]
test_y = [x.sentiment for x in test_container.reviews]


print(train_y.count(Sentiment.Positive))
print(train_y.count(Sentiment.Negative))



436
436


#### Bag of words 

In [4]:
##### This Book is great vs This book was bad
##### {This:2, book:2, is:1, was:1, great:1, bad:1}
#shift tab to read more about function

#TfidfVectorizer weights certain words heavier than others
#eg. great,bad > this, is

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)
#print(vectorizer.get_feature_names())
#print(train_x[0])
#print(train_x_vectors[0].toarray())

##***fit model around train_x_vectors and train_y***

### Classification

#### Linear SVM
##### https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#

In [18]:
from sklearn import svm
#specify kernel:{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)
#test_x[0]



SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [19]:
clf_svm.predict(test_x_vectors[250:255])

array(['Negative', 'Negative', 'Negative', 'Positive', 'Negative'],
      dtype='<U8')

#### Decision Tree
##### https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html?highlight=decision%20tree#sklearn.tree.DecisionTreeClassifier

In [6]:
from sklearn.tree import DecisionTreeClassifier
clf_tree = DecisionTreeClassifier(random_state = 0)
clf_tree.fit(train_x_vectors,train_y)
clf_tree.predict(test_x_vectors[250:255])
#test_x[251]
#test_y[251]-->'Positive'

array(['Negative', 'Positive', 'Negative', 'Positive', 'Negative'],
      dtype='<U8')

#### Naive Bayes (not good for sparse training x)

In [7]:
'''Not work for sparce matrix
from sklearn.naive_bayes import GaussianNB
clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors,train_y)
clf_gnb.predict(test_x_vectors[250:255])
#test_x[123]
'''
from sklearn.naive_bayes import GaussianNB
clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.todense(),train_y)
clf_gnb.predict(test_x_vectors[250:255].todense())
#test_x[123]

array(['Negative', 'Negative', 'Negative', 'Positive', 'Negative'],
      dtype='<U8')

#### Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[250:255])
#test_y[252]-->Positive

array(['Negative', 'Negative', 'Negative', 'Positive', 'Negative'],
      dtype='<U8')

In [9]:
test_y[250:255]

['Negative', 'Negative', 'Negative', 'Positive', 'Negative']

### Evaluation (more comprehensively) 

In [10]:
print('SVM score:')
print(clf_svm.score(test_x_vectors,test_y))
print('Decision Tree score:')
print(clf_tree.score(test_x_vectors,test_y))
print('Gaussian Naive Bayes score:')
print(clf_gnb.score(test_x_vectors.todense(),test_y))
print('Logistic regression Naive Bayes score:')
print(clf_log.score(test_x_vectors,test_y))

SVM score:
0.8076923076923077
Decision Tree score:
0.6442307692307693
Gaussian Naive Bayes score:
0.6610576923076923
Logistic regression Naive Bayes score:
0.8052884615384616


In [11]:
# F1 scores
from sklearn.metrics import f1_score
#f1_score(test_y,clf_svm.predict(test_x_vectors),average = None,labels=[Sentiment.Postive,Sentiment.Neutral,Sentiment.Negative])
#f1_score(test_y,clf_tree.predict(test_x_vectors),average = None,labels=[Sentiment.Postive,Sentiment.Neutral,Sentiment.Negative])
#f1_score(test_y,clf_gnb.predict(test_x_vectors.todense()),average = None,labels=[Sentiment.Postive,Sentiment.Neutral,Sentiment.Negative])
f1_score(test_y,clf_log.predict(test_x_vectors.todense()),average = None,labels=[Sentiment.Positive,Sentiment.Negative])


array([0.80291971, 0.80760095])

In [12]:
print(train_y.count(Sentiment.Positive))
print(train_y.count(Sentiment.Negative))
#print(train_y.count(Sentiment.Neutral))

436
436


### Tuning our Model

In [13]:
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear','rbf'),'C':(1,4,8,16,32)}
svc = svm.SVC()
clf = GridSearchCV(svc,parameters, cv =5)
clf.fit(train_x_vectors,train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [14]:
print(clf.score(test_x_vectors,test_y))

0.8076923076923077


### Saving Model (Using pickle)

In [15]:
import pickle
with open('./models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)


In [16]:
with open ('./models/sentiment_classifier.pkl','rb') as f:
    loaded_clf=pickle.load(f)

In [17]:
print(test_x[123])
loaded_clf.predict(test_x_vectors[123])

I haven't read a urban novel in years. This has a good storyline, can't wait to read the sequel! I like the way how the author went into detail, about what the main characters were thinking. I have a feeling Khalil is going to go bonkers on Donte and a few other people.


array(['Positive'], dtype='<U8')