In [1]:
import random
class Sentiment:
    NEGATIVE = "NEGATIVE"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    def get_sentiment(self):
        if self.score>=3:
            return Sentiment.POSITIVE
        else: 
            return Sentiment.NEGATIVE
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment==Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment==Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        

## LOAD DATA

In [2]:
import json
file_name = "books_small_10000.json"
reviews=[]
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review["reviewText"], review["overall"]))

## PREP DATA

In [3]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(reviews, test_size=0.33, random_state=42)
train_container = ReviewContainer(train)
test_container = ReviewContainer(test)

In [4]:
train_container.evenly_distribute()
test_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()
print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

436
436


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
count_vect = TfidfVectorizer()
train_x_vectors = count_vect.fit_transform(train_x)
test_x_vectors = count_vect.transform(test_x)

## CLASSIFICATION

In [6]:
# from sklearn import svm
# clf_svm = svm.SVC(kernel = "linear")
# clf_svm.fit(train_x_vectors, train_y)


In [7]:
# clf_svm.predict(test_x_vectors[0])


## Decision Tree

In [8]:
# from sklearn.tree import DecisionTreeClassifier
# clf_dec = DecisionTreeClassifier()
# clf_dec.fit(train_x_vectors, train_y)
# clf_dec.predict(test_x_vectors[0])

## Naive Bayes

In [9]:
# from sklearn.naive_bayes import GaussianNB
# clf_NB = GaussianNB()
# clf_NB.fit(train_x_vectors.todense(), train_y)
# clf_NB.predict(test_x_vectors[0].todense())


## Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression()
clf_lr.fit(train_x_vectors, train_y)
clf_lr.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

## Evaluation

In [11]:
#Mean Accuracy
print(clf_lr.score(test_x_vectors, test_y))

0.7740384615384616


In [12]:
#F1 Score
from sklearn.metrics import f1_score
print(f1_score(test_y, clf_lr.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE]))


[0.77294686 0.77511962]


## Qualitative Testing

In [13]:
test_set = ["I enjoy this", "This is insanley incredible", "I do not like it", "Its the worst", "I slept after reading one page"]
new_test = count_vect.transform(test_set)
clf_lr.predict(new_test)

array(['POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE'],
      dtype='<U8')