In [111]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import collections
from sklearn.model_selection import GridSearchCV
import pickle

In [79]:
f = open('Books_small_10000.json')

reviews = []

for line in f:
    data = json.loads(line)
    review = data["reviewText"]
    score = data["overall"]
    if score <= 2:
        sentiment = "NEGATIVE"
        reviews.append((review, sentiment))
    elif score > 3:
        sentiment = "POSITIVE"
        reviews.append((review, sentiment))

In [82]:
print("Total:", len(reviews))

print("Positive proportion:", (len([line for line in reviews if line[1] == "POSITIVE"])/len(reviews))*100)

print("Negative proportion:", (len([line for line in reviews if line[1] == "NEGATIVE"])/len(reviews))*100)

Total: 9022
Positive proportion: 92.86189315007759
Negative proportion: 7.138106849922411


In [87]:
target_length = len([line for line in reviews if line[1] == "NEGATIVE"])

reviews_negative = [line for line in reviews if line[1] == "NEGATIVE"]
reviews_positive = [line for line in reviews if line[1] == "POSITIVE"]

reviews_new = reviews_negative + reviews_positive[:target_length]

print("Positive proportion:", (len([line for line in reviews_new if line[1] == "POSITIVE"])/len(reviews_new))*100)

print("Negative proportion:", (len([line for line in reviews_new if line[1] == "NEGATIVE"])/len(reviews_new))*100)

Positive proportion: 50.0
Negative proportion: 50.0


In [97]:
## Preprocessing

training, test = train_test_split(reviews_new, test_size = 0.33, random_state=42)

train_x = [line[0] for line in training]
train_y = [line[1] for line in training]

test_x = [line[0] for line in test]
test_y = [line[1] for line in test]

## TfidfVectorizer gives weights to words that are mentioned less -> different to CountVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

In [98]:
## SVM

clf_svm = svm.SVC(kernel = "linear")
clf_svm.fit(train_x_vectors, train_y)
print("Accuracy of SVM:", clf_svm.score(test_x_vectors, test_y))

Accuracy of SVM: 0.8497652582159625


In [99]:
## Decision Tree

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)
print("Accuracy of decision tree:", clf_dec.score(test_x_vectors, test_y))

Accuracy of decision tree: 0.6854460093896714


In [100]:
## Logistic regression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)
print("Accuracy of logistic regression:", clf_log.score(test_x_vectors, test_y))

Accuracy of logistic regression: 0.8333333333333334


In [101]:
model_names = ["SVM", "Decision trees", "Logistic regression"]
y_predicts = [clf_svm.predict(test_x_vectors),
              clf_dec.predict(test_x_vectors),
              clf_log.predict(test_x_vectors)]

for name, y_predict in zip(model_names, y_predicts):
    print(f"{name}:", f1_score(test_y, y_predict, average = None, labels = ["POSITIVE", "NEUTRAL", "NEGATIVE"]))

SVM: [0.84236453 0.         0.85650224]
Decision trees: [0.69124424 0.         0.67942584]
Logistic regression: [0.82808717 0.         0.83826879]


  _warn_prf(
  _warn_prf(
  _warn_prf(


In [72]:
counter = collections.Counter(train_y)

total = sum(counter.values())

for key in counter.keys():
    val = counter[key]
    prop = (val/total)*100
    counter[key] = f'{round(prop,1)}%'
    
counter

Counter({'POSITIVE': '82.4%', 'NEGATIVE': '7.0%', 'NEUTRAL': '10.6%'})

In [109]:
tuple([i for i in range(1,5)])

(1, 2, 3, 4)

In [110]:
parameters = {"kernel": ("linear", "rbf"), "C": tuple([i for i in range(1,5)])}

svc = svm.SVC()

clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 2, 3, 4), 'kernel': ('linear', 'rbf')})

In [113]:
with open("models_test.pkl", "wb") as f:
    pickle.dump(clf_svm, f)

In [114]:
with open ("models_test.pkl", "rb") as f:
    loaded_clf = pickle.load(f)

In [116]:
loaded_clf.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [None]:
## confusion matrix