# Sentiment analysis of an amazon article reviews

In [1]:
import numpy as np
import pandas as pd
import json

file_name = 'books_small_10000.json'

reviews = []

with open(file_name) as f:
    for line in f:
        reviews.append(json.loads(line))

df = pd.DataFrame(reviews)
df = df[["reviewText","overall"]]

df.head()

Unnamed: 0,reviewText,overall
0,"I bought both boxed sets, books 1-5. Really a...",5.0
1,I enjoyed this short book. But it was way way ...,3.0
2,I love Nicholas Sparks. I&#8217;ve read everyt...,4.0
3,I really enjoyed this adventure and look forwa...,4.0
4,It was a decent read.. typical story line. Not...,3.0


## Creation of the labels
We will use the score ("overall") to create our labels "positive" and "negative"

In [2]:
# We remove the lines with score = 3.0 which is neutral
df = df[df["overall"]!=3.0]

# Then is the score is > than 3.0 the sentiment is positive, else it's negative
df["sentiment"] = np.where(df["overall"]>3.0, "POSITIVE", "NEGATIVE")

df.drop(columns="overall", inplace=True)
df.head()

Unnamed: 0,reviewText,sentiment
0,"I bought both boxed sets, books 1-5. Really a...",POSITIVE
2,I love Nicholas Sparks. I&#8217;ve read everyt...,POSITIVE
3,I really enjoyed this adventure and look forwa...,POSITIVE
5,I hoped for Mia to have some peace in this boo...,POSITIVE
6,The book has the fevered intensity of Oliver S...,NEGATIVE


## Balancing the labels

In [3]:
pd.value_counts(df["sentiment"])

POSITIVE    8378
NEGATIVE     644
Name: sentiment, dtype: int64

In [4]:
# The labels are unbalanced, we need to pick as many positive labels as negative ones
df = pd.concat([df[df["sentiment"]=="NEGATIVE"], df[df["sentiment"]=="POSITIVE"].sample(len(df[df["sentiment"]=="NEGATIVE"]))]).sample(frac=1)
pd.value_counts(df["sentiment"])

POSITIVE    644
NEGATIVE    644
Name: sentiment, dtype: int64

## Vectorisation of the text

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(df["reviewText"], df["sentiment"], random_state=0)

vect = TfidfVectorizer(min_df=5, ngram_range=(2,5), analyzer='char_wb').fit(X_train)
Xtrain_vect = vect.transform(X_train)
X_test_vect = vect.transform(X_test)

## Model creation, fit and evaluation

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Logistic Regression
clf_lr = LogisticRegression(C=100).fit(Xtrain_vect, y_train)

# SVC
clf_svc = SVC().fit(Xtrain_vect, y_train)

# Random Forest
clf_rdf = RandomForestClassifier().fit(Xtrain_vect, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [7]:
from sklearn.metrics import classification_report
print("Score logistic regression :\n",classification_report(y_test, clf_lr.predict(X_test_vect)))
print("Score SVC :\n",classification_report(y_test, clf_svc.predict(X_test_vect)))
print("Score random forest :\n",classification_report(y_test, clf_rdf.predict(X_test_vect)))

Score logistic regression :
               precision    recall  f1-score   support

    NEGATIVE       0.83      0.84      0.83       166
    POSITIVE       0.83      0.82      0.82       156

    accuracy                           0.83       322
   macro avg       0.83      0.83      0.83       322
weighted avg       0.83      0.83      0.83       322

Score SVC :
               precision    recall  f1-score   support

    NEGATIVE       0.84      0.87      0.85       166
    POSITIVE       0.85      0.82      0.84       156

    accuracy                           0.84       322
   macro avg       0.85      0.84      0.84       322
weighted avg       0.85      0.84      0.84       322

Score random forest :
               precision    recall  f1-score   support

    NEGATIVE       0.81      0.86      0.83       166
    POSITIVE       0.84      0.78      0.81       156

    accuracy                           0.82       322
   macro avg       0.82      0.82      0.82       322
weighted 

## Model optimisation of the SVC which is the best model

In [8]:
from sklearn.model_selection import GridSearchCV

grid_values = {"C":[1,2,3,4,5]}
grid_svc = GridSearchCV(clf_svc, param_grid = grid_values).fit(Xtrain_vect, y_train)
print("Best params :",grid_svc.best_params_)

Best params : {'C': 1}


## Saving and testing the model

In [9]:
test = ["I really didn't like the book, waist of time", "Nice book, the story is well pictured"]
test_vect = vect.transform(test)

result = clf_svc.predict(test_vect)
print(result)

['NEGATIVE' 'POSITIVE']


In [10]:
# Saving the model
import pickle

with open('./sentiment_analysis.pkl','wb') as model:
    pickle.dump(clf_svc, model)