## Sentiment Analysis of Amazon Review Data

In [1]:
import json
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile, f_classif
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [2]:
with open('config.json') as json_data_file:
    config = json.load(json_data_file)

In [3]:
data = pd.read_csv("./Data_filtered/reviews_Automotive.csv", sep=";")
data.head(5)

Unnamed: 0,review,rating,product
0,"['needed', 'set', 'jumper', 'cables', 'new', '...",5.0,B00002243X
1,"['long', 'cables', 'work', 'fine', 'truck', 'q...",4.0,B00002243X
2,"['comment', 'much', 'since', 'yet', 'used', 'c...",5.0,B00002243X
3,"['absolutley', 'love', 'amazon', 'price', 'set...",5.0,B00002243X
4,"['purchased', 'twelve', 'feet', 'long', 'cable...",5.0,B00002243X


In [4]:
# 1, 2, 3 = negativ 4, 5 = positiv
data["label"] = pd.cut(data["rating"], bins = config["label_two"]["bins"], labels = config["label_two"]["labels"])
data["label"] = data["label"].astype(str)
data.head(5)

Unnamed: 0,review,rating,product,label
0,"['needed', 'set', 'jumper', 'cables', 'new', '...",5.0,B00002243X,positiv
1,"['long', 'cables', 'work', 'fine', 'truck', 'q...",4.0,B00002243X,positiv
2,"['comment', 'much', 'since', 'yet', 'used', 'c...",5.0,B00002243X,positiv
3,"['absolutley', 'love', 'amazon', 'price', 'set...",5.0,B00002243X,positiv
4,"['purchased', 'twelve', 'feet', 'long', 'cable...",5.0,B00002243X,positiv


In [5]:
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['label'], test_size=0.2, random_state=42, stratify=data['label'])

#Vectorizer
vect = CountVectorizer()
#vect = TfidfVectorizer()

#feature selection
f_select = SelectKBest(f_classif, k=10000)
f_select = SelectPercentile(f_classif, percentile=20)

#Classifier
clf = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=False)
clf = LinearSVC(C=1.0, penalty= "l1", max_iter=3000, dual=False)
#clf = LogisticRegression(random_state=42, class_weight=None, max_iter=3000)



#Building the Pipeline
pipeline = Pipeline([('vect', vect),
                     ('f_select', f_select), 
                     ('clf', clf)])

In [6]:
from sklearn import metrics
model = pipeline.fit(X_train, y_train)

#Results step of Pipeline
vectorizer = model.named_steps['vect']
feature_selection = model.named_steps['f_select']
classifier = model.named_steps['clf']


#Accuracy score of Train Data
print("accuracy score: " + str(model.score(X_train, y_train)))

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


accuracy score: 0.9441323726950788


In [7]:
from sklearn.metrics import accuracy_score
model = pipeline.fit(X_train, y_train)

#Accuracy Score of Model prediction 
print("accuracy score: " + str(model.score(X_test, y_test)))

#Report on prediction of model
predictions = model.predict(X_test)
print("Classificaton Report:")
print(metrics.classification_report(y_true=y_test, y_pred=predictions))

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


accuracy score: 0.8788766788766789
Classificaton Report:
             precision    recall  f1-score   support

    negativ       0.54      0.27      0.36       516
    positiv       0.90      0.97      0.93      3579

avg / total       0.86      0.88      0.86      4095



In [8]:
#Test with Manual review Text
print(model.predict(['My thing broke, very bad']))
print(model.predict(['My order broke and I am very disappointed']))
print(model.predict(['My order works perfect and and I am very glad I bought it']))

['positiv']
['positiv']
['positiv']


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [9]:
#Confusion matrix of model
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

labels = model.predict(X_test)
target_names = ['Negativ', 'Positiv']
mat = confusion_matrix(y_test, labels)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=target_names, yticklabels=target_names)
plt.xlabel('true label')
plt.ylabel('predicted label')
plt.show()

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


<Figure size 640x480 with 1 Axes>

In [10]:
from sklearn.model_selection import cross_val_score,  KFold

#kfold
kfold = KFold(n_splits=5)

#StratifiedKFold
#standard for cv=INT is stratified k-fold

scores = cross_val_score(pipeline, X_train, y_train, cv=5)
print("Cross-validation scores: {}".format(scores))
print("Mean accuracy: {}".format(scores.mean()))

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


Cross-validation scores: [0.87976808 0.88278388 0.8778626  0.87908397 0.88641221]
Mean accuracy: 0.8811821483942651


In [11]:
from sklearn.model_selection import cross_validate
res = cross_validate(pipeline, X_train, y_train, cv=5, return_train_score=True)

#make DataFrame of result for better Visualisation
res_df = pd.DataFrame(res)
display(res_df)

#Print Mean of all Scores
print("Mean times and scores:\n", res_df.mean())

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


Unnamed: 0,fit_time,score_time,test_score,train_score
0,1.57537,0.204789,0.879768,0.945348
1,1.828108,0.197796,0.882784,0.946344
2,1.743194,0.197794,0.877863,0.949935
3,1.652288,0.1928,0.879084,0.949554
4,1.693247,0.189804,0.886412,0.947111


Mean times and scores:
 fit_time       1.698441
score_time     0.196597
test_score     0.881182
train_score    0.947658
dtype: float64
