In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
data_full=pd.read_json('data/products_100k.json',lines = True)

In [3]:
data_full.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,78,"[1, 1]",5,Conversations with God Book 1 is the single mo...,"08 11, 2004",A3AF8FFZAZYNE5,,Impactful!,1092182400
1,116,"[5, 5]",4,Interesting Grisham tale of a lawyer that take...,"04 27, 2002",AH2L9G3DQHHAJ,chris,Show me the money!,1019865600
2,116,"[0, 0]",1,The thumbnail is a shirt. The product shown i...,"03 24, 2014",A2IIIDRK3PRRZY,Helene,Listing is all screwed up,1395619200
3,868,"[10, 10]",4,I'll be honest. I work for a large online reta...,"09 11, 2002",A1TADCM7YWPQ8M,Joel@AWS,Not a Bad Translation,1031702400
4,13714,"[0, 0]",4,It had all the songs I wanted but I had ordere...,"10 31, 2013",AWGH7V0BDOJKB,Barbara Marshall,Not the large print,1383177600


In [4]:
data=data_full[['reviewText', 'overall']]

In [5]:
data.head()

Unnamed: 0,reviewText,overall
0,Conversations with God Book 1 is the single mo...,5
1,Interesting Grisham tale of a lawyer that take...,4
2,The thumbnail is a shirt. The product shown i...,1
3,I'll be honest. I work for a large online reta...,4
4,It had all the songs I wanted but I had ordere...,4


In [6]:
ser_rating=data['overall']

In [7]:
data.describe()

Unnamed: 0,overall
count,100000.0
mean,4.27604
std,1.137759
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [8]:
data=data.replace([1,2,3,4,5],['Negative','Negative','Neutral','Positive','Positive'])

In [9]:
data.head()

Unnamed: 0,reviewText,overall
0,Conversations with God Book 1 is the single mo...,Positive
1,Interesting Grisham tale of a lawyer that take...,Positive
2,The thumbnail is a shirt. The product shown i...,Negative
3,I'll be honest. I work for a large online reta...,Positive
4,It had all the songs I wanted but I had ordere...,Positive


In [10]:
data['Ratings']=ser_rating

In [11]:
data.head()

Unnamed: 0,reviewText,overall,Ratings
0,Conversations with God Book 1 is the single mo...,Positive,5
1,Interesting Grisham tale of a lawyer that take...,Positive,4
2,The thumbnail is a shirt. The product shown i...,Negative,1
3,I'll be honest. I work for a large online reta...,Positive,4
4,It had all the songs I wanted but I had ordere...,Positive,4


In [14]:
data=data.rename({'overall':'Sentiment'},axis=1)
data.head()

Unnamed: 0,reviewText,Sentiment,Ratings
0,Conversations with God Book 1 is the single mo...,Positive,5
1,Interesting Grisham tale of a lawyer that take...,Positive,4
2,The thumbnail is a shirt. The product shown i...,Negative,1
3,I'll be honest. I work for a large online reta...,Positive,4
4,It had all the songs I wanted but I had ordere...,Positive,4


In [15]:
data.groupby('Sentiment').count()
#Dataset is unbalanced

Unnamed: 0_level_0,reviewText,Ratings
Sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
Negative,10221,10221
Neutral,8601,8601
Positive,81178,81178


In [16]:
#Seeing positive comments
#pd.set_option('display.max_colwidth', -1)
mask=data['Sentiment']=='Positive'
data[mask]['reviewText'].iloc[:5]

0    Conversations with God Book 1 is the single mo...
1    Interesting Grisham tale of a lawyer that take...
3    I'll be honest. I work for a large online reta...
4    It had all the songs I wanted but I had ordere...
5    We have many of the old, old issue. But the nu...
Name: reviewText, dtype: object

In [17]:
#Seeing Negative comments
mask=data['Sentiment']=='Negative'
data[mask]['reviewText'].iloc[:5]

2     The thumbnail is a shirt.  The product shown i...
16    This is a very cheaply made product.  I would ...
28    I bought this for my 3 yr old daughter when I ...
31    Its as cheaply made as the price. I got it for...
34    the tutu color was very nice. the only issue w...
Name: reviewText, dtype: object

In [18]:
mask=data['Sentiment']=='Neutral'
data[mask]['reviewText'].iloc[:5]

12    One review advised this book was large print, ...
51    The waistband was not sewed properly. I had to...
55    tutus are amazing. i love them and so does my ...
65    It is a fine tutu, true to its price. My littl...
68    I was expecting a fuller tutu, but it still wo...
Name: reviewText, dtype: object

In [22]:
documents=data['reviewText']
y_target=data['Sentiment']

In [23]:
X_train,X_test,y_train,y_test=train_test_split(documents,y_target,test_size=0.20)

In [24]:
model_NB=Pipeline([('vect',TfidfVectorizer()),
               ('clf',MultinomialNB())])
model_DT=Pipeline([('vect',TfidfVectorizer()),
                ('tcl',DecisionTreeClassifier())])
model_GB=Pipeline([('vect',TfidfVectorizer()),
                ('gbcl',GradientBoostingClassifier())])
model_LR=Pipeline([('vect',TfidfVectorizer()),
                ('lrcl',LogisticRegression())])
model_RF=Pipeline([('vect',TfidfVectorizer()),
                ('rfcl',LogisticRegression())])

In [25]:
models=[model_NB,model_DT,model_RF,model_GB,model_LR]
for model in models:
    model.fit(X_train,y_train)
    predict=model.predict(X_test)
    print('For model'+ str(model)+ 'Accuracy achieved is'+str(accuracy_score(y_test,predict)))
    print('Classification report:'+ str(metrics.classification_report(y_test,predict)))

For modelPipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])Accuracy achieved is0.81055
Classification report:              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00      2063
     Neutral       0.00      0.00      0.00      1721
    Positive       0.81      1.00      0.90     16216

   micro avg       0.81      0.81      0.81     20000
   macro avg       0.27      0.33      0.30     20000
weighted avg       0.66      0.81      0.73     20000

For modelPipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
       



For modelPipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])Accuracy achieved is0.8637
Classification report:              precision    recall  f1-score   support

    Negative       0.73      0.53      0.62      2063
     Neutral       0.49      0.14      0.21      1721
    Positive       0.88      0.98      0.93     16216

   micro avg       0.86      0.86      0.86     20000
   macro avg       0.70      0.55      0.59     20000
weighted avg       0.83      0.86      0.84     20000

For modelPipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        