In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

In [2]:
dataset = pd.read_json('Cell_Phones_and_Accessories_5.json', lines=True)
dataset.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [3]:
dataset.shape

(194439, 9)

In [4]:
# No.of Unique products
print(f"No.of Unique products: {len(dataset['asin'].unique())}")

# No.of Unique reviewers
print(f"No.of reviewers: {len(dataset['reviewerID'].unique())}")

No.of Unique products: 10429
No.of reviewers: 27879


In [5]:
# To balance data, lets take only one review from one reviewer
unique_reviewers_dataset = dataset.drop_duplicates(subset = 'reviewerID')

In [6]:
unique_reviewers_dataset.shape 

(27879, 9)

In [7]:
unique_reviewers_dataset.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')

In [8]:
# Here columns helpful, unixReviewTime and reviewTime are unnecesary data as they do not describe anything about the product
# So, removing them we get
proper_dataset = unique_reviewers_dataset.drop(columns = ['helpful', 'unixReviewTime', 'reviewTime'])

In [9]:
proper_dataset.shape

(27879, 6)

In [10]:
proper_dataset.isnull().sum()

reviewerID        0
asin              0
reviewerName    477
reviewText        0
overall           0
summary           0
dtype: int64

In [11]:
proper_dataset.fillna('Unknown', inplace = True)

In [12]:
proper_dataset.isnull().sum()

reviewerID      0
asin            0
reviewerName    0
reviewText      0
overall         0
summary         0
dtype: int64

In [13]:
proper_dataset.head()

Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,summary
0,A30TL5EWN6DFXT,120401325X,christina,They look good and stick good! I just don't li...,4,Looks Good
1,ASY55RVNIL0UD,120401325X,emily l.,These stickers work like the review says they ...,5,Really great product.
2,A2TMXE2AFO7ONB,120401325X,Erica,These are awesome and make my phone look so st...,5,LOVE LOVE LOVE
3,AWJ0WZQYMYFQ4,120401325X,JM,Item arrived in great time and was in perfect ...,4,Cute!
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s


In [14]:
# Taking all the reviews into a list
reviews = proper_dataset['reviewText'].to_list()

In [15]:
from nltk.corpus import stopwords

In [16]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
for review in reviews:
    
    # Removing stop words
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(review)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    review = ' '.join(filtered_words)

In [18]:
reviews[1:6]

['These stickers work like the review says they do. They stick on great and they stay on the phone. They are super stylish and I can share them with my sister. :)',
 'These are awesome and make my phone look so stylish! I have only used one so far and have had it on for almost a year! CAN YOU BELIEVE THAT! ONE YEAR!! Great quality!',
 "Item arrived in great time and was in perfect condition. However, I ordered these buttons because they were a great deal and included a FREE screen protector. I never received one. Though its not a big deal, it would've been nice to get it since they claim it comes with one.",
 'awesome! stays on, and looks great. can be used on multiple apple products.  especially having nails, it helps to have an elevated key.',
 'These make using the home button easy. My daughter and I both like them.  I would purchase them again. Well worth the price.']

#### We are only removing stopwords from the data as there are not many impurities in the taken dataset

## Sentimental Analysis using textblob

In [20]:
from textblob import TextBlob

In [21]:
sentiment_score = []
sentiment_subjectivity=[]

In [22]:
for review in reviews:
    testimonial = TextBlob(review)
    sentiment_score.append(testimonial.sentiment.polarity)
    sentiment_subjectivity.append(testimonial.sentiment.subjectivity)

In [23]:
proper_dataset.shape

(27879, 6)

In [24]:
proper_dataset['sentiment_score'] = sentiment_score
proper_dataset['subjectivity_score'] = sentiment_subjectivity

In [25]:
proper_dataset.head(3)

Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,summary,sentiment_score,subjectivity_score
0,A30TL5EWN6DFXT,120401325X,christina,They look good and stick good! I just don't li...,4,Looks Good,0.391667,0.666667
1,ASY55RVNIL0UD,120401325X,emily l.,These stickers work like the review says they ...,5,Really great product.,0.533333,0.854167
2,A2TMXE2AFO7ONB,120401325X,Erica,These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,0.573828,0.95


## Unique Words count

In [26]:
proper_dataset['Unique_words'] = proper_dataset['reviewText'].str.lower().str.split().apply(set).apply(len)

## Word count | Positive and Negative word count

In [27]:
# Getting positive and negative words count
pos_words_count = []
neg_words_count = []
words_count = []

In [28]:
for review in reviews:
    words = nltk.word_tokenize(review)
    neg = 0
    pos = 0
    wor = 0
    for word in words:
        testimonial = TextBlob(word)
        score = testimonial.sentiment.polarity
        wor += 1
        if score < 0:
            neg += 1
        elif score > 0:
            pos += 1
    words_count.append(wor)
    pos_words_count.append(pos)
    neg_words_count.append(neg)

In [30]:
proper_dataset['pos_count'] = pos_words_count
proper_dataset['neg_count'] = neg_words_count
proper_dataset['words_count'] = words_count

In [31]:
proper_dataset.head(3)

Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,summary,sentiment_score,subjectivity_score,Unique_words,pos_count,neg_count,words_count
0,A30TL5EWN6DFXT,120401325X,christina,They look good and stick good! I just don't li...,4,Looks Good,0.391667,0.666667,29,2,1,41
1,ASY55RVNIL0UD,120401325X,emily l.,These stickers work like the review says they ...,5,Really great product.,0.533333,0.854167,26,3,0,36
2,A2TMXE2AFO7ONB,120401325X,Erica,These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,0.573828,0.95,30,4,0,40


## POS Tagging

In [32]:
# Since nltk library has too many categories, let us create our own POS dataset with all the values mapped

In [33]:
array_Noun = []
array_Adj = []
array_Verb = []
array_Adv = []
array_Pro = []
array_Pre = []
array_Con = []
array_Art = []
array_Nega = []
array_Aux = []

articles = ['a', 'an', 'the']
negations = ['no', 'not', 'none', 'nobody', 'nothing', 'neither', 'nowhere', 'never', 'hardly', 'barely', 'scarcely']
auxilliary = ['am', 'is', 'are', 'was', 'were', 'be', 'being', 'been', 'will', 'would', 'shall', 'should', 'may', 'might', 'must', 'can', 'could', 'do', 'does', 'did', 'have', 'having', 'has', 'had']

In [34]:
review_text = proper_dataset.reviewText
import re
from collections import Counter
from nltk import sent_tokenize
from nltk import pos_tag

In [35]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [36]:
for text in review_text:
    filter=re.sub('[^\w\s]', '', text)
    conver_lower=filter.lower()
    Tinput = conver_lower.split(" ")
    
    for i in range(0, len(Tinput)):
        Tinput[i] = "".join(Tinput[i])
    UniqW = Counter(Tinput)
    s = " ".join(UniqW.keys())
    
    tokenized = sent_tokenize(s)
    
    for i in tokenized:
        wordsList = nltk.word_tokenize(i)
        #wordsList = [w for w in wordsList if not w in stop_words]
        
        Art = 0
        Nega = 0
        Aux = 0
        for word in wordsList:
            if word in articles:
                Art += 1
            elif word in negations:
                Nega += 1
            elif word in auxilliary:
                Aux += 1
                
        tagged = nltk.pos_tag(wordsList)
        counts = Counter(tag for word,tag in tagged)

        N = sum([counts[i] for i in counts.keys() if 'NN' in i])
        Adj = sum([counts[i] for i in counts.keys() if 'JJ' in i])
        Verb = sum([counts[i] for i in counts.keys() if 'VB' in i])
        Adv = sum([counts[i] for i in counts.keys() if 'RB' in i])
        Pro = sum([counts[i] for i in counts.keys() if (('PRP' in i) or ('PRP$' in i) or ('WP' in i) or ('WP$' in i))])
        Pre = sum([counts[i] for i in counts.keys() if 'IN' in i])
        Con = sum([counts[i] for i in counts.keys() if 'CC' in i])

        array_Noun.append(N)
        array_Adj.append(Adj)
        array_Verb.append(Verb)
        array_Adv.append(Adv)
        array_Pro.append(Pro)
        array_Pre.append(Pre)
        array_Con.append(Con)
        array_Art.append(Art)
        array_Nega.append(Nega)
        array_Aux.append(Aux)
print('Completed')

Completed


In [37]:
POS = ['Noun_Count', 'Adj_Count', 'Verb_Count', 'Adv_Count', 'Pro_Count', 'Pre_Count', 'Con_Count', 'Art_Count', 'Nega_Count', 'Aux_Count']
Values = [array_Noun, array_Adj, array_Verb, array_Adv, array_Pro, array_Pre, array_Con, array_Art, array_Nega, array_Aux]
i = 0
for x in POS:
    proper_dataset[x] = pd.Series(Values[i])
    proper_dataset[x] = proper_dataset[x].fillna(0)
    proper_dataset[x] = proper_dataset[x].astype(float)
    i += 1

## Authenticity

In [38]:
proper_dataset = proper_dataset.assign(Authenticity = lambda x: (x.Pro_Count + x.Unique_words - x.neg_count) / x.words_count)

In [39]:
proper_dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
proper_dataset.dropna(inplace = True)

## Labelling the data ---> Fake review: 1; normal review: 0

In [40]:
def label(Auth, N, Adj, V, Av, S, Sub, W):
    score = 0
    if Auth >= 0.49:
        score += 2
    if (N + Adj) >= (V + Av):
        score += 1
    if -0.5 <= S <= 0.5:
        score += 1
    if Sub <= 0.5:
        score += 2
    if W > 75:
        score += 3
    if score >= 5:
        return 1
    else:
        return 0

In [41]:
proper_dataset['review_type'] = proper_dataset.apply(lambda x : label(
    x['Authenticity'],
    x['Noun_Count'],
    x['Adj_Count'],
    x['Verb_Count'],
    x['Adv_Count'],
    x['sentiment_score'],
    x['subjectivity_score'],
    x['words_count']), axis = 1)

In [42]:
proper_dataset['review_type'].value_counts()

review_type
1    16171
0    11688
Name: count, dtype: int64

In [44]:
proper_dataset.head(3)

Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,summary,sentiment_score,subjectivity_score,Unique_words,pos_count,...,Verb_Count,Adv_Count,Pro_Count,Pre_Count,Con_Count,Art_Count,Nega_Count,Aux_Count,Authenticity,review_type
0,A30TL5EWN6DFXT,120401325X,christina,They look good and stick good! I just don't li...,4,Looks Good,0.391667,0.666667,29,2,...,7.0,4.0,2.0,2.0,1.0,2.0,0.0,1.0,0.731707,0
1,ASY55RVNIL0UD,120401325X,emily l.,These stickers work like the review says they ...,5,Really great product.,0.533333,0.854167,26,3,...,6.0,0.0,3.0,3.0,1.0,1.0,0.0,3.0,0.805556,0
2,A2TMXE2AFO7ONB,120401325X,Erica,These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,0.573828,0.95,30,4,...,6.0,3.0,3.0,4.0,1.0,1.0,0.0,4.0,0.825,0


# Model Training

#### Dropping unnecessary columns

In [45]:
proper_dataset.columns

Index(['reviewerID', 'asin', 'reviewerName', 'reviewText', 'overall',
       'summary', 'sentiment_score', 'subjectivity_score', 'Unique_words',
       'pos_count', 'neg_count', 'words_count', 'Noun_Count', 'Adj_Count',
       'Verb_Count', 'Adv_Count', 'Pro_Count', 'Pre_Count', 'Con_Count',
       'Art_Count', 'Nega_Count', 'Aux_Count', 'Authenticity', 'review_type'],
      dtype='object')

In [46]:
proper_dataset.head(3)

Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,summary,sentiment_score,subjectivity_score,Unique_words,pos_count,...,Verb_Count,Adv_Count,Pro_Count,Pre_Count,Con_Count,Art_Count,Nega_Count,Aux_Count,Authenticity,review_type
0,A30TL5EWN6DFXT,120401325X,christina,They look good and stick good! I just don't li...,4,Looks Good,0.391667,0.666667,29,2,...,7.0,4.0,2.0,2.0,1.0,2.0,0.0,1.0,0.731707,0
1,ASY55RVNIL0UD,120401325X,emily l.,These stickers work like the review says they ...,5,Really great product.,0.533333,0.854167,26,3,...,6.0,0.0,3.0,3.0,1.0,1.0,0.0,3.0,0.805556,0
2,A2TMXE2AFO7ONB,120401325X,Erica,These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,0.573828,0.95,30,4,...,6.0,3.0,3.0,4.0,1.0,1.0,0.0,4.0,0.825,0


In [47]:
proper_dataset.rename(columns  = {'overall' : 'rating'}, inplace = True)

In [48]:
proper_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27859 entries, 0 to 194388
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   reviewerID          27859 non-null  object 
 1   asin                27859 non-null  object 
 2   reviewerName        27859 non-null  object 
 3   reviewText          27859 non-null  object 
 4   rating              27859 non-null  int64  
 5   summary             27859 non-null  object 
 6   sentiment_score     27859 non-null  float64
 7   subjectivity_score  27859 non-null  float64
 8   Unique_words        27859 non-null  int64  
 9   pos_count           27859 non-null  int64  
 10  neg_count           27859 non-null  int64  
 11  words_count         27859 non-null  int64  
 12  Noun_Count          27859 non-null  float64
 13  Adj_Count           27859 non-null  float64
 14  Verb_Count          27859 non-null  float64
 15  Adv_Count           27859 non-null  float64
 16  Pro_Coun

In [49]:
proper_dataset.isnull().sum()

reviewerID            0
asin                  0
reviewerName          0
reviewText            0
rating                0
summary               0
sentiment_score       0
subjectivity_score    0
Unique_words          0
pos_count             0
neg_count             0
words_count           0
Noun_Count            0
Adj_Count             0
Verb_Count            0
Adv_Count             0
Pro_Count             0
Pre_Count             0
Con_Count             0
Art_Count             0
Nega_Count            0
Aux_Count             0
Authenticity          0
review_type           0
dtype: int64

In [50]:
# Collecting all the numerical data from the columns excluding the output and unnecessary data
dataset = proper_dataset[['rating', 'sentiment_score', 'subjectivity_score', 'words_count', 'Noun_Count', 'Adj_Count', 'Verb_Count', 'Adv_Count', 'Authenticity']]

In [51]:
dataset.head(3)

Unnamed: 0,rating,sentiment_score,subjectivity_score,words_count,Noun_Count,Adj_Count,Verb_Count,Adv_Count,Authenticity
0,4,0.391667,0.666667,41,3.0,5.0,7.0,4.0,0.731707
1,5,0.533333,0.854167,36,6.0,3.0,6.0,0.0,0.805556
2,5,0.573828,0.95,40,5.0,3.0,6.0,3.0,0.825


In [52]:
# Scaling these values using minmaxScaler
from sklearn.preprocessing import MinMaxScaler

In [53]:
minmax = MinMaxScaler()

In [54]:
dataset= pd.DataFrame(minmax.fit_transform(dataset), columns = dataset.columns)

In [55]:
dataset.shape

(27859, 9)

## Train Test Split

In [56]:
from sklearn.model_selection import train_test_split

In [57]:
x = dataset
y = proper_dataset['review_type']

In [58]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

## Hyper Parameter tuning using Adaboost

In [59]:
param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "n_estimators": [1, 2]}

In [60]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

In [63]:
dtc = DecisionTreeClassifier(random_state = 11, max_features = "log2", class_weight = "balanced", max_depth = None)
abc = AdaBoostClassifier(base_estimator = dtc)

grid_search1 = GridSearchCV(
    estimator = abc,
    param_grid = param_grid,
    scoring = 'roc_auc')
print('Running Grid Search for given model.')
grid_search1.fit(X_train, y_train)
print('Grid Search Completed.')

Running Grid Search for given model.




Grid Search Completed.




## Random Forest

In [64]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

In [65]:
param_grid = {
'bootstrap': [True],
'max_depth': [80, 90, 100, 110],
'max_features': [2,3],
'min_samples_leaf': [2,3,4],
'min_samples_split': [2, 5, 10],
'n_estimators': [100, 200, 300, 1000] }

In [66]:
grid_search2 = GridSearchCV(
    estimator = RandomForestClassifier(random_state = 42),
    param_grid = param_grid,
    cv=StratifiedKFold(n_splits=2),
    scoring='roc_auc',
    n_jobs = -1)

print('Running Grid Search for given model.')
grid_search2.fit(X_train, y_train)
print('Grid Search Completed.')

Running Grid Search for given model.
Grid Search Completed.


## Logistic Regression

In [67]:
from sklearn.linear_model import LogisticRegression

In [68]:
grid_search3 = GridSearchCV(
    estimator = LogisticRegression(max_iter = 1000),
    param_grid={'C': [10**i for i in range(-5,5)], 'class_weight': [None, 'balanced']},
    cv=StratifiedKFold(n_splits=5),
    scoring='roc_auc' )

print('Running Grid Search for given model.')
grid_search3.fit(X_train, y_train)
print('Grid Search Completed.')

Running Grid Search for given model.
Grid Search Completed.


In [82]:
clf1 = grid_search1.best_estimator_
clf2 = grid_search2.best_estimator_
clf3 = grid_search3.best_estimator_

In [83]:
probas = clf1.predict(X_test)

In [84]:
from sklearn.metrics import roc_auc_score

In [85]:
print('ROC_AUC Score', roc_auc_score(y_test, probas))

ROC_AUC Score 0.9905221951392713


## Determining Best Scores

In [73]:
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate

In [74]:
scores = {'accuracy':make_scorer(accuracy_score), 
           'precision':make_scorer(precision_score),
           'recall':make_scorer(recall_score), 
           'f1_score':make_scorer(f1_score)}

## Making a dataframe of scores

In [75]:
def evaluate(X, y, folds):
    aB = cross_validate(clf1, X, y, cv=folds, scoring=scores)
    rF = cross_validate(clf2, X, y, cv=folds, scoring=scores)
    lR = cross_validate(clf3, X, y, cv=folds, scoring=scores)
    
    scores_table = pd.DataFrame({'AdaBoost':[aB['test_accuracy'].mean(),
                                                               aB['test_precision'].mean(),
                                                               aB['test_recall'].mean(),
                                                               aB['test_f1_score'].mean()],

                                'Random Forest':[rF['test_accuracy'].mean(),
                                                               rF['test_precision'].mean(),
                                                               rF['test_recall'].mean(),
                                                               rF['test_f1_score'].mean()],

                                'Logistic Regression':[lR['test_accuracy'].mean(),
                                                               lR['test_precision'].mean(),
                                                               lR['test_recall'].mean(),
                                                               lR['test_f1_score'].mean()]},
                                index=['Accuracy', 'Precision', 'Recall', 'F1 Score'])
    
    scores_table['Best Score'] = scores_table.idxmax(axis = 1)
    
    return(scores_table)

In [76]:
evaluate(x, y, 3)



Unnamed: 0,AdaBoost,Random Forest,Logistic Regression,Best Score
Accuracy,0.993001,0.996231,0.917154,Random Forest
Precision,0.994619,0.993638,0.923374,AdaBoost
Recall,0.993322,0.999938,0.935377,Random Forest
F1 Score,0.993969,0.996771,0.929072,Random Forest


In [77]:
clf = AdaBoostClassifier(random_state = 42)
x_tr = X_train
x_te = X_test
train_feature_list = [x_tr[0:10000],x_tr[0:20000],x_tr]
train_target_list = [y_train[0:10000], y_train[0:20000], y_train]
for a, b in zip(train_feature_list, train_target_list):
    clf.fit(a, b)
clf1.predict(X_test)

array([1, 0, 1, ..., 0, 1, 0], dtype=int64)

## Saving the model

In [78]:
import joblib

In [79]:
filename = 'Adaboost.pkl'
joblib.dump(clf1, filename)

['Adaboost.pkl']