In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup
 

In [2]:
#! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz

## Read Data

In [3]:
#import os
#os.listdir('./Dataset')

In [4]:
#reviews_raw = pd.read_csv('./Dataset/amazon_reviews_us_Kitchen_v1_00.tsv', sep = '\t',error_bad_lines=False)
#reviews_raw.head()

In [None]:
reviews_raw = pd.read_csv('https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz', sep = '\t',error_bad_lines=False)
reviews_raw.shape

## Keep Reviews and Ratings

In [6]:
reviews_raw.shape

(4874890, 15)

In [7]:
reviews = reviews_raw[['review_body','star_rating']]

In [8]:
print(reviews['star_rating'].value_counts())

5.0    3124759
4.0     731733
1.0     426900
3.0     349547
2.0     241948
Name: star_rating, dtype: int64


In [9]:
#reviews[:5]
print("Total Reviews: ",len(reviews))
reviews.info(verbose = True,show_counts = True)
print(reviews.isnull().sum())
#create a new copy and remove null values from it
reviews_cpy = reviews.dropna()

Total Reviews:  4874890
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4874890 entries, 0 to 4874889
Data columns (total 2 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   review_body  4874644 non-null  object 
 1   star_rating  4874887 non-null  float64
dtypes: float64(1), object(1)
memory usage: 74.4+ MB
review_body    246
star_rating      3
dtype: int64


In [10]:
reviews_cpy.shape

(4874644, 2)

In [11]:
# Get 3 random rows
reviews_cpy.sample(3)

Unnamed: 0,review_body,star_rating
1068298,use it all the time,5.0
3115897,This simple wire cutter helps level any cake u...,4.0
4152788,This nutcracker does everything it advertises....,5.0


In [12]:
reviews_cpy['star_rating'].value_counts()

5.0    3124595
4.0     731701
1.0     426870
3.0     349539
2.0     241939
Name: star_rating, dtype: int64

In [13]:
pd.options.mode.chained_assignment = None

# Labelling Reviews:
## The reviews with rating 4,5 are labelled to be 1 and 1,2 are labelled as 0. Discard the reviews with rating 3'

In [14]:
reviews_cpy.loc[:,'label'] = np.where(reviews_cpy['star_rating'] <= 2,0,1)
reviews = reviews_cpy[reviews_cpy['star_rating'] != 3.0 ]
print(reviews.head())
print(reviews['label'].value_counts())
print(reviews['label'].value_counts().sum())

                                         review_body  star_rating  label
0                Beautiful.  Looks great on counter.          5.0      1
1  I personally have 5 days sets and have also bo...          5.0      1
2  Fabulous and worth every penny. Used for clean...          5.0      1
3  A must if you love garlic on tomato marinara s...          5.0      1
4  Worth every penny! Buy one now and be a pizza ...          5.0      1
1    3856296
0     668809
Name: label, dtype: int64
4525105


In [15]:
# 0 is negative sentiment classes 
# 1 is positive sentiment classes
#Class 3 or dropped reviews count here
print('Class 3 or dropped  reviews count, Class 1 or positive reviews count, Class 0 or negative reviews count  : {}, {}, {}'.format(len(reviews_cpy['label']) - len(reviews['star_rating']),reviews['label'].value_counts()[1],reviews['label'].value_counts()[0]))

print('Class 3 or dropped  reviews count : ',len(reviews_cpy['label']) - len(reviews['star_rating']))
print('Class 0 or negative reviews count : ',reviews['label'].value_counts()[0])
print('Class 1 or positive reviews count : ',reviews['label'].value_counts()[1])

Class 3 or dropped  reviews count, Class 1 or positive reviews count, Class 0 or negative reviews count  : 349539, 3856296, 668809
Class 3 or dropped  reviews count :  349539
Class 0 or negative reviews count :  668809
Class 1 or positive reviews count :  3856296


 ## We select 200000 reviews randomly with 100,000 positive and 100,000 negative reviews.



In [16]:
reviews_pos = reviews_cpy[reviews_cpy['label'] == 1].sample(100000,random_state = 101)
reviews_neg = reviews_cpy[reviews_cpy['label'] == 0].sample(100000,random_state = 101)
#print(reviews_pos.describe())
#print(reviews_neg.describe())

In [17]:
dataset = pd.concat([reviews_pos,reviews_neg],ignore_index = True)
dataset = dataset.sample(frac = 1,random_state= 101).reset_index(drop = True)
print(len(dataset))
#dataset[50:100]

200000


In [18]:
from sklearn.model_selection import train_test_split

# Data Cleaning

## Convert the all reviews into the lower case.

In [19]:
#Char length of reviews before data cleaning
avg_before_cleaning = dataset['review_body'].str.len().sum()/len(dataset['review_body'])
print('Average character length of reviews before data cleaning:',avg_before_cleaning)

Average character length of reviews before data cleaning: 325.3601


In [20]:
dataset['review_body'] = dataset['review_body'].str.lower()
print(dataset.loc[:5,'review_body'])

0    got these on a kindle fire special for six dol...
1    only thing i can say about this product is gre...
2    got this for my sister, and she loved it!!!  i...
3     definitely keeps my son's lunch hot for 4 hours!
4    ragalta countertop thermo electric hot & cold ...
5    purchased two for a weekly men's prayer breakf...
Name: review_body, dtype: object


## remove the HTML and URLs from the reviews

In [21]:
# print all rows which have <> braces and could be html tags 
len([x for x in dataset['review_body'][dataset['review_body'].str.contains("<.*?>")]])

25802

In [None]:
# remove html
dataset['review_body'] = dataset['review_body'].apply(lambda x : BeautifulSoup(x,'html.parser').get_text())

In [23]:
#check if any tags remain
len([x for x in dataset['review_body'][dataset['review_body'].str.contains("<.*?>")]])
#only remaining results are lone < or > markers spread around no html tags left

40

In [24]:
# For URL removal
def remove_url(text):
    url = re.compile(r'(https?://\S+|www\.\S+)|(\S+\.com\S+)')
    return url.sub(r'',text)
dataset['review_body'] = dataset['review_body'].apply(remove_url)
#check if any url still remain
len([x for x in dataset['review_body'][dataset['review_body'].str.contains("http\S+")]])

0

## remove non-alphabetical characters

In [25]:
#remove every non alphabet chracter except apostrophe for contractions
dataset['review_body'] = dataset['review_body'].replace("[^a-z ']",'',regex=True)

## perform contractions on the reviews.

In [None]:
import sys
!{sys.executable} -m pip install contractions

In [27]:
# should be done before non alphabet characters are removed
import contractions
def contractionfunction(s):
    return contractions.fix(s)


In [28]:
#Check for contractions
len([x for x in dataset['review_body'][dataset['review_body'].str.contains("[a-z]+'[a-z]+")]])

84331

In [29]:
dataset['review_body'] = dataset['review_body'].apply(contractionfunction)

In [30]:
# After contractions are removed; delete all the apostrophe as non alphabet chars are to be removed
dataset['review_body'] = dataset['review_body'].replace("[^a-z ]",'',regex=True)
#check if any more apostrophe words remain
[print(x) for x in dataset['review_body'][dataset['review_body'].str.contains("[a-z]+'[a-z]+")]]

[]

In [31]:
#search for any non alphabetical 
[print(x) for x in dataset['review_body'][dataset['review_body'].str.contains("[^a-z ]")]]

[]

## Remove the extra spaces between the words

In [32]:
dataset['review_body_without_extra_spaces'] = dataset['review_body'].replace('\s+', ' ', regex=True)

In [33]:
#check if multiple spaces still exist
[print(x) for x in dataset['review_body'][dataset['review_body_without_extra_spaces'].str.contains('\s{2,}')]]

[]

In [34]:
#Char length of reviews after data cleaning
avg_after_cleaning = dataset['review_body'].str.len().sum()/len(dataset['review_body'])
print('Average character length of reviews after data cleaning:',avg_after_cleaning)

Average character length of reviews after data cleaning: 312.50965


In [35]:
print('Average character length before and after data cleaning : {}, {}'.format(avg_before_cleaning,avg_after_cleaning))

Average character length before and after data cleaning : 325.3601, 312.50965


# Pre-processing

## remove the stop words 

In [36]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [37]:
print('Average character length of reviews before data preprocessing:',avg_after_cleaning)

Average character length of reviews before data preprocessing: 312.50965


In [38]:
# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
dataset['review_body_without_stopwords'] =  dataset['review_body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

## perform lemmatization  

In [39]:
from nltk.stem import WordNetLemmatizer

In [40]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

In [41]:
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

In [42]:
dataset['review_body_lemmatized'] =  dataset['review_body_without_stopwords'].apply(lemmatize_text)

In [43]:
dataset.sample(4)

Unnamed: 0,review_body,star_rating,label,review_body_without_extra_spaces,review_body_without_stopwords,review_body_lemmatized
28265,does not fit or work in a keurig model when m...,1.0,0,does not fit or work in a keurig model when ma...,fit work keurig model making purchase neither ...,fit work keurig model making purchase neither ...
62925,love them great quality,5.0,1,love them great quality,love great quality,love great quality
48039,just what i expected size is a little big for ...,5.0,1,just what i expected size is a little big for ...,expected size little big works hoping get pain...,expected size little big work hoping get paint...
125817,it is sharp but i do not think that its claim ...,2.0,0,it is sharp but i do not think that its claim ...,sharp think claim nothing stick true cheese st...,sharp think claim nothing stick true cheese st...


In [44]:
#Char length of reviews after data preprocessing
avg_after_preprocessing = dataset['review_body_lemmatized'].str.len().sum()/len(dataset['review_body_lemmatized'])
print('Average character length of reviews after data preprocessing:',avg_after_preprocessing)

Average character length of reviews after data preprocessing: 191.44271


In [45]:
print('Average character length before and after data preprocessing : {}, {}'.format(avg_after_cleaning,avg_after_preprocessing))

Average character length before and after data preprocessing : 312.50965, 191.44271


In [46]:
#Split the data into test and train set
X_train,X_test,y_train,y_test = train_test_split(dataset['review_body_lemmatized'],dataset['label'],test_size = 0.2,random_state = 101)

# TF-IDF Feature Extraction

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [48]:
tfidf_vetorizer = TfidfVectorizer()

In [49]:
#train vectorizer over train data and transform test data for later
tfidf_train_data = tfidf_vetorizer.fit_transform(X_train)
tfidf_test_data = tfidf_vetorizer.transform(X_test)


In [50]:
print('Train Data Shape:',tfidf_train_data.shape)
print('Test Data Shape:',tfidf_test_data.shape)

Train Data Shape: (160000, 106917)
Test Data Shape: (40000, 106917)


In [51]:
# True values should be first argument and predicted values are second
def print_metrics(y_actual,y_pred):
    print("Accuracy Score: ",metrics.accuracy_score(y_actual,y_pred))
    print("Precision Score: ",metrics.precision_score(y_actual, y_pred))
    print("Recall Score: ",metrics.recall_score(y_actual, y_pred))
    print("F1 Score :",metrics.f1_score(y_actual, y_pred))
    
def print_metrics_comma(y_actual,y_pred):
    print("Accuracy Score, Precision Score, Recall Score, F1 Score: {}, {}, {}, {} ".format(metrics.accuracy_score(y_actual,y_pred),metrics.precision_score(y_actual, y_pred),metrics.recall_score(y_actual, y_pred),metrics.f1_score(y_actual, y_pred)))

def print_metrics_comma_det(y_actual_train,y_pred_train,y_actual_test,y_pred_test):
    print("Accuracy Score, Precision Score, Recall Score, F1 Score Training Set followed by Test Set: {}, {}, {}, {}, {}, {}, {}, {} ".format(metrics.accuracy_score(y_actual_train,y_pred_train),metrics.precision_score(y_actual_train, y_pred_train),metrics.recall_score(y_actual_train, y_pred_train),metrics.f1_score(y_actual_train, y_pred_train),metrics.accuracy_score(y_actual_test,y_pred_test),metrics.precision_score(y_actual_test, y_pred_test),metrics.recall_score(y_actual_test, y_pred_test),metrics.f1_score(y_actual_test, y_pred_test)))


# Perceptron

In [52]:
from sklearn.linear_model import Perceptron
from sklearn import metrics

In [53]:
def perceptron_model(tfidf_train_data,y_train,tfidf_test_data,y_test):
    perceptron_model = Perceptron()
    perceptron_model.fit(tfidf_train_data,y_train)
    print_metrics_comma_det(y_train,perceptron_model.predict(tfidf_train_data),y_test,perceptron_model.predict(tfidf_test_data))
    print("Perceptron Metrics for Train Set")
    print_metrics(y_train,perceptron_model.predict(tfidf_train_data))
    print("Perceptron Metrics for Test Set")
    print_metrics(y_test,perceptron_model.predict(tfidf_test_data))

In [54]:
perceptron_model(tfidf_train_data,y_train,tfidf_test_data,y_test)

Accuracy Score, Precision Score, Recall Score, F1 Score Training Set followed by Test Set: 0.9010875, 0.9249195992535635, 0.8731493396968978, 0.8982891810948728, 0.82455, 0.8513873052071456, 0.78553178698462, 0.8171348168221376 
Perceptron Metrics for Train Set
Accuracy Score:  0.9010875
Precision Score:  0.9249195992535635
Recall Score:  0.8731493396968978
F1 Score : 0.8982891810948728
Perceptron Metrics for Test Set
Accuracy Score:  0.82455
Precision Score:  0.8513873052071456
Recall Score:  0.78553178698462
F1 Score : 0.8171348168221376


# SVM

In [55]:
from sklearn.svm import LinearSVC

In [56]:
tfidf_train_data.shape

(160000, 106917)

In [57]:
def linearSVCModel(tfidf_train_data,y_train,tfidf_test_data,y_test):
    svc_classifier = LinearSVC()
    svc_classifier.fit(tfidf_train_data,y_train)
    print_metrics_comma_det(y_train,svc_classifier.predict(tfidf_train_data),y_test,svc_classifier.predict(tfidf_test_data))
    print("SVM Metrics for Train Set")
    print_metrics(y_train,svc_classifier.predict(tfidf_train_data))
    print("SVM Metrics for Test Set")
    print_metrics(y_test,svc_classifier.predict(tfidf_test_data))
    


In [58]:
linearSVCModel(tfidf_train_data,y_train,tfidf_test_data,y_test)

Accuracy Score, Precision Score, Recall Score, F1 Score Training Set followed by Test Set: 0.92510625, 0.9294729402261712, 0.9201014505428604, 0.9247634534849408, 0.872425, 0.8768387947651415, 0.8659886779219478, 0.8713799621928167 
SVM Metrics for Train Set
Accuracy Score:  0.92510625
Precision Score:  0.9294729402261712
Recall Score:  0.9201014505428604
F1 Score : 0.9247634534849408
SVM Metrics for Test Set
Accuracy Score:  0.872425
Precision Score:  0.8768387947651415
Recall Score:  0.8659886779219478
F1 Score : 0.8713799621928167


# Logistic Regression

In [59]:
from sklearn.linear_model import LogisticRegression

In [60]:
def logitClassifierModel(tfidf_train_data,y_train,tfidf_test_data,y_test):
    logit_classifier = LogisticRegression(class_weight = 'balanced',max_iter = 2000)
    logit_classifier.fit(tfidf_train_data,y_train)
    print_metrics_comma_det(y_train,logit_classifier.predict(tfidf_train_data),y_test,logit_classifier.predict(tfidf_test_data))
    print("Logistic Regression Metrics for Train Set")
    print_metrics(y_train,logit_classifier.predict(tfidf_train_data))
    print("Logistic Regression Metrics for Test Set")
    print_metrics(y_test,logit_classifier.predict(tfidf_test_data))
    

In [61]:
logitClassifierModel(tfidf_train_data,y_train,tfidf_test_data,y_test)

Accuracy Score, Precision Score, Recall Score, F1 Score Training Set followed by Test Set: 0.89536875, 0.9014230993632836, 0.8879421282124964, 0.8946318313706485, 0.87645, 0.8838623932934622, 0.8662391663744301, 0.8749620483756704 
Logistic Regression Metrics for Train Set
Accuracy Score:  0.89536875
Precision Score:  0.9014230993632836
Recall Score:  0.8879421282124964
F1 Score : 0.8946318313706485
Logistic Regression Metrics for Test Set
Accuracy Score:  0.87645
Precision Score:  0.8838623932934622
Recall Score:  0.8662391663744301
F1 Score : 0.8749620483756704


# Naive Bayes

In [62]:
from sklearn.naive_bayes import MultinomialNB

In [63]:
def multinomialNBClassifierModel(tfidf_train_data,y_train,tfidf_test_data,y_test):
    multinomialNBClassifier = MultinomialNB()
    multinomialNBClassifier.fit(tfidf_train_data,y_train)
    print_metrics_comma_det(y_train,multinomialNBClassifier.predict(tfidf_train_data),y_test,multinomialNBClassifier.predict(tfidf_test_data))
    print("Multinomial Naieve Bayes Metrics for Train Set")
    print_metrics(y_train,multinomialNBClassifier.predict(tfidf_train_data))
    print("Multinomial Naieve Bayes Metrics for Test Set")
    print_metrics(y_test,multinomialNBClassifier.predict(tfidf_test_data))
    

In [64]:
multinomialNBClassifierModel(tfidf_train_data,y_train,tfidf_test_data,y_test)

Accuracy Score, Precision Score, Recall Score, F1 Score Training Set followed by Test Set: 0.87338125, 0.8851704852967707, 0.8582191181798873, 0.8714864787713855, 0.85085, 0.8677282043197225, 0.8272130654776815, 0.8469864067709669 
Multinomial Naieve Bayes Metrics for Train Set
Accuracy Score:  0.87338125
Precision Score:  0.8851704852967707
Recall Score:  0.8582191181798873
F1 Score : 0.8714864787713855
Multinomial Naieve Bayes Metrics for Test Set
Accuracy Score:  0.85085
Precision Score:  0.8677282043197225
Recall Score:  0.8272130654776815
F1 Score : 0.8469864067709669


## Additional Tests
1. Include Review Title into the review body and then preprocess the data

In [65]:
reviews_incl_heading = reviews_raw[['review_body','star_rating']]

In [66]:
reviews_incl_heading['review_body'] = reviews_raw['review_headline'] + " " + reviews_raw['review_body']

In [67]:
reviews_incl_heading.head()

Unnamed: 0,review_body,star_rating
0,Beautiful. Looks great on counter Beautiful. ...,5.0
1,Awesome & Self-ness I personally have 5 days s...,5.0
2,Fabulous and worth every penny Fabulous and wo...,5.0
3,Five Stars A must if you love garlic on tomato...,5.0
4,Better than sex Worth every penny! Buy one now...,5.0


In [68]:
def clean(dataset):
    dataset = dataset.dropna()
    dataset.loc[:,'label'] = np.where(dataset['star_rating'] <= 2,0,1)
    dataset = dataset[dataset['star_rating'] != 3.0 ]
    reviews_pos = dataset[dataset['label'] == 1].sample(100000,random_state = 101)
    reviews_neg = dataset[dataset['label'] == 0].sample(100000,random_state = 101)
    dataset = pd.concat([reviews_pos,reviews_neg],ignore_index = True)
    dataset = dataset.sample(frac = 1,random_state= 101).reset_index(drop = True)
    print(len(dataset))
    print(dataset.head())
    dataset['review_body'] = dataset['review_body'].str.lower()
    dataset['review_body'] = dataset['review_body'].apply(remove_url)
    dataset['review_body'] = dataset['review_body'].apply(lambda x : BeautifulSoup(x,'html.parser').get_text())
    dataset['review_body'] = dataset['review_body'].replace("[^a-z ']",'',regex=True)
    dataset['review_body'] = dataset['review_body'].apply(contractionfunction)
    dataset['review_body'] = dataset['review_body'].replace("[^a-z ]",'',regex=True)
    dataset['review_body'] = dataset['review_body'].replace('\s+', ' ', regex=True)
    print('Average character length of reviews after data cleaning:',dataset['review_body'].str.len().sum()/len(dataset['review_body']))
    return dataset

In [69]:
reviews_incl_heading = clean(reviews_incl_heading)

200000
                                         review_body  star_rating  label
0  Beauty and Durability rolled into one. I thoug...          5.0      1
1  Great napkin holder. Very pretty napkin holder...          5.0      1
2  Love It It was exactly what I was looking for....          5.0      1
3  Perfect Size & Shape I've waxed rhapsodic abou...          5.0      1
4  Great Tool My husband decided to start juicing...          5.0      1
Average character length of reviews after data cleaning: 330.664615


In [70]:
def preprocess(dataset):
    dataset['review_body_without_stopwords'] =  dataset['review_body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    dataset['review_body_lemmatized'] =  dataset['review_body_without_stopwords'].apply(lemmatize_text)
    print('Average character length of reviews after data preprocessing:',dataset['review_body_lemmatized'].str.len().sum()/len(dataset['review_body_lemmatized']))
    return dataset

In [71]:
reviews_incl_heading = preprocess(reviews_incl_heading)

Average character length of reviews after data preprocessing: 207.432255


In [72]:
X_train,X_test,y_train,y_test = train_test_split(reviews_incl_heading['review_body_lemmatized'],reviews_incl_heading['label'],test_size = 0.2,random_state = 101)

In [73]:
def vectorize(X_train,X_test):
    tfidf_vetorizer = TfidfVectorizer()
    tfidf_train_data = tfidf_vetorizer.fit_transform(X_train)
    tfidf_test_data = tfidf_vetorizer.transform(X_test)
    return tfidf_train_data,tfidf_test_data


In [74]:
rev_train,rev_test = vectorize(X_train,X_test)

In [75]:
perceptron_model(rev_train,y_train,rev_test,y_test)

Accuracy Score, Precision Score, Recall Score, F1 Score Training Set followed by Test Set: 0.9461375, 0.9439009534227504, 0.9487125026549557, 0.9463006118913798, 0.8949, 0.8940235058764692, 0.895546315314864, 0.8947842626889578 
Perceptron Metrics for Train Set
Accuracy Score:  0.9461375
Precision Score:  0.9439009534227504
Recall Score:  0.9487125026549557
F1 Score : 0.9463006118913798
Perceptron Metrics for Test Set
Accuracy Score:  0.8949
Precision Score:  0.8940235058764692
Recall Score:  0.895546315314864
F1 Score : 0.8947842626889578


In [76]:
linearSVCModel(rev_train,y_train,rev_test,y_test)

Accuracy Score, Precision Score, Recall Score, F1 Score Training Set followed by Test Set: 0.9603375, 0.9610942173167649, 0.9595572158572696, 0.9603251015942483, 0.925675, 0.9264912633058847, 0.9244025850408296, 0.9254457456679288 
SVM Metrics for Train Set
Accuracy Score:  0.9603375
Precision Score:  0.9610942173167649
Recall Score:  0.9595572158572696
F1 Score : 0.9603251015942483
SVM Metrics for Test Set
Accuracy Score:  0.925675
Precision Score:  0.9264912633058847
Recall Score:  0.9244025850408296
F1 Score : 0.9254457456679288


In [77]:
logitClassifierModel(rev_train,y_train,rev_test,y_test)

Accuracy Score, Precision Score, Recall Score, F1 Score Training Set followed by Test Set: 0.93736875, 0.9405088456602159, 0.93386973850248, 0.937177534119476, 0.92865, 0.9316679283371183, 0.9248534642552978, 0.9282481898632341 
Logistic Regression Metrics for Train Set
Accuracy Score:  0.93736875
Precision Score:  0.9405088456602159
Recall Score:  0.93386973850248
F1 Score : 0.937177534119476
Logistic Regression Metrics for Test Set
Accuracy Score:  0.92865
Precision Score:  0.9316679283371183
Recall Score:  0.9248534642552978
F1 Score : 0.9282481898632341


In [78]:
multinomialNBClassifierModel(rev_train,y_train,rev_test,y_test)

Accuracy Score, Precision Score, Recall Score, F1 Score Training Set followed by Test Set: 0.9081375, 0.9189653624693828, 0.8953135346518573, 0.9069852801579568, 0.89335, 0.9051158948944298, 0.8783628074745754, 0.8915386962269907 
Multinomial Naieve Bayes Metrics for Train Set
Accuracy Score:  0.9081375
Precision Score:  0.9189653624693828
Recall Score:  0.8953135346518573
F1 Score : 0.9069852801579568
Multinomial Naieve Bayes Metrics for Test Set
Accuracy Score:  0.89335
Precision Score:  0.9051158948944298
Recall Score:  0.8783628074745754
F1 Score : 0.8915386962269907


## Additional Tests 
2. Remove features/terms that only occur 1 time or at most 2 times

In [79]:
reviews_reduced_terms = reviews_raw[['review_body','star_rating']]
reviews_reduced_terms['review_body'] = reviews_raw['review_headline'] + " " + reviews_raw['review_body']

In [80]:
reviews_reduced_terms = clean(reviews_reduced_terms)

200000
                                         review_body  star_rating  label
0  Beauty and Durability rolled into one. I thoug...          5.0      1
1  Great napkin holder. Very pretty napkin holder...          5.0      1
2  Love It It was exactly what I was looking for....          5.0      1
3  Perfect Size & Shape I've waxed rhapsodic abou...          5.0      1
4  Great Tool My husband decided to start juicing...          5.0      1
Average character length of reviews after data cleaning: 330.664615


In [81]:
def vectorize2(X_train,X_test):
    tfidf_vetorizer = TfidfVectorizer(min_df = 2)
    tfidf_train_data = tfidf_vetorizer.fit_transform(X_train)
    tfidf_test_data = tfidf_vetorizer.transform(X_test)
    return tfidf_train_data,tfidf_test_data


In [82]:
reviews_reduced_terms = preprocess(reviews_reduced_terms)

Average character length of reviews after data preprocessing: 207.432255


In [83]:
X_train,X_test,y_train,y_test = train_test_split(reviews_reduced_terms['review_body_lemmatized'],reviews_reduced_terms['label'],test_size = 0.2,random_state = 101)

In [84]:
red_train,red_test = vectorize2(X_train,X_test)

In [85]:
red_train.shape

(160000, 35746)

In [86]:
perceptron_model(red_train,y_train,red_test,y_test)

Accuracy Score, Precision Score, Recall Score, F1 Score Training Set followed by Test Set: 0.9348125, 0.9258525125720368, 0.945401616711853, 0.9355249493101231, 0.897875, 0.8906111603188662, 0.9067181002955764, 0.8985924583571234 
Perceptron Metrics for Train Set
Accuracy Score:  0.9348125
Precision Score:  0.9258525125720368
Recall Score:  0.945401616711853
F1 Score : 0.9355249493101231
Perceptron Metrics for Test Set
Accuracy Score:  0.897875
Precision Score:  0.8906111603188662
Recall Score:  0.9067181002955764
F1 Score : 0.8985924583571234


In [87]:
linearSVCModel(red_train,y_train,red_test,y_test)

Accuracy Score, Precision Score, Recall Score, F1 Score Training Set followed by Test Set: 0.95314375, 0.9540231323853394, 0.9522232911455666, 0.953122362077698, 0.925625, 0.9266981511254019, 0.9240519012073544, 0.9253731343283582 
SVM Metrics for Train Set
Accuracy Score:  0.95314375
Precision Score:  0.9540231323853394
Recall Score:  0.9522232911455666
F1 Score : 0.953122362077698
SVM Metrics for Test Set
Accuracy Score:  0.925625
Precision Score:  0.9266981511254019
Recall Score:  0.9240519012073544
F1 Score : 0.9253731343283582


In [88]:
logitClassifierModel(red_train,y_train,red_test,y_test)

Accuracy Score, Precision Score, Recall Score, F1 Score Training Set followed by Test Set: 0.93619375, 0.9390498465871938, 0.9330076587663514, 0.9360190018989366, 0.9289, 0.9324440402203021, 0.9245027804218225, 0.928456429865164 
Logistic Regression Metrics for Train Set
Accuracy Score:  0.93619375
Precision Score:  0.9390498465871938
Recall Score:  0.9330076587663514
F1 Score : 0.9360190018989366
Logistic Regression Metrics for Test Set
Accuracy Score:  0.9289
Precision Score:  0.9324440402203021
Recall Score:  0.9245027804218225
F1 Score : 0.928456429865164


In [89]:
multinomialNBClassifierModel(red_train,y_train,red_test,y_test)

Accuracy Score, Precision Score, Recall Score, F1 Score Training Set followed by Test Set: 0.90353125, 0.9069389503388848, 0.8994490186034308, 0.903178456500875, 0.894475, 0.897394465764492, 0.8903361555032313, 0.8938513768389287 
Multinomial Naieve Bayes Metrics for Train Set
Accuracy Score:  0.90353125
Precision Score:  0.9069389503388848
Recall Score:  0.8994490186034308
F1 Score : 0.903178456500875
Multinomial Naieve Bayes Metrics for Test Set
Accuracy Score:  0.894475
Precision Score:  0.897394465764492
Recall Score:  0.8903361555032313
F1 Score : 0.8938513768389287
