In [171]:
import pandas as pd
import numpy as np
import matplotlib.pyplot  as plt
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [172]:
df=pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
df.head(40)

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Positive Feedback Count,Division Name,Department Name,Category,Recommended IND
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,0,Initmates,Intimate,Intimates,1
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,4,General,Dresses,Dresses,1
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,General,Dresses,Dresses,0
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,0,General Petite,Bottoms,Pants,1
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,6,General,Tops,Blouses,1
5,5,1080,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,4,General,Dresses,Dresses,0
6,6,858,39,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5,1,General Petite,Tops,Knits,1
7,7,858,39,"Shimmer, surprisingly goes with lots","I ordered this in carbon for store pick up, an...",4,4,General Petite,Tops,Knits,1
8,8,1077,24,Flattering,I love this dress. i usually get an xs but it ...,5,0,General,Dresses,Dresses,1
9,9,1077,34,Such a fun dress!,"I'm 5""5' and 125 lbs. i ordered the s petite t...",5,0,General,Dresses,Dresses,1


In [173]:
''' Preprocessing:
a. Find any null values are present or not, If present remove those data.
b. Remove the data that have less than 5 reviews.
c. Clean the data and remove the special characters and replace the 
contractions with its expansion. Convert the uppercase character to lower 
case. Also, remove the punctuations.
'''
df=df.drop(columns='Unnamed: 0')

In [174]:
print(df.isna().sum())

Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Positive Feedback Count       0
Division Name                14
Department Name              14
Category                     14
Recommended IND               0
dtype: int64


In [175]:
df=df.dropna()

In [176]:
n=df[['Clothing ID']]
df=df[n.replace(n.apply(pd.Series.value_counts)).gt(5).all(1)]

In [177]:
contractions = {
"a'ight":"alright",
"ain't":"are not",
"amn't":"am not",
"aren't":"are not",
"can't":"cannot",
"'cause": "because",
"could've":"could have",
"couldn't":"could not",
"couldn't've":"could not have",
"daren't":"dare not",
"daresn't":"dare not",
"dasn't":"dare not",
"didn't":"did not",
"doesn't":"does not",
"don't":"do not",
"everybody's":"everybody is",
"everyone's":"everyone is",
"giv'n":"given",
"gonna":"going to",
"gon't":"go not", 
"gotta":"got to",
"hadn't":"had not",
"had've":"had have",
"hasn't":"has not",
"haven't":"have not",
"he'd":"he had", 
"he'll":"he will",
"he's":"he is",
"here's":"here is",
"how'd":"how did",
"how'll":"how will",
"how're":"how are",
"how's":"how is",
"I'd":"I had",
"I'd've":"I would have",
"I'd'nt":"I would not",
"I'd'nt've":"I would not have",
"I'll":"I will",
"I'm":"I am",
"I've":"I have",
"isn't":"is not",
"it'd":"it would",
"it'll":"it will",
"it's":"it is",
"let's":"let us",
"ma'am":"madam",
"mayn't":"may not",
"may've":"may have",
"mightn't":"might not",
"might've":"might have",
"mustn't":"must not",
"mustn't've":"must not have",
"must've":"must have",
"needn't":"need not",
"needn't've":"need not have",
"o'clock":"of the clock",
"oughtn't":"ought not",
"oughtn't've":"ought not have",
"shan't":"shall not",
"she'd":"she would",
"she'll":"she will",
"she's":"she is",
"should've":"should have",
"shouldn't":"should not",
"shouldn't've":"should not have",
"somebody's":"somebody is",
"someone's":"someone is",
"something's":"something is",
"so're":"so are",
"so’s":"so is",
"so’ve":"so have",
"that'll":"that will",
"that're":"that are",
"that's":"that is",
"that'd":"that would",
"there'd":"there would",
"there'll":"there will",
"there're":"there are",
"there's":"there is",
"these're":"these are",
"these've":"these have",
"they'd":"they would",
"they'll":"they will",
"they're":"they are",
"they've":"they have",
"this's":"this is",
"those're":"those are",
"those've":"those have",
"to've":"to have",
"wasn't":"was not",
"we'd":"we would",
"we'd've":"we would have",
"we'll":"we will",
"we're":"we are",
"we've":"we have",
"weren't":"were not",
"what'd":"what did",
"what'll":"what will",
"what're":"what are",
"what's":"what is",
"what've":"what have",
"when's":"when is",
"where'd":"where did",
"where'll":"where will",
"where're":"where are",
"where's":"where is",
"where've":"where have",
"which'd":"which would",
"which'll":"which will",
"which're":"which are",
"which's":"which is",
"which've":"which have",
"who'd":"who would",
"who'd've":"who would have",
"who'll":"who will",
"who're":"who are",
"who's":"who is",
"who've":"who have",
"why'd":"why did",
"why're":"why are",
"why's":"why is",
"won't":"will not",
"would've":"would have",
"wouldn't":"would not",
"wouldn't've":"would not have",
"y'at":"you at",
"yes’m":"yes madam",
"you'd":"you would",
"you'll":"you will",
"you're":"you are",
"you've":"you have"}


In [178]:
def cont_to_exp(x):
    if type(x) is str:
        x=x.lower()
        x = x.replace('\\','')
        x=x.replace('.','')
        x=x.replace(',','')
        x=x.replace('-','')
        x=x.replace('!','')
        for key in contractions:
            value = contractions[key]
            x = x.replace(key, value)
        return x
    else:
        return x

In [179]:
df['Review Text'] = df['Review Text'].apply(lambda x:cont_to_exp(x))
df['Review Text']

2        i had such high hopes for this dress and reall...
3        i love love love this jumpsuit it is fun flirt...
5        i love tracy reese dresses but this one is not...
6        i aded this in my basket at hte last mintue to...
7        i ordered this in carbon for store pick up and...
                               ...                        
23481    i was very happy to snag this dress at such a ...
23482    it reminds me of maternity clothes soft stretc...
23483    this fit well but the top was very see through...
23484    i bought this dress for a wedding i have this ...
23485    this dress in a lovely platinum is feminine an...
Name: Review Text, Length: 18206, dtype: object

In [180]:
df.drop(columns=["Clothing ID"])

Unnamed: 0,Age,Title,Review Text,Rating,Positive Feedback Count,Division Name,Department Name,Category,Recommended IND
2,60,Some major design flaws,i had such high hopes for this dress and reall...,3,0,General,Dresses,Dresses,0
3,50,My favorite buy!,i love love love this jumpsuit it is fun flirt...,5,0,General Petite,Bottoms,Pants,1
5,49,Not for the very petite,i love tracy reese dresses but this one is not...,2,4,General,Dresses,Dresses,0
6,39,Cagrcoal shimmer fun,i aded this in my basket at hte last mintue to...,5,1,General Petite,Tops,Knits,1
7,39,"Shimmer, surprisingly goes with lots",i ordered this in carbon for store pick up and...,4,4,General Petite,Tops,Knits,1
...,...,...,...,...,...,...,...,...,...
23481,34,Great dress for many occasions,i was very happy to snag this dress at such a ...,5,0,General Petite,Dresses,Dresses,1
23482,48,Wish it was made of cotton,it reminds me of maternity clothes soft stretc...,3,0,General Petite,Tops,Knits,1
23483,31,"Cute, but see through",this fit well but the top was very see through...,3,1,General Petite,Dresses,Dresses,0
23484,28,"Very cute dress, perfect for summer parties an...",i bought this dress for a wedding i have this ...,3,2,General,Dresses,Dresses,1


In [181]:
'''Separate the columns into dependent and independent variables (or features and 
labels). Then you split those variables into train and test sets (80:20).'''
x = df["Review Text"]
y = df["Recommended IND"].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [182]:
''' Apply the Naïve Bayes Classification Algorithm on Sentiment category to 
predict if item is recommended'''
vec = CountVectorizer(stop_words='english')
x = vec.fit_transform(x).toarray()
x_test = vec.transform(x_test).toarray()

In [183]:
model = MultinomialNB()
model.fit(x, y)

MultinomialNB()

In [184]:
model.score(x_test, y_test)

0.9214717188358045

In [185]:
#Tabulate accuracy in terms of precision, recall and F1 score
y_pred=model.predict(x_test)
c_m=confusion_matrix(y_test,y_pred)
print(c_m)


[[ 489  168]
 [ 118 2867]]


In [186]:
precision= c_m[1][1]/(c_m[1][1]+c_m[0][1])
print(precision)

0.9446457990115321


In [187]:
recall=c_m[1][1]/(c_m[1][1]+c_m[1][0])
print(recall)

0.9604690117252931


In [188]:
f1_score=2*((precision*recall)/(precision+recall))
print(f1_score)

0.9524916943521594
