In [1]:
import numpy as np
import pandas as pd

In [2]:
import re
import nltk
import random
import string

In [3]:
from wordcloud import WordCloud
from wordcloud import STOPWORDS

In [4]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [5]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [6]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
from warnings import filterwarnings
filterwarnings("ignore")

In [8]:
df = pd.read_csv('customer_reviews.csv', index_col=0)
df.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23486 entries, 0 to 23485
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Clothing ID              23486 non-null  int64 
 1   Age                      23486 non-null  int64 
 2   Title                    19676 non-null  object
 3   Review Text              22641 non-null  object
 4   Rating                   23486 non-null  int64 
 5   Recommended IND          23486 non-null  int64 
 6   Positive Feedback Count  23486 non-null  int64 
 7   Division Name            23472 non-null  object
 8   Department Name          23472 non-null  object
 9   Class Name               23472 non-null  object
dtypes: int64(5), object(5)
memory usage: 2.0+ MB


### Approach-01

Using NLTK's Naive Bayes Classifier.

In [10]:
# getting the necessary data

data = df[['Review Text','Rating']]
data.head()

Unnamed: 0,Review Text,Rating
0,Absolutely wonderful - silky and sexy and comf...,4
1,Love this dress! it's sooo pretty. i happene...,5
2,I had such high hopes for this dress and reall...,3
3,"I love, love, love this jumpsuit. it's fun, fl...",5
4,This shirt is very flattering to all due to th...,5


In [11]:
data.isnull().sum()

Review Text    845
Rating           0
dtype: int64

In [12]:
# here revies are missing, they are the primary field in nlp

data.dropna(inplace=True)

In [13]:
data.isnull().sum()

Review Text    0
Rating         0
dtype: int64

In [14]:
# resetting the index

data.reset_index(drop=True, inplace=True)

In [15]:
# renaming the columns
data.columns = ['review','rating']

In [16]:
data['rating'].describe()

count    22641.000000
mean         4.183561
std          1.115762
min          1.000000
25%          4.000000
50%          5.000000
75%          5.000000
max          5.000000
Name: rating, dtype: float64

In [17]:
# checking the proportion of rating 3 and below
len(data[data['rating']<=3])/len(data)

0.22936266065986485

In [18]:
# let's create the label field
sentiment = []

for i in range(0,len(data)):
    if data['rating'][i] > 3:
        sentiment.append('positive')
    else:
        sentiment.append('negative')

In [19]:
# attaching it with the data
data['label'] = sentiment

In [20]:
# removing the rating column
data.drop('rating', axis=1, inplace=True)

In [21]:
# # lower case conversion
# data['review'] = data['review'].str.lower()
# print(data.head())

In [22]:
# replacing some common words
data['review'] = data['review'].str.replace("won't",'will not').str.replace("can't",'can not')
data['review'] = data['review'].str.replace("'s",' is').str.replace("'ve",' have').str.replace("i'm",'i am')
data['review'] = data['review'].str.replace("n't",' not').str.replace("'ll",' will').str.replace("'re",' are')
# data['review'] = data['review'].str.replace("won't",'will not').str.replace("can't",'can not')

In [23]:
print(data.head())

                                              review     label
0  Absolutely wonderful - silky and sexy and comf...  positive
1  Love this dress!  it is sooo pretty.  i happen...  positive
2  I had such high hopes for this dress and reall...  negative
3  I love, love, love this jumpsuit. it is fun, f...  positive
4  This shirt is very flattering to all due to th...  positive


In [24]:
# removing some special characters and numbers

#\w: Returns a match where the string contains any word characters 
     #(characters from a to Z, digits from 0-9, and the underscore _ character)
#\s: Returns a match where the string contains a white space character.
#[^]: Returns a match for any character EXCEPT what is written after it.

data['review'] = data['review'].str.replace('[^\w\s]','').str.replace('[1-9]','')
print(data.head())

                                              review     label
0  Absolutely wonderful  silky and sexy and comfo...  positive
1  Love this dress  it is sooo pretty  i happened...  positive
2  I had such high hopes for this dress and reall...  negative
3  I love love love this jumpsuit it is fun flirt...  positive
4  This shirt is very flattering to all due to th...  positive


In [25]:
# # lower case conversion
data['review'] = data['review'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
print(data.head())

                                              review     label
0  absolutely wonderful silky and sexy and comfor...  positive
1  love this dress it is sooo pretty i happened t...  positive
2  i had such high hopes for this dress and reall...  negative
3  i love love love this jumpsuit it is fun flirt...  positive
4  this shirt is very flattering to all due to th...  positive


In [26]:
# shuffling the data
data = data.sample(frac=1, random_state=0).reset_index().drop("index",axis=1)

In [27]:
# computing word frequency
all_words = [x for x in pd.Series(' '.join(data['review']).split())]

In [28]:
# most common words with its frequency (including stopwords)
nltk.FreqDist(all_words).most_common(10)

[('the', 75907),
 ('i', 65625),
 ('it', 48903),
 ('and', 48839),
 ('a', 42897),
 ('is', 38948),
 ('this', 25682),
 ('to', 24440),
 ('in', 20590),
 ('not', 17879)]

In [29]:
# we have huge amount of stop words and they need to be removed
stopwords = nltk.corpus.stopwords.words('english') +list(string.punctuation)
all_clean_words = [word for word in all_words if word not in stopwords]

In [30]:
# most common words with its frequency (without stopwords)
nltk.FreqDist(all_clean_words).most_common(10)

[('dress', 10467),
 ('love', 8916),
 ('size', 8691),
 ('top', 7327),
 ('fit', 7246),
 ('like', 6992),
 ('wear', 6414),
 ('great', 6076),
 ('would', 5424),
 ('fabric', 4760)]

In [31]:
# deciding the word features (taling most common 2500 words)
word_features = [item[0] for item in nltk.FreqDist(all_clean_words).most_common(2500)]
word_features

['dress',
 'love',
 'size',
 'top',
 'fit',
 'like',
 'wear',
 'great',
 'would',
 'fabric',
 'color',
 'small',
 'look',
 'really',
 'ordered',
 'little',
 'perfect',
 'one',
 'flattering',
 'soft',
 'well',
 'back',
 'comfortable',
 'cute',
 'nice',
 'bought',
 'beautiful',
 'bit',
 'looks',
 'fits',
 '0',
 'large',
 'material',
 'much',
 'length',
 'also',
 'shirt',
 'sweater',
 'got',
 'long',
 'could',
 'jeans',
 'colors',
 'petite',
 'waist',
 'quality',
 'medium',
 'pretty',
 'think',
 'even',
 'retailer',
 'xs',
 'work',
 'tried',
 'usually',
 'skirt',
 'store',
 'good',
 'get',
 'pants',
 'way',
 'short',
 'big',
 'made',
 'cut',
 'still',
 'right',
 'black',
 'super',
 'runs',
 'true',
 'online',
 'style',
 'lbs',
 'see',
 'sleeves',
 'purchased',
 'wearing',
 'design',
 'however',
 'feel',
 'summer',
 'white',
 'enough',
 'tight',
 'perfectly',
 'model',
 'go',
 'definitely',
 'looked',
 'front',
 'though',
 'sale',
 'price',
 'loved',
 'person',
 'better',
 'blue',
 'first'

In [32]:
# making copy of the data
data_01 = data.copy()

In [33]:
# frame features and label
frame = data.copy()
frame.columns = ['feature','label']

frame['feature'] = frame.apply(lambda row: nltk.word_tokenize(str(row['feature'])), axis=1)
print(frame.head())

                                             feature     label
0  [beautiful, and, very, versatile, dress, extre...  positive
1  [i, do, not, have, anything, like, it, in, my,...  positive
2  [this, is, a, cute, dress, but, the, buttons, ...  negative
3  [i, bought, this, shirt, at, the, store, and, ...  positive
4  [i, love, the, paisley, print, very, soft, and...  positive


In [34]:
# deciding the document features
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [35]:
# creating the featuresets

featuresets = [(document_features(feature), label) for index, (feature, label) in frame.iterrows()]

In [36]:
# checking a featureset of 5th review
featuresets[5]

({'contains(dress)': False,
  'contains(love)': False,
  'contains(size)': False,
  'contains(top)': False,
  'contains(fit)': False,
  'contains(like)': True,
  'contains(wear)': False,
  'contains(great)': False,
  'contains(would)': False,
  'contains(fabric)': True,
  'contains(color)': False,
  'contains(small)': False,
  'contains(look)': True,
  'contains(really)': False,
  'contains(ordered)': False,
  'contains(little)': True,
  'contains(perfect)': False,
  'contains(one)': True,
  'contains(flattering)': False,
  'contains(soft)': False,
  'contains(well)': False,
  'contains(back)': False,
  'contains(comfortable)': False,
  'contains(cute)': False,
  'contains(nice)': False,
  'contains(bought)': False,
  'contains(beautiful)': False,
  'contains(bit)': True,
  'contains(looks)': False,
  'contains(fits)': False,
  'contains(0)': False,
  'contains(large)': False,
  'contains(material)': False,
  'contains(much)': False,
  'contains(length)': False,
  'contains(also)': Tru

In [37]:
# training the naive bayes classifier

tr_data = featuresets[0:int(len(featuresets)*0.7)] 
ts_data = featuresets[int(len(featuresets)*0.7):]

classifier = nltk.NaiveBayesClassifier.train(tr_data)

In [38]:
# model validation

score = nltk.classify.accuracy(classifier, ts_data)
print('Model Accuracy: {}'.format(round(score,4)))

Model Accuracy: 0.8669


------------

### Approach-02 

Using ML Naive Bayes Classifier.

**TF-IDF Vectorizer**

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
print(data.head())

                                              review     label
0  beautiful and very versatile dress extremely f...  positive
1  i do not have anything like it in my closet re...  positive
2  this is a cute dress but the buttons are tiny ...  negative
3  i bought this shirt at the store and after goi...  positive
4  i love the paisley print very soft and comfy f...  positive


In [41]:
# we need features and labels
features = data['review'].values
label = data['label']

In [42]:
# using tf-idf vectorizer
vectorizer = TfidfVectorizer(max_features=5000, min_df=0.05, max_df=0.8)
vectorized_features = vectorizer.fit_transform(features).toarray()

In [43]:
pd.DataFrame(vectorized_features, columns=vectorizer.get_feature_names()).head(10).iloc[:10]

Unnamed: 0,about,all,also,am,an,are,as,at,back,be,...,what,when,which,white,will,with,work,would,xs,you
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.127398,0.196076,0.152183,0.0,0.0,0.0,0.153007,...,0.0,0.0,0.0,0.0,0.179583,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183706,0.0,0.0,...,0.0,0.0,0.21699,0.0,0.0,0.127784,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.134522,0.13778,0.0,0.0,0.0,0.108069,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.143851,0.086483,0.0,0.0,0.112832,0.240953,0.0,0.0,...,0.0,0.0,0.142305,0.0,0.0,0.251406,0.0,0.0,0.15892,0.0
7,0.0,0.0,0.0,0.0,0.0,0.185015,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.150081,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.189967,0.0,0.0,0.335611,0.0,0.0,0.0,0.159345
9,0.0,0.0,0.0,0.125642,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

In [45]:
# train, test splits
x_train, x_test, y_train, y_test = train_test_split(vectorized_features, label, test_size=0.3, random_state=0)

In [46]:
model = GaussianNB().fit(x_train,y_train)

tr_score = model.score(x_train,y_train)
ts_score = model.score(x_test,y_test)

print('Train Score: {}'.format(round(tr_score,4)))
print('Test Score : {}'.format(round(ts_score,4)))

Train Score: 0.7599
Test Score : 0.7522


----------