In [1]:
#data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd

#To avoid warnings
import warnings                    
warnings.filterwarnings("ignore")

#FOR NLP
import string
from nltk.corpus import stopwords 

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer #SnowballStemmer is a stemming algorithm
from nltk.tokenize import RegexpTokenizer

#FOR SPLITTING DATASET INTO TRAIN AND TEST SETS
from sklearn.model_selection import train_test_split

#TO HANDLE IMBALANCE 
from imblearn.over_sampling import SMOTE

#MACHINE LEARNING MODELS
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC  
from sklearn.neighbors import KNeighborsClassifier

#TO calculate F1 score of the model's accuracy on dataset
from sklearn.metrics import f1_score




In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/hp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/hp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data=pd.read_csv("/home/hp/ML PROJECT/PROJECT2-NLP/archive/Womens Clothing E-Commerce Reviews.csv")

# EDA

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [5]:
data.tail()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
23481,23481,1104,34,Great dress for many occasions,I was very happy to snag this dress at such a ...,5,1,0,General Petite,Dresses,Dresses
23482,23482,862,48,Wish it was made of cotton,"It reminds me of maternity clothes. soft, stre...",3,1,0,General Petite,Tops,Knits
23483,23483,1104,31,"Cute, but see through","This fit well, but the top was very see throug...",3,0,1,General Petite,Dresses,Dresses
23484,23484,1084,28,"Very cute dress, perfect for summer parties an...",I bought this dress for a wedding i have this ...,3,1,2,General,Dresses,Dresses
23485,23485,1104,52,Please make more like this one!,This dress in a lovely platinum is feminine an...,5,1,22,General Petite,Dresses,Dresses


In [6]:
data.shape

(23486, 11)

In [7]:
data.describe()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Rating,Recommended IND,Positive Feedback Count
count,23486.0,23486.0,23486.0,23486.0,23486.0,23486.0
mean,11742.5,918.118709,43.198544,4.196032,0.822362,2.535936
std,6779.968547,203.29898,12.279544,1.110031,0.382216,5.702202
min,0.0,0.0,18.0,1.0,0.0,0.0
25%,5871.25,861.0,34.0,4.0,1.0,0.0
50%,11742.5,936.0,41.0,5.0,1.0,1.0
75%,17613.75,1078.0,52.0,5.0,1.0,3.0
max,23485.0,1205.0,99.0,5.0,1.0,122.0


In [8]:
data.dtypes

Unnamed: 0                  int64
Clothing ID                 int64
Age                         int64
Title                      object
Review Text                object
Rating                      int64
Recommended IND             int64
Positive Feedback Count     int64
Division Name              object
Department Name            object
Class Name                 object
dtype: object

# Checking For Missing Values

In [9]:
data.isna().sum()

Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

# Selecting Required Fields

In [10]:
text_df = data[['Title', 'Review Text', 'Recommended IND']]
text_df.head()


Unnamed: 0,Title,Review Text,Recommended IND
0,,Absolutely wonderful - silky and sexy and comf...,1
1,,Love this dress! it's sooo pretty. i happene...,1
2,Some major design flaws,I had such high hopes for this dress and reall...,0
3,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",1
4,Flattering shirt,This shirt is very flattering to all due to th...,1


In [11]:
text_df['Review'] = text_df['Title'] + ' ' + text_df['Review Text']
text_df = text_df.drop(labels=['Title','Review Text'] , axis=1)
text_df.head()

Unnamed: 0,Recommended IND,Review
0,1,
1,1,
2,0,Some major design flaws I had such high hopes ...
3,1,"My favorite buy! I love, love, love this jumps..."
4,1,Flattering shirt This shirt is very flattering...


# Dropping Rows With No Reviews

In [12]:
text_df.Review.isna().sum()

3811

In [13]:
row_index = text_df[text_df.Review.isnull()].index.values
text_df = text_df.drop(row_index, axis = 0)
text_df.shape

(19675, 2)

In [14]:

text_df.head(10)

Unnamed: 0,Recommended IND,Review
2,0,Some major design flaws I had such high hopes ...
3,1,"My favorite buy! I love, love, love this jumps..."
4,1,Flattering shirt This shirt is very flattering...
5,0,Not for the very petite I love tracy reese dre...
6,1,Cagrcoal shimmer fun I aded this in my basket ...
7,1,"Shimmer, surprisingly goes with lots I ordered..."
8,1,Flattering I love this dress. i usually get an...
9,1,"Such a fun dress! I'm 5""5' and 125 lbs. i orde..."
10,0,Dress looks like it's made of cheap material D...
12,1,Perfect!!! More and more i find myself reliant...


In [15]:
Review=text_df.Review

In [16]:
Review.head(10)

2     Some major design flaws I had such high hopes ...
3     My favorite buy! I love, love, love this jumps...
4     Flattering shirt This shirt is very flattering...
5     Not for the very petite I love tracy reese dre...
6     Cagrcoal shimmer fun I aded this in my basket ...
7     Shimmer, surprisingly goes with lots I ordered...
8     Flattering I love this dress. i usually get an...
9     Such a fun dress! I'm 5"5' and 125 lbs. i orde...
10    Dress looks like it's made of cheap material D...
12    Perfect!!! More and more i find myself reliant...
Name: Review, dtype: object

# NATURAL LANGUAGE PROCESSING

# Tokenization

In [17]:
tk = RegexpTokenizer(pattern=r'[a-zA-Z\']+') 
Review = Review.apply(lambda x: tk.tokenize(x)).apply(lambda x: ' '.join(x))

In [18]:
Review.head(10)

2     Some major design flaws I had such high hopes ...
3     My favorite buy I love love love this jumpsuit...
4     Flattering shirt This shirt is very flattering...
5     Not for the very petite I love tracy reese dre...
6     Cagrcoal shimmer fun I aded this in my basket ...
7     Shimmer surprisingly goes with lots I ordered ...
8     Flattering I love this dress i usually get an ...
9     Such a fun dress I'm ' and lbs i ordered the s...
10    Dress looks like it's made of cheap material D...
12    Perfect More and more i find myself reliant on...
Name: Review, dtype: object

# Removing Punctuations

In [19]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [20]:
def punctuation_removal(char):
    clean_list = [i for i in char if i not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str


In [21]:
Review = Review.apply(punctuation_removal)
Review.head(10)

2     Some major design flaws I had such high hopes ...
3     My favorite buy I love love love this jumpsuit...
4     Flattering shirt This shirt is very flattering...
5     Not for the very petite I love tracy reese dre...
6     Cagrcoal shimmer fun I aded this in my basket ...
7     Shimmer surprisingly goes with lots I ordered ...
8     Flattering I love this dress i usually get an ...
9     Such a fun dress Im  and lbs i ordered the s p...
10    Dress looks like its made of cheap material Dr...
12    Perfect More and more i find myself reliant on...
Name: Review, dtype: object

# Removing Stop Words

In [22]:
stop= stopwords.words('english')

In [23]:
stop

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [24]:
Review= Review.apply(lambda x: [i.lower() for i in word_tokenize(x) if i.lower() not in stop]).apply(lambda x: ' '.join(x))

In [25]:
Review

2        major design flaws high hopes dress really wan...
3        favorite buy love love love jumpsuit fun flirt...
4        flattering shirt shirt flattering due adjustab...
5        petite love tracy reese dresses one petite fee...
6        cagrcoal shimmer fun aded basket hte last mint...
                               ...                        
23481    great dress many occasions happy snag dress gr...
23482    wish made cotton reminds maternity clothes sof...
23483    cute see fit well top see never would worked i...
23484    cute dress perfect summer parties bought dress...
23485    please make like one dress lovely platinum fem...
Name: Review, Length: 19675, dtype: object

# Stemming

In [26]:

stemmer = SnowballStemmer('english') 
Review = Review.apply(lambda x: [stemmer.stem(i.lower()) for i in tk.tokenize(x)]).apply(lambda x: ' '.join(x))

In [27]:
Review

2        major design flaw high hope dress realli want ...
3        favorit buy love love love jumpsuit fun flirti...
4        flatter shirt shirt flatter due adjust front t...
5        petit love traci rees dress one petit feet tal...
6        cagrcoal shimmer fun ade basket hte last mintu...
                               ...                        
23481    great dress mani occas happi snag dress great ...
23482    wish made cotton remind matern cloth soft stre...
23483    cute see fit well top see never would work im ...
23484    cute dress perfect summer parti bought dress w...
23485    pleas make like one dress love platinum femini...
Name: Review, Length: 19675, dtype: object

# TFIDF Vectorization

In [28]:
vec = TfidfVectorizer(stop_words = stop)
data_vec = vec.fit_transform(Review)

In [55]:
data_vec

<19675x9241 sparse matrix of type '<class 'numpy.float64'>'
	with 549627 stored elements in Compressed Sparse Row format>

In [58]:
data_vec.shape

(19675, 9241)

# SPLITTING INTO TRAIN AND TEST DATASETS

In [29]:
y = text_df['Recommended IND']
y = y.values

In [30]:
X_train, X_test, y_train, y_test = train_test_split(data_vec, y, test_size = 0.2, stratify = y,random_state=35)

# PERFORMING SMOTE TO HANDLE IMBALANCE IN THE TARGET VARIABLE

In [31]:
text_df['Recommended IND'].value_counts()

1    16100
0     3575
Name: Recommended IND, dtype: int64

In [32]:

smote = SMOTE()
X_res, y_res = smote.fit_resample(X_train, y_train)

# LOGISTIC REGRESSION

In [33]:
model = LogisticRegression()
model.fit(X_res, y_res)
y_pred = model.predict(X_test)

In [34]:
f1_log=f1_score(y_test, y_pred)
f1_log

0.9343182913120284

In [35]:
y_pred

array([1, 1, 0, ..., 1, 1, 1])

In [36]:
y_test
   

array([1, 1, 1, ..., 1, 1, 1])

# RANDOM FOREST CLASSIFIER

In [37]:
model2 = RandomForestClassifier()
model2.fit(X_res, y_res)
y_pred2 = model2.predict(X_test)

In [38]:
f1_RF=f1_score(y_test, y_pred2)
f1_RF

0.9293082524271845

In [39]:
y_pred2

array([1, 1, 0, ..., 1, 1, 1])

In [40]:
y_test

array([1, 1, 1, ..., 1, 1, 1])

# XGB CLASSIFIER

In [41]:
model3 = XGBClassifier()
model3.fit(X_res, y_res)
y_pred3 = model3.predict(X_test)

In [42]:
f1_xgb=f1_score(y_test, y_pred3)
f1_xgb

0.9307751343054489

In [43]:
y_pred3

array([1, 1, 0, ..., 1, 1, 1])

In [44]:
y_test

array([1, 1, 1, ..., 1, 1, 1])

# SUPPORT VECTOR CLASSIFIER

In [45]:
model4= SVC()
model4.fit(X_res,y_res)
y_pred4 = model4.predict(X_test)

In [46]:
f1_svm=f1_score(y_test, y_pred4)
f1_svm

0.9456752655538695

In [47]:
y_pred4

array([1, 1, 0, ..., 1, 1, 1])

In [48]:
y_test

array([1, 1, 1, ..., 1, 1, 1])

# K-NEAREST NEIGHBOUR¶


In [49]:
n_classifier = KNeighborsClassifier(n_neighbors=2)

In [50]:
n_classifier.fit(X_train,y_train)
y_pred5=n_classifier.predict(X_test)

In [51]:
f1_knn=f1_score(y_test, y_pred5)
f1_knn

0.871677808576553

In [52]:
y_pred5

array([1, 1, 1, ..., 1, 1, 1])

In [53]:
y_test

array([1, 1, 1, ..., 1, 1, 1])

# Dictonary to compare F1 score given by each model

In [56]:
f1score_dict   ={'f1_score for Logistic Regression':f1_log,'f1_score for Random Forest Classifier':f1_RF,'f1_score for XGB Classifier':f1_xgb,'f1_score for Support Vector Classifier':f1_svm,
                'f1_score for K-Nearest Neighbour':f1_knn}
f1score_dict

{'f1_score for Logistic Regression': 0.9343182913120284,
 'f1_score for Random Forest Classifier': 0.9293082524271845,
 'f1_score for XGB Classifier': 0.9307751343054489,
 'f1_score for Support Vector Classifier': 0.9456752655538695,
 'f1_score for K-Nearest Neighbour': 0.871677808576553}

# Support Vector Classifier Gave the highest f1 score 