In [429]:
import pandas as pd
import numpy as np

In [430]:
data = pd.read_csv(r'..\datasets\Womens Clothing E-Commerce Reviews.csv')

In [431]:
data.columns

Index(['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name'],
      dtype='object')

Keeping only required columns

In [432]:
keep = ['Age', 'Title', 'Review Text', 'Rating','Recommended IND', 'Positive Feedback Count','Department Name', 'Class Name']
data = data[keep]

In [433]:
data.head()

Unnamed: 0,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Department Name,Class Name
0,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Intimate,Intimates
1,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,Dresses,Dresses
2,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,Dresses,Dresses
3,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,Bottoms,Pants
4,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,Tops,Blouses


<h3>Analyzing correlations among key features to establish criteria for accurate sentiment labeling</h3>

In [434]:
data['Positive Feedback Count'].corr(data['Rating'])

np.float64(-0.06496050712064393)

<h4>this indicates there is no direct correlation between them. so analyzing values of this feature seperately</h4>

In [435]:
print(data['Positive Feedback Count'].describe())

count    23486.000000
mean         2.535936
std          5.702202
min          0.000000
25%          0.000000
50%          1.000000
75%          3.000000
max        122.000000
Name: Positive Feedback Count, dtype: float64


<h4>similarly checking correlation between recommended ind and rating</h4>

In [436]:
data['Recommended IND'].corr(data['Rating'])

np.float64(0.792336287928579)

<h4> this incidates a proper correlation between them. Now based on collected information, we create and label column sentiment for supervised learning</h4>

In [437]:
def map_sentiment(row):
    rating = row["Rating"]
    recommended_ind = row["Recommended IND"]
    positive_count = row["Positive Feedback Count"]
    
    if (rating >= 4 and recommended_ind >=1) or positive_count>=3: 
        #since postive feedback is used as solo indicator , I took the value closer to mean for more safety
        return "Positive"
    elif rating <= 2 and recommended_ind == 0:
        return "Negative"
    else:
        return "Neutral"

In [438]:
data['Sentiment'] = data.apply(map_sentiment,axis=1)

In [439]:
data['Sentiment'].value_counts()

Sentiment
Positive    19765
Neutral      2215
Negative     1506
Name: count, dtype: int64

<h4>handling missing values</h4>

In [440]:
data.isna().sum()

Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Department Name              14
Class Name                   14
Sentiment                     0
dtype: int64

In [441]:
data.dropna(subset=['Title', 'Review Text'], how='all')

Unnamed: 0,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Department Name,Class Name,Sentiment
0,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Intimate,Intimates,Positive
1,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,Dresses,Dresses,Positive
2,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,Dresses,Dresses,Neutral
3,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,Bottoms,Pants,Positive
4,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,Tops,Blouses,Positive
...,...,...,...,...,...,...,...,...,...
23481,34,Great dress for many occasions,I was very happy to snag this dress at such a ...,5,1,0,Dresses,Dresses,Positive
23482,48,Wish it was made of cotton,"It reminds me of maternity clothes. soft, stre...",3,1,0,Tops,Knits,Neutral
23483,31,"Cute, but see through","This fit well, but the top was very see throug...",3,0,1,Dresses,Dresses,Neutral
23484,28,"Very cute dress, perfect for summer parties an...",I bought this dress for a wedding i have this ...,3,1,2,Dresses,Dresses,Neutral


In [442]:
data.isna().sum()

Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Department Name              14
Class Name                   14
Sentiment                     0
dtype: int64

In [443]:
m = data[['Title', 'Review Text']].isna().all(axis=1)

In [444]:
data.drop(index=data[m].index, inplace=True)

In [445]:
data.loc[:,'Full Review'] = data[['Title','Review Text']].fillna('').astype(str).agg(' '.join,axis=1).str.strip()

In [446]:
data.drop(['Title', 'Review Text'], axis=1, inplace=True)

In [447]:
data[data['Full Review'].str.strip() == ""].shape[0]

0

In [448]:
data.isna().sum()

Age                         0
Rating                      0
Recommended IND             0
Positive Feedback Count     0
Department Name            13
Class Name                 13
Sentiment                   0
Full Review                 0
dtype: int64

In [449]:
data.fillna({'Department Name' :'Unknown'}, inplace=True)
data.fillna({'Class Name':'Unknown'}, inplace=True)
data.isna().sum()

Age                        0
Rating                     0
Recommended IND            0
Positive Feedback Count    0
Department Name            0
Class Name                 0
Sentiment                  0
Full Review                0
dtype: int64

<h3>check for duplicates</h3>

In [450]:
data.duplicated().sum()

np.int64(2)

In [451]:
data.drop_duplicates()

Unnamed: 0,Age,Rating,Recommended IND,Positive Feedback Count,Department Name,Class Name,Sentiment,Full Review
0,33,4,1,0,Intimate,Intimates,Positive,Absolutely wonderful - silky and sexy and comf...
1,34,5,1,4,Dresses,Dresses,Positive,Love this dress! it's sooo pretty. i happene...
2,60,3,0,0,Dresses,Dresses,Neutral,Some major design flaws I had such high hopes ...
3,50,5,1,0,Bottoms,Pants,Positive,"My favorite buy! I love, love, love this jumps..."
4,47,5,1,6,Tops,Blouses,Positive,Flattering shirt This shirt is very flattering...
...,...,...,...,...,...,...,...,...
23481,34,5,1,0,Dresses,Dresses,Positive,Great dress for many occasions I was very happ...
23482,48,3,1,0,Tops,Knits,Neutral,Wish it was made of cotton It reminds me of ma...
23483,31,3,0,1,Dresses,Dresses,Neutral,"Cute, but see through This fit well, but the t..."
23484,28,3,1,2,Dresses,Dresses,Neutral,"Very cute dress, perfect for summer parties an..."


<h3>data Cleaning</h3>

In [452]:
import re

In [453]:
def clean(text):
    text = re.sub(r'<.*?>','',text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'\d+','',text)
    return text.lower()

In [454]:
data["Full Review"]=data["Full Review"].apply(clean)

<h3>Tokenization</h3>

In [455]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [456]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [457]:
data["Review Tokenize"] = data["Full Review"].apply(word_tokenize)

<h3>remove stopwords</h3>

In [458]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [459]:
stop_words= set(stopwords.words('english'))

In [460]:
def remove_stopwords(text):
    wrd=[wrd for wrd in text if wrd not in stop_words]
    return wrd

In [461]:
data['review_sw_removed']=data["Review Tokenize"].apply(remove_stopwords)

In [462]:
data.head()

Unnamed: 0,Age,Rating,Recommended IND,Positive Feedback Count,Department Name,Class Name,Sentiment,Full Review,Review Tokenize,review_sw_removed
0,33,4,1,0,Intimate,Intimates,Positive,absolutely wonderful silky and sexy and comfo...,"[absolutely, wonderful, silky, and, sexy, and,...","[absolutely, wonderful, silky, sexy, comfortable]"
1,34,5,1,4,Dresses,Dresses,Positive,love this dress its sooo pretty i happened t...,"[love, this, dress, its, sooo, pretty, i, happ...","[love, dress, sooo, pretty, happened, find, st..."
2,60,3,0,0,Dresses,Dresses,Neutral,some major design flaws i had such high hopes ...,"[some, major, design, flaws, i, had, such, hig...","[major, design, flaws, high, hopes, dress, rea..."
3,50,5,1,0,Bottoms,Pants,Positive,my favorite buy i love love love this jumpsuit...,"[my, favorite, buy, i, love, love, love, this,...","[favorite, buy, love, love, love, jumpsuit, fu..."
4,47,5,1,6,Tops,Blouses,Positive,flattering shirt this shirt is very flattering...,"[flattering, shirt, this, shirt, is, very, fla...","[flattering, shirt, shirt, flattering, due, ad..."


<h3>Stemming and Lemmination</h3>

In [463]:
from nltk.stem import WordNetLemmatizer

In [464]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [465]:
lem=WordNetLemmatizer()

In [466]:
def lemmatize_word(text):
    ret = [lem.lemmatize(word) for word in text]
    return ret

In [467]:
data['review_lemmatize']=data['review_sw_removed'].apply(lemmatize_word)

In [468]:
data['final_review']=data["review_lemmatize"].apply(lambda x: ' '.join(x))

<h3>Encoding labels of Sentiment</h3>

In [469]:
enc={'Negative':0,'Neutral':1,'Positive':2}

In [470]:
data["Sentiment"]=data["Sentiment"].map(enc)

<h3>Exporting dataset</h3>

In [480]:
data.to_pickle("../models/data_processed.pkl")