In [178]:
import pandas as pd
import numpy as np

In [179]:
data = pd.read_csv(r'..\datasets\Womens Clothing E-Commerce Reviews.csv')

In [180]:
data.columns

Index(['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name'],
      dtype='object')

Keeping only required columns

In [181]:
keep = ['Title', 'Review Text', 'Rating','Recommended IND', 'Positive Feedback Count']
data = data[keep]

In [182]:
data.head()

Unnamed: 0,Title,Review Text,Rating,Recommended IND,Positive Feedback Count
0,,Absolutely wonderful - silky and sexy and comf...,4,1,0
1,,Love this dress! it's sooo pretty. i happene...,5,1,4
2,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0
3,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0
4,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6


<h3>Analyzing correlations among key features to establish criteria for accurate sentiment labeling</h3>

In [183]:
data['Positive Feedback Count'].corr(data['Rating'])

np.float64(-0.06496050712064393)

<h4>this indicates there is no direct correlation between them. so analyzing values of this feature seperately</h4>

In [184]:
print(data['Positive Feedback Count'].describe())

count    23486.000000
mean         2.535936
std          5.702202
min          0.000000
25%          0.000000
50%          1.000000
75%          3.000000
max        122.000000
Name: Positive Feedback Count, dtype: float64


<h4>similarly checking correlation between recommended ind and rating</h4>

In [185]:
data['Recommended IND'].corr(data['Rating'])

np.float64(0.792336287928579)

<h4> this incidates a proper correlation between them. Now based on collected information, we create and label column sentiment for supervised learning</h4>

In [186]:
def map_sentiment(row):
    rating = row["Rating"]
    recommended_ind = row["Recommended IND"]
    positive_count = row["Positive Feedback Count"]
    
    if (rating >= 4 and recommended_ind >=1) or positive_count>=3: 
        #since postive feedback is used as solo indicator , I took the value closer to mean for more safety
        return "Positive"
    elif rating <= 2 and recommended_ind == 0:
        return "Negative"
    else:
        return "Neutral"

In [187]:
data['Sentiment'] = data.apply(map_sentiment,axis=1)

In [188]:
data['Sentiment'].value_counts()

Sentiment
Positive    19765
Neutral      2215
Negative     1506
Name: count, dtype: int64

<h4>handling missing values</h4>

In [189]:
data.isna().sum()

Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Sentiment                     0
dtype: int64

In [190]:
data.dropna(subset=['Title', 'Review Text'], how='all')

Unnamed: 0,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Sentiment
0,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Positive
1,,Love this dress! it's sooo pretty. i happene...,5,1,4,Positive
2,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,Neutral
3,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,Positive
4,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,Positive
...,...,...,...,...,...,...
23481,Great dress for many occasions,I was very happy to snag this dress at such a ...,5,1,0,Positive
23482,Wish it was made of cotton,"It reminds me of maternity clothes. soft, stre...",3,1,0,Neutral
23483,"Cute, but see through","This fit well, but the top was very see throug...",3,0,1,Neutral
23484,"Very cute dress, perfect for summer parties an...",I bought this dress for a wedding i have this ...,3,1,2,Neutral


In [191]:
data.isna().sum()

Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Sentiment                     0
dtype: int64

In [192]:
m = data[['Title', 'Review Text']].isna().all(axis=1)

In [193]:
data.drop(index=data[m].index, inplace=True)

In [194]:
data.loc[:,'Full Review'] = data[['Title','Review Text']].fillna('').astype(str).agg(' '.join,axis=1).str.strip()

In [195]:
data.drop(['Title', 'Review Text','Rating','Recommended IND', 'Positive Feedback Count'], axis=1, inplace=True)

In [196]:
data[data['Full Review'].str.strip() == ""].shape[0]

0

In [197]:
data.isna().sum()

Sentiment      0
Full Review    0
dtype: int64

<h3>check for duplicates</h3>

In [198]:
data.duplicated().sum()

np.int64(5)

In [199]:
data.drop_duplicates()

Unnamed: 0,Sentiment,Full Review
0,Positive,Absolutely wonderful - silky and sexy and comf...
1,Positive,Love this dress! it's sooo pretty. i happene...
2,Neutral,Some major design flaws I had such high hopes ...
3,Positive,"My favorite buy! I love, love, love this jumps..."
4,Positive,Flattering shirt This shirt is very flattering...
...,...,...
23481,Positive,Great dress for many occasions I was very happ...
23482,Neutral,Wish it was made of cotton It reminds me of ma...
23483,Neutral,"Cute, but see through This fit well, but the t..."
23484,Neutral,"Very cute dress, perfect for summer parties an..."


In [200]:
data = data[['Full Review','Sentiment']]
data = data.reset_index(drop=True)
data.head()

Unnamed: 0,Full Review,Sentiment
0,Absolutely wonderful - silky and sexy and comf...,Positive
1,Love this dress! it's sooo pretty. i happene...,Positive
2,Some major design flaws I had such high hopes ...,Neutral
3,"My favorite buy! I love, love, love this jumps...",Positive
4,Flattering shirt This shirt is very flattering...,Positive


In [201]:
enc={'Negative':0,'Neutral':1,'Positive':2}

In [202]:
data["Sentiment"]=data["Sentiment"].map(enc)

In [203]:
data.head()

Unnamed: 0,Full Review,Sentiment
0,Absolutely wonderful - silky and sexy and comf...,2
1,Love this dress! it's sooo pretty. i happene...,2
2,Some major design flaws I had such high hopes ...,1
3,"My favorite buy! I love, love, love this jumps...",2
4,Flattering shirt This shirt is very flattering...,2


In [208]:
data.to_pickle("../models/data_processed.pkl")