# Libs

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
import pickle

# Download nltk 

In [0]:
nltk.download('punkt')

# Load Data

In [0]:
# Loading Data
data = spark.sql("select * from default.customer_review_dataset").toPandas()

In [0]:
# Checking head of data
data.head()

In [0]:
# Checking shape of data
data.shape

## We have more than 2 lakh rows

In [0]:
# Defining a Replace_Dict, which will be used to replace words as per EDA
global replace_dict
replace_dict={}
def add_to_replace_dict(word,short_list):
    global replace_dict
    try:
        replace_dict[word].extend(short_list)
        replace_dict[word]= list(set(replace_dict[word]))
    except:
        replace_dict[word]=[]
        replace_dict[word].extend(short_list)
        replace_dict[word]= list(set(replace_dict[word]))

In [0]:
new_dict={}

In [0]:
def replace_shortwords(replacement_dict,text):
    word_list= word_tokenize(text)
    for r in replacement_dict.keys():
        word_list=list(pd.Series(word_list).replace(r,replacement_dict[r]))
    return " ".join(word_list)

In [0]:
# Checking Nulls
data.isna().sum()

## There are 11 null records in Summary, let's remove those for now

In [0]:
data=data.dropna(subset='Summary')

In [0]:
# Checking Nulls
data.isna().sum()

In [0]:
# Checking dtype
data.info()

## We can see Rate, Price are object
## We will see what non-numeric values in are in Rate and Price column

In [0]:
# Non Numeric Values in Rate
data[pd.to_numeric(data['Rate'], errors='coerce').isna()]['Rate'].unique()

## There are 3 non numeric records in Rate, Let's check how many records are impacted

In [0]:
data[data['Rate'].isin(['Pigeon Favourite Electric Kettle??????(1.5 L, Silver, Black)',
       'Bajaj DX 2 L/W Dry Iron',
       'Nova Plus Amaze NI 10 1100 W Dry Iron?Ã\x83Â¿?Ã\x83Â¿(Grey & Turquoise)'])]

## There are only 3 records. Let Drop them

In [0]:
data = data[~data['Rate'].isin(['Pigeon Favourite Electric Kettle??????(1.5 L, Silver, Black)',
       'Bajaj DX 2 L/W Dry Iron',
       'Nova Plus Amaze NI 10 1100 W Dry Iron?Ã\x83Â¿?Ã\x83Â¿(Grey & Turquoise)'])]

In [0]:
# Now let's convert Rate to numeric
data['Rate'] = pd.to_numeric(data['Rate'])

In [0]:
# Since we are doing sentiment classification, We don't need Product Price, so let's drop it
data.drop(['product_price','product_name'],axis=1,inplace=True)

In [0]:
data

In [0]:
# Now let's see if Rate and Sentiment columns are more or less provide same information
data.groupby('Sentiment')['Rate'].mean()

In [0]:
# Also check Unique
# Now let's see if Rate and Sentiment columns are more or less provide same information
data.groupby('Sentiment')['Rate'].unique()

## Looks like, our assumption of these two columns providing similar information failed, let's look for specific cases

In [0]:
display(data[(data['Sentiment']=='negative')&(data['Rate']>2)])

## We can see that Review and Rating, not in line with Sentiment. Sentiment seems to be coming from Summary column

In [0]:
display(data[(data['Sentiment']=='neutral')&((data['Rate']>3) | (data['Rate']<3))])

## We can see that Review and Rating, not in line with Sentiment. Sentiment seems to be coming from Summary column

In [0]:
display(data[(data['Sentiment']=='positive')&((data['Rate']<3))])

## We can see that Review and Rating, not in line with Sentiment. Sentiment seems to be coming from Summary column

In [0]:
# Create length column for Review and Summary
data['len_review']=data['Review'].apply(lambda x : len(x))
data['len_summary']=data['Summary'].apply(lambda x : len(x))

In [0]:
# Let's all those records where length of Review is less than 4
data[data['len_review']<4]['Review'].unique()

## We can see nan
## Let's remove nan

In [0]:
data['Review']=data['Review'].replace('nan',np.nan)

In [0]:
# Let's all those records where length of Review is less than 4
data[data['len_review']<5]['Review'].unique()

## We can see nan
## Let's remove nan

In [0]:
add_to_replace_dict('good',[':)'])

In [0]:
# Let's all those records where length of Review is less than 4
data[data['len_review']==6]['Review'].unique()

## We can see nan
## Let's remove nan

In [0]:
add_to_replace_dict('good',['classy'])
add_to_replace_dict('super',['superb'])
add_to_replace_dict('awesome',['awsome'])

In [0]:
# Let's all those records where length of Review is less than 4
sorted(data[data['len_summary']==5]['Summary'].unique())

## We don't see any nan or special character. Let's keep it for now

In [0]:

add_to_replace_dict('awesome',['osmmm','owsam','awsum','ossam','asome','aswam','ausam','awesm','awsme','awsom'])
add_to_replace_dict('',['ffggp'])
add_to_replace_dict('super',['supar','supab'])
add_to_replace_dict('good',['goood','guddd','gd gd','gud 1'])
add_to_replace_dict('nice',['niccc','n ice','gd gd'])
add_to_replace_dict('great',['grate'])
add_to_replace_dict('very good',['vgood'])

In [0]:
# Let's all those records where length of Review is less than 4
sorted(data[data['len_summary']==6]['Summary'].unique())

## We don't see any nan or special character. Let's keep it for now

In [0]:

add_to_replace_dict('awesome',['assowm','aosome','aswame','ausome','awaome','awasom','awasum','awesm','awesom','awosam','awssmm','owsome'])
add_to_replace_dict('average',['avarge','averag','avrege'])
add_to_replace_dict('super',['supper','supprb','supreb','supurb','suuper'])
add_to_replace_dict('good',['classy','gd job','gd one','gooddd','gooodd','gooood'])
add_to_replace_dict('bad',['badddd','ghatia'])
add_to_replace_dict('great',['grate','greate'])
add_to_replace_dict('just ok',['jst ok','jus ok','jzz ok'])
add_to_replace_dict('love',['luv'])
add_to_replace_dict('like',['lyk'])
add_to_replace_dict('favourite',['fav'])
add_to_replace_dict('nice',['niceee','nycccc','nyc','nyss'])
add_to_replace_dict('good',['gud','gd','sandar'])
add_to_replace_dict('perfect',['perfct'])
add_to_replace_dict('wow',['woooow'])
add_to_replace_dict('excellent',['xclent'])

In [0]:
# Let's all those records where length of Review is less than 4
sorted(data[data['len_summary']==7]['Summary'].unique())

## We don't see any nan or special character. Let's keep it for now

In [0]:
add_to_replace_dict('amazing',['amasing','ameging','amezing','amizing',])
add_to_replace_dict('average',['avarage','avarege','averege','avrrage','everage','evarage'])
add_to_replace_dict('best',['bestttt'])
add_to_replace_dict('perfect',['farfect'])
add_to_replace_dict('fabulous',['fablous','febulas'])
add_to_replace_dict('nice',['nic'])
add_to_replace_dict('excellent',['xcelent','exalant','exalent','exelent','exllant','exxlent'])

In [0]:
# Let's all those records where length of Review is less than 4
sorted(data[data['len_summary']==8]['Summary'].unique())

## We don't see any nan or special character. Let's keep it for now

In [0]:
add_to_replace_dict('wonderful',['wondrful'])

In [0]:
replace_dict

In [0]:
# Savinge Replace dict as pkl file which can be used in transformation notebook
with open('/dbfs/FileStore/replace_dict.pkl','wb') as file:
    pickle.dump(replace_dict,file)

In [0]:
new_replace_dict={}
for key in replace_dict.keys():
        for k in replace_dict[key]:
            new_dict[k]=key

In [0]:
# Checking Null Records
data.isna().sum()

## There are no Review for 24662 records, let's not drop the records but keep it. Replace Null with '' blank

In [0]:
# Replacing Null records in Review with blank
data['Review']=data['Review'].fillna('')

## Let's now create combined column from Review and Summary 

In [0]:
pip install TextBlob

In [0]:
# Combined column
data['combined']=data['Review']+' '+data['Summary']

In [0]:
data.head()

In [0]:
data['combined_cleanded']=data['combined'].apply(lambda x:replace_shortwords(new_replace_dict,x))

In [0]:
# We can calculate polarity from combined text now and filter those records which doesn't make sense i.e eg. sentiment is negative but polarity is >0.8
from textblob import TextBlob
def polarity(text):
    return TextBlob(text).sentiment.polarity

data['polarity_score_combined']=data['combined'].apply(lambda x : polarity(x))

In [0]:
data.groupby('Sentiment').mean()[['polarity_score_combined']]

In [0]:
data.groupby('Sentiment').median()[['polarity_score_combined']]

In [0]:
data.groupby('Sentiment').max()[['polarity_score_combined']]

In [0]:
data.groupby('Sentiment').min()[['polarity_score_combined']]

In [0]:
# When Positive, but polarity <-0.5
data[(data['Sentiment']=='positive')&(data['polarity_score_combined']<0)]

In [0]:
# Remove these rows because these are adding noise
data = data[~((data['Sentiment']=='positive')&(data['polarity_score_combined']<0))]

In [0]:
# When Negative, but polarity >0.3
data[(data['Sentiment']=='negative')&(data['polarity_score_combined']>0.3)]

In [0]:
# Remove these rows because these are adding noise
data = data[~((data['Sentiment']=='negative')&(data['polarity_score_combined']>0.3))]

In [0]:
# When Neutral, but polarity >0.7
data[(data['Sentiment']=='neutral')&(data['polarity_score_combined']>0.7)]

In [0]:
# Remove these rows because these are adding noise
data=data[~((data['Sentiment']=='neutral')&(data['polarity_score_combined']>0.7))]

In [0]:
# When Neutral, but polarity <-0.5
data[(data['Sentiment']=='neutral')&(data['polarity_score_combined']<-0.5)]

In [0]:
# Remove these rows because these are adding noise
data=data[~((data['Sentiment']=='neutral')&(data['polarity_score_combined']<-0.5))]

In [0]:
data.shape
# (205052, 6)
# Records reduced by 4342

In [0]:
data.groupby('Sentiment').mean()[['polarity_score_combined']]

In [0]:
data.groupby('Sentiment').median()[['polarity_score_combined']]

In [0]:
data.groupby('Sentiment').min()[['polarity_score_combined']]

In [0]:
data.groupby('Sentiment').max()[['polarity_score_combined']]

In [0]:
# Checking when sentiment = 'positive' and Rating <3
data[(data['Sentiment']=='positive')&(data['Rate']<3)]

In [0]:
# Remove these records, these are adding noise
data=data[~((data['Sentiment']=='positive')&(data['Rate']<3))]

In [0]:
# Checking when sentiment = 'negative' and Rating >3
data[((data['Sentiment']=='negative')&(data['Rate']>3))]

In [0]:
# Remove these records, these are adding noise
data=data[~((data['Sentiment']=='negative')&(data['Rate']>3))]

In [0]:
# Checking when sentiment = 'neutral' and Rating >3
data[((data['Sentiment']=='neutral')&((data['Rate']>4)|(data['Rate']<2)))]

In [0]:
# Remove these records, these are adding noise
data=data[~((data['Sentiment']=='neutral')&((data['Rate']>4)|(data['Rate']<2)))]

In [0]:
# Finally, shape of data
data.shape
# (205052, 6), reduced by 9137

In [0]:
def sentiment(x):
    if x < 0:
        return 'negative'
    elif x == 0:
        return 'neutral'
    else:
        return 'positive'
data['polarity'] = data['polarity_score_combined'].apply(sentiment)

In [0]:
pd.merge(data.groupby('Sentiment').count()['Rate'].reset_index().rename(columns={'Rate':'Original'}),data.groupby('polarity').count()['Rate'].reset_index().rename(columns={'Rate':'Calculated'}),left_on='Sentiment',right_on='polarity').drop('polarity',axis=1).plot.bar(x='Sentiment')

In [0]:
data.value_counts('Sentiment').reset_index().plot.bar(x='Sentiment')
# sns.barplot(x=tmp['sentiment'],y=tmp[0]);