In [None]:
# Import python packages
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [None]:
select * from NPS_SCORE.NPS_SCORE.NPS_SCORE;

In [None]:
df=session.table("NPS_SCORE.NPS_SCORE.NPS_SCORE")
df

In [None]:
dataset=df.to_pandas()

DESCRIBING THE DATASET

In [None]:
dataset.describe()

'''from above we can conclude that there are null-values present in age and income' and some negative values are also there in NUM_PURCHASES column which can't be negative.

In [None]:
dataset.info()

In [None]:
#first removing the null values present in  AGE ,INCOME,REGISTRATION_DATE,LAST_PURCHASE_DATE
dataset['AGE'].isnull().sum()
dataset['AGE'].value_counts()
dataset['AGE'].fillna(64,inplace=True)
dataset['INCOME'].isnull().sum()
dataset['INCOME'].value_counts()
dataset['INCOME'].fillna(60608.54,inplace=True)
dataset['REGISTRATION_DATE'].fillna(method='ffill',inplace=True)
dataset['LAST_PURCHASE_DATE'].fillna(method='ffill',inplace=True)

In [None]:
dataset.isnull().sum()

In [None]:
#now removing the null values in NUM_PURCHASES column
#checking the number of negative values present
c=0
for i in dataset['NUM_PURCHASES']:
    if i<0:
        c+=1
print(c)
#306
#removing the negative values present with plotting it first
sns.histplot(x=dataset['NUM_PURCHASES'],bins=int(np.sqrt(len(dataset))))
print(max(dataset['NUM_PURCHASES']))

In [None]:
sns.boxplot(x=dataset['NUM_PURCHASES'],y=dataset['NPS_CATEGORY'])

by analysis the above plot i can say the NUM_PURCHASES has a lot of outliers,maybe because of the negative values present in it.

In [None]:
dataset['NUM_PURCHASES'].value_counts()
#l=list(map(lambda x : x if x > 0 else 8.447307608191855, dataset['NUM_PURCHASES']))
#dataset['NUM_PURCHASES']=l
p_25=dataset['NUM_PURCHASES'].quantile(0.25)
p_75=dataset['NUM_PURCHASES'].quantile(0.75)
iqr=p_75-p_25

In [None]:
upper_limit=1.5*iqr+p_75
lower_limit=p_25-1.5*iqr
print("UPPER LIMIT=",upper_limit)
print("LOWER_LIMIT",lower_limit)
dataset['NUM_PURCHASES']=np.where(dataset['NUM_PURCHASES']>upper_limit,upper_limit,
                                  np.where(dataset['NUM_PURCHASES']<lower_limit,lower_limit,
                                           dataset['NUM_PURCHASES'])
                                 )

In [None]:
sns.boxplot(x=dataset['NUM_PURCHASES'],y=dataset['NPS_CATEGORY'])

ENCODING THE CATEGORIES TO NUMERICAL COLUMNS

In [None]:
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder()
dataset['PRODUCT_NAME']= label_encoder.fit_transform(dataset['PRODUCT_NAME']) 
dataset['PRODUCT_CATEGORY']= label_encoder.fit_transform(dataset['PRODUCT_CATEGORY']) 
dataset['PURCHASE_TYPE']= label_encoder.fit_transform(dataset['PURCHASE_TYPE']) 
dataset['SUPPORT_CHANNEL']=label_encoder.fit_transform(dataset['SUPPORT_CHANNEL'])
dataset['GENDER']=label_encoder.fit_transform(dataset['GENDER'])
dataset['ACQUISITION_CHANNEL']=label_encoder.fit_transform(dataset['ACQUISITION_CHANNEL'])

In [None]:
for label, c in enumerate(label_encoder.classes_):
    print(f"{c} -> {label}")

In [None]:
sns.countplot(x='GENDER', hue='NPS_CATEGORY',data=dataset)

In [None]:
sns.countplot(x='ACQUISITION_CHANNEL', hue='NPS_CATEGORY',data=dataset)

CONVERTING THE COMMENT COLUMN INTO THE CATEGORY OF POSITIVE,NEUTRAL OR NEGATIVE REVIEW BY PERFORMING SENTIMENT ANALYSIS

In [None]:
#Text pre-processing
import re
import string
strings=dataset['COMMENTS']
l=[]
def text_clean(text):
    text=text.lower()
    text=re.sub('\[.*?\]','',text)
    text=re.sub('[%s]'% re.escape(string.punctuation),'',text)
    text=re.sub('\w*\d\w*','',text)
    return text
for i in strings:
    s=text_clean(i)
    l.append(s)
dataset['COMMENTS']=l
dataset['COMMENTS']

In [None]:
from textblob import TextBlob
Comment=[]
for i in dataset['COMMENTS']:
    blob = TextBlob(i)
    Comment.append(blob.sentiment.polarity)
dataset['COMMENTS']=Comment
dataset['COMMENTS']

In [None]:
sns.scatterplot(x=dataset['COMMENTS'],y=dataset['NPS_SCORE'])

'''Those who are having above sentiment polarity of 0 are assigned as detractors and who are having below value i.e. less than 0 '''

In [None]:
dataset.drop(['LOCATION','REGISTRATION_DATE','LAST_PURCHASE_DATE'],axis=1,inplace=True)

In [None]:
sns.countplot(x=dataset['REFERRAL_ACTIVITY'],hue=dataset['NPS_CATEGORY'])

In [None]:
dataset['RESPONDENT'].value_counts()

In [None]:
dataset['REFERRAL_ACTIVITY'].value_counts()

In [None]:
sns.boxplot(x=dataset['DISCOUNTS_RECEIVED'],y=dataset['NPS_CATEGORY'])

In [None]:
lower_25=dataset['DISCOUNTS_RECEIVED'].quantile(0.25)
upper_75=dataset['DISCOUNTS_RECEIVED'].quantile(0.75)
iqr=upper_75-lower_25
lower_limit1=lower_25-iqr*1.5
upper_limit1=upper_75+iqr*1.5
dataset['DISCOUNTS_RECEIVED']=np.where(dataset['DISCOUNTS_RECEIVED']>upper_limit1,upper_limit1,
                            np.where(dataset['DISCOUNTS_RECEIVED']<lower_limit1,lower_limit1,
                                           dataset['DISCOUNTS_RECEIVED'])
                                 )

In [None]:
sns.boxplot(x=dataset['DISCOUNTS_RECEIVED'],y=dataset['NPS_CATEGORY'])

In [None]:
sns.boxplot(x=dataset['INCOME'],y=dataset['NPS_CATEGORY'])

In [None]:
lower_INCOME=dataset['INCOME'].quantile(0.25)
upper_INCOME=dataset['INCOME'].quantile(0.75)
iqr1=upper_INCOME-lower_INCOME
lower_limit2=lower_INCOME-iqr1*1.5
upper_limit2=upper_INCOME+iqr1*1.5
dataset['INCOME']=np.where(dataset['INCOME']>upper_INCOME,upper_INCOME,
                            np.where(dataset['INCOME']<lower_INCOME,lower_INCOME,
                                           dataset['INCOME'])
                                 )

In [None]:
sns.boxplot(x=dataset['INCOME'],y=dataset['NPS_CATEGORY'])

In [None]:
sns.boxplot(x=dataset['RESPONSE_TIME'],y=dataset['NPS_CATEGORY'])

In [None]:
dataset.drop('RESPONSE_TIME',axis=1,inplace=True)

In [None]:
sns.boxplot(x=dataset['INTERACTION_FREQUENCY'],y=dataset['NPS_CATEGORY'])

In [None]:
lower_IN=dataset['INTERACTION_FREQUENCY'].quantile(0.25)
upper_IN=dataset['INTERACTION_FREQUENCY'].quantile(0.75)
iqr1=upper_IN-lower_IN
lower_li=lower_IN-iqr1*1.5
upper_li=upper_IN+iqr1*1.5
dataset['INTERACTION_FREQUENCY']=np.where(dataset['INTERACTION_FREQUENCY']>upper_li,upper_li,
                            np.where(dataset['INTERACTION_FREQUENCY']<lower_li,lower_li,
                                           dataset['INTERACTION_FREQUENCY'])
                                 )
lower_li

In [None]:
sns.boxplot(x=dataset['INTERACTION_FREQUENCY'],y=dataset['NPS_CATEGORY'])

In [None]:
sns.boxplot(x=dataset['CUSTOMER_SERVICE_INTERACTIONS'],y=dataset['NPS_CATEGORY'])

In [None]:
lower_cs=dataset['CUSTOMER_SERVICE_INTERACTIONS'].quantile(0.25)
upper_cs=dataset['CUSTOMER_SERVICE_INTERACTIONS'].quantile(0.75)
iqr1=upper_cs-lower_cs
lower_cs=lower_cs-iqr1*1.5
upper_cs=upper_cs+iqr1*1.5
dataset['CUSTOMER_SERVICE_INTERACTIONS']=np.where(dataset['CUSTOMER_SERVICE_INTERACTIONS']>upper_cs,upper_cs,
                            np.where(dataset['CUSTOMER_SERVICE_INTERACTIONS']<lower_cs,lower_cs,
                                           dataset['CUSTOMER_SERVICE_INTERACTIONS'])
                                 )
lower_cs

In [None]:
sns.boxplot(x=dataset['CUSTOMER_SERVICE_INTERACTIONS'],y=dataset['NPS_CATEGORY'])

In [None]:
sns.boxplot(x=dataset['USAGE_FREQUENCY'],y=dataset['NPS_CATEGORY'])

In [None]:
l=dataset['USAGE_FREQUENCY'].quantile(0.25)
u=dataset['USAGE_FREQUENCY'].quantile(0.75)
iqr1=u-l
low=l-iqr1*1.5
up=u+iqr1*1.5
dataset['USAGE_FREQUENCY']=np.where(dataset['USAGE_FREQUENCY']>up,up,
                            np.where(dataset['USAGE_FREQUENCY']<low,low,
                                           dataset['USAGE_FREQUENCY'])
                                 )
low

In [None]:
sns.boxplot(x=dataset['USAGE_FREQUENCY'],y=dataset['NPS_CATEGORY'])

In [None]:
sns.boxplot(x=dataset['AVG_PURCHASE_VALUE'],y=dataset['NPS_CATEGORY'])

In [None]:
l=dataset['AVG_PURCHASE_VALUE'].quantile(0.25)
u=dataset['AVG_PURCHASE_VALUE'].quantile(0.75)
iqr1=u-l
low=l-iqr1*1.5
up=u+iqr1*1.5
dataset['AVG_PURCHASE_VALUE']=np.where(dataset['AVG_PURCHASE_VALUE']>up,up,
                            np.where(dataset['AVG_PURCHASE_VALUE']<low,low,
                                           dataset['AVG_PURCHASE_VALUE'])
                                 )
low

In [None]:
sns.boxplot(x=dataset['AVG_PURCHASE_VALUE'],y=dataset['NPS_CATEGORY'])

In [None]:
sns.countplot(x=dataset['ACQUISITION_CHANNEL'],hue=dataset['NPS_CATEGORY'])

In [None]:
sns.boxplot(x=dataset['ONLINE_ACTIVITY'],y=dataset['NPS_CATEGORY'])

In [None]:
l=dataset['ONLINE_ACTIVITY'].quantile(0.25)
u=dataset['ONLINE_ACTIVITY'].quantile(0.75)
iqr1=u-l
low=l-iqr1*1.5
up=u+iqr1*1.5
dataset['ONLINE_ACTIVITY']=np.where(dataset['ONLINE_ACTIVITY']>up,up,
                            np.where(dataset['ONLINE_ACTIVITY']<low,low,
                                           dataset['ONLINE_ACTIVITY'])
                                 )
low

In [None]:
sns.boxplot(x=dataset['ONLINE_ACTIVITY'],y=dataset['NPS_CATEGORY'])

In [None]:
sns.countplot(x=dataset['MARKETING_ENGAGEMENT'],hue=dataset['NPS_CATEGORY'])

In [None]:
from sklearn.preprocessing import RobustScaler
x=da
scaler=StandardScaler()
scaled_x=scaler.fit_transform(x)

In [None]:
x_scale=pd.DataFrame(scaled_x)
x_scale.head()

In [None]:
y=dataset['NPS_CATEGORY']

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
ordered_rank=SelectKBest(score_func=chi2)
ordered_feature=ordered_rank.fit(x,y)
ordered_feature