In [1]:
# Importing required libraries for data manipulation & visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# setting style of seaborn plot
sns.set_theme(style='whitegrid')

# for ignoring any unwanted warning
import warnings
warnings.filterwarnings('ignore')

# For Discretising the features
from feature_engine.discretisation import EqualFrequencyDiscretiser


In [2]:
# let's import the dataset
df = pd.read_csv('eCommerce.csv')

# view dataset
df.head()

Unnamed: 0,ID,n_clicks,n_visits,amount_spent,amount_discount,days_since_registration,profile_information
0,2085,643,142,300.0,228.432503,132,411
1,742,527,95,743.832508,60.882831,86,184
2,750,367,49,305.668886,72.961801,334,329
3,1224,466,30,291.191193,101.90317,131,148
4,2210,715,169,703.136878,506.416735,114,160


In previous notebook we've seen that there are some outliers in the dataset and no missing data. So we just need to take care of outliers.

In [3]:
# taking care of outliers.

# let's create a function to detect outlier
def outlier_detect(var):
    upper_bound = var.mean() + 3*var.std()
    lower_bound = var.mean() - 3*var.std()
    return upper_bound, lower_bound

In [4]:
# Detect outlier for the features

cols = ['amount_spent', 'n_clicks', 'n_visits', 'amount_discount',
       'days_since_registration', 'profile_information']
for i in cols:
    upper, lower= outlier_detect(df[i])
    # printing number of outliers in each column
    
    print('Column Name: {}'.format(i))
    print('Upper Bound: {}'.format(upper))
    print('Lower Bound: {}'.format(lower))
    print('Number of outliers: {}'.format(df[(df[i] > upper) | (df[i] < lower)].value_counts().sum()))
    print('======================================')


Column Name: amount_spent
Upper Bound: 4948.081163645362
Lower Bound: -2057.899672819025
Number of outliers: 23
Column Name: n_clicks
Upper Bound: 967.9222694739619
Lower Bound: -150.56226947396198
Number of outliers: 12
Column Name: n_visits
Upper Bound: 211.0746685163116
Lower Bound: -22.12346851631159
Number of outliers: 10
Column Name: amount_discount
Upper Bound: 1849.9405396138761
Lower Bound: -1072.9232657057103
Number of outliers: 18
Column Name: days_since_registration
Upper Bound: 498.3834525192299
Lower Bound: -96.4362525192299
Number of outliers: 3
Column Name: profile_information
Upper Bound: 501.45803798659585
Lower Bound: -99.37723798659582
Number of outliers: 3


Here for this dataset there are total 69 outliers. We can remove this data points 'trimming' or Discretisation this points. Since this is a small dataset we'll uses Discretisation for this. 

In [5]:
# For these outliers I'll use "Discretisation" technique & use "EqualFrequencyDiscretiser"

df_dis = EqualFrequencyDiscretiser(q = 10, variables = ['n_clicks', 'n_visits', 'amount_spent', 'amount_discount',
       'days_since_registration', 'profile_information'])

df = df_dis.fit_transform(df)

In [6]:
# Now let's check if there is any outliers or not

cols = ['amount_spent', 'n_clicks', 'n_visits', 'amount_discount',
       'days_since_registration', 'profile_information']
for i in cols:
    upper, lower= outlier_detect(df[i])
    # printing number of outliers in each column
    
    print('Column Name: {}'.format(i))
    print('Upper Bound: {}'.format(upper))
    print('Lower Bound: {}'.format(lower))
    print('Number of outliers: {}'.format(df[(df[i] > upper) | (df[i] < lower)].value_counts().sum()))
    print('======================================')


Column Name: amount_spent
Upper Bound: 13.11856785578404
Lower Bound: -4.11856785578404
Number of outliers: 0
Column Name: n_clicks
Upper Bound: 13.110548861188853
Lower Bound: -4.127348861188853
Number of outliers: 0
Column Name: n_visits
Upper Bound: 13.113181911843498
Lower Bound: -4.192381911843498
Number of outliers: 0
Column Name: amount_discount
Upper Bound: 13.11856785578404
Lower Bound: -4.11856785578404
Number of outliers: 0
Column Name: days_since_registration
Upper Bound: 13.102704502623212
Lower Bound: -4.12510450262321
Number of outliers: 0
Column Name: profile_information
Upper Bound: 13.106021742780488
Lower Bound: -4.126821742780488
Number of outliers: 0


No there are no outliers now. We've removed them.

This ends the feature engineering section. Now let's save this cleaned dataset into the final dataset and later notebook will used it for Machine Learning.

In [7]:
df.to_csv('final_df.csv', index = False)