In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import scipy.stats as stats
from scipy.stats import chi2_contingency

In [2]:
df=pd.read_excel('cleaned_data.xlsx')
df.head()

Unnamed: 0,CustomerID,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,HourSpendOnApp,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount,Churn
0,50001,4,Mobile,3,6,DC,Female,3,3,Laptop,2,Single,9,1,11,1,1,5,159.93,1
1,50002,10,Mobile,1,8,UPI,Male,3,4,Mobile,3,Single,7,1,15,0,1,0,120.9,1
2,50003,10,Mobile,1,30,DC,Male,2,4,Mobile,3,Single,6,1,14,0,1,3,120.28,1
3,50004,0,Mobile,3,15,DC,Male,2,4,Laptop,5,Single,8,0,23,0,1,3,134.07,1
4,50005,0,Mobile,1,12,CC,Male,3,3,Mobile,5,Single,3,0,11,1,1,3,129.6,1


# Outlier Handling
Most of the outliers we observed during exploration of the data seems to represent the natural variation in the population(of the respective variable). Therefore, we leave them except for a few variables like Tenure, DaySinceLastOrder, CashbackAmount and NumberOfAddress.

We handle them one by one.

### Tenure

According to the percentile distribution of 'Tenure', we observe that customers with Tenure > 21, have not churned and the 99th percentile value is 31 which means the extreme values lies after that. So lets cap at the value 31, ie, all values above 31 will be considered as 31.

In [5]:
percentile = df.Tenure.quantile([0.99]).values
df['Tenure'] = df['Tenure'].apply(lambda x : percentile[0] if x > percentile[0] else x)

### DaySinceLastOrder

Similarly for DaySinceLastOrder, according to the percentile distribution, we see that values after DaySinceLastOrder=18 are extreme values [30,31,46]. The 99th percentile values is 15. So we shall cap at this position.

In [7]:
percentile = df.DaySinceLastOrder.quantile([0.99]).values
df['DaySinceLastOrder'] = df['DaySinceLastOrder'].apply(lambda x : percentile[0] if x > percentile[0] else x)
     

### CashbackAmount
Here we see that there are outliers after the upper bound and below the lower bound. We cap the values at the 1st and 99th percentile.

In [9]:
percentile = df.CashbackAmount.quantile([0.01, 0.99]).values
df['CashbackAmount'] = df['CashbackAmount'].apply(lambda x : percentile[0] if x < percentile[0] else percentile[1] if x > percentile[1] else x)
     

# Feature selection using Statistical Test

We perform the following tests to select features that would contribute the best for our model.

#### Numerical Variables

In [11]:
# Function to perform statistical test on numerical variables to determine whether to reject or accept H0
# Where H0 is there is no relation between the col and the target variable 
def num_stats(num_col):
  group_0 = df[df['Churn']==0][num_col]
  group_1 = df[df['Churn']==1][num_col]

  _, p_value = stats.ttest_ind(group_0,group_1,equal_var=False)

  print('P-value : ', p_value)
  if(p_value<0.05):
    print('Reject null hypothesis')
  else:
    print('Do not reject null hypotheis')
     

In [13]:
num_col_list = ['Tenure', 'WarehouseToHome', 'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount',
                'DaySinceLastOrder', 'CashbackAmount', 'HourSpendOnApp', 'NumberOfDeviceRegistered', 'NumberOfAddress']
     

In [15]:
for col in num_col_list:
  print('Column Name : ', col)
  num_stats(col)
  print('---------------------------------------------')
     

Column Name :  Tenure
P-value :  2.0082775953677156e-202
Reject null hypothesis
---------------------------------------------
Column Name :  WarehouseToHome
P-value :  6.733649545254792e-08
Reject null hypothesis
---------------------------------------------
Column Name :  OrderAmountHikeFromlastYear
P-value :  0.4237779148876236
Do not reject null hypotheis
---------------------------------------------
Column Name :  CouponUsed
P-value :  0.9159384616337447
Do not reject null hypotheis
---------------------------------------------
Column Name :  OrderCount
P-value :  0.06430615371620459
Do not reject null hypotheis
---------------------------------------------
Column Name :  DaySinceLastOrder
P-value :  3.1021436368381747e-39
Reject null hypothesis
---------------------------------------------
Column Name :  CashbackAmount
P-value :  3.03484589898161e-43
Reject null hypothesis
---------------------------------------------
Column Name :  HourSpendOnApp
P-value :  0.14300103213870458
Do

In [17]:
def chisq_test(cat_col, df):
    CrossTabResult=pd.crosstab(index=df['Churn'], columns=df[cat_col])
    ChiSqResult = chi2_contingency(CrossTabResult)
        
    # If the ChiSq P-Value is <0.05, that means we reject H0
    if (ChiSqResult[1] < 0.05):
        print('P-Value :', ChiSqResult[1])
        print('Reject null hypothesis')
    else:
        print('P-Value :', ChiSqResult[1])
        print('Do not reject null hypotheis')  

In [19]:
cat_col_list = ['PreferredLoginDevice', 'CityTier', 'PreferredPaymentMode',
                'PreferedOrderCat', 'SatisfactionScore', 'MaritalStatus', 'Gender', 'Complain']

for col in cat_col_list:
  print('Column Name : ', col)
  chisq_test(col, df)
  print('---------------------------------------------')
     

Column Name :  PreferredLoginDevice
P-Value : 0.0001477040239947965
Reject null hypothesis
---------------------------------------------
Column Name :  CityTier
P-Value : 1.2612000812079956e-09
Reject null hypothesis
---------------------------------------------
Column Name :  PreferredPaymentMode
P-Value : 1.4978570960706217e-10
Reject null hypothesis
---------------------------------------------
Column Name :  PreferedOrderCat
P-Value : 3.11924340428766e-61
Reject null hypothesis
---------------------------------------------
Column Name :  SatisfactionScore
P-Value : 2.4233349782737515e-14
Reject null hypothesis
---------------------------------------------
Column Name :  MaritalStatus
P-Value : 1.073011277910542e-41
Reject null hypothesis
---------------------------------------------
Column Name :  Gender
P-Value : 0.030820940334890086
Reject null hypothesis
---------------------------------------------
Column Name :  Complain
P-Value : 2.6644609654641377e-78
Reject null hypothesis


In [21]:
df.drop(columns=['CustomerID', 'HourSpendOnApp', 'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount'], axis=1, inplace=True)
df.head()

Unnamed: 0,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,DaySinceLastOrder,CashbackAmount,Churn
0,4.0,Mobile,3,6,DC,Female,3,Laptop,2,Single,9,1,5.0,159.93,1
1,10.0,Mobile,1,8,UPI,Male,4,Mobile,3,Single,7,1,0.0,120.9,1
2,10.0,Mobile,1,30,DC,Male,4,Mobile,3,Single,6,1,3.0,120.28,1
3,0.0,Mobile,3,15,DC,Male,4,Laptop,5,Single,8,0,3.0,134.07,1
4,0.0,Mobile,1,12,CC,Male,3,Mobile,5,Single,3,0,3.0,129.6,1


In [23]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# List of one-hot encoding columns (nominal)
one_hot_cols = ['PreferredLoginDevice', 'PreferredPaymentMode', 'PreferedOrderCat', 'MaritalStatus', 'Gender', 'Complain']

# List of label encoding columns (ordinal)
label_cols = ['CityTier', 'SatisfactionScore']

# One-hot encode the nominal variables
df_encoded = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)

# Label encode the ordinal variables
label_encoders = {col: LabelEncoder() for col in label_cols}

for col in label_cols:
    df_encoded[col] = label_encoders[col].fit_transform(df_encoded[col])

# Save the encoded DataFrame to an Excel file
df_encoded.to_excel('modelready.xlsx', index=False)

# Display confirmation message
print("DataFrame has been saved to 'modelready.xlsx'")


DataFrame has been saved to 'modelready.xlsx'
