# Feature Engineering

Feature engineering is the art of creating new features based on the available ones in order to increase model performance. In this case, feature engineering will be realized using the hypothesis testing results, looking for catch complex relationships between the features that most influence in the churning of the customers.

# Preparing the environment

In [1]:
import pandas as pd
import sys
sys.path.append('../ecommerce_customer_churn_prevention')
from utils import paths

# Importing the data

In [2]:
df = pd.read_csv(paths.data_interim_dir('df_etl_processed.csv'))
df.head()

Unnamed: 0,CustomerID,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,HourSpendOnApp,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount
0,50001,1,4.0,Mobile Phone,3,6.0,Debit Card,Female,3.0,3,Laptop & Accessory,2,Single,9,1,11.0,1.0,1.0,5.0,159.93
1,50002,1,,Mobile Phone,1,8.0,UPI,Male,3.0,4,Mobile Phone,3,Single,7,1,15.0,0.0,1.0,0.0,120.9
2,50003,1,,Mobile Phone,1,30.0,Debit Card,Male,2.0,4,Mobile Phone,3,Single,6,1,14.0,0.0,1.0,3.0,120.28
3,50004,1,0.0,Mobile Phone,3,15.0,Debit Card,Male,2.0,4,Laptop & Accessory,5,Single,8,0,23.0,0.0,1.0,3.0,134.07
4,50005,1,0.0,Mobile Phone,1,12.0,Credit Card,Male,,3,Mobile Phone,5,Single,3,0,11.0,1.0,1.0,3.0,129.6


In [3]:
# Converting the features to categorical like the data dictionary

cat_features = ['PreferredLoginDevice', 'CityTier', 'PreferredPaymentMode', 'Gender', 'PreferedOrderCat', 'MaritalStatus', 'Complain']

df[cat_features] = df[cat_features].astype('category')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   CustomerID                   5630 non-null   int64   
 1   Churn                        5630 non-null   int64   
 2   Tenure                       5366 non-null   float64 
 3   PreferredLoginDevice         5630 non-null   category
 4   CityTier                     5630 non-null   category
 5   WarehouseToHome              5379 non-null   float64 
 6   PreferredPaymentMode         5630 non-null   category
 7   Gender                       5630 non-null   category
 8   HourSpendOnApp               5375 non-null   float64 
 9   NumberOfDeviceRegistered     5630 non-null   int64   
 10  PreferedOrderCat             5630 non-null   category
 11  SatisfactionScore            5630 non-null   int64   
 12  MaritalStatus                5630 non-null   category
 13  Num

In [4]:
# Interactions between categorical features

df['PreferredLoginDevice_CityTier'] = df['PreferredLoginDevice'].astype(str) + '_' + df['CityTier'].astype(str)
df['PreferredLoginDevice_PreferredPaymentMode'] = df['PreferredLoginDevice'].astype(str) + '_' + df['PreferredPaymentMode'].astype(str)
df['CityTier_PreferredPaymentMode'] = df['CityTier'].astype(str) + '_' + df['PreferredPaymentMode'].astype(str)
df['Gender_PreferedOrderCat'] = df['Gender'].astype(str) + '_' + df['PreferedOrderCat'].astype(str)
df['Gender_MaritalStatus'] = df['Gender'].astype(str) + '_' + df['MaritalStatus'].astype(str)
df['Gender_Complain'] = df['Gender'].astype(str) + '_' + df['Complain'].astype(str)
df['PreferedOrderCat_MaritalStatus'] = df['PreferedOrderCat'].astype(str) + '_' + df['MaritalStatus'].astype(str)
df['Complain_MaritalStatus'] = df['Complain'].astype(str) + '_' + df['MaritalStatus'].astype(str)
df['Complain_PreferedOrderCat'] = df['Complain'].astype(str) + '_' + df['PreferedOrderCat'].astype(str)

In [9]:
# Interactions between numerical features

df['OrderCount_Tenure_Ratio'] = df['OrderCount'] / (df['Tenure'] + 1)
df['CouponUsed_Tenure_Ratio'] = df['CouponUsed'] / (df['Tenure'] + 1)
df['WarehouseToHome_Tenure_Ratio'] = df['WarehouseToHome'] / (df['Tenure'] + 1)
df['HourSpendOnApp_Tenure_Ratio'] = df['HourSpendOnApp'] / (df['Tenure'] + 1)
df['SatisfactionScore_Tenure_Ratio'] = df['SatisfactionScore'] / (df['Tenure'] + 1)
df['HourSpendOnApp_NumberOfDeviceRegistered'] = df['HourSpendOnApp'] * df['NumberOfDeviceRegistered']
df['SatisfactionScore_NumberOfDeviceRegistered'] = df['SatisfactionScore'] * df['NumberOfDeviceRegistered']
df['SatisfactionScore_OrderCount'] = df['SatisfactionScore'] * df['OrderCount']
df['SatisfactionScore_CouponUsed'] = df['SatisfactionScore'] * df['CouponUsed']
df['SatisfactionScore_CashbackAmount'] = df['SatisfactionScore'] * df['CashbackAmount']
df['CashbackAmount_Tenure_Ratio'] = df['CashbackAmount'] / (df['Tenure'] + 1)
df['CashbackAmount_CouponUsed_Ratio'] = df['CashbackAmount'] / (df['CouponUsed'] + 1)

In [10]:
# Saving the processed data with new features

df.to_csv(paths.data_processed_dir('df_processed.csv'), index=False)