In [3]:
# we will derive new features from the existing featues to gain more insights and uncover patterns (Comment by VedantK)
import os
import pandas as pd 
current_dir = os.getcwd()
data_path = os.path.join(current_dir, '..', 'data', 'raw', 'processed_data.csv')
data_model = pd.read_csv(data_path)

#### Engineering Advanced Features

In [4]:
# Engagement score
data_model['Engagement'] = data_model['NumOfProducts'] * data_model['IsActiveMember']

# Salary-to-balance ratio
data_model['Balance_to_Salary'] = data_model['Balance'] / (data_model['EstimatedSalary'] + 1) 

In [5]:
# Age feature : How long a customer has been with the bank relative to their age
data_model['Tenure_per_Age'] = data_model['Tenure'] / (data_model['Age'] + 1)

In [6]:
#Balance Features : Customers with very low or very high balances might behave differently
data_model['High_Balance'] = (data_model['Balance'] > data_model['Balance'].median()).astype(int)
data_model['Low_Balance'] = (data_model['Balance'] < data_model['Balance'].quantile(0.25)).astype(int)

In [7]:
#Churn Risk Score Features - Combine complaints and satisfaction to predict churn tendencies (High complaints + low satisfaction → higher risk)
data_model['Risk_Score'] = data_model['Complain'] * (1 - data_model['Satisfaction Score']/100)

In [8]:
# Gradient boosting or tree models can benefit from ratio and product features:

data_model['Age_Balance'] = data_model['Age'] * data_model['Balance']
data_model['Tenure_Balance'] = data_model['Tenure'] * data_model['Balance']

In [9]:
# Clustering based customer segmentation, we will use k means clustering to group "similar customers" together and assign the cluseters a label and use these lables as features
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

cluster_features = ['CreditScore','Balance','EstimatedSalary','Satisfaction Score','Point Earned']
scaler = StandardScaler()
X_cluster = scaler.fit_transform(data_model[cluster_features])

kmeans = KMeans(n_clusters=5, random_state=42)
data_model['CustomerCluster'] = kmeans.fit_predict(X_cluster)

In [10]:
data_model.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,...,Geography_Spain,Engagement,Balance_to_Salary,Tenure_per_Age,High_Balance,Low_Balance,Risk_Score,Age_Balance,Tenure_Balance,CustomerCluster
0,619,0,42,2,0.0,1,1,1,101348.88,1,...,False,1,0.0,0.046512,0,0,0.98,0.0,0.0,4
1,608,0,41,1,83807.86,1,0,1,112542.58,0,...,True,1,0.74467,0.02381,0,0,0.97,3436122.26,83807.86,3
2,502,0,42,8,159660.8,3,1,0,113931.57,1,...,False,0,1.401362,0.186047,1,0,0.97,6705753.6,1277286.4,0
3,699,0,39,1,0.0,2,0,0,93826.63,0,...,False,0,0.0,0.025,0,0,0.0,0.0,0.0,3
4,850,0,43,2,125510.82,1,1,1,79084.1,0,...,True,1,1.587035,0.045455,1,0,0.0,5396965.26,251021.64,3


In [14]:
data_path = os.path.join(current_dir, '..', 'data', 'processed', 'engineered_data.csv')

In [15]:
data_model.to_csv(data_path, index=False)  

In [13]:
data_model.columns

Index(['CreditScore', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
       'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited', 'Complain',
       'Satisfaction Score', 'Point Earned', 'Card_Type_Encoded',
       'Geography_France', 'Geography_Germany', 'Geography_Spain',
       'Engagement', 'Balance_to_Salary', 'Tenure_per_Age', 'High_Balance',
       'Low_Balance', 'Risk_Score', 'Age_Balance', 'Tenure_Balance',
       'CustomerCluster'],
      dtype='object')

In [None]:
# Identifying Leaky Features
corr = data_model.corr(numeric_only=True)["Exited"].sort_values(ascending=False)
print(corr)

Exited                1.000000
Complain              0.995693
Risk_Score            0.995581
Age                   0.285296
Age_Balance           0.205806
Geography_Germany     0.173313
Balance               0.118577
High_Balance          0.114194
Tenure_Balance        0.087883
Balance_to_Salary     0.025950
Card_Type_Encoded     0.016949
EstimatedSalary       0.012490
Point Earned         -0.004628
Satisfaction Score   -0.005849
HasCrCard            -0.006976
Tenure               -0.013656
CreditScore          -0.026771
NumOfProducts        -0.047611
Geography_Spain      -0.052800
CustomerCluster      -0.077990
Geography_France     -0.104688
Gender               -0.106267
Tenure_per_Age       -0.119383
Engagement           -0.138103
IsActiveMember       -0.156356
Low_Balance                NaN
Name: Exited, dtype: float64
