In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import math

In [3]:
df = pd.read_pickle(r"C:\Users\Almog\Desktop\Data Science\Projects\Bank Customer Churn\Pickle files\FE_BCC.pkl")
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,...,inactive_long_tenure,single_product,multi_product,active_single_product,has_balance,high_balance,salary_to_balance_ratio,young_customer,senior_customer,senior_inactive
0,619,1,2,42,2,0.0,1,1,1,101348.88,...,0,1,0,1,0,0,101348.88,0,0,0
1,608,3,2,41,1,83807.86,1,0,1,112542.58,...,0,1,0,1,1,0,1.342848,0,0,0
2,502,1,2,42,8,159660.8,3,1,0,113931.57,...,1,0,1,0,1,1,0.713581,0,0,0
3,699,1,2,39,1,0.0,2,0,0,93826.63,...,0,0,0,0,0,0,93826.63,0,0,0
4,850,3,2,43,2,125510.82,1,1,1,79084.1,...,0,1,0,1,1,1,0.630093,0,0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 33 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   CreditScore                 10000 non-null  int64  
 1   Geography                   10000 non-null  int64  
 2   Gender                      10000 non-null  int64  
 3   Age                         10000 non-null  int64  
 4   Tenure                      10000 non-null  int64  
 5   Balance                     10000 non-null  float64
 6   NumOfProducts               10000 non-null  int64  
 7   HasCrCard                   10000 non-null  int32  
 8   IsActiveMember              10000 non-null  int32  
 9   EstimatedSalary             10000 non-null  float64
 10  Exited                      10000 non-null  int32  
 11  Complain                    10000 non-null  int32  
 12  Satisfaction Score          10000 non-null  int64  
 13  Card Type                   1000

#### Feature Selection

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier

# Features and target
X = df.drop('Exited', axis=1)
y = df['Exited']

# Scale features for models that need it (Logistic, Ridge)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Logistic Regression with L1 penalty (Lasso equivalent for classification)
lasso_clf = LogisticRegression(penalty='l1', solver='saga', C=1/0.01, max_iter=10000).fit(X_scaled, y)
lasso_selected = (np.abs(lasso_clf.coef_)[0] > 0).astype(int)

# Ridge Classifier (L2 penalty)
ridge_clf = RidgeClassifier(alpha=0.01, max_iter=10000).fit(X_scaled, y)
ridge_selected = (np.abs(ridge_clf.coef_)[0] > 0).astype(int)

# Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier().fit(X, y)
gb_selected = (gb_clf.feature_importances_ > 0).astype(int)

# Random Forest Classifier
rf_clf = RandomForestClassifier().fit(X, y)
rf_selected = (rf_clf.feature_importances_ > 0).astype(int)

# XGBoost Classifier
xgb_clf = XGBClassifier(eval_metric='logloss').fit(X, y)
xgb_selected = (xgb_clf.feature_importances_ > 0).astype(int)

# Combine results into a DataFrame
selection_df = pd.DataFrame({
    'Feature': X.columns,
    'Lasso': lasso_selected,
    'GradientBoost': gb_selected,
    'RandomForest': rf_selected,
    'Ridge': ridge_selected,
    'XGBoost': xgb_selected
})

# Sum column to indicate total approvals
selection_df['Sum'] = selection_df[['Lasso', 'GradientBoost', 'RandomForest', 'Ridge', 'XGBoost']].sum(axis=1)

selection_df.sort_values('Sum', ascending=False, inplace=True)

selection_df.reset_index(drop=True, inplace=True)

selection_df


Unnamed: 0,Feature,Lasso,GradientBoost,RandomForest,Ridge,XGBoost,Sum
0,CreditScore,1,1,1,1,1,5
1,EstimatedSalary,1,1,1,1,1,5
2,salary_to_balance_ratio,1,1,1,1,1,5
3,inactive_long_tenure,1,1,1,1,1,5
4,low_credit_and_complains,1,1,1,1,1,5
5,Point Earned,1,1,1,1,1,5
6,Card Type,1,1,1,1,1,5
7,Satisfaction Score,1,1,1,1,1,5
8,Complain,1,1,1,1,1,5
9,senior_inactive,1,1,1,1,1,5


### Feature Selection Decision Rationale

Although several engineered features received lower aggregate importance scores across models, all such features were intentionally retained at this stage.

This dataset contains a relatively small number of observations, which can cause feature importance methods to be less discriminative and overly permissive. Additionally, all lower-scoring features were engineered based on domain intuition (customer behavior, satisfaction, tenure patterns) rather than raw noise.

Dropping these features prematurely could remove valuable signals that:
- activate only under class imbalance
- influence recall rather than accuracy
- contribute through interactions rather than standalone importance

For this reason, feature selection is treated as an iterative process. Final pruning will be considered only after baseline modeling, threshold tuning, and recall-focused evaluation provide empirical evidence that certain features do not contribute meaningfully to model performance.


In [17]:
df.to_pickle(r"C:\Users\Almog\Desktop\Data Science\Projects\Bank Customer Churn\Pickle files\FS_BCC.pkl")