In [1]:
import pandas as pd
import numpy as np
import warnings

In [2]:
df = pd.read_pickle(r"C:\Users\Almog\Desktop\Data Science\Projects\Bank Customer Churn\Pickle files\EDA_BCC.pkl")
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,619,France,Female,42,2,0.0,1,True,True,101348.88,True,True,2,DIAMOND,464
1,608,Spain,Female,41,1,83807.86,1,False,True,112542.58,False,True,3,DIAMOND,456
2,502,France,Female,42,8,159660.8,3,True,False,113931.57,True,True,3,DIAMOND,377
3,699,France,Female,39,1,0.0,2,False,False,93826.63,False,False,5,GOLD,350
4,850,Spain,Female,43,2,125510.82,1,True,True,79084.1,False,False,5,GOLD,425


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CreditScore         10000 non-null  int64  
 1   Geography           10000 non-null  string 
 2   Gender              10000 non-null  string 
 3   Age                 10000 non-null  int64  
 4   Tenure              10000 non-null  int64  
 5   Balance             10000 non-null  float64
 6   NumOfProducts       10000 non-null  int64  
 7   HasCrCard           10000 non-null  bool   
 8   IsActiveMember      10000 non-null  bool   
 9   EstimatedSalary     10000 non-null  float64
 10  Exited              10000 non-null  bool   
 11  Complain            10000 non-null  bool   
 12  Satisfaction Score  10000 non-null  int64  
 13  Card Type           10000 non-null  string 
 14  Point Earned        10000 non-null  int64  
dtypes: bool(4), float64(2), int64(6), string(3)
memory usa

#### Feature Engineering 

##### Encoding Categorial Features

In [6]:
# Non-ordinal mapping
df['Geography'] = (
    df['Geography']
    .str.strip()
    .map({'France': 1, 'Germany': 2, 'Spain': 3})
)

df['Gender'] = (
    df['Gender']
    .str.strip()
    .map({'Male': 1, 'Female': 2})
)

# Ordinal for card type
df['Card Type'] = (
    df['Card Type']
    .str.strip()
    .map({'SILVER':0,'GOLD':1,'DIAMOND':2,'PLATINUM':3})
)

# Binary mapping
binary_cols = ['HasCrCard','IsActiveMember','Exited','Complain']
for col in binary_cols:
    df[col] = df[col].astype(bool).astype(int)


In [7]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,619,1,2,42,2,0.0,1,1,1,101348.88,1,1,2,2,464
1,608,3,2,41,1,83807.86,1,0,1,112542.58,0,1,3,2,456
2,502,1,2,42,8,159660.8,3,1,0,113931.57,1,1,3,2,377
3,699,1,2,39,1,0.0,2,0,0,93826.63,0,0,5,1,350
4,850,3,2,43,2,125510.82,1,1,1,79084.1,0,0,5,1,425


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CreditScore         10000 non-null  int64  
 1   Geography           10000 non-null  int64  
 2   Gender              10000 non-null  int64  
 3   Age                 10000 non-null  int64  
 4   Tenure              10000 non-null  int64  
 5   Balance             10000 non-null  float64
 6   NumOfProducts       10000 non-null  int64  
 7   HasCrCard           10000 non-null  int32  
 8   IsActiveMember      10000 non-null  int32  
 9   EstimatedSalary     10000 non-null  float64
 10  Exited              10000 non-null  int32  
 11  Complain            10000 non-null  int32  
 12  Satisfaction Score  10000 non-null  int64  
 13  Card Type           10000 non-null  int64  
 14  Point Earned        10000 non-null  int64  
dtypes: float64(2), int32(4), int64(9)
memory usage: 1015.8

In [19]:
df_fe = df.copy()

# =========================
# 1. SIMPLE BEHAVIOR FLAGS
# =========================

df_fe['active_low_satisfaction'] = (
    (df_fe['IsActiveMember'] == 1) &
    (df_fe['Satisfaction Score'].isin([1, 2]))
).astype(int)

df_fe['inactive_high_satisfaction'] = (
    (df_fe['IsActiveMember'] == 0) &
    (df_fe['Satisfaction Score'] >= 4)
).astype(int)

df_fe['complainer'] = (df_fe['Complain'] == 1).astype(int)


# =========================
# 2. CREDIT & RISK SIGNALS
# =========================

df_fe['low_credit_score'] = (df_fe['CreditScore'] < 600).astype(int)
df_fe['high_credit_score'] = (df_fe['CreditScore'] >= 750).astype(int)

df_fe['low_credit_and_complains'] = (
    (df_fe['CreditScore'] < 600) &
    (df_fe['Complain'] == 1)
).astype(int)


# =========================
# 3. TENURE & LOYALTY
# =========================

df_fe['long_tenure'] = (df_fe['Tenure'] >= 7).astype(int)
df_fe['short_tenure'] = (df_fe['Tenure'] <= 2).astype(int)

df_fe['inactive_long_tenure'] = (
    (df_fe['IsActiveMember'] == 0) &
    (df_fe['Tenure'] >= 7)
).astype(int)


# =========================
# 4. PRODUCT ENGAGEMENT
# =========================

df_fe['single_product'] = (df_fe['NumOfProducts'] == 1).astype(int)
df_fe['multi_product'] = (df_fe['NumOfProducts'] >= 3).astype(int)

df_fe['active_single_product'] = (
    (df_fe['IsActiveMember'] == 1) &
    (df_fe['NumOfProducts'] == 1)
).astype(int)


# =========================
# 5. MONEY BEHAVIOR
# =========================

df_fe['has_balance'] = (df_fe['Balance'] > 0).astype(int)
df_fe['high_balance'] = (
    df_fe['Balance'] > df_fe['Balance'].median()
).astype(int)

df_fe['salary_to_balance_ratio'] = (
    df_fe['EstimatedSalary'] / (df_fe['Balance'] + 1)
)


# =========================
# 6. AGE-BASED RISK
# =========================

df_fe['young_customer'] = (df_fe['Age'] < 30).astype(int)
df_fe['senior_customer'] = (df_fe['Age'] >= 55).astype(int)

df_fe['senior_inactive'] = (
    (df_fe['Age'] >= 55) &
    (df_fe['IsActiveMember'] == 0)
).astype(int)

df_fe.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,...,inactive_long_tenure,single_product,multi_product,active_single_product,has_balance,high_balance,salary_to_balance_ratio,young_customer,senior_customer,senior_inactive
0,619,1,2,42,2,0.0,1,1,1,101348.88,...,0,1,0,1,0,0,101348.88,0,0,0
1,608,3,2,41,1,83807.86,1,0,1,112542.58,...,0,1,0,1,1,0,1.342848,0,0,0
2,502,1,2,42,8,159660.8,3,1,0,113931.57,...,1,0,1,0,1,1,0.713581,0,0,0
3,699,1,2,39,1,0.0,2,0,0,93826.63,...,0,0,0,0,0,0,93826.63,0,0,0
4,850,3,2,43,2,125510.82,1,1,1,79084.1,...,0,1,0,1,1,1,0.630093,0,0,0


In [23]:
df_fe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 33 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   CreditScore                 10000 non-null  int64  
 1   Geography                   10000 non-null  int64  
 2   Gender                      10000 non-null  int64  
 3   Age                         10000 non-null  int64  
 4   Tenure                      10000 non-null  int64  
 5   Balance                     10000 non-null  float64
 6   NumOfProducts               10000 non-null  int64  
 7   HasCrCard                   10000 non-null  int32  
 8   IsActiveMember              10000 non-null  int32  
 9   EstimatedSalary             10000 non-null  float64
 10  Exited                      10000 non-null  int32  
 11  Complain                    10000 non-null  int32  
 12  Satisfaction Score          10000 non-null  int64  
 13  Card Type                   1000

In [25]:
df_fe.to_pickle(r"C:\Users\Almog\Desktop\Data Science\Projects\Bank Customer Churn\Pickle files\FE_BCC.pkl")