# Binary Classification with Bank Churn

## Set-Up

### Imports

In [1]:
# Data Wrangling
import pandas as pd
import numpy  as np

# Data visualisation
from matplotlib import pyplot as plt
import seaborn as sns
from yellowbrick.classifier import ConfusionMatrix

# Machine Learning
## Data splitting
from sklearn.model_selection import train_test_split, StratifiedKFold
## Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder, LabelEncoder
## Dimensionality
from sklearn.decomposition import PCA, FactorAnalysis
## Models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
## Hyperparameter tunning
from sklearn.model_selection import GridSearchCV

# Utils
## Paths
from pyhere import here
## Datetime
import datetime
## Data
from collections import defaultdict

### File paths

In [2]:
raw_data   = here("Data", "Raw", "binary_classification_with_bank_churn.csv")
test_data  = here("Data", "Test", "binary_classification_with_bank_churn.csv")

out_suffix = datetime.datetime.now().strftime("%Y%m%d%H%M") 
out_file   = here("Submissions", f"binary_classification_with_bank_churn_{out_suffix}.csv")

## Data

### Intake

In [3]:
df = pd.read_csv(raw_data, index_col = 0)
display(df.info())
display(df.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165034 entries, 0 to 165033
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   CustomerId       165034 non-null  int64  
 1   Surname          165034 non-null  object 
 2   CreditScore      165034 non-null  int64  
 3   Geography        165034 non-null  object 
 4   Gender           165034 non-null  object 
 5   Age              165034 non-null  float64
 6   Tenure           165034 non-null  int64  
 7   Balance          165034 non-null  float64
 8   NumOfProducts    165034 non-null  int64  
 9   HasCrCard        165034 non-null  float64
 10  IsActiveMember   165034 non-null  float64
 11  EstimatedSalary  165034 non-null  float64
 12  Exited           165034 non-null  int64  
dtypes: float64(5), int64(5), object(3)
memory usage: 17.6+ MB


None

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


### Initial data inspection

#### Missing data

In [4]:
df.isnull().sum()

CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

#### Numerical features

In [5]:
num_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
display(df[num_cols].describe())

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary
count,165034.0,165034.0,165034.0,165034.0,165034.0,165034.0
mean,656.454373,38.125888,5.020353,55478.086689,1.554455,112574.822734
std,80.10334,8.867205,2.806159,62817.663278,0.547154,50292.865585
min,350.0,18.0,0.0,0.0,1.0,11.58
25%,597.0,32.0,3.0,0.0,1.0,74637.57
50%,659.0,37.0,5.0,0.0,2.0,117948.0
75%,710.0,42.0,7.0,119939.5175,2.0,155152.4675
max,850.0,92.0,10.0,250898.09,4.0,199992.48


#### Categorical features

In [6]:
count_dct = defaultdict(list)

cat_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'Exited']
for col in cat_cols:
    s = df[col].value_counts()

    count_dct["Feature"].append(f"--{col}--")
    count_dct["Feature"].extend(s.index.values)
    
    count_dct["Count"].append(np.nan)
    count_dct["Count"].extend(s.values)

    count_dct["Totals"].append(np.nan)
    count_dct["Totals"].extend([s.values.sum()]*len(s.values))

#count_dct
count_df = pd.DataFrame.from_dict(count_dct)
count_df["Percentage"] = count_df.apply(lambda row: (row[1]/row[2])*100, axis = 1)
display(count_df.drop(columns = ["Totals"]))
del count_df

Unnamed: 0,Feature,Count,Percentage
0,--Geography--,,
1,France,94215.0,57.088236
2,Spain,36213.0,21.942751
3,Germany,34606.0,20.969012
4,--Gender--,,
5,Male,93150.0,56.442915
6,Female,71884.0,43.557085
7,--HasCrCard--,,
8,1.0,124428.0,75.395373
9,0.0,40606.0,24.604627


### Data Preparation

In [20]:
df["Geography"] = LabelEncoder().fit_transform(df["Geography"])
df["Gender"] = LabelEncoder().fit_transform(df["Gender"])

#### Train-validation split

In [24]:
X = df.drop(columns = ["Exited", "CustomerId", "Surname"])
y = df["Exited"]

#X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.2, random_state = 13, shuffle = True, stratify = y)

In [8]:
#print(f"Proportion of positive classes for training: {y_train.mean():.2}")
#print(f"Proportion of positive classes for validation: {y_val.mean():.2}")

#### Feature engineering

In [9]:
num_cols = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary"]
cat_cols = ["Geography", "Gender", 'HasCrCard', 'IsActiveMember']

#---- Logistic regression ------
lr_preproc0 = ColumnTransformer(
    transformers = [
        ("dummies",  OneHotEncoder(), cat_cols)
    ], remainder = 'passthrough'
)

lr_preproc1 = ColumnTransformer(
    transformers = [
        ("dummies",  OneHotEncoder(), cat_cols),
        ("transform", PowerTransformer(), num_cols)
    ], remainder = 'passthrough'
)
#---- K-Nearest Neighbour -------
knn_step0 = ColumnTransformer(
    transformers = [
        ("dummies", OneHotEncoder(drop = None), cat_cols),
        ("transform",  PowerTransformer(), num_cols)
    ], remainder = "passthrough"
)

knn_step1 = ColumnTransformer(
    transformers = [
        ("scale",  StandardScaler(), [9,10,11,12,13,14])
    ], remainder = "passthrough"
)

knn_preproc = Pipeline(
    steps = [
        ("step0", knn_step0),
        ("step1", knn_step1)
    ])

#---- Naive Bayes-----------------
nb_preproc = ColumnTransformer(
    transformers = [
        ("encode", LabelEncoder(), ["Geography", "Gender"])
    ], remainder = "passthrough"
)

#----- SVM -----------------------
svm_step0 = ColumnTransformer(
    transformers = [
        ("dummies", OneHotEncoder(drop = None), cat_cols),
        ("transform", PowerTransformer(), num_cols)
    ], remainder = "passthrough"
)

svm_step1 = ColumnTransformer(
    transformers = [
        ("scale", StandardScaler(), [9,10,11,12,13,14])
    ], remainder = "passthrough"
)

svm_preproc = Pipeline(
    steps = [
        ("step0", svm_step0),
        ("step1", svm_step1)
    ]
)

#---- Random Forest and GBM -------------
tree_preproc = ColumnTransformer(
    transformers = [
        ("encode", LabelEncoder(), ["Geography", "Gender"])
    ], remainder = "passthrough"
)

### Grid search setup

#### Stratified K-fold Crossvalidation

In [10]:
kfold_cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 13)

#### Processing Pipeline

In [11]:
pipe = Pipeline(steps = [
    ("features", "passthrough"),
    ("dim_redux", "passthrough"),
    ("clf", "passthrough")
])

#### Param Grid

In [21]:
param_grids = [{
    "features"   : [lr_preproc0, lr_preproc1],
    "dim_redux": [PCA()],
    "dim_redux__n_components": [None,1,2,3,4,5],
    "clf": [LogisticRegression(solver = "saga")],
    "clf__penalty": ["l1", "l2"],
    "clf__C": np.linspace(0.1,1,10)
}, {
    "features" : [knn_preproc],
    "dim_redux": [PCA()],
    "dim_redux__n_components": [None,1,2,3],
    "clf": [KNeighborsClassifier()],
    "clf__n_neighbors" : [5,7,10],
    "clf__weights" : ["uniform", "distance"],
    "clf__p": [1,2]
}, {
    "features": [nb_preproc],
    "clf": [GaussianNB()]
},{
    "features": [svm_preproc],
    "dim_redux": [PCA()],
    "dim_redux__n_components": [None,1,2,3,4,5],
    "clf": [SVC()],
    "clf__C": np.linspace(0.1,1,10),
    "clf__kernel": ["linear", "poly", "rbf"],
    "clf__tol" : [0.001, 0.0005, 0.0001]
},{
    "clf": [RandomForestClassifier()],
    "clf__n_estimators": [100,200,300,400,500,600,700,800,900,1000],
    "clf__criterion": ["gini", "entropy"],
    "clf__max_depth": [1,2,4,8,16,32],
    "clf__max_features": ["sqrt", "log2"]
}, {
    "clf": [GradientBoostingClassifier()],
    "clf__loss": ["log_loss", "exponential"],
    "clf__learning_rate": [0.1, 0.01, 0.001],
    "clf__n_estimators" : [100,500,1000],
    "clf__max_depth": [1,2,4,8,16],
    "clf__min_impurity_decrease": [0, 0.01, 0.1]  
}]

In [22]:
param_grid = param_grids[-1]

In [25]:
grid = GridSearchCV(pipe, n_jobs = 30, param_grid = param_grid, scoring = "roc_auc", cv = kfold_cv)
grid.fit(X, y)

display(grid.best_params_)
display(grid.best_score_)



KeyboardInterrupt: 