# Binary Classification with Bank Churn

## Set-UP

### Imports

In [1]:
# Data Wrangling
import numpy as np
import pandas as pd

# Machine Learning
## Pre-processing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PowerTransformer, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# PyTorch
## Torch
import torch
import torch.nn as nn
import torch.optim as optim
## Utils
from torch.utils.data import TensorDataset, DataLoader

# Utils
## File paths
from pyhere import here
## Date and times
import datetime
## Progress bar
from tqdm import tqdm

### Functions

In [2]:
def make_train_step_fn(model, loss_fn, optimiser):
    def train_step(X,y):
        model.train()
        yhat = model(X)
        loss = loss_fn(yhat, y)
        loss.backward()
        optimiser.step()
        optimiser.zero_grad()
        return loss.item()
    return train_step

def make_val_step_fn(model, loss_fn):
    def val_step(X,y):
        model.eval()
        yhat = model(X)
        loss = loss_fn(yhat, y)
        return loss.item()
    return val_step

In [3]:
def do_epoch(data_loader):
    epoch_loss = []
    
    for X_batch, y_batch in data_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        tr_loss = train_step(X_batch, y_batch)
        epoch_loss.append(tr_loss)

    return np.mean(epoch_loss)

### Device

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

### Paths

In [5]:
raw_data = here("Data", "Raw", "binary_classification_with_bank_churn.csv")
test_data = here("Data", "Test", "binary_classification_with_bank_churn.csv")

## Data

### Data Intake

In [6]:
df = pd.read_csv(raw_data)
display(df.info())
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165034 entries, 0 to 165033
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               165034 non-null  int64  
 1   CustomerId       165034 non-null  int64  
 2   Surname          165034 non-null  object 
 3   CreditScore      165034 non-null  int64  
 4   Geography        165034 non-null  object 
 5   Gender           165034 non-null  object 
 6   Age              165034 non-null  float64
 7   Tenure           165034 non-null  int64  
 8   Balance          165034 non-null  float64
 9   NumOfProducts    165034 non-null  int64  
 10  HasCrCard        165034 non-null  float64
 11  IsActiveMember   165034 non-null  float64
 12  EstimatedSalary  165034 non-null  float64
 13  Exited           165034 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 17.6+ MB


None

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


### Data Preparation

In [7]:
# Pre-processing pipeline
feat_eng = ColumnTransformer(
    transformers = 
    [
        ("dummy", OneHotEncoder(), ["Geography", "Gender"]),
        ("transform", PowerTransformer(), ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary"])
    ], remainder = "passthrough"
)

scaler = ColumnTransformer(
    transformers = [
        ("scale", StandardScaler(), [5,6,7,8,9,10])
    ], remainder = "passthrough"
)

pipe = Pipeline(
    steps = [
        ("feats", feat_eng),
        ("scale", scaler)
    ])

def prepare_data(data_frame):
    X = data_frame.drop(columns = ["id", "CustomerId", "Surname", "Exited"])
    y = data_frame.loc[:, "Exited"]

    return X, y.values.reshape(-1,1)

In [8]:
X, y = prepare_data(df)
X = pipe.fit_transform(X)

### Torch TensorDataset

In [9]:
train_dataset = TensorDataset(
    torch.as_tensor(X).float(),
    torch.as_tensor(y).float())

## Model

### Hyperparameters

In [10]:
lr = 0.001
batch_size = 32
n_epoch = 500
optim_fn = optim.SGD

### Model Configuration

In [11]:
model = nn.Sequential(
    nn.Linear(
        in_features  = X.shape[1],
        out_features = 610,
        bias = True
    ),
    nn.PReLU(),
    nn.Dropout(0.25),
    nn.Linear(
        in_features = 610,
        out_features = 305,
        bias = True
    ),
    nn.PReLU(),
    nn.Dropout(0.25),
    nn.Linear(
        in_features  = 305,
        out_features = 150,
        bias = True
    ),
    nn.PReLU(),
    nn.Dropout(0.25),
    nn.Linear(
        in_features = 150,
        out_features = 75,
        bias = True
    ),
    nn.PReLU(),
    nn.Dropout(0.25),
    nn.Linear(
        in_features = 75,
        out_features = 25,
        bias = True
    ),
    nn.PReLU(),
    nn.Dropout(0.25),
    nn.Linear(
        in_features = 25,
        out_features = 7,
        bias = True
    ),
    nn.PReLU(),
    nn.Dropout(0.25),
    nn.Linear(
        in_features = 7,
        out_features = 1,
        bias = True
    )
)

loss_fn = nn.BCEWithLogitsLoss()
optimiser = optim_fn(model.parameters(), lr = lr)

## Model Training

### Data Loaders

In [12]:
data_loader = DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle = True)

In [13]:
model.to(device)

train_step = make_train_step_fn(model, loss_fn, optimiser)
val_step = make_val_step_fn(model, loss_fn)

train_losses = []


for epoch in tqdm(range(n_epoch)):
    loss = do_epoch(data_loader)
    train_losses.append(loss)

100%|█████████████████████████████████████████| 500/500 [27:27<00:00,  3.30s/it]


## Predictions

In [14]:
test_df = pd.read_csv(test_data)
display(test_df.head())

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


In [15]:
ids = test_df["id"]

In [16]:
X_test = test_df.drop(columns = ["id", "CustomerId", "Surname"])
X_test = pipe.transform(X_test)

model.eval()
yhat = model(torch.as_tensor(X_test).float().to(device)).detach().cpu()
yprops = torch.sigmoid(yhat).numpy()

In [17]:
yprops.shape

(110023, 1)

In [18]:
submission = pd.DataFrame.from_dict(
    {
        "id": ids.values,
        "Exited": yprops.reshape(-1,)
    }
)

submission.head()

Unnamed: 0,id,Exited
0,165034,0.020087
1,165035,0.89536
2,165036,0.021614
3,165037,0.203092
4,165038,0.332537


In [19]:
out_suffix = datetime.datetime.now().strftime("%Y%m%d%H%M")
out_file = here("Submissions", f"binary_classification_with_bank_churn_dl_{out_suffix}.csv")
submission.to_csv(out_file, index = False)