In [1]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Load the data

In [2]:
os.listdir('../data/playground_series_s4e1')

['.DS_Store',
 'test.csv',
 'Churn_Modelling.csv',
 'train.csv',
 'sample_submission.csv']

In [3]:
test_url = os.path.join('../data/playground_series_s4e1', 
                        'test.csv')
train_url = os.path.join('../data/playground_series_s4e1', 
                         'train.csv')
origin_url = os.path.join('../data/playground_series_s4e1', 
                          'Churn_Modelling.csv')

In [4]:
train_df = pd.read_csv(train_url)
test_df = pd.read_csv(test_url)
origin_df = pd.read_csv(origin_url)

In [5]:
train_df.shape, test_df.shape, origin_df.shape

((165034, 14), (110023, 13), (10002, 14))

# Preprocessing

In [6]:
train_df = train_df.drop(columns='id')
train_df.columns = train_df.columns.str.lower()
train_df.dtypes

customerid           int64
surname             object
creditscore          int64
geography           object
gender              object
age                float64
tenure               int64
balance            float64
numofproducts        int64
hascrcard          float64
isactivemember     float64
estimatedsalary    float64
exited               int64
dtype: object

In [7]:
def impute_target(df, col):
    df_target = df.groupby(col).agg(
        {'exited': 'mean'})
    df_target = df_target.rename(columns=
                                 {'exited': col + '_target'})
    df = pd.merge(df, df_target, 
                  on=col, how='left', 
                  validate='m:1')
    df = df.drop(columns=col)
    
    return df

In [8]:
cols = train_df.select_dtypes('object').columns.tolist()
cols = cols + ['customerid']
for col in cols:
    train_df = impute_target(train_df, col)

train_df.head(1)

Unnamed: 0,creditscore,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited,surname_target,geography_target,gender_target,customerid_target
0,668,33.0,3,0.0,2,1.0,0.0,181449.97,0,0.318008,0.165282,0.159055,0.30303


# Basline Models

In [9]:
X_train = train_df.drop(columns='exited')
y_train = train_df['exited']

In [10]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [16]:
skf = StratifiedKFold(n_splits=5, 
                      shuffle=True,
                      random_state=42)

model = LogisticRegression()
scores = cross_val_score(model, X_train_scaled, y_train, cv=skf, scoring='roc_auc')

for i, score in enumerate(scores):
    print(f'The score of fold {i+1} is: {score}.')

print(f'The average scores is {scores.mean()}.')

The score of fold 1 is: 0.873010912366314.
The score of fold 2 is: 0.8745956799428332.
The score of fold 3 is: 0.8771877620022361.
The score of fold 4 is: 0.8791906600477422.
The score of fold 5 is: 0.8786230027583497.
The average scores is 0.8765216034234949.


In [17]:
%%time
skf = StratifiedKFold(n_splits=5, 
                      shuffle=True,
                      random_state=42)

model = CatBoostClassifier(verbose=0)
scores = cross_val_score(model, X_train_scaled, y_train, cv=skf, scoring='roc_auc', 
                        n_jobs=-1)

for i, score in enumerate(scores):
    print(f'The score of fold {i+1} is: {score}.')

print(f'The average scores is {scores.mean()}.')

The score of fold 1 is: 0.9259894151921912.
The score of fold 2 is: 0.9269409013411741.
The score of fold 3 is: 0.92804601788734.
The score of fold 4 is: 0.9297142492776472.
The score of fold 5 is: 0.9262258252621384.
The average scores is 0.9273832817920982.
CPU times: user 33 ms, sys: 11 ms, total: 44 ms
Wall time: 19.2 s


In [18]:
%%time
skf = StratifiedKFold(n_splits=5, 
                      shuffle=True,
                      random_state=42)

model = XGBClassifier(verbose=0)
scores = cross_val_score(model, X_train_scaled, y_train, cv=skf, scoring='roc_auc', 
                        n_jobs=-1)

for i, score in enumerate(scores):
    print(f'The score of fold {i+1} is: {score}.')

print(f'The average scores is {scores.mean()}.')

The score of fold 1 is: 0.9250976749618663.
The score of fold 2 is: 0.9261685181436335.
The score of fold 3 is: 0.9265160497285003.
The score of fold 4 is: 0.9277679967619491.
The score of fold 5 is: 0.9243027344559891.
The average scores is 0.9259705948103877.
CPU times: user 26.4 ms, sys: 8.79 ms, total: 35.2 ms
Wall time: 7.92 s


In [19]:
%%time
skf = StratifiedKFold(n_splits=5, 
                      shuffle=True,
                      random_state=42)

model = LGBMClassifier(verbose=0)
scores = cross_val_score(model, X_train_scaled, y_train, cv=skf, scoring='roc_auc', 
                        n_jobs=-1)

for i, score in enumerate(scores):
    print(f'The score of fold {i+1} is: {score}.')

print(f'The average scores is {scores.mean()}.')

The score of fold 1 is: 0.9261224947760769.
The score of fold 2 is: 0.9271216687159156.
The score of fold 3 is: 0.9280227242144901.
The score of fold 4 is: 0.9292433328398353.
The score of fold 5 is: 0.9268270683243353.
The average scores is 0.9274674577741306.
CPU times: user 28.5 ms, sys: 7.79 ms, total: 36.3 ms
Wall time: 1.25 s


# LGBM Hyper Tuning