In [10]:
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

import optuna
from sklearn.metrics import roc_auc_score

### Load the data

In [11]:
def load_data(dataset):
    train_url = os.path.join('../data/playground_series_s4e1', 
                         'train.csv')
    test_url = os.path.join('../data/playground_series_s4e1', 
                            'test.csv')
    origin_url = os.path.join('../data/playground_series_s4e1', 
                              'Churn_Modelling.csv')
    if dataset == 'train':
        df = pd.read_csv(train_url)
    elif dataset == 'test':
        df = pd.read_csv(test_url)
    elif dataset == 'origin':
        df = pd.read_csv(origin_url)
    else:
        raise ValueError(f'{dataset} is not a supported dataset')
    
    return df

### Target Imputation

In [12]:
def impute_target(df, col):
    # calcualted the mean exited rate by specified columns
    df_target = df.groupby(col).agg({'exited': 'mean'})
    df_target = df_target.reset_index()
    df_target = df_target.rename(columns={'exited': col+'_target'})
    
    df = pd.merge(df, df_target, on=col, how='left', 
                  validate='m:1')
    df = df.drop(columns=[col])
    
    return df

### Baseline Models
Accuracy: roc_auc
1. logistic regression
2. catboost classifier
3. xgboost classifier
4. lightgbm classifier

In [13]:
def calculate_score(df, model='lr', scaler='minmax'):
    
    X_train = df.drop(columns=['exited'])
    y_train = df['exited']
    
    # stadardization
    if scaler == 'minmax':
        scaler = MinMaxScaler()
    elif scaler == 'standard':
        scaler = StandardScaler()
    else:
        raise ValueError(f'{scaler} is not supported')
    X_train_scaled = scaler.fit_transform(X_train)
    
    # cross validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    if model == 'lr':
        model = LogisticRegression()
    elif model == 'cat':
        model = CatBoostClassifier(verbose=0)
    elif model == 'xgb':
        model = XGBClassifier(verbosity=0)
    elif model == 'lgb': 
        model = LGBMClassifier(verbose=-1)
    else:
        model = model
    scores = cross_val_score(model, X_train_scaled, y_train, cv=skf, scoring='roc_auc')
    
    return scores

### Generated Dataset
The highest roc auc score of **0.9000** is achieved by lightgbm classifier with minmax scaler.

In [15]:
%%time
# load the data
train_df = load_data('train')
train_df.columns = train_df.columns.str.lower()
train_df = train_df.drop(columns=['id', 'customerid'])
#train_df = train_df.drop(columns=['id'])

# impute the dataset
cols = ['surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
#cols = ['customerid', 'surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
for col in cols:
    train_df = impute_target(train_df, col)
    
# calculate the roc scores
for model in ['lr', 'cat', 'xgb', 'lgb']:
    for scaler in ['minmax', 'standard']: 
        scores = calculate_score(train_df, model=model, scaler=scaler)
        print(f'The average score of {model} with {scaler} is {scores.mean(): .4f}.')

The average score of lr with minmax is  0.8353.
The average score of lr with standard is  0.8353.
The average score of cat with minmax is  0.8996.
The average score of cat with standard is  0.8995.
The average score of xgb with minmax is  0.8981.
The average score of xgb with standard is  0.8982.
The average score of lgb with minmax is  0.9000.
The average score of lgb with standard is  0.8998.
CPU times: total: 30min 50s
Wall time: 2min 26s


### Original Dataset
The highest auc score of **0.9341** is achieved by catboost classifier with minmax/standard scalers.

In [16]:
%%time
# load the data
train_df = load_data('origin')
train_df.columns = train_df.columns.str.lower()
train_df = train_df.rename(columns={'rownumber': 'id'})

# drop the rows with missing values and useless columns
train_df = train_df.dropna(how='any', axis=0)
train_df = train_df.drop(columns=['id', 'customerid'])

# impute the dataset
cols = ['surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
#cols = ['customerid', 'surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
for col in cols:
    train_df = impute_target(train_df, col)
    
# calculate the roc scores
for model in ['lr', 'cat', 'xgb', 'lgb']:
    for scaler in ['minmax', 'standard']: 
        scores = calculate_score(train_df, model=model, scaler=scaler)
        print(f'The average score of {model} with {scaler} is {scores.mean(): .4f}.')

The average score of lr with minmax is  0.8841.
The average score of lr with standard is  0.8843.
The average score of cat with minmax is  0.9341.
The average score of cat with standard is  0.9341.
The average score of xgb with minmax is  0.9244.
The average score of xgb with standard is  0.9244.
The average score of lgb with minmax is  0.9310.
The average score of lgb with standard is  0.9316.
CPU times: total: 5min 6s
Wall time: 28.7 s


### Original + Generated Dataset
The highest auc score of **0.9024** is achieved by lightgbm classifier with minmax scaler.

In [28]:
%%time
# load the generated dataset
train_df = load_data('train')
train_df.columns = train_df.columns.str.lower()
train_df = train_df.drop(columns=['id', 'customerid'])

# impute the dataset
cols = ['surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
for col in cols:
    train_df = impute_target(train_df, col)

# add the data type feature
train_df['type'] = 'generated'

# load the original dataset
origin_df = load_data('origin')
origin_df.columns = origin_df.columns.str.lower()
origin_df = origin_df.rename(columns={'rownumber': 'id'})
origin_df = origin_df.dropna(how='any', axis=0)
origin_df = origin_df.drop(columns=['id', 'customerid'])

# impute the dataset
cols = ['surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
for col in cols:
    origin_df = impute_target(origin_df, col)

# add the data type feature
origin_df['type'] = 'origin'

# concat the data
train_df = pd.concat([train_df, origin_df], ignore_index=True)

# impute the dataset
cols = ['type']
for col in cols:
    train_df = impute_target(train_df, col)
    
# calculate the roc scores
for model in ['lr', 'cat', 'xgb', 'lgb']:
    for scaler in ['minmax', 'standard']: 
        scores = calculate_score(train_df, model=model, scaler=scaler)
        print(f'The average score of {model} with {scaler} is {scores.mean(): .4f}.')

The average score of lr with minmax is  0.8371.
The average score of lr with standard is  0.8371.
The average score of cat with minmax is  0.9022.
The average score of cat with standard is  0.9021.
The average score of xgb with minmax is  0.9007.
The average score of xgb with standard is  0.9007.
The average score of lgb with minmax is  0.9024.
The average score of lgb with standard is  0.9022.
CPU times: total: 33min 24s
Wall time: 2min 39s


### Hyperparameter Tuning (How much can we push the limit?)
The highest auc score of **0.9035** is achieved with 20 trials.

In [41]:
def objective(trial, X, y):
    
    param_grid = {
        
        # tree structure
        'max_depth': trial.suggest_int('max_depth', 3, 12, step=1),
        'num_leaves': trial.suggest_int('max_leaves', 20, 3000, step=20), 

        # better accuracy
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.9, step=0.01),
        'n_estimators': trial.suggest_categorical('n_estimators', [10000]),

        # combat overfitting
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.99, log=True),
        'subsample': trial.suggest_float('subsample', 0.2, 0.99, log=True),
        'subsample_freq': trial.suggest_categorical('subsample_freq', [1]), 
        'reg_alpha': trial.suggest_categorical('reg_alpha', [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]), # L1 regularization
        'reg_lambda': trial.suggest_categorical('reg_lambda', [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]), # L2 regularization

        'random_state': trial.suggest_categorical('random_state', [42]), 
        'n_jobs': trial.suggest_categorical('n_jobs', [-1]), 
        #'early_stopping_rounds': trial.suggest_categorical('early_stopping_rounds', [100]), 
        'metric': trial.suggest_categorical('metric', ['auc']), 
    }
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        model = LGBMClassifier(objective='binary', **param_grid)
        model.fit(
            X_train, 
            y_train, 
            eval_set=[(X_test, y_test)], 
            early_stopping_rounds=100, 
            verbose=0, 
        )
        y_preds = model.predict_proba(X_test)[:, 1]
        cv_scores[idx] = roc_auc_score(y_test, y_preds)
    
    return np.mean(cv_scores)

In [43]:
%%time
X_train = train_df.drop(columns=['exited'])
y_train = train_df['exited']

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

study = optuna.create_study(direction='maximize', study_name='XGB Classifier')
func = lambda trial: objective(trial, X_train_scaled, y_train)
study.optimize(func, n_trials=200)

[32m[I 2024-01-04 16:08:42,028][0m A new study created in memory with name: XGB Classifier[0m
[32m[I 2024-01-04 16:08:48,560][0m Trial 0 finished with value: 0.9024773273096953 and parameters: {'max_depth': 6, 'max_leaves': 2540, 'learning_rate': 0.06999999999999999, 'n_estimators': 10000, 'colsample_bytree': 0.6437130071370235, 'subsample': 0.25528170320581667, 'subsample_freq': 1, 'reg_alpha': 5.0, 'reg_lambda': 0.1, 'random_state': 42, 'n_jobs': -1, 'metric': 'auc'}. Best is trial 0 with value: 0.9024773273096953.[0m
[32m[I 2024-01-04 16:08:52,660][0m Trial 1 finished with value: 0.9015269490537049 and parameters: {'max_depth': 3, 'max_leaves': 1140, 'learning_rate': 0.7000000000000001, 'n_estimators': 10000, 'colsample_bytree': 0.2281238278461871, 'subsample': 0.592886197706159, 'subsample_freq': 1, 'reg_alpha': 5.0, 'reg_lambda': 50.0, 'random_state': 42, 'n_jobs': -1, 'metric': 'auc'}. Best is trial 0 with value: 0.9024773273096953.[0m
[32m[I 2024-01-04 16:08:55,360][0

CPU times: total: 1d 18min 48s
Wall time: 1h 36min 43s


In [65]:
model = LGBMClassifier(objective='binary', **study.best_params)
scores = calculate_score(train_df, model=model)
print(f'The average score of the optimized model is {scores.mean(): .4f}.')

ValueError: could not convert string to float: 'Okwudilichukwu'

### Prediction

In [44]:
def impute_test(test_df, train_df, col):
    # calcualted the mean exited rate by specified columns
    df_target = train_df.groupby(col).agg({'exited': 'mean'})
    df_target = df_target.reset_index()
    df_target = df_target.rename(columns={'exited': col+'_target'})
    
    df = pd.merge(test_df, df_target, on=col, how='left', 
                  validate='m:1')
    df = df.drop(columns=[col])
    
    return df

In [55]:
train_df = load_data('train')
test_df = load_data('test')
train_df.columns = train_df.columns.str.lower()
test_df.columns = test_df.columns.str.lower()

test_df.index = test_df.id
test_df = test_df.drop(columns=['id', 'customerid'])
test_df.head()

Unnamed: 0_level_0,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
165034,Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
165035,Nott,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
165036,K?,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
165037,O'Donnell,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
165038,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


In [56]:
# impute the test dataset
cols = ['surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
for col in cols:
    test_df = impute_test(test_df, train_df, col)
    
test_df.head()

Unnamed: 0,creditscore,age,tenure,balance,numofproducts,estimatedsalary,surname_target,geography_target,gender_target,hascrcard_target,isactivemember_target
0,586,23.0,2,0.0,2,160976.75,0.188082,0.165282,0.279687,0.227429,0.125345
1,683,46.0,2,0.0,1,72549.27,0.266667,0.165282,0.279687,0.206433,0.297086
2,656,34.0,7,0.0,2,138882.09,0.175633,0.165282,0.279687,0.206433,0.297086
3,681,36.0,8,0.0,1,113931.57,0.187215,0.165282,0.159055,0.206433,0.297086
4,752,38.0,10,121263.62,1,139431.0,0.272727,0.378952,0.159055,0.206433,0.297086


In [58]:
test_df.isna().sum()

creditscore                0
age                        0
tenure                     0
balance                    0
numofproducts              0
estimatedsalary            0
surname_target           171
geography_target           0
gender_target              0
hascrcard_target           0
isactivemember_target      0
dtype: int64

In [60]:
surname_mean = test_df.surname_target.mean()
test_df = test_df.fillna(surname_mean)
test_df.head()

Unnamed: 0,creditscore,age,tenure,balance,numofproducts,estimatedsalary,surname_target,geography_target,gender_target,hascrcard_target,isactivemember_target
0,586,23.0,2,0.0,2,160976.75,0.188082,0.165282,0.279687,0.227429,0.125345
1,683,46.0,2,0.0,1,72549.27,0.266667,0.165282,0.279687,0.206433,0.297086
2,656,34.0,7,0.0,2,138882.09,0.175633,0.165282,0.279687,0.206433,0.297086
3,681,36.0,8,0.0,1,113931.57,0.187215,0.165282,0.159055,0.206433,0.297086
4,752,38.0,10,121263.62,1,139431.0,0.272727,0.378952,0.159055,0.206433,0.297086


In [63]:
model = LGBMClassifier(objective='binary', **study.best_params)
model.fit(
test_proba = model.predict_proba(test_df)

NotFittedError: Estimator not fitted, call `fit` before exploiting the model.