In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

### Load the data

In [2]:
def load_data(dataset):
    train_url = os.path.join('../data/playground_series_s4e1', 
                         'train.csv')
    test_url = os.path.join('../data/playground_series_s4e1', 
                            'test.csv')
    origin_url = os.path.join('../data/playground_series_s4e1', 
                              'Churn_Modelling.csv')
    if dataset == 'train':
        df = pd.read_csv(train_url)
    elif dataset == 'test':
        df = pd.read_csv(test_url)
    elif dataset == 'origin':
        df = pd.read_csv(origin_url)
    else:
        raise ValueError(f'{dataset} is not a supported dataset')
    
    return df

### Target Imputation

In [3]:
def impute_target(df, col):
    # calcualted the mean exited rate by specified columns
    df_target = df.groupby(col).agg({'exited': 'mean'})
    df_target = df_target.reset_index()
    df_target = df_target.rename(columns={'exited': col+'_target'})
    
    df = pd.merge(df, df_target, on=col, how='left', 
                  validate='m:1')
    df = df.drop(columns=[col])
    
    return df

### Baseline Models
Accuracy: roc_auc
1. logistic regression
2. catboost classifier
3. xgboost classifier
4. lightgbm classifier

In [4]:
def calculate_score(df, model='lr', scaler='minmax'):
    
    X_train = df.drop(columns=['exited'])
    y_train = df['exited']
    
    # stadardization
    if scaler == 'minmax':
        scaler = MinMaxScaler()
    elif scaler == 'standard':
        scaler = StandardScaler()
    else:
        raise ValueError(f'{scaler} is not supported')
    X_train_scaled = scaler.fit_transform(X_train)
    
    # cross validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    if model == 'lr':
        model = LogisticRegression()
    elif model == 'cat':
        model = CatBoostClassifier(verbose=0)
    elif model == 'xgb':
        model = XGBClassifier(verbosity=0)
    elif model == 'lgb': 
        model = LGBMClassifier(verbose=-1)
    else:
        model = model
    scores = cross_val_score(model, X_train_scaled, y_train, cv=skf, scoring='roc_auc')
    
    return scores

In [5]:
%%time
# load the data
train_df = load_data('train')
train_df.columns = train_df.columns.str.lower()
train_df = train_df.drop(columns=['id', 'customerid'])

# impute the dataset
cols = ['surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
for col in cols:
    train_df = impute_target(train_df, col)
    
# calculate the roc scores
for model in ['lr', 'cat', 'xgb', 'lgb']:
    for scaler in ['minmax', 'standard']: 
        scores = calculate_score(train_df, model=model, scaler=scaler)
        print(f'The average score of {model} with {scaler} is {scores.mean(): .4f}.')

The average score of lr with minmax is  0.8353.
The average score of lr with standard is  0.8353.
The average score of cat with minmax is  0.8996.
The average score of cat with standard is  0.8996.
The average score of xgb with minmax is  0.8981.
The average score of xgb with standard is  0.8982.
The average score of lgb with minmax is  0.9000.
The average score of lgb with standard is  0.8998.
CPU times: user 8min 37s, sys: 3min 3s, total: 11min 40s
Wall time: 1min 54s


### Hyperparameter Tuning

In [6]:
import optuna
from sklearn.metrics import roc_auc_score
from optuna.integration import LightGBMPruningCallback

In [11]:
def objective(trial, X, y):
    
    param_grid = {
        
        # tree structure
        'max_depth': trial.suggest_int('max_depth', 3, 12, step=1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 3000, step=20), 

        # better accuracy
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.9, step=0.01),
        'n_estimators': trial.suggest_categorical('n_estimators', [10000]),

        # combat overfitting
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.99, log=True),
        'subsample': trial.suggest_float('subsample', 0.2, 0.99, log=True),
        'subsample_freq': trial.suggest_categorical('subsample_freq', [1]), 
        'reg_alpha': trial.suggest_categorical('reg_alpha', [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]), # L1 regularization
        'reg_lambda': trial.suggest_categorical('reg_lambda', [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]), # L2 regularization

        'random_state': trial.suggest_categorical('random_state', [42]), 
        'n_jobs': trial.suggest_categorical('n_jobs', [-1]), 
    }
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        model = LGBMClassifier(objective='binary', **param_grid)
        model.fit(
            X_train, 
            y_train, 
            eval_set=[(X_test, y_test)], 
            eval_metric='auc', 
            early_stopping_rounds=100,
            #callbacks=[LightGBMPruningCallback(trial, 'auc'), ], 
            verbose=0, 
        )
        y_preds = model.predict_proba(X_test)[:, 1]
        cv_scores[idx] = roc_auc_score(y_test, y_preds)
    
    return np.mean(cv_scores)

In [12]:
%%time
X_train = train_df.drop(columns=['exited'])
y_train = train_df['exited']

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

study = optuna.create_study(direction='maximize', study_name='LGBM Classifier')
func = lambda trial: objective(trial, X_train_scaled, y_train)
study.optimize(func, n_trials=200)

[I 2024-01-03 22:10:53,982] A new study created in memory with name: LGBM Classifier
[I 2024-01-03 22:11:13,938] Trial 0 finished with value: 0.8948741104894102 and parameters: {'max_depth': 4, 'num_leaves': 320, 'learning_rate': 0.36000000000000004, 'n_estimators': 10000, 'colsample_bytree': 0.2736428411314035, 'subsample': 0.339670720846338, 'subsample_freq': 1, 'reg_alpha': 100.0, 'reg_lambda': 5.0, 'random_state': 42, 'n_jobs': -1}. Best is trial 0 with value: 0.8948741104894102.
[I 2024-01-03 22:11:19,096] Trial 1 finished with value: 0.8954721492716899 and parameters: {'max_depth': 9, 'num_leaves': 1000, 'learning_rate': 0.78, 'n_estimators': 10000, 'colsample_bytree': 0.39660619746192743, 'subsample': 0.25039092405773317, 'subsample_freq': 1, 'reg_alpha': 1.0, 'reg_lambda': 100.0, 'random_state': 42, 'n_jobs': -1}. Best is trial 1 with value: 0.8954721492716899.
[I 2024-01-03 22:11:49,994] Trial 2 finished with value: 0.8977494533456637 and parameters: {'max_depth': 11, 'num_lea

CPU times: user 13h 53min 48s, sys: 4h 18min 19s, total: 18h 12min 7s
Wall time: 4h 19min 25s


In [None]:
print(f'\tBest value： {study.best_value:.5f}')

	Best value： 0.90095


In [None]:
model = LGBMClassifier(objective='binary', **study.best_params)
scores = calculate_score(train_df, model)
scores.mean()

0.898161330457971