In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

### Load the data

In [2]:
def load_data(dataset):
    train_url = os.path.join('../data/playground-series-s4e1', 
                         'train.csv')
    test_url = os.path.join('../data/playground-series-s4e1', 
                            'test.csv')
    origin_url = os.path.join('../data/playground-series-s4e1', 
                              'Churn_Modelling.csv')
    if dataset == 'train':
        df = pd.read_csv(train_url)
    elif dataset == 'test':
        df = pd.read_csv(test_url)
    elif dataset == 'origin':
        df = pd.read_csv(origin_url)
    else:
        raise ValueError(f'{dataset} is not a supported dataset')
    
    return df

### Target Imputation

In [10]:
def impute_target(df, col):
    # calcualted the mean exited rate by specified columns
    df_target = df.groupby(col).agg({'exited': 'mean'})
    df_target = df_target.reset_index()
    df_target = df_target.rename(columns={'exited': col+'_target'})
    
    df = pd.merge(df, df_target, on=col, how='left', 
                  validate='m:1')
    df = df.drop(columns=[col])
    
    return df

### Baseline Models
Accuracy: roc_auc
1. logistic regression
2. catboost classifier
3. xgboost classifier
4. lightgbm classifier

In [61]:
def calculate_score(df, model='lr', scaler='minmax'):
    
    X_train = df.drop(columns=['exited'])
    y_train = df['exited']
    
    # stadardization
    if scaler == 'minmax':
        scaler = MinMaxScaler()
    elif scaler == 'standard':
        scaler = StandardScaler()
    else:
        raise ValueError(f'{scaler} is not supported')
    X_train_scaled = scaler.fit_transform(X_train)
    
    # cross validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    if model == 'lr':
        model = LogisticRegression()
    elif model == 'cat':
        model = CatBoostClassifier(verbose=0)
    elif model == 'xgb':
        model = XGBClassifier(verbosity=0)
    elif model == 'lgb': 
        model = LGBMClassifier(verbose=-1)
    else:
        model = model
    scores = cross_val_score(model, X_train_scaled, y_train, cv=skf, scoring='roc_auc')
    
    return scores

In [27]:
%%time
# load the data
train_df = load_data('train')
train_df.columns = train_df.columns.str.lower()
train_df = train_df.drop(columns=['id', 'customerid'])

# impute the dataset
cols = ['surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
for col in cols:
    train_df = impute_target(train_df, col)
    
# calculate the roc scores
for model in ['lr', 'cat', 'xgb', 'lgb']:
    for scaler in ['minmax', 'standard']: 
        scores = calculate_score(train_df, model=model, scaler=scaler)
        print(f'The average score of {model} with {scaler} is {scores.mean(): .4f}.')

The average score of lr with minmax is  0.8353.
The average score of lr with standard is  0.8353.
The average score of cat with minmax is  0.8996.
The average score of cat with standard is  0.8995.
The average score of xgb with minmax is  0.8981.
The average score of xgb with standard is  0.8982.
The average score of lgb with minmax is  0.9000.
The average score of lgb with standard is  0.8998.
CPU times: total: 31min 43s
Wall time: 2min 34s


### Hyperparameter Tuning

In [30]:
import optuna
from sklearn.metrics import roc_auc_score
from optuna.integration import LightGBMPruningCallback

In [90]:
def objective(trial, X, y):
    
    param_grid = {
        
        # tree structure
        'max_depth': trial.suggest_int('max_depth', 3, 12, step=1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 3000, step=20), 

        # better accuracy
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, step=0.01),
        'n_estimators': trial.suggest_categorical('n_estimators', [4000]),

        # combat overfitting
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.99, log=True),
        'subsample': trial.suggest_float('subsample', 0.2, 0.99, log=True),
        'subsample_freq': trial.suggest_categorical('subsample_freq', [1]), 
        'reg_alpha': trial.suggest_categorical('reg_alpha', [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]), # L1 regularization
        'reg_lambda': trial.suggest_categorical('reg_lambda', [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]), # L2 regularization

        'random_state': trial.suggest_categorical('random_state', [42]), 
        'n_jobs': trial.suggest_categorical('n_jobs', [-1]), 
    }
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        model = LGBMClassifier(objective='binary', **param_grid)
        model.fit(
            X_train, 
            y_train, 
            eval_set=[(X_test, y_test)], 
            eval_metric='auc', 
            early_stopping_rounds=100,
            #callbacks=[LightGBMPruningCallback(trial, 'auc'), ], 
            verbose=0, 
        )
        y_preds = model.predict_proba(X_test)[:, 1]
        cv_scores[idx] = roc_auc_score(y_test, y_preds)
    
    return np.mean(cv_scores)

In [91]:
%%time
X_train = train_df.drop(columns=['exited'])
y_train = train_df['exited']

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

study = optuna.create_study(direction='maximize', study_name='LGBM Classifier')
func = lambda trial: objective(trial, X_train_scaled, y_train)
study.optimize(func, n_trials=20)

[32m[I 2024-01-03 17:37:22,272][0m A new study created in memory with name: LGBM Classifier[0m
[32m[I 2024-01-03 17:38:21,341][0m Trial 0 finished with value: 0.8979829346517529 and parameters: {'max_depth': 8, 'num_leaves': 2300, 'learning_rate': 0.02, 'n_estimators': 4000, 'colsample_bytree': 0.406160792434119, 'subsample': 0.8105160683574074, 'subsample_freq': 1, 'reg_alpha': 100.0, 'reg_lambda': 50.0, 'random_state': 42, 'n_jobs': -1}. Best is trial 0 with value: 0.8979829346517529.[0m
[32m[I 2024-01-03 17:38:26,748][0m Trial 1 finished with value: 0.8991263574798459 and parameters: {'max_depth': 3, 'num_leaves': 800, 'learning_rate': 0.27, 'n_estimators': 4000, 'colsample_bytree': 0.6980550343877667, 'subsample': 0.31426771328283737, 'subsample_freq': 1, 'reg_alpha': 0.5, 'reg_lambda': 100.0, 'random_state': 42, 'n_jobs': -1}. Best is trial 1 with value: 0.8991263574798459.[0m
[32m[I 2024-01-03 17:38:31,405][0m Trial 2 finished with value: 0.8997282279039382 and paramet

CPU times: total: 1h 12min 26s
Wall time: 4min 47s


In [92]:
print(f'\tBest value： {study.best_value:.5f}')

	Best value： 0.90068


In [None]:
model = LGBMClassifier(objective='binary', **study.best_params)
scores = calculate_score(train_df, model)
scores.mean()