In [22]:
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

import optuna
from sklearn.metrics import roc_auc_score

### Load the data

In [2]:
def load_data(dataset):
    train_url = os.path.join('../data/playground_series_s4e1', 
                         'train.csv')
    test_url = os.path.join('../data/playground_series_s4e1', 
                            'test.csv')
    origin_url = os.path.join('../data/playground_series_s4e1', 
                              'Churn_Modelling.csv')
    if dataset == 'train':
        df = pd.read_csv(train_url)
    elif dataset == 'test':
        df = pd.read_csv(test_url)
    elif dataset == 'origin':
        df = pd.read_csv(origin_url)
    else:
        raise ValueError(f'{dataset} is not a supported dataset')
    
    return df

### Target Imputation

In [3]:
def impute_target(df, col):
    # calcualted the mean exited rate by specified columns
    df_target = df.groupby(col).agg({'exited': 'mean'})
    df_target = df_target.reset_index()
    df_target = df_target.rename(columns={'exited': col+'_target'})
    
    df = pd.merge(df, df_target, on=col, how='left', 
                  validate='m:1')
    df = df.drop(columns=[col])
    
    return df

### Baseline Models
Accuracy: roc_auc
1. logistic regression
2. catboost classifier
3. xgboost classifier
4. lightgbm classifier

In [4]:
def calculate_score(df, model='lr', scaler='minmax'):
    
    X_train = df.drop(columns=['exited'])
    y_train = df['exited']
    
    # stadardization
    if scaler == 'minmax':
        scaler = MinMaxScaler()
    elif scaler == 'standard':
        scaler = StandardScaler()
    else:
        raise ValueError(f'{scaler} is not supported')
    X_train_scaled = scaler.fit_transform(X_train)
    
    # cross validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    if model == 'lr':
        model = LogisticRegression()
    elif model == 'cat':
        model = CatBoostClassifier(verbose=0)
    elif model == 'xgb':
        model = XGBClassifier(verbosity=0)
    elif model == 'lgb': 
        model = LGBMClassifier(verbose=-1)
    else:
        model = model
    scores = cross_val_score(model, X_train_scaled, y_train, cv=skf, scoring='roc_auc')
    
    return scores

### Generated Dataset
The highest roc auc score of **0.9000** is achieved by lightgbm classifier with minmax scaler.

In [5]:
%%time
# load the data
train_df = load_data('train')
train_df.columns = train_df.columns.str.lower()
train_df = train_df.drop(columns=['id', 'customerid'])
#train_df = train_df.drop(columns=['id'])

# impute the dataset
cols = ['surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
#cols = ['customerid', 'surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
for col in cols:
    train_df = impute_target(train_df, col)
    
# calculate the roc scores
for model in ['lr', 'cat', 'xgb', 'lgb']:
    for scaler in ['minmax', 'standard']: 
        scores = calculate_score(train_df, model=model, scaler=scaler)
        print(f'The average score of {model} with {scaler} is {scores.mean(): .4f}.')

The average score of lr with minmax is  0.8353.
The average score of lr with standard is  0.8353.
The average score of cat with minmax is  0.8996.
The average score of cat with standard is  0.8995.
The average score of xgb with minmax is  0.8981.
The average score of xgb with standard is  0.8982.
The average score of lgb with minmax is  0.9000.
The average score of lgb with standard is  0.8998.
CPU times: total: 30min 43s
Wall time: 2min 31s


### Original Dataset
The highest auc score of **0.9341** is achieved by catboost classifier with minmax/standard scalers.

In [6]:
%%time
# load the data
train_df = load_data('origin')
train_df.columns = train_df.columns.str.lower()
train_df = train_df.rename(columns={'rownumber': 'id'})

# drop the rows with missing values and useless columns
train_df = train_df.dropna(how='any', axis=0)
train_df = train_df.drop(columns=['id', 'customerid'])

# impute the dataset
cols = ['surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
#cols = ['customerid', 'surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
for col in cols:
    train_df = impute_target(train_df, col)
    
# calculate the roc scores
for model in ['lr', 'cat', 'xgb', 'lgb']:
    for scaler in ['minmax', 'standard']: 
        scores = calculate_score(train_df, model=model, scaler=scaler)
        print(f'The average score of {model} with {scaler} is {scores.mean(): .4f}.')

The average score of lr with minmax is  0.8841.
The average score of lr with standard is  0.8843.
The average score of cat with minmax is  0.9341.
The average score of cat with standard is  0.9341.
The average score of xgb with minmax is  0.9244.
The average score of xgb with standard is  0.9244.
The average score of lgb with minmax is  0.9310.
The average score of lgb with standard is  0.9316.
CPU times: total: 6min 39s
Wall time: 36.6 s


### Original + Generated Dataset
The highest auc score of **0.8982** is achieved by lightgbm classifier with minmax scaler.

In [44]:
%%time
# load the generated dataset
train_df = load_data('train')
train_df.columns = train_df.columns.str.lower()
train_df = train_df.drop(columns=['id', 'customerid'])

# load the original dataset
origin_df = load_data('origin')
origin_df.columns = origin_df.columns.str.lower()
origin_df = origin_df.rename(columns={'rownumber': 'id'})
origin_df = origin_df.dropna(how='any', axis=0)
origin_df = origin_df.drop(columns=['id', 'customerid'])

# concat the data
train_df = pd.concat([train_df, origin_df], ignore_index=True)

# impute the dataset
cols = ['surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
for col in cols:
    train_df = impute_target(train_df, col)
    
# calculate the roc scores
for model in ['lr', 'cat', 'xgb', 'lgb']:
    for scaler in ['minmax', 'standard']: 
        scores = calculate_score(train_df, model=model, scaler=scaler)
        print(f'The average score of {model} with {scaler} is {scores.mean(): .4f}.')

The average score of lr with minmax is  0.8312.
The average score of lr with standard is  0.8312.
The average score of cat with minmax is  0.8977.
The average score of cat with standard is  0.8976.
The average score of xgb with minmax is  0.8962.
The average score of xgb with standard is  0.8963.
The average score of lgb with minmax is  0.8982.
The average score of lgb with standard is  0.8981.
CPU times: total: 31min 47s
Wall time: 3min 58s


### Hyperparameter Tuning
The highest auc score of **0.8998** is achieved with 20 trials.

In [47]:
def objective(trial, X, y):
    
    param_grid = {
        
        # tree structure
        'max_depth': trial.suggest_int('max_depth', 3, 12, step=1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 3000, step=20), 

        # better accuracy
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.9, step=0.01),
        'n_estimators': trial.suggest_categorical('n_estimators', [10000]),

        # combat overfitting
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.99, log=True),
        'subsample': trial.suggest_float('subsample', 0.2, 0.99, log=True),
        'subsample_freq': trial.suggest_categorical('subsample_freq', [1]), 
        'reg_alpha': trial.suggest_categorical('reg_alpha', [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]), # L1 regularization
        'reg_lambda': trial.suggest_categorical('reg_lambda', [0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]), # L2 regularization

        'random_state': trial.suggest_categorical('random_state', [42]), 
        'n_jobs': trial.suggest_categorical('n_jobs', [-1]), 
        #'early_stopping_rounds': trial.suggest_categorical('early_stopping_rounds', [100]), 
        'metric': trial.suggest_categorical('metric', ['auc']), 
    }
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        model = LGBMClassifier(objective='binary', **param_grid)
        model.fit(
            X_train, 
            y_train, 
            eval_set=[(X_test, y_test)], 
            early_stopping_rounds=100, 
            verbose=0, 
        )
        y_preds = model.predict_proba(X_test)[:, 1]
        cv_scores[idx] = roc_auc_score(y_test, y_preds)
    
    return np.mean(cv_scores)

In [50]:
%%time
# load the data
train_df = load_data('train')
train_df.columns = train_df.columns.str.lower()
train_df = train_df.drop(columns=['id', 'customerid'])
#train_df = train_df.drop(columns=['id'])

# impute the dataset
cols = ['surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
#cols = ['customerid', 'surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
for col in cols:
    train_df = impute_target(train_df, col)

X_train = train_df.drop(columns=['exited'])
y_train = train_df['exited']

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='maximize', study_name='XGB Classifier', sampler=sampler)
func = lambda trial: objective(trial, X_train_scaled, y_train)
study.optimize(func, n_trials=20)

[32m[I 2024-01-05 14:40:41,805][0m A new study created in memory with name: XGB Classifier[0m
[32m[I 2024-01-05 14:40:55,074][0m Trial 0 finished with value: 0.8954198479781935 and parameters: {'max_depth': 6, 'num_leaves': 2860, 'learning_rate': 0.66, 'n_estimators': 10000, 'colsample_bytree': 0.5210282738918683, 'subsample': 0.25668551069752277, 'subsample_freq': 1, 'reg_alpha': 100.0, 'reg_lambda': 0.1, 'random_state': 42, 'n_jobs': -1, 'metric': 'auc'}. Best is trial 0 with value: 0.8954198479781935.[0m
[32m[I 2024-01-05 14:41:01,354][0m Trial 1 finished with value: 0.8982810969500473 and parameters: {'max_depth': 5, 'num_leaves': 1840, 'learning_rate': 0.13, 'n_estimators': 10000, 'colsample_bytree': 0.3191208553648477, 'subsample': 0.3593410534932508, 'subsample_freq': 1, 'reg_alpha': 0.5, 'reg_lambda': 5.0, 'random_state': 42, 'n_jobs': -1, 'metric': 'auc'}. Best is trial 1 with value: 0.8982810969500473.[0m
[32m[I 2024-01-05 14:41:06,946][0m Trial 2 finished with val

CPU times: total: 1h 54min 31s
Wall time: 7min 42s


In [51]:
%%time
# load the data
train_df = load_data('train')
train_df.columns = train_df.columns.str.lower()

# drop the rows with missing values and useless columns
train_df = train_df.dropna(how='any', axis=0)
train_df = train_df.drop(columns=['id', 'customerid'])

# impute the dataset
cols = ['surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
for col in cols:
    train_df = impute_target(train_df, col)

# calculate the roc scores
model = LGBMClassifier(objective='binary', **study.best_params)
scores = calculate_score(train_df, model=model)
print(f'The average score of the optimized model is {scores.mean(): .4f}.')

The average score of the optimized model is  0.8893.
CPU times: total: 4h 2min 6s
Wall time: 16min 37s


### Prediction

In [52]:
def impute_test(test_df, train_df, col):
    # calcualted the mean exited rate by specified columns
    df_target = train_df.groupby(col).agg({'exited': 'mean'})
    df_target = df_target.reset_index()
    df_target = df_target.rename(columns={'exited': col+'_target'})
    
    df = pd.merge(test_df, df_target, on=col, how='left', 
                  validate='m:1')
    df = df.drop(columns=[col])
    
    return df

In [53]:
# laod the training and testing dataset
train_df = load_data('train')
test_df = load_data('test')

train_df.columns = train_df.columns.str.lower()
test_df.columns = test_df.columns.str.lower()

test_df.index = test_df.id

train_df = train_df.drop(columns=['id', 'customerid'])
test_df = test_df.drop(columns=['id', 'customerid'])

# impute the testing dataset
cols = ['surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
for col in cols:
    test_df = impute_test(test_df, train_df, col)

# impute the training dataset
cols = ['surname', 'geography', 'gender', 'hascrcard', 'isactivemember']
for col in cols:
    train_df = impute_target(train_df, col)

# fill the missing values of surname targets
surname_mean = test_df.surname_target.mean()
test_df = test_df.fillna(surname_mean)

print(f'The shape of the training dataset is {train_df.shape}.')
print(f'The shape of the testing dataset is {test_df.shape}.')

The shape of the training dataset is (165034, 12).
The shape of the testing dataset is (110023, 11).


In [54]:
X_train_val = train_df.drop(columns=['exited'])
y_train_val = train_df['exited']

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, 
                                                    test_size=0.2, stratify=y_train_val, 
                                                    random_state=42)

model = LGBMClassifier(objective='binary', **study.best_params)
model.fit(X_train, y_train, 
          eval_set=[(X_val, y_val)], 
          verbose=0)
test_proba = model.predict_proba(test_df)

In [55]:
test_df = load_data('test')
test_df['Exited'] = test_proba[:, 1]
test_df = test_df[['id', 'Exited']]
test_df.head()

Unnamed: 0,id,Exited
0,165034,0.011312
1,165035,0.68889
2,165036,0.010176
3,165037,0.136124
4,165038,0.26451


In [59]:
test_df.to_csv('../data/playground_series_s4e1/submission.csv', 
              index=False)

In [60]:
test_df.shape

(110023, 2)