In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np 
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold


In [2]:

NUM_FOLD = 5

train = pd.read_csv('../dataset/train.csv', index_col='id')
test = pd.read_csv('../dataset/test.csv', index_col='id')
original = pd.read_csv('../dataset/original.csv')
sample_submission = pd.read_csv('../dataset/sample_submission.csv')


cat_cols = list(test.select_dtypes(include=['object']).columns)

for df in [train, test, original]:
    for col in cat_cols:  
        df[col] = df[col].astype('str').astype('category') 


X = train.drop(['loan_status'], axis=1)
y = train['loan_status']
X_original = original.drop(['loan_status'], axis=1)
y_original = original['loan_status']


In [7]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

# params_xgb = {
#     'enable_categorical': True,
#     'random_state': 1,
#     'n_estimators': 10000,
#     'learning_rate': 0.01,
#     'colsample_bytree': 0.6,
#     'reg_lambda': 0.01,
#     'max_depth': 4,
#     'max_bin': 5000,
#     'subsample': 0.95,
#     'reg_alpha': 0.1,
# }


def objective(trials):
 
    n_estimators = trials.suggest_int('n_estimators', 1000, 10000)
    learning_rate = trials.suggest_loguniform('learning_rate', 0.01, 0.1)
    colsample_bytree = trials.suggest_uniform('colsample_bytree', 0.1, 1.0)
    reg_lambda = trials.suggest_loguniform('reg_lambda', 0.01, 1.0)
    max_depth = trials.suggest_int('max_depth', 3, 10)
    max_bin = trials.suggest_int('max_bin', 1000, 10000)
    subsample = trials.suggest_uniform('subsample', 0.1, 1.0)
    reg_alpha = trials.suggest_loguniform('reg_alpha', 0.01, 1.0)

    model = XGBClassifier(
                            enable_categorical=True,
                            random_state=42,
                            n_estimators=n_estimators,
                            learning_rate=learning_rate,
                            colsample_bytree=colsample_bytree,
                            reg_lambda=reg_lambda,
                            max_depth=max_depth,
                            max_bin=max_bin,
                            subsample=subsample,
                            reg_alpha=reg_alpha,
                            use_label_encoder=False,
                            early_stopping_rounds=100, 
                            verbosity=0
                            )
    
    skf = StratifiedKFold(n_splits=NUM_FOLD, shuffle=True, random_state=42)
    scores = []
    for train_index, valid_index in skf.split(X, y):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
        y_pred = model.predict_proba(X_valid)[:, 1]
        score = roc_auc_score(y_valid, y_pred)
        scores.append(score)
    
    return np.mean(scores)

In [8]:
import optuna

study = optuna.create_study(direction='maximize', study_name='xgb_loan')
study.optimize(objective, n_trials=50)

[I 2024-10-13 11:22:43,175] A new study created in memory with name: xgb_loan
[I 2024-10-13 11:23:15,252] Trial 0 finished with value: 0.9607642811603159 and parameters: {'n_estimators': 8886, 'learning_rate': 0.022409275352942896, 'colsample_bytree': 0.5544891463369508, 'reg_lambda': 0.0866815870952636, 'max_depth': 3, 'max_bin': 4477, 'subsample': 0.8104745435206383, 'reg_alpha': 0.12140135860750263}. Best is trial 0 with value: 0.9607642811603159.
[I 2024-10-13 11:23:20,749] Trial 1 finished with value: 0.9563236664160495 and parameters: {'n_estimators': 7313, 'learning_rate': 0.07838986960572583, 'colsample_bytree': 0.9926236441266069, 'reg_lambda': 0.08478114719564188, 'max_depth': 4, 'max_bin': 4858, 'subsample': 0.2870862873166744, 'reg_alpha': 0.13662148672109942}. Best is trial 0 with value: 0.9607642811603159.
[I 2024-10-13 11:23:25,565] Trial 2 finished with value: 0.9571530008875776 and parameters: {'n_estimators': 7035, 'learning_rate': 0.09115023756532271, 'colsample_bytr

In [9]:
study.best_params, study.best_value

({'n_estimators': 7978,
  'learning_rate': 0.012320163771771564,
  'colsample_bytree': 0.6206861124243899,
  'reg_lambda': 0.13082515371562026,
  'max_depth': 4,
  'max_bin': 6691,
  'subsample': 0.8968362560875737,
  'reg_alpha': 0.9728255967146223},
 0.9612009860386594)

In [12]:
skf = StratifiedKFold(n_splits=NUM_FOLD, shuffle=True, random_state=42)
val_scores = []
test_preds_model = []

for Fold, (train_index, val_index) in enumerate(skf.split(X, y)):

    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]

    X_train = pd.concat([X_train, X_original], axis=0)
    y_train = pd.concat([y_train, y_original]) 

    model = XGBClassifier(
            enable_categorical=True,
            random_state=42,
            use_label_encoder=False,
            # early_stopping_rounds=100, 
            **study.best_params,
            verbosity=0
        )
    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_val)[:, 1]

    roc_auc_score_ = roc_auc_score(y_val, y_pred)

    print(f'Fold {Fold}: roc_auc_score= {roc_auc_score_:.5f}')

    val_scores.append(roc_auc_score_)

    test_preds_model.append(model.predict_proba(test)[:, 1])

test_preds_model = sum(test_preds_model)/len(test_preds_model)

print(f'mean validation roc_auc_score = {np.mean(val_scores):.5f}')
print(f'std validation roc_auc_score = {np.std(val_scores):.5f}')


Fold 0: roc_auc_score= 0.96095
Fold 1: roc_auc_score= 0.96997
Fold 2: roc_auc_score= 0.96441
Fold 3: roc_auc_score= 0.96515
Fold 4: roc_auc_score= 0.96558
mean validation roc_auc_score = 0.96521
std validation roc_auc_score = 0.00288


In [13]:
submission = pd.DataFrame({'id': sample_submission['id'], 'loan_status': test_preds_model})

In [14]:
submission.to_csv('submission_xgb.csv', index=False)

In [15]:
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice, plot_contour, plot_parallel_coordinate

plot_optimization_history(study).show()
plot_param_importances(study).show()
plot_slice(study).show()
plot_contour(study).show()
plot_parallel_coordinate(study).show()
