In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [20]:


NUM_FOLD = 3

train = pd.read_csv('../dataset/train.csv', index_col='id')
test = pd.read_csv('../dataset/test.csv', index_col='id')


# cat_cols = list(test.select_dtypes(include=['object']).columns)
cat_cols = test.columns.values
print(cat_cols)

for df in [train, test]:
    for col in cat_cols:
        if df[col].dtype == 'float64':
            df[col] = df[col].round(decimals=2).astype('str').astype('category')
        else:
            df[col] = df[col].astype('str').astype('category')

# train['person_home_ownership'] = train['person_home_ownership'].astype(str)
# test['person_home_ownership'] = test['person_home_ownership'].astype(str)



X = train.drop(['loan_status'], axis=1)
y = train['loan_status']

X, _, y, _ = train_test_split(X, y, test_size=0.8, random_state=42, stratify=y)



['person_age' 'person_income' 'person_home_ownership' 'person_emp_length'
 'loan_intent' 'loan_grade' 'loan_amnt' 'loan_int_rate'
 'loan_percent_income' 'cb_person_default_on_file'
 'cb_person_cred_hist_length']


In [21]:
X.value_counts()

person_age  person_income  person_home_ownership  person_emp_length  loan_intent        loan_grade  loan_amnt  loan_int_rate  loan_percent_income  cb_person_default_on_file  cb_person_cred_hist_length
21          109000         OWN                    5.0                VENTURE            B           15000      11.48          0.14                 N                          3                             1
28          60000          MORTGAGE               3.0                HOMEIMPROVEMENT    A           15000      7.49           0.25                 N                          8                             1
                                                                     PERSONAL           B           6000       11.48          0.1                  N                          10                            1
                                                  4.0                DEBTCONSOLIDATION  A           4800       7.9            0.08                 N                          10     

In [22]:
def objective_func(trial):

# # test 1
    params = {
    'iterations': trial.suggest_int('iterations', 200, 1000),
    'depth': trial.suggest_int('depth', 3, 15),
    'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1),
    'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0, 10),
    'border_count': trial.suggest_int('border_count', 32, 256),
    'random_strength': trial.suggest_float('random_strength', 0.5, 8),
    'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1.5),
    # 'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli']),
    'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 10),
    'verbose': False,
    'task_type': 'CPU',
}

    # # test 2
    # params = {
    #     'iterations': trial.suggest_int('iterations', 600, 1500),
    #     'depth': trial.suggest_int('depth', 1, 8),
    #     'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
    #     'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 6, 15),
    #     'border_count': trial.suggest_int('border_count', 190, 300),
    #     'random_strength': trial.suggest_float('random_strength', 6, 12),
    #     'bagging_temperature': trial.suggest_float('bagging_temperature', 0.5, 1.5),
    #     'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0, 8),
    #     'verbose': False,
    #     'task_type': 'CPU',
    # }

    # # test 3
    # params = {
    #     'iterations': trial.suggest_int('iterations', 800, 1500),
    #     'depth': trial.suggest_int('depth', 2, 6),
    #     'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.1),
    #     'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 5, 10),
    #     'border_count': trial.suggest_int('border_count', 210, 350),
    #     'random_strength': trial.suggest_float('random_strength', 4,10),
    #     'bagging_temperature': trial.suggest_float('bagging_temperature',1,2),
    #     'scale_pos_weight': trial.suggest_float('scale_pos_weight', 2, 8),
    #     'verbose': False,
    #     'task_type': 'CPU',
    # }


    skf = StratifiedKFold(n_splits=NUM_FOLD, shuffle=True, random_state=42)
    scores = []

    for train_idx, valid_idx in skf.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = CatBoostClassifier(**params,cat_features=cat_cols)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=100, verbose=False)
        y_pred = model.predict_proba(X_valid)[:, 1]
        score = roc_auc_score(y_valid, y_pred)
        scores.append(score)

    return np.mean(scores)

In [23]:
import optuna

study = optuna.create_study(direction='maximize', study_name="Catboost all columns Category")
study.optimize(objective_func, n_trials=100)

[I 2024-10-15 12:18:46,861] A new study created in memory with name: Catboost all columns Category
[I 2024-10-15 12:18:53,634] Trial 0 finished with value: 0.9514978324474469 and parameters: {'iterations': 521, 'depth': 9, 'learning_rate': 0.0782336490137188, 'l2_leaf_reg': 5.234033707272165, 'border_count': 232, 'random_strength': 2.1608017371977004, 'bagging_temperature': 1.0902045782493959, 'scale_pos_weight': 8.340148324714708}. Best is trial 0 with value: 0.9514978324474469.
[I 2024-10-15 12:18:59,648] Trial 1 finished with value: 0.9526205010319512 and parameters: {'iterations': 612, 'depth': 3, 'learning_rate': 0.06131160202159489, 'l2_leaf_reg': 7.1984460677101385, 'border_count': 94, 'random_strength': 0.6078407544176783, 'bagging_temperature': 0.4578211370962242, 'scale_pos_weight': 7.602523970125969}. Best is trial 1 with value: 0.9526205010319512.
[I 2024-10-15 12:19:04,684] Trial 2 finished with value: 0.9520003292715365 and parameters: {'iterations': 224, 'depth': 8, 'lea

In [24]:
study.best_params, study.best_value

({'iterations': 891,
  'depth': 7,
  'learning_rate': 0.0448563477253477,
  'l2_leaf_reg': 8.309884320742215,
  'border_count': 161,
  'random_strength': 5.297381835241815,
  'bagging_temperature': 0.10498408144882451,
  'scale_pos_weight': 2.3753101495123747},
 0.9550143699850512)

# final training

In [28]:

NUM_FOLD = 5

train = pd.read_csv('../dataset/train.csv', index_col='id')
test = pd.read_csv('../dataset/test.csv', index_col='id')
original = pd.read_csv('../dataset/original.csv')


# cat_cols = list(test.select_dtypes(include=['object']).columns)
cat_cols = test.columns.values

for df in [train, test, original]:
    for col in cat_cols:
        df[col] = df[col].astype('str').astype('category')


X = train.drop(['loan_status'], axis=1)
y = train['loan_status']
X_original = original.drop(['loan_status'], axis=1)
y_original = original['loan_status']





In [29]:

val_scores = []
test_preds_model = []

skf = StratifiedKFold(n_splits=NUM_FOLD, shuffle=True, random_state=42)


# test 1
params = {'iterations': 891,
  'depth': 7,
  'learning_rate': 0.0448563477253477,
  'l2_leaf_reg': 8.309884320742215,
  'border_count': 161,
  'random_strength': 5.297381835241815,
  'bagging_temperature': 0.10498408144882451,
  'scale_pos_weight': 2.3753101495123747}

for Fold, (train_index, val_index) in enumerate(skf.split(X, y)):

    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]

    X_train = pd.concat([X_train, X_original], axis=0)
    y_train = pd.concat([y_train, y_original]) 

    model = CatBoostClassifier(**params,cat_features=cat_cols)
    # model.fit(X_train, y_train)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)

    y_pred = model.predict_proba(X_val)[:, 1]

    roc_auc_score_ = roc_auc_score(y_val, y_pred)

    print(f'Fold {Fold}: roc_auc_score= {roc_auc_score_:.5f}')

    val_scores.append(roc_auc_score_)

    test_preds_model.append(model.predict_proba(test)[:, 1])

test_preds_model = sum(test_preds_model)/len(test_preds_model)

print(f'mean validation roc_auc_score = {np.mean(val_scores):.5f}')
print(f'std validation roc_auc_score = {np.std(val_scores):.5f}')


Fold 0: roc_auc_score= 0.96436
Fold 1: roc_auc_score= 0.97157
Fold 2: roc_auc_score= 0.96897
Fold 3: roc_auc_score= 0.97142
Fold 4: roc_auc_score= 0.96759
mean validation roc_auc_score = 0.96878
std validation roc_auc_score = 0.00268
