In [14]:
# !pip install optuna

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [2]:


NUM_FOLD = 5

train = pd.read_csv('../dataset/train.csv', index_col='id')
test = pd.read_csv('../dataset/test.csv', index_col='id')
original = pd.read_csv('../dataset/original.csv')
sample_submission = pd.read_csv('../dataset/sample_submission.csv')



cat_cols = list(test.select_dtypes(include=['object']).columns)

for df in [train, test, original]:
    for col in cat_cols:
        df[col] = df[col].astype('str').astype('category')


X = train.drop(['loan_status'], axis=1)
y = train['loan_status']
X_original = original.drop(['loan_status'], axis=1)
y_original = original['loan_status']


In [19]:
# from sklearn.model_selection import train_test_split
# # from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier


# # lgb_params = {
# #     'objective': 'binary',
# #     'metric': 'auc',
# #     'random_state': 42,
# #     'n_estimators': 1000,
# #     'learning_rate': 0.05,
# #     'max_depth': 6,
# #     'num_leaves': 31,
# #     'subsample': 0.8,
# #     'colsample_bytree': 0.8,
# # }
# # cat_params = {
# #     'loss_function': 'Logloss',
# #     'eval_metric': 'AUC',
# #     'random_seed': 42,
# #     'iterations': 1000,
# #     'learning_rate': 0.05,
# #     'depth': 6,
# #     'verbose': False,

# # }


# def objective(trials):
#     loss_func = trials.suggest_categorical('loss_function', ['Logloss', 'CrossEntropy'])
#     learning_rate = trials.suggest_uniform('learning_rate', 0.01, 0.1)
#     depth = trials.suggest_int('depth', 4, 10)
#     iterations = trials.suggest_int('iterations', 100, 1400)
#     eval_metric = trials.suggest_categorical('eval_metric', ['AUC', 'Accuracy'])

#     model = CatBoostClassifier(
#         random_seed=42,
#         loss_function=loss_func,
#         learning_rate=learning_rate,
#         depth=depth,
#         iterations=iterations,
#         eval_metric=eval_metric,
#         verbose=False,
#     )



#     skf = StratifiedKFold(n_splits=NUM_FOLD, shuffle=True, random_state=42)
#     scores = []
#     for train_index, valid_index in skf.split(X, y):
#         X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
#         y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
#         model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
#         y_pred = model.predict_proba(X_valid)[:, 1]
#         score = roc_auc_score(y_valid, y_pred)
#         scores.append(score)

#     return np.mean(scores)

In [3]:
from sklearn.model_selection import train_test_split
# from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


# lgb_params = {
#     'objective': 'binary',
#     'metric': 'auc',
#     'random_state': 42,
#     'n_estimators': 1000,
#     'learning_rate': 0.05,
#     'max_depth': 6,
#     'num_leaves': 31,
#     'subsample': 0.8,
#     'colsample_bytree': 0.8,
# }
# cat_params = {
#     'loss_function': 'Logloss',
#     'eval_metric': 'AUC',
#     'random_seed': 42,
#     'iterations': 1000,
#     'learning_rate': 0.05,
#     'depth': 6,
#     'verbose': False,

# }


def objective(trials):
    loss_func = trials.suggest_categorical('loss_function', ['Logloss', 'CrossEntropy'])
    learning_rate = trials.suggest_uniform('learning_rate', 0.01, 0.1)
    depth = trials.suggest_int('depth', 4, 10)
    iterations = trials.suggest_int('iterations', 100, 1400)
    eval_metric = trials.suggest_categorical('eval_metric', ['AUC', 'Accuracy'])

    model = CatBoostClassifier(
        random_seed=42,
        loss_function=loss_func,
        learning_rate=learning_rate,
        depth=depth,
        iterations=iterations,
        eval_metric=eval_metric,
        verbose=False,
    )



    skf = StratifiedKFold(n_splits=NUM_FOLD, shuffle=True, random_state=42)
    scores = []
    for train_index, valid_index in skf.split(X, y):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        # Explicitly specify categorical features for CatBoost
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], cat_features=cat_cols)
        y_pred = model.predict_proba(X_valid)[:, 1]
        score = roc_auc_score(y_valid, y_pred)
        scores.append(score)

    return np.mean(scores)

In [4]:
import optuna

study = optuna.create_study(direction='maximize',study_name= 'lgbm_loan')
study.optimize(objective, n_trials=100)

[I 2024-10-13 12:22:17,592] A new study created in memory with name: lgbm_loan
[I 2024-10-13 12:23:28,403] Trial 0 finished with value: 0.9482638183221985 and parameters: {'loss_function': 'CrossEntropy', 'learning_rate': 0.03743587139314545, 'depth': 9, 'iterations': 710, 'eval_metric': 'Accuracy'}. Best is trial 0 with value: 0.9482638183221985.
[I 2024-10-13 12:24:13,245] Trial 1 finished with value: 0.956138721661955 and parameters: {'loss_function': 'CrossEntropy', 'learning_rate': 0.08955224063660172, 'depth': 4, 'iterations': 854, 'eval_metric': 'AUC'}. Best is trial 1 with value: 0.956138721661955.
[I 2024-10-13 12:25:25,327] Trial 2 finished with value: 0.9495743020371427 and parameters: {'loss_function': 'Logloss', 'learning_rate': 0.08508374161944439, 'depth': 10, 'iterations': 496, 'eval_metric': 'AUC'}. Best is trial 1 with value: 0.956138721661955.
[I 2024-10-13 12:25:42,696] Trial 3 finished with value: 0.9382980058374155 and parameters: {'loss_function': 'CrossEntropy',

KeyboardInterrupt: 

In [5]:
study.best_params

{'loss_function': 'CrossEntropy',
 'learning_rate': 0.08881827847947872,
 'depth': 5,
 'iterations': 1243,
 'eval_metric': 'AUC'}

In [6]:
study.best_value

0.9578240596052154