In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import mlflow
import mlflow.sklearn

# 데이터 로딩
data = pd.read_csv('../data/dataset.csv')

In [5]:
# 예측 타겟과 특성 분리
X = data.drop('Default', axis=1)
y = data['Default']

In [6]:
# 범주형과 수치형 컬럼 구분
categorical_cols = ['State', 'BankState', 'NewExist', 'UrbanRural', 'RealEstate']
numerical_cols = ['DisbursementGross', 'GrAppv', 'daysterm']


In [7]:
# 데이터 전처리 파이프라인 구성
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

In [9]:
# 데이터 전처리 실행
X_processed = preprocessor.fit_transform(X)

In [10]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# 하이퍼파라미터 탐색 공간 설정
space = {
    'max_depth': hp.choice('max_depth', range(3, 10)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'n_estimators': hp.choice('n_estimators', range(50, 200)),
    'gamma': hp.uniform('gamma', 0, 5)
}


In [11]:
# 최적화를 위한 목적 함수 정의
def objective(params):
    # MLflow에 실험 이름 설정
    mlflow.set_experiment("assignment1")

    # 각 하이퍼파라미터 조합별 실험 시작
    with mlflow.start_run(nested=True):
        # XGBoost 모델 초기화 및 훈련
        model = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False, **params)
        model.fit(X_train, y_train)

        # 예측 확률 및 ROC-AUC 계산
        probs = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, probs)

        # MLflow에 파라미터 및 메트릭 로깅
        mlflow.log_params(params)
        mlflow.log_metric("roc_auc", auc)

        # 목적 함수 결과 반환 (손실 최소화)
        return {'loss': -auc, 'status': STATUS_OK}


In [12]:
# Hyperopt를 사용한 최적화 실행
trials = Trials()
best_params = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=30,
    trials=trials
)

  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]

2025/10/10 14:13:22 INFO mlflow.tracking.fluent: Experiment with name 'assignment1' does not exist. Creating a new experiment.



  7%|▋         | 2/30 [00:00<00:05,  4.78trial/s, best loss: -0.9742599742599742]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



 13%|█▎        | 4/30 [00:00<00:03,  6.98trial/s, best loss: -0.9752871002871004]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



 23%|██▎       | 7/30 [00:01<00:03,  7.66trial/s, best loss: -0.9757697257697258]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



 30%|███       | 9/30 [00:01<00:02,  8.25trial/s, best loss: -0.9757697257697258]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



 40%|████      | 12/30 [00:01<00:01,  9.52trial/s, best loss: -0.9802618552618552]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



 47%|████▋     | 14/30 [00:01<00:01,  9.36trial/s, best loss: -0.9802618552618552]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



 53%|█████▎    | 16/30 [00:01<00:01,  8.75trial/s, best loss: -0.9802618552618552]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



 67%|██████▋   | 20/30 [00:02<00:00, 10.29trial/s, best loss: -0.9802618552618552]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



 73%|███████▎  | 22/30 [00:02<00:00, 10.26trial/s, best loss: -0.9802618552618552]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



 80%|████████  | 24/30 [00:02<00:00, 10.21trial/s, best loss: -0.9802618552618552]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



 93%|█████████▎| 28/30 [00:03<00:00,  9.87trial/s, best loss: -0.9802618552618552]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



100%|██████████| 30/30 [00:03<00:00,  8.91trial/s, best loss: -0.9802618552618552]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

