In [None]:
!pip install imbalanced-learn

In [None]:
# 필요한 라이브러리와 모듈 임포트
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from tqdm import tqdm
import time

# 불균형한 데이터셋 생성
X, y = make_classification(
    n_classes=2, class_sep=1.5, weights=[.99, .01],
    n_informative=3, n_redundant=1, flip_y=0,
    n_features=20, n_clusters_per_class=1, n_samples=2000, random_state=10
)

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# SMOTE로 오버샘플링 후 Tomek Links로 언더샘플링
smote = SMOTE(random_state=0)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
tomek = TomekLinks()
X_train_res, y_train_res = tomek.fit_resample(X_smote, y_smote)

# 모델과 하이퍼파라미터 목록
clfs = [
    (KNeighborsClassifier(), {'n_neighbors': [3,5,7,9,11], 'weights': ['uniform','distance']}),
    (MLPClassifier(random_state=0), {'batch_size': [32, 64, 128], 'learning_rate' : ['constant', 'adaptive'], 'activation': ['tanh', 'relu'], 'solver': ['sgd', 'adam']}),
    (LogisticRegression(random_state=0), {'C': np.arange(0.1, 1.1, 0.1), 'penalty': ['l1','l2']}),
    (RandomForestClassifier(random_state=0), {'n_estimators': [100,200,300], 'max_depth': [3,4,5]}),
    (GradientBoostingClassifier(random_state=0), {'n_estimators': [100, 200, 300], 'learning_rate': [1, 0.1, 0.01]}),
    (XGBClassifier(tree_method = 'hist', random_state=0), {'min_child_weight': range(0, 121, 20), 'learning_rate': np.arange(0.1, 0.6, 0.1), 'subsample': np.arange(0.5, 1.0, 0.1)}),
    (LGBMClassifier(random_state=0), {'min_child_weight': range(0, 121, 20), 'learning_rate': np.arange(0.1, 0.6, 0.1), 'subsample': np.arange(0.5, 1.0, 0.1)}),
]

clfs_tuned = []  # 최적화된 모델 저장할 곳

# 모든 모델에 대해 랜덤하게 하이퍼파라미터 검색
for clf, param_grid in tqdm(clfs):
    start = time.time()  # 시작 시간 저장
    # 랜덤하게 하이퍼파라미터 검색
    rand_search = RandomizedSearchCV(clf, param_grid, n_iter=5, scoring='roc_auc', 
                                     cv=3, random_state=0, n_jobs=-1)
    rand_search.fit(X_train_res, y_train_res)
    clf_name = type(clf).__name__  # 모델 이름
    clf_score = rand_search.score(X_test, y_test)  # 점수
    print(f'{clf_name:30} {clf_score:30f} {time.time() - start:.1f}초')  # 출력
    clfs_tuned.append((clf_name, rand_search, clf_score))  # 저장
