In [1]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import time

# 데이터 불러오기
df = pd.read_excel('C:/Users/NT551/Desktop/데이터 추출(연습)/high_diamond_ranked_10min5.xlsx')

# 타겟 변수와 피처 변수 설정
y = df['blueWins']
X = df.drop(columns=['blueWins', 'gameId'])

# 데이터 전처리: 피처 스케일링
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 학습 데이터와 검증 데이터로 분할
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 하이퍼파라미터 설정
params_list = [
    {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 400, 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0.1, 'reg_lambda': 10, 'reg_alpha': 1},

    # 추가적인 하이퍼파라미터 조합
]

best_f1 = 0
best_params = None
best_model = None

for params in params_list:
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        **params
    )
    
    model.fit(X_train, y_train, early_stopping_rounds=20, eval_set=[(X_valid, y_valid)], verbose=False)
    
    y_pred = model.predict(X_valid)
    y_pred_proba = model.predict_proba(X_valid)[:, 1]
    
    f1 = f1_score(y_valid, y_pred)
    if f1 > best_f1:
        best_f1 = f1
        best_params = params
        best_model = model

print(f'Best parameters: {best_params}')
print(f'Best model F1 Score: {best_f1:.4f}')

# 최적 모델로 예측
y_pred_best = best_model.predict(X_valid)
y_pred_proba = best_model.predict_proba(X_valid)[:, 1]

# 최적 모델 평가
accuracy_best = accuracy_score(y_valid, y_pred_best)
roc_auc_best = roc_auc_score(y_valid, y_pred_proba)

print(f'Best model accuracy: {accuracy_best:.4f}')
print(f'Best model ROC AUC: {roc_auc_best:.4f}')

# 혼동 행렬 시각화
conf_mat = confusion_matrix(y_valid, y_pred_best)
plt.figure(figsize=(7, 7))
sns.heatmap(conf_mat, annot=True, fmt=".3f", linewidths=.5, square=True, cmap='Blues')
plt.ylabel("Actual class")
plt.xlabel("Predicted class")
plt.title("Confusion Matrix")
plt.show()

# 학습 과정 시각화
evals_result = best_model.evals_result()  # XGBClassifier의 evals_result 호출

epochs = len(evals_result['validation_0']['logloss'])
x_axis = range(0, epochs)

# 로그 손실 시각화
plt.figure(figsize=(14, 7))
plt.plot(x_axis, evals_result['validation_0']['logloss'], label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Logloss')
plt.title('XGBoost Logloss over Epochs')
plt.legend()
plt.grid(True)
plt.show()




Best parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 400, 'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0.1, 'reg_lambda': 10, 'reg_alpha': 1}
Best model F1 Score: 0.4000
Best model accuracy: 0.6250
Best model ROC AUC: 0.8667


NameError: name 'confusion_matrix' is not defined