## 0. 패키지 임포트

In [5]:
# 임포트
# 데이터 전처리 및 기본 라이브러리
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler


# 머신러닝 모델 임포트
from sklearn.tree import DecisionTreeClassifier  # 결정 트리
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier   # 랜덤 포레스트 , 그라디언트 부스팅
from xgboost import XGBClassifier  # XGBoost
from lightgbm import LGBMClassifier  # LightGBM
from sklearn.linear_model import LogisticRegression  # 로지스틱 회귀
from sklearn.neighbors import KNeighborsClassifier  # KNN

# 평가 지표
from sklearn.metrics import accuracy_score, f1_score, r2_score, confusion_matrix, classification_report

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

## 1. 하이퍼 파라미터를 찾기 위한 데이터 셋 불러오기

In [None]:
# 1. 데이터 로드
final_merged_data = pd.read_csv('./data/merged_data_cleaned.csv')

# 2. Feature Set 정의 (원본 데이터 유지)
sum_clicked_features = ['highest_education', 'imd_band', 'log_sum_click', 'log_studied_credits', 'scaled_score']
mean_clicked_features = ['highest_education', 'imd_band', 'log_mean_click', 'log_studied_credits', 'scaled_score']

sum_x = final_merged_data[sum_clicked_features]
mean_x = final_merged_data[mean_clicked_features]
y = final_merged_data['final_result']

# 3. 학습 데이터와 테스트 데이터 분리 (Train 90%, Test 10%)
sum_x_train, sum_x_test, y_train, y_test = train_test_split(sum_x, y, test_size=0.1, random_state=42, stratify=y)
mean_x_train, mean_x_test, y_train, y_test = train_test_split(mean_x, y, test_size=0.1, random_state=42, stratify=y)

# 4. 스케일링 (RobustScaler 사용)
scaler = RobustScaler()
sum_x_train_scaled = scaler.fit_transform(sum_x_train).astype(np.float64)
sum_x_test_scaled = scaler.transform(sum_x_test).astype(np.float64)

mean_x_train_scaled = scaler.fit_transform(mean_x_train).astype(np.float64)
mean_x_test_scaled = scaler.transform(mean_x_test).astype(np.float64)

## 2. 랜덤포레스트 파라미터

In [None]:
# 9. 하이퍼파라미터 튜닝 (GridSearchCV 적용)
param_grid = {
    'n_estimators': [300],  # 트리 개수
    'max_depth': [5, 10, 20],  # 트리 깊이
    'min_samples_split': [2, 5],  # 내부 노드 분할 최소 샘플 수
    'min_samples_leaf': [1, 2]  # 리프 노드 최소 샘플 수
}

grid_search_sum = GridSearchCV(RandomForestClassifier(random_state=42),
                               param_grid,
                               cv=3,  # 3-Fold Cross Validation
                               scoring='accuracy',
                               n_jobs=-1)
grid_search_sum.fit(sum_x_train_scaled, y_train)

grid_search_mean = GridSearchCV(RandomForestClassifier(random_state=42),
                                param_grid,
                                cv=3,  # 3-Fold Cross Validation
                                scoring='accuracy',
                                n_jobs=-1)
grid_search_mean.fit(mean_x_train_scaled, y_train)

# 10. 최적의 하이퍼파라미터 및 성능 확인
best_sum_model = grid_search_sum.best_estimator_
best_mean_model = grid_search_mean.best_estimator_

best_params_sum = grid_search_sum.best_params_
best_params_mean = grid_search_mean.best_params_

print("\n=== Sum Click 기반 최적화 Random Forest 성능 ===")
print(f"Best Parameters: {best_params_sum}")

print("\n=== Mean Click 기반 최적화 Random Forest 성능 ===")
print(f"Best Parameters: {best_params_mean}")

# 11. 최적 모델 평가 (Sum Click)
y_pred_best_sum = best_sum_model.predict(sum_x_test_scaled)
acc_best_sum = accuracy_score(y_test, y_pred_best_sum) * 100
f1_best_sum = f1_score(y_test, y_pred_best_sum, average='weighted') * 100
r2_best_sum = r2_score(y_test, y_pred_best_sum) * 100

print(f"Best Accuracy (Sum Click): {acc_best_sum:.4f}")
print(f"Best F1 Score (Sum Click): {f1_best_sum:.4f}")
print(f"Best R2 Score (Sum Click): {r2_best_sum:.4f}")

# 12. 최적 모델 평가 (Mean Click)
y_pred_best_mean = best_mean_model.predict(mean_x_test_scaled)
acc_best_mean = accuracy_score(y_test, y_pred_best_mean) * 100
f1_best_mean = f1_score(y_test, y_pred_best_mean, average='weighted') * 100
r2_best_mean = r2_score(y_test, y_pred_best_mean) * 100

print(f"Best Accuracy (Mean Click): {acc_best_mean:.4f}")
print(f"Best F1 Score (Mean Click): {f1_best_mean:.4f}")
print(f"Best R2 Score (Mean Click): {r2_best_mean:.4f}")

# 13. Feature Importance 시각화 (Sum Click)
plt.figure(figsize=(10, 5))
feature_importances_sum = pd.Series(best_sum_model.feature_importances_, index=sum_clicked_features).sort_values(ascending=False)
sns.barplot(x=feature_importances_sum, y=feature_importances_sum.index, palette='viridis')
plt.title("Feature Importance - Sum Click Based Random Forest")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()

# 14. Feature Importance 시각화 (Mean Click)
plt.figure(figsize=(10, 5))
feature_importances_mean = pd.Series(best_mean_model.feature_importances_, index=mean_clicked_features).sort_values(ascending=False)
sns.barplot(x=feature_importances_mean, y=feature_importances_mean.index, palette='viridis')
plt.title("Feature Importance - Mean Click Based Random Forest")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()



=== Sum Click 기반 최적화 Random Forest 성능 ===
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}

=== Mean Click 기반 최적화 Random Forest 성능 ===
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}

## 3. 그리디언트 부스팅 파라미터

In [None]:
# 9. 하이퍼파라미터 튜닝 (GridSearchCV 적용)
param_grid = {
    'n_estimators': [100, 150],  # 트리 개수
    'learning_rate': [0.1, 0.15],  # 학습률
    'max_depth': [10],  # 트리 깊이
    'min_samples_split': [2, 5],  # 내부 노드 분할 최소 샘플 수
    'min_samples_leaf': [1, 2]  # 리프 노드 최소 샘플 수
}

grid_search_sum = GridSearchCV(GradientBoostingClassifier(random_state=42),
                               param_grid,
                               cv=3,  # ✅ 3-Fold Cross Validation
                               scoring='accuracy',
                               n_jobs=-1)
grid_search_sum.fit(sum_x_train_scaled, y_train)

grid_search_mean = GridSearchCV(GradientBoostingClassifier(random_state=42),
                                param_grid,
                                cv=3,  # ✅ 3-Fold Cross Validation
                                scoring='accuracy',
                                n_jobs=-1)
grid_search_mean.fit(mean_x_train_scaled, y_train)

# 10. 최적의 하이퍼파라미터 및 성능 확인
best_sum_model = grid_search_sum.best_estimator_
best_mean_model = grid_search_mean.best_estimator_

best_params_sum = grid_search_sum.best_params_
best_params_mean = grid_search_mean.best_params_

print("\n=== Sum Click 기반 최적화 Gradient Boosting 성능 ===")
print(f"Best Parameters: {best_params_sum}")

print("\n=== Mean Click 기반 최적화 Gradient Boosting 성능 ===")
print(f"Best Parameters: {best_params_mean}")

# 11. 최적 모델 평가 (Sum Click)
y_pred_best_sum = best_sum_model.predict(sum_x_test_scaled)
acc_best_sum = accuracy_score(y_test, y_pred_best_sum) * 100
f1_best_sum = f1_score(y_test, y_pred_best_sum, average='weighted') * 100
r2_best_sum = r2_score(y_test, y_pred_best_sum) * 100

print(f"Best Accuracy (Sum Click): {acc_best_sum:.4f}")
print(f"Best F1 Score (Sum Click): {f1_best_sum:.4f}")
print(f"Best R2 Score (Sum Click): {r2_best_sum:.4f}")

# 12. 최적 모델 평가 (Mean Click)
y_pred_best_mean = best_mean_model.predict(mean_x_test_scaled)
acc_best_mean = accuracy_score(y_test, y_pred_best_mean) * 100
f1_best_mean = f1_score(y_test, y_pred_best_mean, average='weighted') * 100
r2_best_mean = r2_score(y_test, y_pred_best_mean) * 100

print(f"Best Accuracy (Mean Click): {acc_best_mean:.4f}")
print(f"Best F1 Score (Mean Click): {f1_best_mean:.4f}")
print(f"Best R2 Score (Mean Click): {r2_best_mean:.4f}")

# 13. Feature Importance 시각화 (Sum Click)
plt.figure(figsize=(10, 5))
feature_importances_sum = pd.Series(best_sum_model.feature_importances_, index=sum_clicked_features).sort_values(ascending=False)
sns.barplot(x=feature_importances_sum, y=feature_importances_sum.index, palette='viridis')
plt.title("Feature Importance - Sum Click Based GBM")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()

# 14. Feature Importance 시각화 (Mean Click)
plt.figure(figsize=(10, 5))
feature_importances_mean = pd.Series(best_mean_model.feature_importances_, index=mean_clicked_features).sort_values(ascending=False)
sns.barplot(x=feature_importances_mean, y=feature_importances_mean.index, palette='viridis')
plt.title("Feature Importance - Mean Click Based GBM")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()

=== Sum Click 기반 최적화 Gradient Boosting 성능 ===
Best Parameters: {'subsample': 0.9, 'n_estimators': 600, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 9, 'learning_rate': 0.15}

=== Mean Click 기반 최적화 Gradient Boosting 성능 ===
Best Parameters: {'subsample': 0.8, 'n_estimators': 600, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': None, 'max_depth': 9, 'learning_rate': 0.1}

## 4. 결정 트리 파라미터

In [None]:

# 7. 하이퍼파라미터 튜닝 (GridSearchCV)
param_grid = {
    'max_depth': [3, 5, 10, 20, None],  # 트리 깊이
    'min_samples_split': [2, 5, 10],  # 내부 노드 분할 최소 샘플 수
    'min_samples_leaf': [1, 2, 4],  # 리프 노드 최소 샘플 수
    'criterion': ['gini', 'entropy']  # 분할 기준
}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# 8. 최적의 하이퍼파라미터 및 성능 확인
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("\n=== 하이퍼파라미터 최적화 후 Decision Tree 성능 ===")
print(f"Best Parameters: {best_params}")

# 9. 최적 모델 평가
y_pred_best = best_model.predict(X_test_scaled)
acc_best = accuracy_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best, average='weighted')

print(f"Best Accuracy: {acc_best:.4f}")
print(f"Best F1 Score: {f1_best:.4f}")

# 10. 혼동행렬 시각화
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred_best), annot=True, fmt='d', cmap='Blues', xticklabels=['Not Withdrawn', 'Withdrawn'], yticklabels=['Not Withdrawn', 'Withdrawn'])
plt.title("Confusion Matrix - Decision Tree (Optimized)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# 11. Feature Importance 시각화
feature_importances = pd.Series(best_model.feature_importances_, index=data.drop(columns=['final_result','id_student','id_assessment','code_module']).columns).sort_values(ascending=False)

plt.figure(figsize=(10, 5))
sns.barplot(x=feature_importances, y=feature_importances.index, palette='viridis')
plt.title("Feature Importance - Decision Tree")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()

=== 하이퍼파라미터 최적화 후 Decision Tree 성능 ===
Best Parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Accuracy: 0.8924
Best F1 Score: 0.8939

## 5. LightGBM 파라미터

In [None]:
# 7. 하이퍼파라미터 튜닝 (RandomizedSearchCV 적용)
param_dist = {
    'n_estimators': [100, 200, 500],  # 트리 개수
    'learning_rate': [0.01, 0.05, 0.1, 0.15],  # 학습률
    'max_depth': [3, 5, 7, 10],  # 트리 깊이
    'num_leaves': [20, 31, 50, 100],  # 리프 노드 개수
    'min_child_samples': [10, 20, 30, 50],  # 리프 노드 최소 샘플 수
    'subsample': [0.7, 0.8, 0.9, 1.0],  # 샘플링 비율
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],  # 트리별 특성 샘플링 비율
    'reg_alpha': [0, 0.1, 0.5, 1.0],  # L1 정규화
    'reg_lambda': [0, 0.1, 0.5, 1.0]  # L2 정규화
}

# RandomizedSearchCV 실행
random_search_sum = RandomizedSearchCV(LGBMClassifier(random_state=42), 
                                       param_distributions=param_dist, 
                                       n_iter=20,  # 랜덤 탐색 횟수 (너무 크면 시간 오래 걸림)
                                       cv=3, 
                                       scoring='accuracy', 
                                       n_jobs=-1, 
                                       verbose=1)
random_search_sum.fit(sum_x_train_scaled, y_train)

random_search_mean = RandomizedSearchCV(LGBMClassifier(random_state=42), 
                                        param_distributions=param_dist, 
                                        n_iter=20,  
                                        cv=3, 
                                        scoring='accuracy', 
                                        n_jobs=-1, 
                                        verbose=1)
random_search_mean.fit(mean_x_train_scaled, y_train)

# 8. 최적 하이퍼파라미터 확인
best_sum_model = random_search_sum.best_estimator_
best_mean_model = random_search_mean.best_estimator_

print("\n=== Sum Click 기반 최적화 LightGBM 성능 ===")
print(f"Best Parameters: {random_search_sum.best_params_}")

print("\n=== Mean Click 기반 최적화 LightGBM 성능 ===")
print(f"Best Parameters: {random_search_mean.best_params_}")

# 9. 최적 모델 성능 평가
y_pred_best_sum = best_sum_model.predict(sum_x_test_scaled)
y_pred_best_mean = best_mean_model.predict(mean_x_test_scaled)

evaluate_model("Best Sum Click 기반 LightGBM", y_test, y_pred_best_sum)
evaluate_model("Best Mean Click 기반 LightGBM", y_test, y_pred_best_mean)

# 10. Feature Importance 시각화
plt.figure(figsize=(10, 5))
feature_importances_sum = pd.Series(best_sum_model.feature_importances_, index=sum_clicked_features).sort_values(ascending=False)
sns.barplot(x=feature_importances_sum, y=feature_importances_sum.index, palette='viridis')
plt.title("Feature Importance - Sum Click Based LightGBM")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()

plt.figure(figsize=(10, 5))
feature_importances_mean = pd.Series(best_mean_model.feature_importances_, index=mean_clicked_features).sort_values(ascending=False)
sns.barplot(x=feature_importances_mean, y=feature_importances_mean.index, palette='viridis')
plt.title("Feature Importance - Mean Click Based LightGBM")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()


=== Sum Click 기반 최적화 LightGBM 성능 ===
Best Parameters: {'subsample': 0.7, 'reg_lambda': 0.1, 'reg_alpha': 0.5, 'num_leaves': 31, 'n_estimators': 500, 'min_child_samples': 30, 'max_depth': 5, 'learning_rate': 0.15, 'colsample_bytree': 1.0}

=== Mean Click 기반 최적화 LightGBM 성능 ===
Best Parameters: {'subsample': 0.7, 'reg_lambda': 1.0, 'reg_alpha': 0.1, 'num_leaves': 100, 'n_estimators': 200, 'min_child_samples': 20, 'max_depth': 7, 'learning_rate': 0.15, 'colsample_bytree': 0.8}


## 6. 로지스틱 회귀 파라미터

In [None]:

# 7. 하이퍼파라미터 튜닝 (GridSearchCV)
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # 규제 강도 (작을수록 강한 규제)
    'solver': ['liblinear', 'lbfgs']  # 최적화 알고리즘
}

grid_search = GridSearchCV(LogisticRegression(random_state=42, max_iter=1000), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# 8. 최적의 하이퍼파라미터 및 성능 확인
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("\n=== 하이퍼파라미터 최적화 후 Logistic Regression 성능 ===")
print(f"Best Parameters: {best_params}")

# 9. 최적 모델 평가
y_pred_best = best_model.predict(X_test)
acc_best = accuracy_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best, average='weighted')

print(f"Best Accuracy: {acc_best:.4f}")
print(f"Best F1 Score: {f1_best:.4f}")


=== 하이퍼파라미터 최적화 후 Logistic Regression 성능 ===
Best Parameters: {'C': 0.01, 'solver': 'liblinear'}
Best Accuracy: 0.8671
Best F1 Score: 0.8065