In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame(pd.read_csv('../data/before_scaling.csv'))
df.head(2)

Unnamed: 0,school_GP,school_MS,fromCity,sex,age,famMore3,liveTogether,Medu,Fedu,noParent,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,1,0,1,0,18,1,0,4,4,0,...,4,3,4,1,1,3,6,5,6,6
1,1,0,1,0,17,1,1,1,1,0,...,5,3,3,1,1,3,4,5,5,6


In [3]:
X = df.drop(columns=['G1', 'G2', 'G3'])
y = ((df['G1'] + df['G2'] + df['G3']) / 3).round()
y

0       6.0
1       5.0
2       8.0
3      15.0
4       9.0
       ... 
352     9.0
353    15.0
354     8.0
355    11.0
356     9.0
Length: 357, dtype: float64

In [4]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=20)

In [5]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# 3. 모델 생성 및 학습
gbm = GradientBoostingRegressor(
    n_estimators=100,      # 트리 개수
    learning_rate=0.03,    # 학습률
    max_depth= 1,           # 개별 트리 깊이
    random_state=42
)
gbm.fit(X_train, y_train)

# 4. 예측
y_pred = gbm.predict(X_test)

# 5. 평가
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R² Score: {r2:.3f}")



MAE: 1.838
RMSE: 2.368
R² Score: -0.050


# hyperOpt이랑 optuna 써보기

In [6]:
import optuna
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import numpy as np

X = df.drop(columns=['G1', 'G2', 'G3'])
y = ((df['G1'] + df['G2'] + df['G3']) / 3).round()
\

# RMSE 스코어러 정의
mse = make_scorer(lambda y_true, y_pred: mean_squared_error(y_true, y_pred))

# === Optuna 목적 함수 ===
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "random_state": 42,
    }

    model = xgb.XGBRegressor(**params)

    # 교차검증 (5-fold)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kf, scoring=mse)

    return scores.mean()  # Optuna는 score 최소화를 목표로 (RMSE 낮을수록 좋음)

# === Optuna 실행 ===
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=500)  # trial 수는 상황에 맞게 조정

# print("Best trial:", study.best_trial.params)
# print("Best RMSE:", study.best_value)
print(study.best_value)
print(study.best_params)

[I 2025-09-27 10:57:32,094] A new study created in memory with name: no-name-76f14193-29aa-4989-ac82-6a146a9fe552
[I 2025-09-27 10:57:32,897] Trial 0 finished with value: 9.026335912812982 and parameters: {'n_estimators': 798, 'max_depth': 7, 'learning_rate': 0.18812827849896518, 'subsample': 0.7561732597135126, 'colsample_bytree': 0.727604258281399, 'gamma': 0.601390368214173, 'reg_alpha': 0.15248341766597207, 'reg_lambda': 0.9189057363604279, 'min_child_weight': 1}. Best is trial 0 with value: 9.026335912812982.
[I 2025-09-27 10:57:33,156] Trial 1 finished with value: 9.475135467823714 and parameters: {'n_estimators': 181, 'max_depth': 4, 'learning_rate': 0.13693993537883534, 'subsample': 0.8713040456337351, 'colsample_bytree': 0.8261374969002144, 'gamma': 2.3870018923375325, 'reg_alpha': 0.6406847152894906, 'reg_lambda': 0.36918125143138647, 'min_child_weight': 9}. Best is trial 0 with value: 9.026335912812982.
[I 2025-09-27 10:57:33,454] Trial 2 finished with value: 11.511595243324

8.006748594332375
{'n_estimators': 119, 'max_depth': 4, 'learning_rate': 0.025468606404335348, 'subsample': 0.5070544287818559, 'colsample_bytree': 0.5595401394252664, 'gamma': 4.781560819789, 'reg_alpha': 0.7448694257732423, 'reg_lambda': 0.4575875275500041, 'min_child_weight': 2}


In [8]:
import logging
from datetime import datetime
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# === 로깅 설정 ===
logging.basicConfig(
    filename='xgb_optuna_log.txt',  # 로그 파일 이름
    level=logging.INFO,
    format='%(asctime)s - %(message)s',  # 시간 포함
    datefmt='%Y-%m-%d %H:%M:%S'
)

# 최적 하이퍼파라미터 불러오기
best_params = study.best_params
xgb_optuna = xgb.XGBRegressor(**best_params)

# 학습
xgb_optuna.fit(X_train, y_train)
optuna_pred = xgb_optuna.predict(X_test)

# 평가 지표 계산
mae = mean_absolute_error(y_test, optuna_pred)
mse = mean_squared_error(y_test, optuna_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, optuna_pred)

# 출력
print(f"MAE: {mae:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R² Score: {r2:.3f}")

# 로그 저장
logging.info("=== XGBoost Optuna Evaluation ===")
logging.info(f"Best Params: {best_params}")
logging.info(f"MAE: {mae:.3f}")
logging.info(f"RMSE: {rmse:.3f}")
logging.info(f"R² Score: {r2:.3f}")


MAE: 1.805
RMSE: 2.290
R² Score: 0.018


In [1]:
import optuna
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np
import pandas as pd

#------------------------------------------------------
# 1. 데이터 전처리 개선
def preprocess_data(df):
    """데이터 전처리 함수"""
    df_processed = df.copy()
    
    # 범주형 변수 인코딩
    categorical_columns = df_processed.select_dtypes(include=['object']).columns
    label_encoders = {}
    
    for col in categorical_columns:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col].astype(str))
        label_encoders[col] = le
    
    return df_processed, label_encoders

# 데이터 전처리
df_processed, encoders = preprocess_data(df)

# === 2. 타겟 변수 개선 ===
# 반올림 대신 원래 평균값 사용 (더 많은 정보 보존)
X = df_processed.drop(columns=['G1', 'G2', 'G3'])
y = (df_processed['G1'] + df_processed['G2'] + df_processed['G3']) / 3  # 반올림 제거

# === 3. 데이터 분할 및 스케일링 ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 수치형 데이터 스케일링 (XGBoost는 스케일링이 필수는 아니지만 도움이 될 수 있음)
scaler = StandardScaler()
numeric_columns = X_train.select_dtypes(include=[np.number]).columns
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

if len(numeric_columns) > 0:
    X_train_scaled[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
    X_test_scaled[numeric_columns] = scaler.transform(X_test[numeric_columns])

# === 4. 개선된 스코어 정의 ===
def rmse_score(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse_score, greater_is_better=False)

# === 5. 개선된 Optuna 목적 함수 ===
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 10),  # 범위 확장
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 10),  # 범위 확장
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "random_state": 42,
        "n_jobs": -1  # 병렬 처리
    }

    model = xgb.XGBRegressor(**params)

    # 교차검증 (5-fold)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train_scaled, y_train, cv=kf, scoring=rmse_scorer)

    return -scores.mean()  # RMSE는 낮을수록 좋으므로 음수로 반환

# === 6. Optuna 실행 ===
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=1000)  # 처음에는 적은 수로 테스트

print("Best RMSE:", study.best_value)
print("Best params:", study.best_params)

# === 7. 최적 모델로 성능 평가 ===
best_model = xgb.XGBRegressor(**study.best_params)
best_model.fit(X_train_scaled, y_train)

# 예측 및 평가
y_pred_train = best_model.predict(X_train_scaled)
y_pred_test = best_model.predict(X_test_scaled)

print("\n=== 성능 평가 ===")
print(f"Train RMSE: {rmse_score(y_train, y_pred_train):.4f}")
print(f"Test RMSE: {rmse_score(y_test, y_pred_test):.4f}")
print(f"Train R²: {r2_score(y_train, y_pred_train):.4f}")
print(f"Test R²: {r2_score(y_test, y_pred_test):.4f}")

# === 8. 피처 중요도 확인 ===
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n=== 피처 중요도 Top 10 ===")
print(feature_importance.head(10))

print("\n=== 피처 중요도 Bottom 10 ===")
print(feature_importance.tail(10))

# === 9. 추가 개선 방안 ===
print("\n=== 추가 개선 제안 ===")
print("1. 피처 엔지니어링: 새로운 파생 변수 생성")
print("2. 앙상블 모델: Random Forest, LightGBM과 결합")
print("3. 이상치 제거: IQR 방법으로 이상치 탐지 및 제거")
print("4. 피처 선택: 중요도가 낮은 피처 제거")

KeyboardInterrupt: 

In [None]:
import optuna
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression
import numpy as np
import pandas as pd
from scipy.optimize import minimize
import warnings
warnings.filterwarnings('ignore')

#-------------------------------------------------------
# 1. 데이터 전처리
#-------------------------------------------------------
def preprocess_data(df):
    df_processed = df.copy()
    categorical_columns = df_processed.select_dtypes(include=['object']).columns
    label_encoders = {}
    
    for col in categorical_columns:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col].astype(str))
        label_encoders[col] = le
    
    return df_processed, label_encoders

df_processed, encoders = preprocess_data(df)

# 타겟 변수
X = df_processed.drop(columns=['G1', 'G2', 'G3'])
y = (df_processed['G1'] + df_processed['G2'] + df_processed['G3']) / 3

# 피처 선택
selector = SelectKBest(score_func=f_regression, k=min(15, X.shape[1]))
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.3, random_state=42
)

# 스케일링 (선형 모델용)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"훈련 데이터 크기: {X_train.shape}")
print(f"테스트 데이터 크기: {X_test.shape}")
print(f"선택된 피처: {list(selected_features)}")

#-------------------------------------------------------
# 2. 개별 모델 정의 및 최적화
#-------------------------------------------------------
class EnsembleOptimizer:
    def __init__(self, X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.X_train_scaled = X_train_scaled
        self.X_test_scaled = X_test_scaled
        self.models = {}
        self.predictions_train = {}
        self.predictions_test = {}
        
    def optimize_xgboost(self, n_trials=50):
        """XGBoost 최적화"""
        def objective(trial):
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 100, 500),
                "max_depth": trial.suggest_int("max_depth", 3, 8),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
                "subsample": trial.suggest_float("subsample", 0.7, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
                "reg_alpha": trial.suggest_float("reg_alpha", 0, 3),
                "reg_lambda": trial.suggest_float("reg_lambda", 0, 3),
                "min_child_weight": trial.suggest_int("min_child_weight", 1, 7),
                "random_state": 42
            }
            
            model = xgb.XGBRegressor(**params)
            kf = KFold(n_splits=5, shuffle=True, random_state=42)
            scores = cross_val_score(model, self.X_train, self.y_train, cv=kf, 
                                   scoring='neg_root_mean_squared_error')
            return -scores.mean()
        
        study = optuna.create_study(direction="minimize", study_name="xgboost")
        study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
        
        best_xgb = xgb.XGBRegressor(**study.best_params)
        return best_xgb, study.best_value
    
    def optimize_lightgbm(self, n_trials=50):
        """LightGBM 최적화"""
        def objective(trial):
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 100, 500),
                "max_depth": trial.suggest_int("max_depth", 3, 8),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
                "subsample": trial.suggest_float("subsample", 0.7, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
                "reg_alpha": trial.suggest_float("reg_alpha", 0, 3),
                "reg_lambda": trial.suggest_float("reg_lambda", 0, 3),
                "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10),
                "random_state": 42,
                "verbose": -1
            }
            
            model = lgb.LGBMRegressor(**params)
            kf = KFold(n_splits=5, shuffle=True, random_state=42)
            scores = cross_val_score(model, self.X_train, self.y_train, cv=kf, 
                                   scoring='neg_root_mean_squared_error')
            return -scores.mean()
        
        study = optuna.create_study(direction="minimize", study_name="lightgbm")
        study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
        
        best_lgb = lgb.LGBMRegressor(**study.best_params)
        return best_lgb, study.best_value
    
    def get_base_models(self):
        """기본 모델들 정의"""
        models = {
            'rf': RandomForestRegressor(n_estimators=200, max_depth=6, min_samples_split=5, 
                                      min_samples_leaf=2, random_state=42),
            'extra_trees': ExtraTreesRegressor(n_estimators=200, max_depth=6, min_samples_split=5,
                                             min_samples_leaf=2, random_state=42),
            'gbm': GradientBoostingRegressor(n_estimators=200, max_depth=4, learning_rate=0.1,
                                           min_samples_split=5, min_samples_leaf=2, random_state=42),
            'ridge': Ridge(alpha=1.0),
            'lasso': Lasso(alpha=0.1, random_state=42),
            'elastic': ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42),
        }
        return models
    
    def train_all_models(self):
        """모든 모델 훈련"""
        print("=== 개별 모델 최적화 및 훈련 ===")
        
        # XGBoost 최적화
        print("XGBoost 최적화 중...")
        best_xgb, xgb_score = self.optimize_xgboost()
        self.models['xgb'] = best_xgb
        print(f"XGBoost 최적 CV RMSE: {xgb_score:.4f}")
        
        # LightGBM 최적화
        print("LightGBM 최적화 중...")
        best_lgb, lgb_score = self.optimize_lightgbm()
        self.models['lgb'] = best_lgb
        print(f"LightGBM 최적 CV RMSE: {lgb_score:.4f}")
        
        # 기본 모델들
        base_models = self.get_base_models()
        self.models.update(base_models)
        
        # 모든 모델 훈련 및 예측
        print("\n=== 개별 모델 성능 ===")
        for name, model in self.models.items():
            if name in ['ridge', 'lasso', 'elastic']:
                # 선형 모델은 스케일된 데이터 사용
                model.fit(self.X_train_scaled, self.y_train)
                train_pred = model.predict(self.X_train_scaled)
                test_pred = model.predict(self.X_test_scaled)
            else:
                # 트리 기반 모델은 원본 데이터 사용
                model.fit(self.X_train, self.y_train)
                train_pred = model.predict(self.X_train)
                test_pred = model.predict(self.X_test)
            
            self.predictions_train[name] = train_pred
            self.predictions_test[name] = test_pred
            
            train_r2 = r2_score(self.y_train, train_pred)
            test_r2 = r2_score(self.y_test, test_pred)
            test_rmse = np.sqrt(mean_squared_error(self.y_test, test_pred))
            
            print(f"{name:12} - Train R²: {train_r2:.4f}, Test R²: {test_r2:.4f}, "
                  f"Test RMSE: {test_rmse:.4f}")
    
    def optimize_ensemble_weights(self):
        """
        앙상블 가중치 최적화
        """
        print("\n=== 앙상블 가중치 최적화 ===")
        
        # 예측값들을 행렬로 변환
        train_preds = np.column_stack([self.predictions_train[name] for name in self.models.keys()])
        test_preds = np.column_stack([self.predictions_test[name] for name in self.models.keys()])
        
        # 가중치 최적화 함수
        def objective(weights):
            weights = weights / np.sum(weights)  # 정규화
            ensemble_pred = np.dot(train_preds, weights)
            return mean_squared_error(self.y_train, ensemble_pred)
        
        # 제약 조건: 가중치 합은 1, 모든 가중치는 양수
        constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}
        bounds = [(0, 1) for _ in range(len(self.models))]
        
        # 초기 가중치 (균등)
        initial_weights = np.ones(len(self.models)) / len(self.models)
        
        # 최적화 실행
        result = minimize(objective, initial_weights, method='SLSQP', 
                         bounds=bounds, constraints=constraints)
        
        optimal_weights = result.x / np.sum(result.x)  # 정규화
        
        print("최적 가중치:")
        for name, weight in zip(self.models.keys(), optimal_weights):
            print(f"{name:12}: {weight:.4f}")
        
        return optimal_weights, train_preds, test_preds
    
    def evaluate_ensemble(self, weights, train_preds, test_preds):
        """
        앙상블 모델 평가
        
        """
        # 가중 평균 예측
        ensemble_train_pred = np.dot(train_preds, weights)
        ensemble_test_pred = np.dot(test_preds, weights)
        
        # 성능 계산
        train_r2 = r2_score(self.y_train, ensemble_train_pred)
        test_r2 = r2_score(self.y_test, ensemble_test_pred)
        train_rmse = np.sqrt(mean_squared_error(self.y_train, ensemble_train_pred))
        test_rmse = np.sqrt(mean_squared_error(self.y_test, ensemble_test_pred))
        
        print(f"\n=== 최종 앙상블 성능 ===")
        print(f"Train R²: {train_r2:.4f}")
        print(f"Test R²: {test_r2:.4f}")
        print(f"Train RMSE: {train_rmse:.4f}")
        print(f"Test RMSE: {test_rmse:.4f}")
        print(f"과적합 정도 (R² 차이): {train_r2 - test_r2:.4f}")
        
        return ensemble_train_pred, ensemble_test_pred

#-------------------------------------------------------
# 3. 앙상블 실행
#-------------------------------------------------------
optimizer = EnsembleOptimizer(X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled)

# 모든 모델 훈련
optimizer.train_all_models()

# 앙상블 가중치 최적화
optimal_weights, train_preds, test_preds = optimizer.optimize_ensemble_weights()

# 최종 앙상블 평가
ensemble_train_pred, ensemble_test_pred = optimizer.evaluate_ensemble(
    optimal_weights, train_preds, test_preds)

#-------------------------------------------------------
# 4. 추가 앙상블 기법들
#-------------------------------------------------------
print("\n=== 다른 앙상블 방법들과 비교 ===")

# 1. 단순 평균 앙상블
simple_avg_train = np.mean(train_preds, axis=1)
simple_avg_test = np.mean(test_preds, axis=1)
simple_avg_r2 = r2_score(y_test, simple_avg_test)
print(f"단순 평균 앙상블 Test R²: {simple_avg_r2:.4f}")

# 2. 중간값 앙상블
median_train = np.median(train_preds, axis=1)
median_test = np.median(test_preds, axis=1)
median_r2 = r2_score(y_test, median_test)
print(f"중간값 앙상블 Test R²: {median_r2:.4f}")

# 3. 상위 3개 모델만 사용한 앙상블
model_performance = {}
for i, name in enumerate(optimizer.models.keys()):
    test_r2 = r2_score(y_test, test_preds[:, i])
    model_performance[name] = test_r2

top3_models = sorted(model_performance.items(), key=lambda x: x[1], reverse=True)[:3]
top3_indices = [list(optimizer.models.keys()).index(name) for name, _ in top3_models]
top3_preds = test_preds[:, top3_indices]
top3_avg = np.mean(top3_preds, axis=1)
top3_r2 = r2_score(y_test, top3_avg)

print(f"상위 3개 모델 앙상블 Test R²: {top3_r2:.4f}")
print(f"상위 3개 모델: {[name for name, _ in top3_models]}")

#-------------------------------------------------------
#  5. 최종 권장사항
#------------------------------------------------------- 
print(f"\n=== 최종 결과 요약 ===")
print(f"가중 평균 앙상블 Test R²: {r2_score(y_test, ensemble_test_pred):.4f}")
print(f"단순 평균 앙상블 Test R²: {simple_avg_r2:.4f}")
print(f"상위 3개 모델 앙상블 Test R²: {top3_r2:.4f}")

best_method = max([
    ("가중 평균", r2_score(y_test, ensemble_test_pred)),
    ("단순 평균", simple_avg_r2),
    ("상위 3개", top3_r2)
], key=lambda x: x[1])

print(f"\n최고 성능: {best_method[0]} 앙상블 (R² = {best_method[1]:.4f})")

#-------------------------------------------------------
# 6. 모델 저장을 위한 클래스
#-------------------------------------------------------
class FinalEnsembleModel:
    def __init__(self, models, weights, scaler, selector, selected_features):
        self.models = models
        self.weights = weights
        self.scaler = scaler
        self.selector = selector
        self.selected_features = selected_features
    
    def predict(self, X):
        # 피처 선택
        X_selected = self.selector.transform(X)
        
        predictions = []
        for name, model in self.models.items():
            if name in ['ridge', 'lasso', 'elastic']:
                X_scaled = self.scaler.transform(X_selected)
                pred = model.predict(X_scaled)
            else:
                pred = model.predict(X_selected)
            predictions.append(pred)
        
        # 가중 평균
        predictions = np.column_stack(predictions)
        return np.dot(predictions, self.weights)

# 최종 모델 생성
final_ensemble = FinalEnsembleModel(
    optimizer.models, optimal_weights, scaler, selector, selected_features
)

print(f"\n앙상블 모델 준비완료")
print(f"사용된 모델 수: {len(optimizer.models)}")
print(f"선택된 피처 수: {len(selected_features)}")

[I 2025-09-27 11:24:41,716] A new study created in memory with name: xgboost


훈련 데이터 크기: (249, 15)
테스트 데이터 크기: (108, 15)
선택된 피처: ['fromCity', 'sex', 'age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'schoolsup', 'higher', 'internet', 'goout', 'Dalc', 'Walc', 'absences']
=== 개별 모델 최적화 및 훈련 ===
XGBoost 최적화 중...


[I 2025-09-27 11:24:42,137] Trial 0 finished with value: 3.1758317166228545 and parameters: {'n_estimators': 258, 'max_depth': 5, 'learning_rate': 0.07384665932076019, 'subsample': 0.928175875393128, 'colsample_bytree': 0.7758040422924897, 'reg_alpha': 2.962338160185446, 'reg_lambda': 2.1875734857549523, 'min_child_weight': 1}. Best is trial 0 with value: 3.1758317166228545.
[I 2025-09-27 11:24:42,834] Trial 1 finished with value: 3.2406763196465276 and parameters: {'n_estimators': 376, 'max_depth': 8, 'learning_rate': 0.050069903870827374, 'subsample': 0.9481122201801737, 'colsample_bytree': 0.8663580261322481, 'reg_alpha': 1.207759475559058, 'reg_lambda': 0.49845585417375615, 'min_child_weight': 6}. Best is trial 0 with value: 3.1758317166228545.
[I 2025-09-27 11:24:43,325] Trial 2 finished with value: 3.3304446108875867 and parameters: {'n_estimators': 486, 'max_depth': 3, 'learning_rate': 0.06452994548703436, 'subsample': 0.9428243599075931, 'colsample_bytree': 0.8809178944753319, 

XGBoost 최적 CV RMSE: 3.0236
LightGBM 최적화 중...


[I 2025-09-27 11:24:51,080] Trial 0 finished with value: 3.114023195131196 and parameters: {'n_estimators': 160, 'max_depth': 6, 'learning_rate': 0.1369034030170087, 'subsample': 0.7897445363285214, 'colsample_bytree': 0.8474292594359101, 'reg_alpha': 0.7552850370902942, 'reg_lambda': 2.6915395638148794, 'min_child_weight': 6.988259650029541}. Best is trial 0 with value: 3.114023195131196.
[I 2025-09-27 11:24:51,192] Trial 1 finished with value: 2.9744882394450407 and parameters: {'n_estimators': 163, 'max_depth': 6, 'learning_rate': 0.0916692215428149, 'subsample': 0.7336882250388455, 'colsample_bytree': 0.9936511451209492, 'reg_alpha': 2.3504264084092674, 'reg_lambda': 2.485858795938624, 'min_child_weight': 9.354186693367135}. Best is trial 1 with value: 2.9744882394450407.
[I 2025-09-27 11:24:51,357] Trial 2 finished with value: 2.883964799209346 and parameters: {'n_estimators': 323, 'max_depth': 3, 'learning_rate': 0.0447464890949277, 'subsample': 0.906642843208181, 'colsample_bytr

LightGBM 최적 CV RMSE: 2.8041

=== 개별 모델 성능 ===
xgb          - Train R²: 0.8098, Test R²: 0.1246, Test RMSE: 2.8601
lgb          - Train R²: 0.3700, Test R²: 0.1716, Test RMSE: 2.7824
rf           - Train R²: 0.5351, Test R²: 0.2008, Test RMSE: 2.7328
extra_trees  - Train R²: 0.4810, Test R²: 0.2139, Test RMSE: 2.7103
gbm          - Train R²: 0.8850, Test R²: 0.0348, Test RMSE: 3.0033
ridge        - Train R²: 0.2347, Test R²: 0.2564, Test RMSE: 2.6360
lasso        - Train R²: 0.2257, Test R²: 0.2388, Test RMSE: 2.6670
elastic      - Train R²: 0.2308, Test R²: 0.2502, Test RMSE: 2.6470

=== 앙상블 가중치 최적화 ===
최적 가중치:
xgb         : 0.0000
lgb         : 0.0000
rf          : 0.0000
extra_trees : 0.0000
gbm         : 1.0000
ridge       : 0.0000
lasso       : 0.0000
elastic     : 0.0000

=== 최종 앙상블 성능 ===
Train R²: 0.8850
Test R²: 0.0348
Train RMSE: 1.0634
Test RMSE: 3.0033
과적합 정도 (R² 차이): 0.8502

=== 다른 앙상블 방법들과 비교 ===
단순 평균 앙상블 Test R²: 0.2314
중간값 앙상블 Test R²: 0.2358
상위 3개 모델 앙상블 Test R²: 0.250

In [15]:
import optuna
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression
import numpy as np
import pandas as pd
from scipy.optimize import minimize
import joblib
import warnings
warnings.filterwarnings('ignore')

# === 1. 데이터 전처리 ===
def preprocess_data(df):
    df_processed = df.copy()
    categorical_columns = df_processed.select_dtypes(include=['object']).columns
    label_encoders = {}
    
    for col in categorical_columns:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col].astype(str))
        label_encoders[col] = le
    
    return df_processed, label_encoders

df_processed, encoders = preprocess_data(df)

# 타겟 변수
X = df_processed.drop(columns=['G1', 'G2', 'G3'])
y = (df_processed['G1'] + df_processed['G2'] + df_processed['G3']) / 3

# 피처 선택
selector = SelectKBest(score_func=f_regression, k=min(15, X.shape[1]))
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.3, random_state=42
)

# 스케일링 (선형 모델용)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"훈련 데이터 크기: {X_train.shape}")
print(f"테스트 데이터 크기: {X_test.shape}")
print(f"선택된 피처: {list(selected_features)}")

# === 2. 개별 모델 정의 및 최적화 ===

class EnsembleOptimizer:
    def __init__(self, X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.X_train_scaled = X_train_scaled
        self.X_test_scaled = X_test_scaled
        self.models = {}
        self.predictions_train = {}
        self.predictions_test = {}
        
    def optimize_xgboost(self, n_trials=50):
        """XGBoost 최적화"""
        def objective(trial):
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 100, 500),
                "max_depth": trial.suggest_int("max_depth", 3, 8),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
                "subsample": trial.suggest_float("subsample", 0.7, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
                "reg_alpha": trial.suggest_float("reg_alpha", 0, 3),
                "reg_lambda": trial.suggest_float("reg_lambda", 0, 3),
                "min_child_weight": trial.suggest_int("min_child_weight", 1, 7),
                "random_state": 42
            }
            
            model = xgb.XGBRegressor(**params)
            kf = KFold(n_splits=5, shuffle=True, random_state=42)
            scores = cross_val_score(model, self.X_train, self.y_train, cv=kf, 
                                   scoring='neg_root_mean_squared_error')
            return -scores.mean()
        
        study = optuna.create_study(direction="minimize", study_name="xgboost")
        study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
        
        best_xgb = xgb.XGBRegressor(**study.best_params)
        return best_xgb, study.best_value
    
    def optimize_lightgbm(self, n_trials=50):
        """LightGBM 최적화"""
        def objective(trial):
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 100, 500),
                "max_depth": trial.suggest_int("max_depth", 3, 8),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
                "subsample": trial.suggest_float("subsample", 0.7, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
                "reg_alpha": trial.suggest_float("reg_alpha", 0, 3),
                "reg_lambda": trial.suggest_float("reg_lambda", 0, 3),
                "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10),
                "random_state": 42,
                "verbose": -1
            }
            
            model = lgb.LGBMRegressor(**params)
            kf = KFold(n_splits=5, shuffle=True, random_state=42)
            scores = cross_val_score(model, self.X_train, self.y_train, cv=kf, 
                                   scoring='neg_root_mean_squared_error')
            return -scores.mean()
        
        study = optuna.create_study(direction="minimize", study_name="lightgbm")
        study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
        
        best_lgb = lgb.LGBMRegressor(**study.best_params)
        return best_lgb, study.best_value
    
    def get_base_models(self):
        """기본 모델들 정의"""
        models = {
            'rf': RandomForestRegressor(n_estimators=200, max_depth=6, min_samples_split=5, 
                                      min_samples_leaf=2, random_state=42),
            'extra_trees': ExtraTreesRegressor(n_estimators=200, max_depth=6, min_samples_split=5,
                                             min_samples_leaf=2, random_state=42),
            'gbm': GradientBoostingRegressor(n_estimators=200, max_depth=4, learning_rate=0.1,
                                           min_samples_split=5, min_samples_leaf=2, random_state=42),
            'ridge': Ridge(alpha=1.0),
            'lasso': Lasso(alpha=0.1, random_state=42),
            'elastic': ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42),
        }
        return models
    
    def train_all_models(self):
        """모든 모델 훈련"""
        print("=== 개별 모델 최적화 및 훈련 ===")
        
        # XGBoost 최적화
        print("XGBoost 최적화 중...")
        best_xgb, xgb_score = self.optimize_xgboost()
        self.models['xgb'] = best_xgb
        print(f"XGBoost 최적 CV RMSE: {xgb_score:.4f}")
        
        # LightGBM 최적화
        print("LightGBM 최적화 중...")
        best_lgb, lgb_score = self.optimize_lightgbm()
        self.models['lgb'] = best_lgb
        print(f"LightGBM 최적 CV RMSE: {lgb_score:.4f}")
        
        # 기본 모델들
        base_models = self.get_base_models()
        self.models.update(base_models)
        
        # 모든 모델 훈련 및 예측
        print("\n=== 개별 모델 성능 ===")
        for name, model in self.models.items():
            if name in ['ridge', 'lasso', 'elastic']:
                # 선형 모델은 스케일된 데이터 사용
                model.fit(self.X_train_scaled, self.y_train)
                train_pred = model.predict(self.X_train_scaled)
                test_pred = model.predict(self.X_test_scaled)
            else:
                # 트리 기반 모델은 원본 데이터 사용
                model.fit(self.X_train, self.y_train)
                train_pred = model.predict(self.X_train)
                test_pred = model.predict(self.X_test)
            
            self.predictions_train[name] = train_pred
            self.predictions_test[name] = test_pred
            
            train_r2 = r2_score(self.y_train, train_pred)
            test_r2 = r2_score(self.y_test, test_pred)
            test_rmse = np.sqrt(mean_squared_error(self.y_test, test_pred))
            
            print(f"{name:12} - Train R²: {train_r2:.4f}, Test R²: {test_r2:.4f}, "
                  f"Test RMSE: {test_rmse:.4f}")
    
    def optimize_ensemble_weights(self):
        """앙상블 가중치 최적화"""
        print("\n=== 앙상블 가중치 최적화 ===")
        
        # 예측값들을 행렬로 변환
        train_preds = np.column_stack([self.predictions_train[name] for name in self.models.keys()])
        test_preds = np.column_stack([self.predictions_test[name] for name in self.models.keys()])
        
        # 가중치 최적화 함수
        def objective(weights):
            weights = weights / np.sum(weights)  # 정규화
            ensemble_pred = np.dot(train_preds, weights)
            return mean_squared_error(self.y_train, ensemble_pred)
        
        # 제약 조건: 가중치 합은 1, 모든 가중치는 양수
        constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}
        bounds = [(0, 1) for _ in range(len(self.models))]
        
        # 초기 가중치 (균등)
        initial_weights = np.ones(len(self.models)) / len(self.models)
        
        # 최적화 실행
        result = minimize(objective, initial_weights, method='SLSQP', 
                         bounds=bounds, constraints=constraints)
        
        optimal_weights = result.x / np.sum(result.x)  # 정규화
        
        print("최적 가중치:")
        for name, weight in zip(self.models.keys(), optimal_weights):
            print(f"{name:12}: {weight:.4f}")
        
        return optimal_weights, train_preds, test_preds
    
    def evaluate_ensemble(self, weights, train_preds, test_preds):
        """앙상블 모델 평가"""
        # 가중 평균 예측
        ensemble_train_pred = np.dot(train_preds, weights)
        ensemble_test_pred = np.dot(test_preds, weights)
        
        # 성능 계산
        train_r2 = r2_score(self.y_train, ensemble_train_pred)
        test_r2 = r2_score(self.y_test, ensemble_test_pred)
        train_rmse = np.sqrt(mean_squared_error(self.y_train, ensemble_train_pred))
        test_rmse = np.sqrt(mean_squared_error(self.y_test, ensemble_test_pred))
        
        print(f"\n=== 최종 앙상블 성능 ===")
        print(f"Train R²: {train_r2:.4f}")
        print(f"Test R²: {test_r2:.4f}")
        print(f"Train RMSE: {train_rmse:.4f}")
        print(f"Test RMSE: {test_rmse:.4f}")
        print(f"과적합 정도 (R² 차이): {train_r2 - test_r2:.4f}")
        
        return ensemble_train_pred, ensemble_test_pred

# === 3. 앙상블 실행 ===
optimizer = EnsembleOptimizer(X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled)

# 모든 모델 훈련
optimizer.train_all_models()

# 앙상블 가중치 최적화
optimal_weights, train_preds, test_preds = optimizer.optimize_ensemble_weights()

# 최종 앙상블 평가
ensemble_train_pred, ensemble_test_pred = optimizer.evaluate_ensemble(
    optimal_weights, train_preds, test_preds)

# === 4. 추가 앙상블 기법들 ===
print("\n=== 다른 앙상블 방법들과 비교 ===")

# 1. 단순 평균 앙상블
simple_avg_train = np.mean(train_preds, axis=1)
simple_avg_test = np.mean(test_preds, axis=1)
simple_avg_r2 = r2_score(y_test, simple_avg_test)
print(f"단순 평균 앙상블 Test R²: {simple_avg_r2:.4f}")

# 2. 중간값 앙상블
median_train = np.median(train_preds, axis=1)
median_test = np.median(test_preds, axis=1)
median_r2 = r2_score(y_test, median_test)
print(f"중간값 앙상블 Test R²: {median_r2:.4f}")

# 3. 상위 3개 모델만 사용한 앙상블
model_performance = {}
for i, name in enumerate(optimizer.models.keys()):
    test_r2 = r2_score(y_test, test_preds[:, i])
    model_performance[name] = test_r2

top3_models = sorted(model_performance.items(), key=lambda x: x[1], reverse=True)[:3]
top3_indices = [list(optimizer.models.keys()).index(name) for name, _ in top3_models]
top3_preds = test_preds[:, top3_indices]
top3_avg = np.mean(top3_preds, axis=1)
top3_r2 = r2_score(y_test, top3_avg)

print(f"상위 3개 모델 앙상블 Test R²: {top3_r2:.4f}")
print(f"상위 3개 모델: {[name for name, _ in top3_models]}")

# === 5. 최종 권장사항 ===
print(f"\n=== 최종 결과 요약 ===")
print(f"가중 평균 앙상블 Test R²: {r2_score(y_test, ensemble_test_pred):.4f}")
print(f"단순 평균 앙상블 Test R²: {simple_avg_r2:.4f}")
print(f"상위 3개 모델 앙상블 Test R²: {top3_r2:.4f}")

best_method = max([
    ("가중 평균", r2_score(y_test, ensemble_test_pred)),
    ("단순 평균", simple_avg_r2),
    ("상위 3개", top3_r2)
], key=lambda x: x[1])

print(f"\n최고 성능: {best_method[0]} 앙상블 (R² = {best_method[1]:.4f})")

# === 6. 모델 저장을 위한 클래스 ===
class FinalEnsembleModel:
    def __init__(self, models, weights, scaler, selector, selected_features):
        self.models = models
        self.weights = weights
        self.scaler = scaler
        self.selector = selector
        self.selected_features = selected_features
    
    def predict(self, X):
        # 피처 선택
        X_selected = self.selector.transform(X)
        
        predictions = []
        for name, model in self.models.items():
            if name in ['ridge', 'lasso', 'elastic']:
                X_scaled = self.scaler.transform(X_selected)
                pred = model.predict(X_scaled)
            else:
                pred = model.predict(X_selected)
            predictions.append(pred)
        
        # 가중 평균
        predictions = np.column_stack(predictions)
        return np.dot(predictions, self.weights)

# 최종 모델 생성
final_ensemble = FinalEnsembleModel(
    optimizer.models, optimal_weights, scaler, selector, selected_features
)

print(f"\n앙상블 모델이 준비되었습니다!")
print(f"사용된 모델 수: {len(optimizer.models)}")
print(f"선택된 피처 수: {len(selected_features)}")

# === 7. 모델 저장 (joblib) ===
import joblib
from datetime import datetime

# 현재 시간으로 파일명 생성
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_filename = f"ensemble_model_{timestamp}.pkl"

# 모델 저장
print(f"\n=== 모델 저장 중... ===")
joblib.dump(final_ensemble, model_filename)
print(f"앙상블 모델이 '{model_filename}'으로 저장되었습니다.")

# 저장된 모델 정보
model_info = {
    'timestamp': timestamp,
    'num_models': len(optimizer.models),
    'model_names': list(optimizer.models.keys()),
    'optimal_weights': optimal_weights.tolist(),
    'selected_features': list(selected_features),
    'test_r2_score': r2_score(y_test, ensemble_test_pred),
    'test_rmse': np.sqrt(mean_squared_error(y_test, ensemble_test_pred)),
    'train_r2_score': r2_score(y_train, ensemble_train_pred),
    'overfitting_gap': r2_score(y_train, ensemble_train_pred) - r2_score(y_test, ensemble_test_pred)
}

# 모델 정보도 함께 저장
info_filename = f"ensemble_info_{timestamp}.pkl"
joblib.dump(model_info, info_filename)
print(f"모델 정보가 '{info_filename}'으로 저장되었습니다.")

# === 8. 모델 불러오기 테스트 ===
print(f"\n=== 저장된 모델 불러오기 테스트 ===")

# 모델 불러오기
loaded_ensemble = joblib.load(model_filename)
loaded_info = joblib.load(info_filename)

# 예측 테스트
test_predictions = loaded_ensemble.predict(X[selector.get_support()])[:len(y_test)]
loaded_r2 = r2_score(y_test, test_predictions)

print(f"원본 모델 Test R²: {model_info['test_r2_score']:.4f}")
print(f"불러온 모델 Test R²: {loaded_r2:.4f}")
print(f"예측 일치 여부: {'✅ 성공' if abs(loaded_r2 - model_info['test_r2_score']) < 0.001 else '❌ 실패'}")

print(f"\n=== 저장된 모델 정보 ===")
for key, value in loaded_info.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    elif isinstance(value, list) and len(value) > 10:
        print(f"{key}: {len(value)}개 항목")
    else:
        print(f"{key}: {value}")

# === 9. 새로운 데이터 예측 함수 ===
def predict_new_data(model_path, new_data):
    """
    저장된 모델로 새로운 데이터 예측
    
    Parameters:
    model_path (str): 저장된 모델 파일 경로
    new_data (pd.DataFrame): 예측할 새 데이터 (원본 피처 형태)
    
    Returns:
    np.array: 예측 결과
    """
    # 모델 불러오기
    loaded_model = joblib.load(model_path)
    
    # 예측
    predictions = loaded_model.predict(new_data)
    
    return predictions

# 사용 예시 출력
print(f"\n=== 사용 방법 ===")
print(f"""
# 저장된 모델 불러오기
import joblib
model = joblib.load('{model_filename}')

# 새로운 데이터 예측 (원본 DataFrame 형태로 입력)
# new_data는 원본 df와 같은 구조여야 함 (G1, G2, G3 제외)
predictions = model.predict(new_data)

# 또는 함수 사용
predictions = predict_new_data('{model_filename}', new_data)
""")

print(f"모델 저장 완료! 파일명: {model_filename}")

[I 2025-09-28 04:23:44,693] A new study created in memory with name: xgboost


훈련 데이터 크기: (249, 15)
테스트 데이터 크기: (108, 15)
선택된 피처: ['fromCity', 'sex', 'age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'schoolsup', 'higher', 'internet', 'goout', 'Dalc', 'Walc', 'absences']
=== 개별 모델 최적화 및 훈련 ===
XGBoost 최적화 중...


[I 2025-09-28 04:23:45,311] Trial 0 finished with value: 3.197135344380475 and parameters: {'n_estimators': 453, 'max_depth': 7, 'learning_rate': 0.06237475202463447, 'subsample': 0.7795094995090551, 'colsample_bytree': 0.7496013944691565, 'reg_alpha': 2.957772330569822, 'reg_lambda': 0.8940205676871422, 'min_child_weight': 5}. Best is trial 0 with value: 3.197135344380475.
[I 2025-09-28 04:23:45,572] Trial 1 finished with value: 2.8901497173388435 and parameters: {'n_estimators': 218, 'max_depth': 4, 'learning_rate': 0.010142961349314045, 'subsample': 0.9718729996770088, 'colsample_bytree': 0.8296874377833925, 'reg_alpha': 0.3427160275667447, 'reg_lambda': 2.4042900516732923, 'min_child_weight': 5}. Best is trial 1 with value: 2.8901497173388435.
[I 2025-09-28 04:23:45,776] Trial 2 finished with value: 3.273335192595354 and parameters: {'n_estimators': 203, 'max_depth': 3, 'learning_rate': 0.1571475005038777, 'subsample': 0.7120949091050813, 'colsample_bytree': 0.8147976602582994, 're

XGBoost 최적 CV RMSE: 2.8633
LightGBM 최적화 중...


[I 2025-09-28 04:23:59,022] Trial 0 finished with value: 3.2071529870872837 and parameters: {'n_estimators': 434, 'max_depth': 6, 'learning_rate': 0.07332515795084033, 'subsample': 0.7889882919829766, 'colsample_bytree': 0.7848960563609356, 'reg_alpha': 0.759962586022233, 'reg_lambda': 0.5666491301851241, 'min_child_weight': 7.315206054933406}. Best is trial 0 with value: 3.2071529870872837.
[I 2025-09-28 04:23:59,298] Trial 1 finished with value: 2.9104177842492396 and parameters: {'n_estimators': 459, 'max_depth': 8, 'learning_rate': 0.0260259964169941, 'subsample': 0.773130834933514, 'colsample_bytree': 0.7024884618173208, 'reg_alpha': 1.24455529906404, 'reg_lambda': 1.5560523928246306, 'min_child_weight': 8.586715947709152}. Best is trial 1 with value: 2.9104177842492396.
[I 2025-09-28 04:23:59,499] Trial 2 finished with value: 2.9286666265224213 and parameters: {'n_estimators': 329, 'max_depth': 4, 'learning_rate': 0.04005896848688231, 'subsample': 0.8024634570474082, 'colsample_b

LightGBM 최적 CV RMSE: 2.8355

=== 개별 모델 성능 ===
xgb          - Train R²: 0.4104, Test R²: 0.2240, Test RMSE: 2.6928
lgb          - Train R²: 0.4828, Test R²: 0.1692, Test RMSE: 2.7863
rf           - Train R²: 0.5351, Test R²: 0.2008, Test RMSE: 2.7328
extra_trees  - Train R²: 0.4810, Test R²: 0.2139, Test RMSE: 2.7103
gbm          - Train R²: 0.8850, Test R²: 0.0348, Test RMSE: 3.0033
ridge        - Train R²: 0.2347, Test R²: 0.2564, Test RMSE: 2.6360
lasso        - Train R²: 0.2257, Test R²: 0.2388, Test RMSE: 2.6670
elastic      - Train R²: 0.2308, Test R²: 0.2502, Test RMSE: 2.6470

=== 앙상블 가중치 최적화 ===
최적 가중치:
xgb         : 0.0000
lgb         : 0.0000
rf          : 0.0000
extra_trees : 0.0000
gbm         : 1.0000
ridge       : 0.0000
lasso       : 0.0000
elastic     : 0.0000

=== 최종 앙상블 성능 ===
Train R²: 0.8850
Test R²: 0.0348
Train RMSE: 1.0634
Test RMSE: 3.0033
과적합 정도 (R² 차이): 0.8502

=== 다른 앙상블 방법들과 비교 ===
단순 평균 앙상블 Test R²: 0.2384
중간값 앙상블 Test R²: 0.2405
상위 3개 모델 앙상블 Test R²: 0.250

ValueError: Item wrong length 28 instead of 357.

In [16]:
# 최종 앙상블 평가
ensemble_train_pred, ensemble_test_pred = optimizer.evaluate_ensemble(
    optimal_weights, train_preds, test_preds)


=== 최종 앙상블 성능 ===
Train R²: 0.8850
Test R²: 0.0348
Train RMSE: 1.0634
Test RMSE: 3.0033
과적합 정도 (R² 차이): 0.8502
