In [5]:
# scikit-learn 설치 및 임포트
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import os
from itertools import product

# 데이터 경로 (os.path.join 사용, 절대 경로)
BASE_DIR = os.path.dirname(os.path.dirname(os.getcwd()))  # ml_code 폴더의 상위 폴더로 이동
DATA_DIR = os.path.join(BASE_DIR, 'data', 'ML')

print(BASE_DIR)
print(DATA_DIR)
print("scikit-learn 및 관련 라이브러리 임포트 완료!")

c:\final_git\SKN12-FINAL-5TEAM
c:\final_git\SKN12-FINAL-5TEAM\data\ML
scikit-learn 및 관련 라이브러리 임포트 완료!


### 준비

In [2]:
# npz 불러오기
train = np.load(os.path.join(DATA_DIR, "train_set.npz"))
X_train = train["X"]
y_train = train["y"]

# 컬럼 이름 만들기
n_features = X_train.shape[1]
feature_columns = [f"f{i}" for i in range(n_features)]

# DataFrame 생성
df_train = pd.DataFrame(X_train, columns=feature_columns)
df_train["score"] = y_train # 종속변수 추가

val = np.load(os.path.join(DATA_DIR, "val_set.npz"))
X_val = val["X"]
y_val = val["y"]
df_val = pd.DataFrame(X_val, columns=feature_columns)
df_val["score"] = y_val # 종속변수 추가

test = np.load(os.path.join(DATA_DIR, "test_set.npz"))
X_test = test["X"]
y_test = test["y"]
df_test = pd.DataFrame(X_test, columns=feature_columns)
df_test["score"] = y_test # 종속변수 추가

df_train.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f1527,f1528,f1529,f1530,f1531,f1532,f1533,f1534,f1535,score
0,-0.454308,-0.191012,0.079864,-0.534569,-0.068804,0.01766,0.22431,-0.269141,-0.361705,0.053733,...,0.491733,0.35919,-0.123756,-0.515413,0.105428,0.175339,0.07215,-0.036318,-0.250155,48.7933
1,0.006021,-0.706709,-0.360626,-0.347866,0.355289,0.699073,-0.213558,0.278139,-0.362823,0.273964,...,0.746505,0.164486,-0.271899,0.004447,0.757929,-0.417573,0.060278,-0.676335,-0.297556,48.3933
2,-0.197276,-0.294467,0.414493,-0.373786,0.317888,0.066508,-0.189156,0.22883,-0.40263,0.703705,...,0.317407,0.20478,-0.016202,-0.265335,0.574012,-0.27486,-0.171743,-0.043548,-0.318174,51.4933
3,0.208798,-0.105023,-0.051948,-0.191882,0.193627,-0.091749,0.241087,0.238545,0.159051,0.491462,...,0.609287,0.674161,-0.049371,0.001968,0.019721,-0.094569,-0.111176,-0.318199,-0.48802,43.98
4,-0.112501,-0.304849,-0.043498,-0.297551,0.484919,-0.212221,0.086554,-0.089514,-0.218532,0.533597,...,1.009515,0.507447,0.081017,0.218226,0.8622,-0.71089,0.140288,-0.419778,0.308251,47.0967


### RandomForest 모델 정의

In [7]:
# 훈련 및 검증 데이터 준비
X_train_features = df_train.drop('score', axis=1)
y_train_target = df_train['score']
X_val_features = df_val.drop('score', axis=1)
y_val_target = df_val['score']

# RandomForest 기본 파라미터로 학습
print("RandomForest 기본 파라미터로 학습...")
rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)

# 훈련
rf_model.fit(X_train_features, y_train_target)

# 검증셋으로 평가
rf_val_pred = rf_model.predict(X_val_features)
rf_val_rmse = mean_squared_error(y_val_target, rf_val_pred, squared=False)
rf_val_r2 = r2_score(y_val_target, rf_val_pred)

print(f"RF Validation RMSE: {rf_val_rmse:.4f}")
print(f"RF Validation R²: {rf_val_r2:.4f}")

# 변수명 통일을 위해
best_rf_model = rf_model

RandomForest 기본 파라미터로 학습...
RF Validation RMSE: 6.0485
RF Validation R²: 0.2974




### GradientBoosting 모델 정의

In [8]:
# GradientBoosting 기본 파라미터로 학습
print("GradientBoosting 기본 파라미터로 학습...")
gb_model = GradientBoostingRegressor(random_state=42)

# 훈련
gb_model.fit(X_train_features, y_train_target)

# 검증셋으로 평가
gb_val_pred = gb_model.predict(X_val_features)
gb_val_rmse = mean_squared_error(y_val_target, gb_val_pred, squared=False)
gb_val_r2 = r2_score(y_val_target, gb_val_pred)

print(f"GB Validation RMSE: {gb_val_rmse:.4f}")
print(f"GB Validation R²: {gb_val_r2:.4f}")

# 변수명 통일을 위해
best_gb_model = gb_model

GradientBoosting 기본 파라미터로 학습...


GB Validation RMSE: 5.8446
GB Validation R²: 0.3440




### 평가

In [12]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

# 테스트 셋 준비
X_test_features = df_test.drop('score', axis=1)
y_test_target = df_test['score']

# 두 모델의 테스트 성능 비교
print("=== 테스트 셋 최종 평가 ===")

# RandomForest 테스트 평가
rf_test_pred = best_rf_model.predict(X_test_features)
rf_test_rmse = mean_squared_error(y_test_target, rf_test_pred, squared=False)
rf_test_r2 = r2_score(y_test_target, rf_test_pred)

# GradientBoosting 테스트 평가
gb_test_pred = best_gb_model.predict(X_test_features)
gb_test_rmse = mean_squared_error(y_test_target, gb_test_pred, squared=False)
gb_test_r2 = r2_score(y_test_target, gb_test_pred)

# 결과를 표로 정리
results_df = pd.DataFrame({
    '모델': ['RandomForest', 'GradientBoosting'],
    'RMSE': [round(rf_test_rmse, 4), round(gb_test_rmse, 4)],
    'R²': [round(rf_test_r2, 4), round(gb_test_r2, 4)]
})

print("테스트 셋 성능 비교:")
results_df

=== 테스트 셋 최종 평가 ===
테스트 셋 성능 비교:




Unnamed: 0,모델,RMSE,R²
0,RandomForest,5.4197,0.3201
1,GradientBoosting,5.3435,0.3391
