In [4]:
import pandas as pd
from tqdm import tqdm
from autogluon.tabular import TabularDataset, TabularPredictor
import json
import numpy as np
import os

# 데이터 경로 (os.path.join 사용, 절대 경로)
BASE_DIR = os.getcwd()
print(BASE_DIR)
INPUT_FILE_PATH = os.path.join(BASE_DIR, 'data', 'preprocessed_data.json')
DATA_DIR = os.path.join(BASE_DIR, 'data', 'ML')
os.makedirs(DATA_DIR, exist_ok=True)

c:\final_git\SKN12-FINAL-5TEAM


## json 파일을 데이터프레임으로 변환

## ML에 입력할 수 있도록 독립 / 종속 변수 추가

In [9]:
# npz 불러오기
train = np.load(os.path.join(DATA_DIR, "train_set.npz"))
X_train = train["X"]
y_train = train["y"]

# 컬럼 이름 만들기
n_features = X_train.shape[1]
feature_columns = [f"f{i}" for i in range(n_features)]

# DataFrame 생성
df_train = pd.DataFrame(X_train, columns=feature_columns)
df_train["score"] = y_train # 종속변수 추가

val = np.load(os.path.join(DATA_DIR, "val_set.npz"))
X_val = val["X"]
y_val = val["y"]
df_val = pd.DataFrame(X_val, columns=feature_columns)
df_val["score"] = y_val # 종속변수 추가

test = np.load(os.path.join(DATA_DIR, "test_set.npz"))
X_test = test["X"]
y_test = test["y"]
df_test = pd.DataFrame(X_test, columns=feature_columns)
df_test["score"] = y_test # 종속변수 추가


train_data = TabularDataset(df_train)
val_data = TabularDataset(df_val)
test_data = TabularDataset(df_test)

In [10]:
train_data.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f1527,f1528,f1529,f1530,f1531,f1532,f1533,f1534,f1535,score
0,-0.454308,-0.191012,0.079864,-0.534569,-0.068804,0.01766,0.22431,-0.269141,-0.361705,0.053733,...,0.491733,0.35919,-0.123756,-0.515413,0.105428,0.175339,0.07215,-0.036318,-0.250155,48.7933
1,0.006021,-0.706709,-0.360626,-0.347866,0.355289,0.699073,-0.213558,0.278139,-0.362823,0.273964,...,0.746505,0.164486,-0.271899,0.004447,0.757929,-0.417573,0.060278,-0.676335,-0.297556,48.3933
2,-0.197276,-0.294467,0.414493,-0.373786,0.317888,0.066508,-0.189156,0.22883,-0.40263,0.703705,...,0.317407,0.20478,-0.016202,-0.265335,0.574012,-0.27486,-0.171743,-0.043548,-0.318174,51.4933
3,0.208798,-0.105023,-0.051948,-0.191882,0.193627,-0.091749,0.241087,0.238545,0.159051,0.491462,...,0.609287,0.674161,-0.049371,0.001968,0.019721,-0.094569,-0.111176,-0.318199,-0.48802,43.98
4,-0.112501,-0.304849,-0.043498,-0.297551,0.484919,-0.212221,0.086554,-0.089514,-0.218532,0.533597,...,1.009515,0.507447,0.081017,0.218226,0.8622,-0.71089,0.140288,-0.419778,0.308251,47.0967


## AutoML 설정

In [12]:
from autogluon.tabular import TabularPredictor

predictor = TabularPredictor(
    label='score',
    path='automl/',
    problem_type='regression',
    eval_metric='rmse'  # 💡 평가 지표를 RMSE로 설정
).fit(
    train_data=train_data,
    tuning_data=val_data,       # 💡 검증 데이터 명시
    time_limit=600
)

# 평가: 여러 지표 다 나옴
results = predictor.evaluate(test_data)

# 출력
for metric, value in results.items():
    print(f"{metric}: {value}")

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.10.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          16
Memory Avail:       12.44 GB / 31.64 GB (39.3%)
Disk Space Avail:   256.49 GB / 476.00 GB (53.9%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.
	presets='good'         : Good accuracy with very fast inference speed.
	presets='me

KeyboardInterrupt: 

In [29]:
# 학습된 모델들의 성능을 확인합니다.
leaderboard = predictor.leaderboard(test_data, silent=True)
leaderboard

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT,-6.695962,-6.228672,root_mean_squared_error,0.039003,0.028,30.880543,0.039003,0.028,30.880543,1,True,3
1,WeightedEnsemble_L2,-6.695962,-6.228672,root_mean_squared_error,0.060009,0.028,30.896077,0.021006,0.0,0.015533,2,True,6
2,LightGBM,-6.808247,-6.343531,root_mean_squared_error,0.042011,0.026005,36.285259,0.042011,0.026005,36.285259,1,True,4
3,RandomForestMSE,-7.336781,-6.70586,root_mean_squared_error,0.344626,0.194681,620.611826,0.344626,0.194681,620.611826,1,True,5
4,KNeighborsDist,-8.150046,-7.541428,root_mean_squared_error,1.24431,1.029261,0.450217,1.24431,1.029261,0.450217,1,True,2
5,KNeighborsUnif,-8.179447,-7.577029,root_mean_squared_error,1.135977,0.984943,0.516378,1.135977,0.984943,0.516378,1,True,1


In [32]:
# 리더보드에서 모델명과 테스트 점수만 표시
leaderboard_simple = leaderboard[['model', 'score_test']].copy()
leaderboard_simple

Unnamed: 0,model,score_test
0,LightGBMXT,-6.695962
1,WeightedEnsemble_L2,-6.695962
2,LightGBM,-6.808247
3,RandomForestMSE,-7.336781
4,KNeighborsDist,-8.150046
5,KNeighborsUnif,-8.179447


In [30]:
from autogluon.tabular import TabularPredictor
from sklearn.metrics import mean_absolute_error, r2_score # r2_score 임포트 추가

# 모델이 저장된 경로
model_path = 'automl/' # 

# 모델 불러오기
predictor = TabularPredictor.load(model_path)

# 예측 값 (클래스 값)
y_pred = predictor.predict(test_data)

overall_r2 = r2_score(test_data['score'], y_pred)
print(overall_r2)

0.4538397177818283


In [33]:
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np

# 예측값 계산 (AutoGluon predictor 썼다면)
y_pred = predictor.predict(test_data)

# 실제값
y_true = test_data['score']  # 또는 y_test

# RMSE, R2 계산
rmse = mean_squared_error(y_true, y_pred, squared=False)
r2 = r2_score(y_true, y_pred)

# 데이터프레임 형태로 보기 좋게 출력
df_metrics = pd.DataFrame({
    "지표": ["RMSE", "R² Score"],
    "값": [round(rmse, 3), round(r2, 3)]
})

import IPython.display as dsp
dsp.display(df_metrics)




Unnamed: 0,지표,값
0,RMSE,6.696
1,R² Score,0.454


In [38]:
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# 상위 3개 모델 이름 가져오기
top_3_models = leaderboard['model'].head(3).tolist()

# 결과를 저장할 리스트
results = []

for model_name in top_3_models:
    # 각 모델로 예측
    y_pred = predictor.predict(test_data, model=model_name)
    y_true = test_data['score']
    
    # RMSE, R2 계산
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    
    results.append({
        '모델명': model_name,
        'RMSE': round(rmse, 4),
        'R² Score': round(r2, 4)
    })

# 데이터프레임으로 변환하여 표 형태로 출력
df_results = pd.DataFrame(results)
print("상위 3개 모델 성능 비교:")
df_results

상위 3개 모델 성능 비교:




Unnamed: 0,모델명,RMSE,R² Score
0,WeightedEnsemble_L2,6.696,0.4538
1,LightGBMXT,6.696,0.4538
2,LightGBM,6.8082,0.4354
