In [14]:
import pandas as pd
from tqdm import tqdm
from autogluon.tabular import TabularDataset, TabularPredictor
import json
import numpy as np

In [15]:
# JSON 파일 읽어오기
def load_json_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def json_to_dataframe(json_data):
    """JSON 데이터를 DataFrame으로 변환"""
    df_data = []
    for item in json_data:
        row = {
            'question': item.get('question', ''),
            'answer': item.get('answer', ''),
            'normalized_score': item.get('normalized_score', 0),
            'summary_ratio_score': item.get('summary_ratio_score', 0),
            'emotion': item.get('emotion', 0),
            'score': item.get('score', 0)
        }
        df_data.append(row)
    return pd.DataFrame(df_data)

# 데이터 로드 및 DataFrame 변환
train_data_json = load_json_data('new_train_qa_normalized.json')
val_data_json = load_json_data('new_val_qa_normalized.json')
test_data_json = load_json_data('new_test_qa_normalized.json')

df_train_json = json_to_dataframe(train_data_json)
df_val_json = json_to_dataframe(val_data_json)
df_test_json = json_to_dataframe(test_data_json)

print(f"Train DataFrame: {df_train_json.shape}")
print(f"Validation DataFrame: {df_val_json.shape}")
print(f"Test DataFrame: {df_test_json.shape}")

# DataFrame 구조 확인
print("\nTrain DataFrame 정보:")
df_train_json.head()


Train DataFrame: (5257, 6)
Validation DataFrame: (657, 6)
Test DataFrame: (658, 6)

Train DataFrame 정보:


Unnamed: 0,question,answer,normalized_score,summary_ratio_score,emotion,score
0,일을 하면서 가장 어려울 것으로 예상하고 계시는 것은 무엇인가요 또한 그런 어려운 ...,가장 어려울 것으로 예상되는 점은 역시나 사람입니다. 사람만큼 어려운 게 있을까 싶...,0.4779,1.0,0.0,0.4926
1,커뮤니케이션을 잘 할 수 있는 나만의 스킬이 있다면 한번 소개해 주실 수 있으십니까...,커뮤니케이션 스킬 중에 하나는 타인의 이야기를 잘 듣는 경청의 자세가 우선되어야 한...,0.7518,1.0,0.0,0.5839
2,협업을 하다 보면 여러 동료들과 소통을 해야 되는데 비아이티 동료와 효과적으로 의사...,네 대화 주제를 어느 방향으로 이끌어가느냐에 달려 있다고 생각합니다. 우선 아이티 ...,0.4949,1.0,0.0,0.4983
3,이제까지 살면서 새로운 환경에서 아예 몰랐던 일을 해 본 경험이 있다면 경험에 대해...,전혀 몰랐던 일을 맡아서 했던 경험 있기는 있던 것 같습니다. 뭐 사진을 찍을 때도...,0.6406,1.0,0.0,0.5469
4,만약 함께 일하는 동료 가운데 이직을 생각하는 동료를 만나게 된다면 뭐라고 말씀하시...,제가 회사 생활을 하면서 제 동료가 만약에 이직을 원한다면 일단은 어떤 결론을 내기...,0.4295,1.0,0.0,0.4765


In [23]:
# npz 불러오기
train = np.load("train_set.npz")
X_train = train["X"]
y_train = df_train_json["score"]

# 컬럼 이름 만들기
n_features = X_train.shape[1]
feature_columns = [f"f{i}" for i in range(n_features)]

# DataFrame 생성
df_train = pd.DataFrame(X_train, columns=feature_columns)
df_train["score"] = y_train*100  # 종속변수 추가

val = np.load("val_set.npz")
X_val = val["X"]
y_val = df_val_json["score"]
df_val = pd.DataFrame(X_val, columns=feature_columns)
df_val["score"] = y_val*100

test = np.load("test_set.npz")
X_test = test["X"]
y_test = df_test_json["score"]
df_test = pd.DataFrame(X_test, columns=feature_columns)
df_test["score"] = y_test*100

train_data = TabularDataset(df_train)
val_data = TabularDataset(df_val)
test_data = TabularDataset(df_test)

In [28]:
from autogluon.tabular import TabularPredictor

predictor = TabularPredictor(
    label='score',
    path='automl/',
    problem_type='regression',
    eval_metric='rmse'  # 💡 평가 지표를 RMSE로 설정
).fit(
    train_data=train_data,
    tuning_data=val_data,       # 💡 검증 데이터 명시
    time_limit=600
)

# 평가: 여러 지표 다 나옴
results = predictor.evaluate(test_data)

# 출력
for metric, value in results.items():
    print(f"{metric}: {value}")

Verbosity: 2 (Standard Logging)
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.10.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          16
Memory Avail:       15.45 GB / 31.64 GB (48.8%)
Disk Space Avail:   258.14 GB / 476.00 GB (54.2%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.
	presets='good'         : Good accuracy with very fa

[1000]	valid_set's rmse: 6.23123


	-6.2287	 = Validation score   (-root_mean_squared_error)
	30.88s	 = Training   runtime
	0.03s	 = Validation runtime
	30.88s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: LightGBM ... Training model for up to 562.03s of the 562.03s of remaining time.
Fitting model: LightGBM ... Training model for up to 562.03s of the 562.03s of remaining time.
	-6.3435	 = Validation score   (-root_mean_squared_error)
	36.29s	 = Training   runtime
	0.03s	 = Validation runtime
	-6.3435	 = Validation score   (-root_mean_squared_error)
	36.29s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: RandomForestMSE ... Training model for up to 525.66s of the 525.66s of remaining time.
Fitting model: RandomForestMSE ... Training model for up to 525.66s of the 525.66s of remaining time.
	-6.7059	 = Validation score   (-root_mean_squared_error)
	620.61s	 = Training   runtime
	0.19s	 = Validation runtime
	-6.7059	 = Validation score   (-root_mean_squared_error)
	620.61s	 = Training

root_mean_squared_error: -6.695962333417998
mean_squared_error: -44.835911570552604
mean_absolute_error: -4.521185811051481
r2: 0.4538397177818283
pearsonr: 0.6825826298349335
median_absolute_error: -3.1224677276611317


In [29]:
# 학습된 모델들의 성능을 확인합니다.
leaderboard = predictor.leaderboard(test_data, silent=True)
leaderboard

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT,-6.695962,-6.228672,root_mean_squared_error,0.039003,0.028,30.880543,0.039003,0.028,30.880543,1,True,3
1,WeightedEnsemble_L2,-6.695962,-6.228672,root_mean_squared_error,0.060009,0.028,30.896077,0.021006,0.0,0.015533,2,True,6
2,LightGBM,-6.808247,-6.343531,root_mean_squared_error,0.042011,0.026005,36.285259,0.042011,0.026005,36.285259,1,True,4
3,RandomForestMSE,-7.336781,-6.70586,root_mean_squared_error,0.344626,0.194681,620.611826,0.344626,0.194681,620.611826,1,True,5
4,KNeighborsDist,-8.150046,-7.541428,root_mean_squared_error,1.24431,1.029261,0.450217,1.24431,1.029261,0.450217,1,True,2
5,KNeighborsUnif,-8.179447,-7.577029,root_mean_squared_error,1.135977,0.984943,0.516378,1.135977,0.984943,0.516378,1,True,1


In [32]:
# 리더보드에서 모델명과 테스트 점수만 표시
leaderboard_simple = leaderboard[['model', 'score_test']].copy()
leaderboard_simple

Unnamed: 0,model,score_test
0,LightGBMXT,-6.695962
1,WeightedEnsemble_L2,-6.695962
2,LightGBM,-6.808247
3,RandomForestMSE,-7.336781
4,KNeighborsDist,-8.150046
5,KNeighborsUnif,-8.179447


In [35]:
predictor = TabularPredictor.load("automl/")  # 저장된 predictor 불러오기
leaderboard = predictor.leaderboard(silent=True)

# 모델 이름 확인 (예: LightGBMXT_BAG_L1)
model_name = "LightGBMXT"

# 하이퍼파라미터 조회
info = predictor.info()
params = info['model_info'][model_name]['hyperparameters']

print(f"📌 {model_name} 하이퍼파라미터:")
for k, v in params.items():
    print(f" - {k}: {v}")


📌 LightGBMXT 하이퍼파라미터:
 - learning_rate: 0.05
 - extra_trees: True


In [30]:
from autogluon.tabular import TabularPredictor
from sklearn.metrics import mean_absolute_error, r2_score # r2_score 임포트 추가

# 모델이 저장된 경로
model_path = 'automl/' # 

# 모델 불러오기
predictor = TabularPredictor.load(model_path)

# 예측 값 (클래스 값)
y_pred = predictor.predict(test_data)

overall_r2 = r2_score(test_data['score'], y_pred)
print(overall_r2)

0.4538397177818283


In [33]:
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np

# 예측값 계산 (AutoGluon predictor 썼다면)
y_pred = predictor.predict(test_data)

# 실제값
y_true = test_data['score']  # 또는 y_test

# RMSE, R2 계산
rmse = mean_squared_error(y_true, y_pred, squared=False)
r2 = r2_score(y_true, y_pred)

# 데이터프레임 형태로 보기 좋게 출력
df_metrics = pd.DataFrame({
    "지표": ["RMSE", "R² Score"],
    "값": [round(rmse, 3), round(r2, 3)]
})

import IPython.display as dsp
dsp.display(df_metrics)




Unnamed: 0,지표,값
0,RMSE,6.696
1,R² Score,0.454


In [38]:
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# 상위 3개 모델 이름 가져오기
top_3_models = leaderboard['model'].head(3).tolist()

# 결과를 저장할 리스트
results = []

for model_name in top_3_models:
    # 각 모델로 예측
    y_pred = predictor.predict(test_data, model=model_name)
    y_true = test_data['score']
    
    # RMSE, R2 계산
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    
    results.append({
        '모델명': model_name,
        'RMSE': round(rmse, 4),
        'R² Score': round(r2, 4)
    })

# 데이터프레임으로 변환하여 표 형태로 출력
df_results = pd.DataFrame(results)
print("상위 3개 모델 성능 비교:")
df_results

상위 3개 모델 성능 비교:




Unnamed: 0,모델명,RMSE,R² Score
0,WeightedEnsemble_L2,6.696,0.4538
1,LightGBMXT,6.696,0.4538
2,LightGBM,6.8082,0.4354
