In [1]:
!pip install h2o
# 한글 폰트 사용을 위한 라이브러리입니다.
!apt-get install -y fonts-nanum

Reading package lists... Done
Building dependency tree       
Reading state information... Done
fonts-nanum is already the newest version (20180306-3).
0 upgraded, 0 newly installed, 0 to remove and 26 not upgraded.


In [2]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [3]:
import h2o
from h2o.automl import H2OAutoML

# 필요한 데이터를 load
total_path = "../total-data_ver2.csv"

dt_total = pd.read_csv(total_path)

# is_test 컬럼 값이 0인 행을 train 데이터셋으로 저장
df_train = dt_total[dt_total['is_test'] == 0]

# is_test 컬럼 값이 1인 행을 test 데이터셋으로 저장
df_test = dt_total[dt_total['is_test'] == 1]

In [4]:
# '계약년월'에서 '연도' 추출 후 int 형으로 변환하여 새로운 열 추가
df_train = df_train.assign(year=df_train['계약년월'].astype('str').str[:4].astype(int)).query('year > 2020')

# '계약년월'에서 '연도' 추출 후 int 형으로 변환하여 새로운 열 추가
df_test = df_test.assign(year=df_test['계약년월'].astype('str').str[:4].astype(int))

# '계약년월'에서 '월' 추출 후 5로 나눈 값을 새로운 열로 추가
df_train, df_test = map(lambda df: df.assign(계약월=lambda x: (x['계약년월'] % 100) // 5), [df_train, df_test])

# 타겟 변수 생성
df_train['price'] = df_train.groupby(['아파트명', '계약월'])['target'].transform('mean')

# 중복 데이터 제거
df_train = df_train.drop_duplicates(subset=['아파트명', '계약월'])

In [5]:
import pandas as pd

# 범주형 변수를 원-핫 인코딩으로 변환
df_train_encoded = pd.get_dummies(df_train)
df_test_encoded = pd.get_dummies(df_test)

# 학습 데이터와 테스트 데이터에 있는 모든 변수가 동일하도록 조정
df_train_encoded, df_test_encoded = df_train_encoded.align(df_test_encoded, join='left', axis=1)

# 결측값을 0으로 채움 (학습 데이터에 없는 범주형 레벨 때문에 발생)
df_test_encoded.fillna(0, inplace=True)

In [7]:
# H2O 서버 초기화
h2o.init()

# 원-핫 인코딩된 데이터 프레임을 H2O용 프레임으로 변환
h2o_train = h2o.H2OFrame(df_train_encoded)
h2o_test = h2o.H2OFrame(df_test_encoded)

# 타겟 변수와 입력 변수 지정
x = ['전용면적', 'x'] # 사용하고자 하는 변수만 지정
y = "price"

# AutoML 객체 생성
aml = H2OAutoML(max_models = 10, seed = 1)

# 모델 학습
aml.train(x = x, y = y, training_frame = h2o_train)

# 테스트 데이터에 대한 예측
preds = aml.predict(h2o_test)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,3 mins 47 secs
H2O_cluster_timezone:,Asia/Seoul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,1 month and 2 days
H2O_cluster_name:,H2O_from_python_root_uv3kqh
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,14.83 Gb
H2O_cluster_total_cores:,10
H2O_cluster_allowed_cores:,10


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


In [8]:
# print(pred.columns)

In [10]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

model_id,rmse,mse,mae,rmsle,mean_residual_deviance
StackedEnsemble_BestOfFamily_1_AutoML_1_20240123_104514,40023.9,1601910000.0,23940.0,,1601910000.0
StackedEnsemble_AllModels_1_AutoML_1_20240123_104514,40024.9,1601990000.0,23946.6,,1601990000.0
XRT_1_AutoML_1_20240123_104514,43085.2,1856340000.0,24856.9,0.37088,1856340000.0
DRF_1_AutoML_1_20240123_104514,43342.2,1878540000.0,24747.5,0.370745,1878540000.0
XGBoost_1_AutoML_1_20240123_104514,44082.1,1943240000.0,27147.4,,1943240000.0
XGBoost_2_AutoML_1_20240123_104514,45898.4,2106670000.0,28635.8,,2106670000.0
XGBoost_3_AutoML_1_20240123_104514,50296.1,2529700000.0,31898.7,,2529700000.0
GBM_4_AutoML_1_20240123_104514,50879.0,2588670000.0,32500.5,,2588670000.0
GBM_3_AutoML_1_20240123_104514,52872.5,2795500000.0,33727.5,,2795500000.0
GBM_2_AutoML_1_20240123_104514,53449.7,2856870000.0,34098.3,,2856870000.0


In [11]:
lb = h2o.automl.get_leaderboard(aml, extra_columns = 'ALL')

In [12]:
lb

model_id,rmse,mse,mae,rmsle,mean_residual_deviance,training_time_ms,predict_time_per_row_ms,algo
StackedEnsemble_BestOfFamily_1_AutoML_1_20240123_104514,40023.9,1601910000.0,23940.0,,1601910000.0,21768,0.07626,StackedEnsemble
StackedEnsemble_AllModels_1_AutoML_1_20240123_104514,40024.9,1601990000.0,23946.6,,1601990000.0,67600,0.038144,StackedEnsemble
XRT_1_AutoML_1_20240123_104514,43085.2,1856340000.0,24856.9,0.37088,1856340000.0,1553,0.009842,DRF
DRF_1_AutoML_1_20240123_104514,43342.2,1878540000.0,24747.5,0.370745,1878540000.0,1224,0.009703,DRF
XGBoost_1_AutoML_1_20240123_104514,44082.1,1943240000.0,27147.4,,1943240000.0,1379,0.009733,XGBoost
XGBoost_2_AutoML_1_20240123_104514,45898.4,2106670000.0,28635.8,,2106670000.0,948,0.008646,XGBoost
XGBoost_3_AutoML_1_20240123_104514,50296.1,2529700000.0,31898.7,,2529700000.0,626,0.000783,XGBoost
GBM_4_AutoML_1_20240123_104514,50879.0,2588670000.0,32500.5,,2588670000.0,572,0.019436,GBM
GBM_3_AutoML_1_20240123_104514,52872.5,2795500000.0,33727.5,,2795500000.0,493,0.018763,GBM
GBM_2_AutoML_1_20240123_104514,53449.7,2856870000.0,34098.3,,2856870000.0,483,0.019868,GBM


In [9]:
# 예측 결과를 pandas 데이터프레임으로 변환
preds_df = preds.as_data_frame()

# 데이터프레임을 CSV 파일로 저장
preds_df.to_csv('predictions.csv', index=False)

