# 🏠 부동산 실거래가 예측 대회 - KKH - MODEL
> - 학습, 예측, 평가를 진행한다.
> - kimkihong / helpotcreator@gmail.com / Upstage AI Lab 3기
> - 2024.07.16.화 ~ 2024.07.19.금 19:00

In [1]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(fname=r'font/NanumGothic.otf', name='NanumBarunGothic')
fm.fontManager.ttflist.insert(0, fe)
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'})
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import re
import math

# Model
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, cross_val_score
from sklearn.metrics import mean_squared_error, roc_auc_score, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor, CatBoost
from category_encoders import TargetEncoder, OneHotEncoder, MEstimateEncoder, OrdinalEncoder, CatBoostEncoder

# Optuna
from optuna import create_study
from optuna.integration import OptunaSearchCV
from optuna.samplers import TPESampler
import optuna

import eli5
from eli5.sklearn import PermutationImportance

# 모든 열을 표시하도록 설정
pd.set_option('display.max_columns', None)

In [2]:
train = pd.read_csv('data/kkh_train.csv', encoding='utf-8')
test = pd.read_csv('data/kkh_test.csv', encoding='utf-8')
loan = pd.read_csv('data/kkh_loan.csv', encoding='utf-8') # 서울시 가계대출규모

In [3]:
train['세대별주차대수'] = train['주차대수'] / train['k-전체세대수']
test['세대별주차대수'] = test['주차대수'] / test['k-전체세대수']

In [4]:
# '계약년월'과 '년월'이 일치하는 경우 '서울시_가계대출' 정보를 train에 병합
train = train.merge(loan, how='left', left_on='계약년월', right_on='년월')
# '년월' 열은 필요 없으므로 삭제
train.drop(columns=['년월'], inplace=True)

test = test.merge(loan, how='left', left_on='계약년월', right_on='년월')
test.drop(columns=['년월'], inplace=True)

In [5]:
# '동+아파트명' 기준으로 그룹화하여 'target' 피처의 평균 가격 계산
mean_target_per_group = train.groupby('동+아파트명')['target'].mean()

# 평균 가격이 200,000 이상인 그룹을 리스트로 생성
high_price_groups = mean_target_per_group[mean_target_per_group >= 200000].index.tolist()

# train 데이터에 'top_apt' 피처 추가
train['top_apt'] = train['동+아파트명'].apply(lambda x: 1 if x in high_price_groups else 0).astype('category')

# test 데이터에 'top_apt' 피처 추가
test['top_apt'] = test['동+아파트명'].apply(lambda x: 1 if x in high_price_groups else 0).astype('category')

In [6]:
train['국민평수'] = train['전용면적(㎡)'].apply(lambda x: 1 if x <= 90 else 0)
test['국민평수'] = test['전용면적(㎡)'].apply(lambda x: 1 if x <= 90 else 0)

In [7]:
# selected_feature_list = ['아파트명', '전용면적(㎡)', '계약년월', '층', '건축년도', '구', '동', '도로명', '세대별주차대수', '좌표X', '좌표Y', 'target', 'top_apt', '국민평수', '1번째_가까운_역_이름', '1번째_가까운_역_호선', '1번째_가까운_역_거리', '5분이하_역_개수', '5분초과_10분이하_역_개수']
selected_feature_list = ['아파트명', '전용면적(㎡)', '서울시_가계대출', '층', '건축년도', '구', '동', '도로명', '세대별주차대수', '좌표X', '좌표Y', 'target', 'top_apt', '국민평수', '1번째_가까운_역_이름', '1번째_가까운_역_호선', '1번째_가까운_역_거리', '5분이하_역_개수', '5분초과_10분이하_역_개수']
train = train[selected_feature_list]

In [8]:
train.to_csv('data/kkh_train_final.csv', index=False)

In [9]:
def impute_missing_values(df):
    # 수치형 피처와 범주형 피처 구분
    numeric_features = df.select_dtypes(include=[np.number]).columns
    categorical_features = df.select_dtypes(include=[object]).columns
    
    # 수치형 피처의 결측치를 평균값으로 대체
    # for feature in numeric_features:
    #     mean_value = df[feature].mean()
    #     df[feature].fillna(mean_value, inplace=True)
    
    # 수치형 피처의 결측치를 -999으로 대체
    for feature in numeric_features:
        df[feature].fillna(-999, inplace=True)
    
    # 범주형 피처의 결측치를 'Missing'으로 대체
    for feature in categorical_features:
        df[feature].fillna('Missing', inplace=True)
    
    return df

impute_missing_values(train)

Unnamed: 0,아파트명,전용면적(㎡),서울시_가계대출,층,건축년도,구,동,도로명,세대별주차대수,좌표X,좌표Y,target,top_apt,국민평수,1번째_가까운_역_이름,1번째_가까운_역_호선,1번째_가까운_역_거리,5분이하_역_개수,5분초과_10분이하_역_개수
0,개포6차우성,79.97,282438.7,3,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,124000,0,1,구룡,분당선,1187.672025,0,0
1,개포6차우성,79.97,282438.7,4,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,123500,0,1,구룡,분당선,1187.672025,0,0
2,개포6차우성,54.98,282438.7,5,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,91500,0,1,구룡,분당선,1187.672025,0,0
3,개포6차우성,79.97,283669.8,4,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,130000,0,1,구룡,분당선,1187.672025,0,0
4,개포6차우성,79.97,283669.8,2,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,117000,0,1,구룡,분당선,1187.672025,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118817,갈현현대,59.94,-999.0,11,1998,은평구,구산동,서오릉로21길 36,1.000000,126.905543,37.612989,20000,0,1,구산,6호선,1048.367837,0,0
1118818,갈현현대,59.94,-999.0,10,1998,은평구,구산동,서오릉로21길 36,1.000000,126.905543,37.612989,20000,0,1,구산,6호선,1048.367837,0,0
1118819,갈현현대,84.83,-999.0,20,1998,은평구,구산동,서오릉로21길 36,1.000000,126.905543,37.612989,28000,0,1,구산,6호선,1048.367837,0,0
1118820,갈현현대,84.83,-999.0,8,1998,은평구,구산동,서오릉로21길 36,1.000000,126.905543,37.612989,29000,0,1,구산,6호선,1048.367837,0,0


In [10]:
print((train.isnull().mean() * 100).to_string())

아파트명               0.0
전용면적(㎡)            0.0
서울시_가계대출           0.0
층                  0.0
건축년도               0.0
구                  0.0
동                  0.0
도로명                0.0
세대별주차대수            0.0
좌표X                0.0
좌표Y                0.0
target             0.0
top_apt            0.0
국민평수               0.0
1번째_가까운_역_이름       0.0
1번째_가까운_역_호선       0.0
1번째_가까운_역_거리       0.0
5분이하_역_개수          0.0
5분초과_10분이하_역_개수    0.0


In [11]:
train = train[~train['층'].isin([-2, 65])]

In [12]:
# train = train[train['계약년월'] >= 201801]

In [13]:
SEED = 1053682552

# TARGET 정의
TARGET = 'target'

# NUMERIC_COLS 계산: TARGET을 제외한 수치형 컬럼 리스트
NUMERIC_COLS = [col for col in train.select_dtypes(include=[float, int]).columns if col != TARGET]

# CAT_COLS 계산: NUMERIC_COLS를 제외한 나머지 컬럼 리스트
CAT_COLS = [col for col in train.columns if col not in NUMERIC_COLS + [TARGET]]

print(f'Target         --> {TARGET}')
print(f'Numeric Cols   --> {NUMERIC_COLS}')
print(f'Categoric Cols --> {CAT_COLS}')

Target         --> target
Numeric Cols   --> ['전용면적(㎡)', '서울시_가계대출', '층', '건축년도', '세대별주차대수', '좌표X', '좌표Y', '국민평수', '1번째_가까운_역_거리', '5분이하_역_개수', '5분초과_10분이하_역_개수']
Categoric Cols --> ['아파트명', '구', '동', '도로명', 'top_apt', '1번째_가까운_역_이름', '1번째_가까운_역_호선']


In [14]:
# # 데이터 준비
# X = train.drop(columns=['target'])
# y = train['target']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# # 범주형 열에 대한 레이블 인코딩
# def label_encode_columns(X, cols):
#     for col in cols:
#         le = LabelEncoder()
#         X[col] = le.fit_transform(X[col])
#     return X

# # 전처리 및 모델 파이프라인 설정
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), NUMERIC_COLS),
#         ('cat', FunctionTransformer(label_encode_columns, kw_args={'cols': CAT_COLS}), CAT_COLS)
#     ], remainder='passthrough')

# # 모델 정의, 학습, 예측, 평가
# results = {}
# kf = KFold(n_splits=3, shuffle=True, random_state=SEED)

# model = RandomForestRegressor(random_state=SEED)
# pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
# cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
# rmse = np.sqrt(-np.mean(cv_scores))
# pipeline.fit(X_train, y_train)
# y_pred = pipeline.predict(X_test)
# r2 = r2_score(y_test, y_pred)
# results['RandomForest'] = {'R2 Score': r2, 'RMSE': rmse}

# # 결과 출력
# for model_name, metrics in results.items():
#     print(f"@@ model: {model_name}  /  R2: {metrics['R2 Score']:.4f}  /  RMSE: {metrics['RMSE']:.4f}")

In [15]:

# 데이터 준비
X = train.drop(columns=['target'])
y = train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# 범주형 열에 대한 레이블 인코딩 함수 정의
def label_encode_columns(X, cols):
    for col in cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
    return X

# 전처리 및 모델 파이프라인 설정
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), NUMERIC_COLS),
        ('cat', FunctionTransformer(label_encode_columns, kw_args={'cols': CAT_COLS}), CAT_COLS)
    ], remainder='passthrough')

kf = KFold(n_splits=3, shuffle=True, random_state=SEED)

# Optuna 하이퍼파라미터 튜닝 목적 함수
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)

    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=SEED
    )
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
    rmse = np.sqrt(-np.mean(cv_scores))
    return rmse

# Optuna 스터디 생성 및 최적의 하이퍼파라미터 탐색
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=3)

# 최적의 하이퍼파라미터 출력
print(f"Best trial: {study.best_trial.params}")

# 최적의 하이퍼파라미터로 모델 학습 및 평가
best_params = study.best_trial.params
model = RandomForestRegressor(**best_params, random_state=SEED)
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(-np.mean(cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')))

results = {'RandomForest': {'R2 Score': r2, 'RMSE': rmse}}

# 결과 출력
for model_name, metrics in results.items():
    print(f"@@ model: {model_name}  /  R2: {metrics['R2 Score']:.4f}  /  RMSE: {metrics['RMSE']:.4f}")

[I 2024-07-18 16:52:43,649] A new study created in memory with name: no-name-81ec8045-2c41-4a2c-aa71-d2ba3b423629
[I 2024-07-18 17:12:17,946] Trial 0 finished with value: 8692.909397755162 and parameters: {'n_estimators': 114, 'max_depth': 20, 'min_samples_split': 19, 'min_samples_leaf': 10}. Best is trial 0 with value: 8692.909397755162.
[I 2024-07-18 17:23:01,823] Trial 1 finished with value: 7598.748728624904 and parameters: {'n_estimators': 54, 'max_depth': 24, 'min_samples_split': 9, 'min_samples_leaf': 2}. Best is trial 1 with value: 7598.748728624904.
[W 2024-07-18 17:44:32,248] Trial 2 failed with parameters: {'n_estimators': 285, 'max_depth': 17, 'min_samples_split': 13, 'min_samples_leaf': 6} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\kkhzz\AppData\Local\pypoetry\Cache\virtualenvs\helpotcreator-zEEQU_7F-py3.10\lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  

KeyboardInterrupt: 

In [None]:
import joblib
# 모델 저장
model_filename = f'random_forest_pipeline_rmse_{rmse:.4f}.pkl'
joblib.dump(pipeline, model_filename)
print(f"모델이 파일로 저장되었습니다: {model_filename}")

모델이 파일로 저장되었습니다: random_forest_pipeline_rmse_7307.5846.pkl


In [44]:
selected_feature_list = ['아파트명', '전용면적(㎡)', '서울시_가계대출', '층', '건축년도', '구', '동', '도로명', '세대별주차대수', '좌표X', '좌표Y', 'top_apt', '국민평수', '1번째_가까운_역_이름', '1번째_가까운_역_호선', '1번째_가까운_역_거리', '5분이하_역_개수', '5분초과_10분이하_역_개수']
test = test[selected_feature_list]

impute_missing_values(test)

print((test.isnull().mean() * 100).to_string())

아파트명               0.0
전용면적(㎡)            0.0
서울시_가계대출           0.0
층                  0.0
건축년도               0.0
구                  0.0
동                  0.0
도로명                0.0
세대별주차대수            0.0
좌표X                0.0
좌표Y                0.0
top_apt            0.0
국민평수               0.0
1번째_가까운_역_이름       0.0
1번째_가까운_역_호선       0.0
1번째_가까운_역_거리       0.0
5분이하_역_개수          0.0
5분초과_10분이하_역_개수    0.0


In [48]:
import joblib
import pandas as pd

# 모델 로드
model_filename = 'random_forest_pipeline_rmse_{:.4f}.pkl'.format(rmse)
pipeline = joblib.load(model_filename)

final_predictions = pipeline.predict(test)
test['target'] = final_predictions
output_filename = f'final_predictions_rmse_{rmse:.4f}.csv'
test.to_csv(output_filename, index=False)