# 🏠 부동산 실거래가 예측 대회 - KKH - MODEL
> - 학습, 예측, 평가를 진행한다.
> - kimkihong / helpotcreator@gmail.com / Upstage AI Lab 3기
> - 2024.07.16.화 ~ 2024.07.19.금 19:00

In [7]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(fname=r'font/NanumGothic.otf', name='NanumBarunGothic')
fm.fontManager.ttflist.insert(0, fe)
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'})
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import re
import math

# Model
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, cross_val_score
from sklearn.metrics import mean_squared_error, roc_auc_score, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor, CatBoost
from category_encoders import TargetEncoder, OneHotEncoder, MEstimateEncoder, OrdinalEncoder, CatBoostEncoder

# Optuna
from optuna import create_study
from optuna.integration import OptunaSearchCV
from optuna.samplers import TPESampler
import optuna

import eli5
from eli5.sklearn import PermutationImportance

# 모든 열을 표시하도록 설정
pd.set_option('display.max_columns', None)

In [8]:
train = pd.read_csv('data/kkh_train.csv', encoding='utf-8')
test = pd.read_csv('data/kkh_test.csv', encoding='utf-8')
loan = pd.read_csv('data/kkh_loan.csv', encoding='utf-8') # 서울시 가계대출규모

In [9]:
train['세대별주차대수'] = train['주차대수'] / train['k-전체세대수']
test['세대별주차대수'] = test['주차대수'] / test['k-전체세대수']

In [None]:
# '계약년월'과 '년월'이 일치하는 경우 '서울시_가계대출' 정보를 train에 병합
train = train.merge(loan, how='left', left_on='계약년월', right_on='년월')
# '년월' 열은 필요 없으므로 삭제
train.drop(columns=['년월'], inplace=True)

test = test.merge(loan, how='left', left_on='계약년월', right_on='년월')
test.drop(columns=['년월'], inplace=True)

In [10]:
# '동+아파트명' 기준으로 그룹화하여 'target' 피처의 평균 가격 계산
mean_target_per_group = train.groupby('동+아파트명')['target'].mean()

# 평균 가격이 200,000 이상인 그룹을 리스트로 생성
high_price_groups = mean_target_per_group[mean_target_per_group >= 200000].index.tolist()

# train 데이터에 'top_apt' 피처 추가
train['top_apt'] = train['동+아파트명'].apply(lambda x: 1 if x in high_price_groups else 0).astype('category')

# test 데이터에 'top_apt' 피처 추가
test['top_apt'] = test['동+아파트명'].apply(lambda x: 1 if x in high_price_groups else 0).astype('category')

In [11]:
train['국민평수'] = train['전용면적(㎡)'].apply(lambda x: 1 if x <= 90 else 0)
test['국민평수'] = test['전용면적(㎡)'].apply(lambda x: 1 if x <= 90 else 0)

In [14]:
# selected_feature_list = ['아파트명', '전용면적(㎡)', '계약년월', '층', '건축년도', '구', '동', '도로명', '세대별주차대수', '좌표X', '좌표Y', 'target', 'top_apt', '국민평수', '1번째_가까운_역_이름', '1번째_가까운_역_호선', '1번째_가까운_역_거리', '5분이하_역_개수', '5분초과_10분이하_역_개수']
selected_feature_list = ['아파트명', '전용면적(㎡)', '서울시_가계대출', '층', '건축년도', '구', '동', '도로명', '세대별주차대수', '좌표X', '좌표Y', 'target', 'top_apt', '국민평수', '1번째_가까운_역_이름', '1번째_가까운_역_호선', '1번째_가까운_역_거리', '5분이하_역_개수', '5분초과_10분이하_역_개수']
train = train[selected_feature_list]

In [15]:
train.to_csv('data/kkh_train_final.csv', index=False)

In [16]:
def impute_missing_values(df):
    # 수치형 피처와 범주형 피처 구분
    numeric_features = df.select_dtypes(include=[np.number]).columns
    categorical_features = df.select_dtypes(include=[object]).columns
    
    # 수치형 피처의 결측치를 평균값으로 대체
    # for feature in numeric_features:
    #     mean_value = df[feature].mean()
    #     df[feature].fillna(mean_value, inplace=True)
    
    # 수치형 피처의 결측치를 -999으로 대체
    for feature in numeric_features:
        df[feature].fillna(-999, inplace=True)
    
    # 범주형 피처의 결측치를 'Missing'으로 대체
    for feature in categorical_features:
        df[feature].fillna('Missing', inplace=True)
    
    return df

impute_missing_values(train)

Unnamed: 0,아파트명,전용면적(㎡),계약년월,층,건축년도,구,동,도로명,세대별주차대수,좌표X,좌표Y,target,top_apt,국민평수,1번째_가까운_역_이름,1번째_가까운_역_호선,1번째_가까운_역_거리,5분이하_역_개수,5분초과_10분이하_역_개수
0,개포6차우성,79.97,201712,3,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,124000,0,1,구룡,분당선,1187.672025,0,0
1,개포6차우성,79.97,201712,4,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,123500,0,1,구룡,분당선,1187.672025,0,0
2,개포6차우성,54.98,201712,5,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,91500,0,1,구룡,분당선,1187.672025,0,0
3,개포6차우성,79.97,201801,4,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,130000,0,1,구룡,분당선,1187.672025,0,0
4,개포6차우성,79.97,201801,2,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,117000,0,1,구룡,분당선,1187.672025,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118817,갈현현대,59.94,200707,11,1998,은평구,구산동,서오릉로21길 36,1.000000,126.905543,37.612989,20000,0,1,구산,6호선,1048.367837,0,0
1118818,갈현현대,59.94,200708,10,1998,은평구,구산동,서오릉로21길 36,1.000000,126.905543,37.612989,20000,0,1,구산,6호선,1048.367837,0,0
1118819,갈현현대,84.83,200708,20,1998,은평구,구산동,서오릉로21길 36,1.000000,126.905543,37.612989,28000,0,1,구산,6호선,1048.367837,0,0
1118820,갈현현대,84.83,200709,8,1998,은평구,구산동,서오릉로21길 36,1.000000,126.905543,37.612989,29000,0,1,구산,6호선,1048.367837,0,0


In [17]:
print((train.isnull().mean() * 100).to_string())

아파트명               0.0
전용면적(㎡)            0.0
계약년월               0.0
층                  0.0
건축년도               0.0
구                  0.0
동                  0.0
도로명                0.0
세대별주차대수            0.0
좌표X                0.0
좌표Y                0.0
target             0.0
top_apt            0.0
국민평수               0.0
1번째_가까운_역_이름       0.0
1번째_가까운_역_호선       0.0
1번째_가까운_역_거리       0.0
5분이하_역_개수          0.0
5분초과_10분이하_역_개수    0.0


In [18]:
train = train[~train['층'].isin([-2, 65])]

In [19]:
train = train[train['계약년월'] >= 201801]

In [20]:
SEED = 1053682552

# TARGET 정의
TARGET = 'target'

# NUMERIC_COLS 계산: TARGET을 제외한 수치형 컬럼 리스트
NUMERIC_COLS = [col for col in train.select_dtypes(include=[float, int]).columns if col != TARGET]

# CAT_COLS 계산: NUMERIC_COLS를 제외한 나머지 컬럼 리스트
CAT_COLS = [col for col in train.columns if col not in NUMERIC_COLS + [TARGET]]

print(f'Target         --> {TARGET}')
print(f'Numeric Cols   --> {NUMERIC_COLS}')
print(f'Categoric Cols --> {CAT_COLS}')

Target         --> target
Numeric Cols   --> ['전용면적(㎡)', '계약년월', '층', '건축년도', '세대별주차대수', '좌표X', '좌표Y', '국민평수', '1번째_가까운_역_거리', '5분이하_역_개수', '5분초과_10분이하_역_개수']
Categoric Cols --> ['아파트명', '구', '동', '도로명', 'top_apt', '1번째_가까운_역_이름', '1번째_가까운_역_호선']


In [22]:
# # 데이터 준비
# X = train.drop(columns=['target'])
# y = train['target']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# # 범주형 열에 대한 레이블 인코딩
# def label_encode_columns(X, cols):
#     for col in cols:
#         le = LabelEncoder()
#         X[col] = le.fit_transform(X[col])
#     return X

# # 전처리 및 모델 파이프라인 설정
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), NUMERIC_COLS),
#         ('cat', FunctionTransformer(label_encode_columns, kw_args={'cols': CAT_COLS}), CAT_COLS)
#     ], remainder='passthrough')

# # 옵티마이저를 위한 하이퍼파라미터 공간 설정
# # def objective_xgb(trial):
# #     model = XGBRegressor(
# #         n_estimators=trial.suggest_int('n_estimators', 300, 800),
# #         learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
# #         max_depth=trial.suggest_int('max_depth', 6, 20),
# #         alpha=trial.suggest_loguniform('alpha', 1e-4, 1e2),
# #         random_state=SEED
# #     )
# #     pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
# #     kf = KFold(n_splits=3, shuffle=True, random_state=SEED)
# #     cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
# #     return -np.mean(cv_scores)
# def objective_xgb(trial):
#     model = XGBRegressor(
#         n_estimators=trial.suggest_int('n_estimators', 300, 1000),
#         learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
#         max_depth=trial.suggest_int('max_depth', 5, 24),
#         min_child_weight=trial.suggest_loguniform('min_child_weight', 1e-3, 1e2),
#         subsample=trial.suggest_uniform('subsample', 0.5, 1.0),
#         colsample_bytree=trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
#         gamma=trial.suggest_loguniform('gamma', 1e-4, 1e1),
#         alpha=trial.suggest_loguniform('alpha', 1e-4, 1e2),
#         random_state=SEED
#     )
#     pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
#     kf = KFold(n_splits=2, shuffle=True, random_state=SEED)
#     cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
#     return -np.mean(cv_scores)

# def objective_cat(trial):
#     model = CatBoostRegressor(
#         iterations=trial.suggest_int('iterations', 300, 800),
#         learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
#         depth=trial.suggest_int('depth', 6, 20),
#         random_state=SEED,
#         verbose=0
#     )
#     pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
#     kf = KFold(n_splits=3, shuffle=True, random_state=SEED)
#     cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
#     return -np.mean(cv_scores)

# # 옵티마이저 설정 및 최적화
# def optimize_model(objective_function, n_trials):
#     study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=SEED))
#     study.optimize(objective_function, n_trials=n_trials)
#     return study.best_params

# # 각 모델의 하이퍼파라미터 최적화
# n_trials = 10

# xgb_best_params = optimize_model(objective_xgb, n_trials)
# print("@@ XGBoost best params:", xgb_best_params)

# # cat_best_params = optimize_model(objective_cat, n_trials)
# # print("@@ CatBoost best params:", cat_best_params)

# # 모델 정의, 학습, 예측, 평가
# results = {}
# kf = KFold(n_splits=3, shuffle=True, random_state=SEED)

# for model_name, model_class, best_params in [
#     ('XGBoost', XGBRegressor, xgb_best_params),
#     # ('CatBoost', CatBoostRegressor, cat_best_params)
# ]:
#     model = model_class(**best_params)
#     pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
#     cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
#     rmse = np.sqrt(-np.mean(cv_scores))
#     pipeline.fit(X_train, y_train)
#     y_pred = pipeline.predict(X_test)
#     r2 = r2_score(y_test, y_pred)
#     results[model_name] = {'R2 Score': r2, 'RMSE': rmse}

# # 결과 출력
# for model_name, metrics in results.items():
#     print(f"@@ model: {model_name}  /  R2: {metrics['R2 Score']:.4f}  /  RMSE: {metrics['RMSE']:.4f}")

[I 2024-07-18 12:59:15,576] A new study created in memory with name: no-name-cd58740c-8904-422e-8ad4-7f121dd2e580
[I 2024-07-18 13:00:20,648] Trial 0 finished with value: 2254905188.3463016 and parameters: {'n_estimators': 828, 'learning_rate': 0.0003798782534911095, 'max_depth': 13, 'min_child_weight': 0.8246544242693233, 'subsample': 0.9927612315491489, 'colsample_bytree': 0.8526943846115274, 'gamma': 0.8904275349929992, 'alpha': 0.5057309506977974}. Best is trial 0 with value: 2254905188.3463016.
[I 2024-07-18 13:00:28,515] Trial 1 finished with value: 1303550592.6085694 and parameters: {'n_estimators': 928, 'learning_rate': 0.0011874581171232713, 'max_depth': 5, 'min_child_weight': 3.838365619480914, 'subsample': 0.5795192745561082, 'colsample_bytree': 0.9808270454430227, 'gamma': 0.3956089060022762, 'alpha': 0.0005019860496167036}. Best is trial 1 with value: 1303550592.6085694.
[I 2024-07-18 13:00:41,176] Trial 2 finished with value: 107839029.10775214 and parameters: {'n_estimat

@@ XGBoost best params: {'n_estimators': 565, 'learning_rate': 0.04223731676206321, 'max_depth': 17, 'min_child_weight': 0.0050395281289352944, 'subsample': 0.6748193915025716, 'colsample_bytree': 0.575666250505291, 'gamma': 0.00025338839768089717, 'alpha': 0.06392606729577881}
@@ model: XGBoost  /  R2: 0.9639  /  RMSE: 11059.5517


In [None]:
# 데이터 준비
X = train.drop(columns=['target'])
y = train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# 범주형 열에 대한 레이블 인코딩
def label_encode_columns(X, cols):
    for col in cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
    return X

# 전처리 및 모델 파이프라인 설정
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), NUMERIC_COLS),
        ('cat', FunctionTransformer(label_encode_columns, kw_args={'cols': CAT_COLS}), CAT_COLS)
    ], remainder='passthrough')

# 모델 정의, 학습, 예측, 평가
results = {}
kf = KFold(n_splits=3, shuffle=True, random_state=SEED)

model = RandomForestRegressor(random_state=SEED)
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
rmse = np.sqrt(-np.mean(cv_scores))
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
r2 = r2_score(y_test, y_pred)
results['RandomForest'] = {'R2 Score': r2, 'RMSE': rmse}

# 결과 출력
for model_name, metrics in results.items():
    print(f"@@ model: {model_name}  /  R2: {metrics['R2 Score']:.4f}  /  RMSE: {metrics['RMSE']:.4f}")

In [24]:
# print(xgb_best_params)
# print('======================')
# print(cat_best_params)

{'n_estimators': 565, 'learning_rate': 0.04223731676206321, 'max_depth': 17, 'min_child_weight': 0.0050395281289352944, 'subsample': 0.6748193915025716, 'colsample_bytree': 0.575666250505291, 'gamma': 0.00025338839768089717, 'alpha': 0.06392606729577881}


In [26]:
# selected_feature_list = ['아파트명', '전용면적(㎡)', '계약년월', '층', '건축년도', '구', '동', '도로명', '세대별주차대수', '좌표X', '좌표Y', 'top_apt', '국민평수', '1번째_가까운_역_이름', '1번째_가까운_역_호선', '1번째_가까운_역_거리', '5분이하_역_개수', '5분초과_10분이하_역_개수']
selected_feature_list = ['아파트명', '전용면적(㎡)', '서울시_가계대출', '층', '건축년도', '구', '동', '도로명', '세대별주차대수', '좌표X', '좌표Y', 'top_apt', '국민평수', '1번째_가까운_역_이름', '1번째_가까운_역_호선', '1번째_가까운_역_거리', '5분이하_역_개수', '5분초과_10분이하_역_개수']
test = test[selected_feature_list]

In [27]:
impute_missing_values(test)

print((test.isnull().mean() * 100).to_string())

아파트명               0.0
전용면적(㎡)            0.0
계약년월               0.0
층                  0.0
건축년도               0.0
구                  0.0
동                  0.0
도로명                0.0
세대별주차대수            0.0
좌표X                0.0
좌표Y                0.0
top_apt            0.0
국민평수               0.0
1번째_가까운_역_이름       0.0
1번째_가까운_역_호선       0.0
1번째_가까운_역_거리       0.0
5분이하_역_개수          0.0
5분초과_10분이하_역_개수    0.0


In [28]:
# 최종 모델 정의 및 학습
def train_and_predict(model_class, best_params):
    model = model_class(**best_params)
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipeline.fit(X, y)
    predictions = pipeline.predict(test)
    return predictions.astype(int)  # 예측값을 int로 변환

# 예측 및 저장
print("Predicting using XGBoost...")
xgb_predictions = train_and_predict(XGBRegressor, xgb_best_params)
submission_xgb = pd.DataFrame({
    'target': xgb_predictions
})
submission_xgb.to_csv(f'test_xgb_11059.5517.csv', index=False)

# print("Predicting using CatBoost...")
# cat_predictions = train_and_predict(CatBoostRegressor, cat_best_params)
# submission_cat = pd.DataFrame({
#     'target': cat_predictions
# })
# submission_cat.to_csv(f'test_cat_13919.0407.csv', index=False)

Predicting using XGBoost...
