# 🏠 부동산 실거래가 예측 대회 - KKH - MODEL
> - 학습, 예측, 평가를 진행한다.
> - kimkihong / helpotcreator@gmail.com / Upstage AI Lab 3기
> - 2024.07.16.화 ~ 2024.07.19.금 19:00

In [18]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(fname=r'font/NanumGothic.otf', name='NanumBarunGothic')
fm.fontManager.ttflist.insert(0, fe)
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'})
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import re
import math

# Model
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, cross_val_score
from sklearn.metrics import mean_squared_error, roc_auc_score, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor, CatBoost
from category_encoders import TargetEncoder, OneHotEncoder, MEstimateEncoder, OrdinalEncoder, CatBoostEncoder

# Optuna
from optuna import create_study
from optuna.integration import OptunaSearchCV
from optuna.samplers import TPESampler
import optuna

import eli5
from eli5.sklearn import PermutationImportance

# 모든 열을 표시하도록 설정
pd.set_option('display.max_columns', None)

In [19]:
train = pd.read_csv('data/kkh_train.csv', encoding='utf-8')
test = pd.read_csv('data/kkh_test.csv', encoding='utf-8')

In [20]:
train['세대별주차대수'] = train['주차대수'] / train['k-전체세대수']
test['세대별주차대수'] = test['주차대수'] / test['k-전체세대수']

In [21]:
# '동+아파트명' 기준으로 그룹화하여 'target' 피처의 평균 가격 계산
mean_target_per_group = train.groupby('동+아파트명')['target'].mean()

# 평균 가격이 200,000 이상인 그룹을 리스트로 생성
high_price_groups = mean_target_per_group[mean_target_per_group >= 200000].index.tolist()

# train 데이터에 'top_apt' 피처 추가
train['top_apt'] = train['동+아파트명'].apply(lambda x: 1 if x in high_price_groups else 0).astype('category')

# test 데이터에 'top_apt' 피처 추가
test['top_apt'] = test['동+아파트명'].apply(lambda x: 1 if x in high_price_groups else 0).astype('category')

In [22]:
train['국민평수'] = train['전용면적(㎡)'].apply(lambda x: 1 if x <= 90 else 0)
test['국민평수'] = test['전용면적(㎡)'].apply(lambda x: 1 if x <= 90 else 0)

In [23]:
from scipy.spatial import cKDTree

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # 지구의 반경 (km)
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = R * c
    return distance * 1000  # 미터 단위로 변환

def walking_time(distance):
    return distance / (4000/60)  # 4km/h의 걷는 속도 가정

def add_subway_features(apartment_df, subway_df):
    apartment_coords = apartment_df[['좌표Y', '좌표X']].values
    station_coords = subway_df[['위도', '경도']].values

    # 가장 가까운 3개의 역 찾기
    tree = cKDTree(station_coords)
    distances, indices = tree.query(apartment_coords, k=3)

    for i in range(1):
        apartment_df[f'{i+1}번째_가까운_역_이름'] = subway_df.loc[indices[:, i], '역사명'].values
        apartment_df[f'{i+1}번째_가까운_역_호선'] = subway_df.loc[indices[:, i], '호선'].values
        apartment_df[f'{i+1}번째_가까운_역_거리'] = np.array([haversine_distance(ac[0], ac[1], station_coords[idx][0], station_coords[idx][1]) 
                                                   for ac, idx in zip(apartment_coords, indices[:, i])])
        # apartment_df[f'{i+1}번째_가까운_역_도보시간'] = walking_time(apartment_df[f'{i+1}번째_가까운_역_거리'])

    # 시간대별 역 개수 계산
    def count_stations_in_time_range(min_time, max_time):
        min_dist = min_time * (4000/60)
        max_dist = max_time * (4000/60)
        return np.array([np.sum((min_dist < haversine_distance(c[0], c[1], station_coords[:, 0], station_coords[:, 1])) & 
                                (haversine_distance(c[0], c[1], station_coords[:, 0], station_coords[:, 1]) <= max_dist)) 
                         for c in apartment_coords])

    apartment_df['5분이하_역_개수'] = count_stations_in_time_range(0, 5)
    apartment_df['5분초과_10분이하_역_개수'] = count_stations_in_time_range(5, 10)
    # apartment_df['10분초과_15분이하_역_개수'] = count_stations_in_time_range(10, 15)
    # apartment_df['15분초과_20분이하_역_개수'] = count_stations_in_time_range(15, 20)

    return apartment_df

train = add_subway_features(train, subway)
train['1번째_가까운_역_이름'] = train['1번째_가까운_역_이름'].astype('category')
train['1번째_가까운_역_호선'] = train['1번째_가까운_역_호선'].astype('category')
train['1번째_가까운_역_이름'] = train['1번째_가까운_역_이름'].astype('category')

In [24]:
test = add_subway_features(test, subway)
test['1번째_가까운_역_이름'] = test['1번째_가까운_역_이름'].astype('category')
test['1번째_가까운_역_호선'] = test['1번째_가까운_역_호선'].astype('category')
test['1번째_가까운_역_이름'] = test['1번째_가까운_역_이름'].astype('category')

In [25]:
selected_feature_list = ['아파트명', '전용면적(㎡)', '계약년월', '층', '건축년도', '구', '동', '도로명', '세대별주차대수', '좌표X', '좌표Y', 'target', 'top_apt', '국민평수', '1번째_가까운_역_이름', '1번째_가까운_역_호선', '1번째_가까운_역_거리', '5분이하_역_개수', '5분초과_10분이하_역_개수']
train = train[selected_feature_list]

In [26]:
train.to_csv('data/kkh_train_final.csv', index=False)

In [27]:
def impute_missing_values(df):
    # 수치형 피처와 범주형 피처 구분
    numeric_features = df.select_dtypes(include=[np.number]).columns
    categorical_features = df.select_dtypes(include=[object]).columns
    
    # 수치형 피처의 결측치를 평균값으로 대체
    # for feature in numeric_features:
    #     mean_value = df[feature].mean()
    #     df[feature].fillna(mean_value, inplace=True)
    
    # 수치형 피처의 결측치를 -999으로 대체
    for feature in numeric_features:
        df[feature].fillna(-999, inplace=True)
    
    # 범주형 피처의 결측치를 'Missing'으로 대체
    for feature in categorical_features:
        df[feature].fillna('Missing', inplace=True)
    
    return df

impute_missing_values(train)

Unnamed: 0,아파트명,전용면적(㎡),계약년월,층,건축년도,구,동,도로명,세대별주차대수,좌표X,좌표Y,target,top_apt,국민평수,1번째_가까운_역_이름,1번째_가까운_역_호선,1번째_가까운_역_거리,5분이하_역_개수,5분초과_10분이하_역_개수
0,개포6차우성,79.97,201712,3,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,124000,0,1,구룡,분당선,1187.672025,0,0
1,개포6차우성,79.97,201712,4,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,123500,0,1,구룡,분당선,1187.672025,0,0
2,개포6차우성,54.98,201712,5,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,91500,0,1,구룡,분당선,1187.672025,0,0
3,개포6차우성,79.97,201801,4,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,130000,0,1,구룡,분당선,1187.672025,0,0
4,개포6차우성,79.97,201801,2,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,117000,0,1,구룡,분당선,1187.672025,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118817,갈현현대,59.94,200707,11,1998,은평구,구산동,서오릉로21길 36,1.000000,126.905543,37.612989,20000,0,1,구산,6호선,1048.367837,0,0
1118818,갈현현대,59.94,200708,10,1998,은평구,구산동,서오릉로21길 36,1.000000,126.905543,37.612989,20000,0,1,구산,6호선,1048.367837,0,0
1118819,갈현현대,84.83,200708,20,1998,은평구,구산동,서오릉로21길 36,1.000000,126.905543,37.612989,28000,0,1,구산,6호선,1048.367837,0,0
1118820,갈현현대,84.83,200709,8,1998,은평구,구산동,서오릉로21길 36,1.000000,126.905543,37.612989,29000,0,1,구산,6호선,1048.367837,0,0


In [28]:
print((train.isnull().mean() * 100).to_string())

아파트명               0.0
전용면적(㎡)            0.0
계약년월               0.0
층                  0.0
건축년도               0.0
구                  0.0
동                  0.0
도로명                0.0
세대별주차대수            0.0
좌표X                0.0
좌표Y                0.0
target             0.0
top_apt            0.0
국민평수               0.0
1번째_가까운_역_이름       0.0
1번째_가까운_역_호선       0.0
1번째_가까운_역_거리       0.0
5분이하_역_개수          0.0
5분초과_10분이하_역_개수    0.0


In [29]:
train = train[~train['층'].isin([-2, 65])]

In [30]:
train = train[train['계약년월'] >= 201801]

In [31]:
SEED = 1053682552

# TARGET 정의
TARGET = 'target'

# NUMERIC_COLS 계산: TARGET을 제외한 수치형 컬럼 리스트
NUMERIC_COLS = [col for col in train.select_dtypes(include=[float, int]).columns if col != TARGET]

# CAT_COLS 계산: NUMERIC_COLS를 제외한 나머지 컬럼 리스트
CAT_COLS = [col for col in train.columns if col not in NUMERIC_COLS + [TARGET]]

print(f'Target         --> {TARGET}')
print(f'Numeric Cols   --> {NUMERIC_COLS}')
print(f'Categoric Cols --> {CAT_COLS}')

Target         --> target
Numeric Cols   --> ['전용면적(㎡)', '계약년월', '층', '건축년도', '세대별주차대수', '좌표X', '좌표Y', '국민평수', '1번째_가까운_역_거리', '5분이하_역_개수', '5분초과_10분이하_역_개수']
Categoric Cols --> ['아파트명', '구', '동', '도로명', 'top_apt', '1번째_가까운_역_이름', '1번째_가까운_역_호선']


In [32]:
# 데이터 준비
X = train.drop(columns=['target'])
y = train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# 범주형 열에 대한 레이블 인코딩
def label_encode_columns(X, cols):
    for col in cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
    return X

# 전처리 및 모델 파이프라인 설정
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), NUMERIC_COLS),
        ('cat', FunctionTransformer(label_encode_columns, kw_args={'cols': CAT_COLS}), CAT_COLS)
    ], remainder='passthrough')

# 옵티마이저를 위한 하이퍼파라미터 공간 설정
def objective_xgb(trial):
    model = XGBRegressor(
        n_estimators=trial.suggest_int('n_estimators', 300, 800),
        learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        max_depth=trial.suggest_int('max_depth', 3, 6),
        alpha=trial.suggest_loguniform('alpha', 1e-4, 1e2),
        random_state=SEED
    )
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
    return -np.mean(cv_scores)

def objective_cat(trial):
    model = CatBoostRegressor(
        iterations=trial.suggest_int('iterations', 300, 800),
        learning_rate=trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        depth=trial.suggest_int('depth', 3, 6),
        random_state=SEED,
        verbose=0
    )
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
    return -np.mean(cv_scores)

# 옵티마이저 설정 및 최적화
def optimize_model(objective_function, n_trials):
    study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=SEED))
    study.optimize(objective_function, n_trials=n_trials)
    return study.best_params

# 각 모델의 하이퍼파라미터 최적화
n_trials = 7

xgb_best_params = optimize_model(objective_xgb, n_trials)
print("@@ XGBoost best params:", xgb_best_params)

cat_best_params = optimize_model(objective_cat, n_trials)
print("@@ CatBoost best params:", cat_best_params)

# 모델 정의, 학습, 예측, 평가
results = {}
kf = KFold(n_splits=3, shuffle=True, random_state=SEED)

for model_name, model_class, best_params in [
    ('XGBoost', XGBRegressor, xgb_best_params),
    ('CatBoost', CatBoostRegressor, cat_best_params)
]:
    model = model_class(**best_params)
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
    rmse = np.sqrt(-np.mean(cv_scores))
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    results[model_name] = {'R2 Score': r2, 'RMSE': rmse}

# 결과 출력
for model_name, metrics in results.items():
    print(f"@@ model: {model_name}  /  R2: {metrics['R2 Score']:.4f}  /  RMSE: {metrics['RMSE']:.4f}")

[I 2024-07-17 23:43:29,508] A new study created in memory with name: no-name-877f9ddf-22c2-40e6-9b83-dd0642aaa09b
[I 2024-07-17 23:43:44,785] Trial 0 finished with value: 2930185376.5570974 and parameters: {'n_estimators': 677, 'learning_rate': 0.0003798782534911095, 'max_depth': 4, 'alpha': 0.31588310039776357}. Best is trial 0 with value: 2930185376.5570974.
[I 2024-07-17 23:44:07,541] Trial 1 finished with value: 215164584.68116483 and parameters: {'n_estimators': 793, 'learning_rate': 0.01306671165805956, 'max_depth': 6, 'alpha': 0.5057309506977974}. Best is trial 1 with value: 215164584.68116483.
[I 2024-07-17 23:44:20,309] Trial 2 finished with value: 2021749777.8949318 and parameters: {'n_estimators': 748, 'learning_rate': 0.0011874581171232713, 'max_depth': 3, 'alpha': 1.9997495547370978}. Best is trial 1 with value: 215164584.68116483.
[I 2024-07-17 23:44:29,981] Trial 3 finished with value: 188245425.18207294 and parameters: {'n_estimators': 379, 'learning_rate': 0.0767294831

@@ XGBoost best params: {'n_estimators': 623, 'learning_rate': 0.07901694526374901, 'max_depth': 6, 'alpha': 97.31612494508994}


[I 2024-07-17 23:45:48,288] Trial 0 finished with value: 3000649466.389779 and parameters: {'iterations': 677, 'learning_rate': 0.0003798782534911095, 'depth': 4}. Best is trial 0 with value: 3000649466.389779.
[I 2024-07-17 23:46:28,911] Trial 1 finished with value: 204552492.7957434 and parameters: {'iterations': 592, 'learning_rate': 0.09048308290706156, 'depth': 5}. Best is trial 1 with value: 204552492.7957434.
[I 2024-07-17 23:47:20,655] Trial 2 finished with value: 465065497.1607065 and parameters: {'iterations': 695, 'learning_rate': 0.007111476293272711, 'depth': 6}. Best is trial 1 with value: 204552492.7957434.
[I 2024-07-17 23:47:53,706] Trial 3 finished with value: 3632773163.734256 and parameters: {'iterations': 479, 'learning_rate': 0.0001029942855626323, 'depth': 5}. Best is trial 1 with value: 204552492.7957434.
[I 2024-07-17 23:48:19,205] Trial 4 finished with value: 248658889.64907885 and parameters: {'iterations': 379, 'learning_rate': 0.07672948312461544, 'depth': 

@@ CatBoost best params: {'iterations': 592, 'learning_rate': 0.09048308290706156, 'depth': 5}
0:	learn: 58493.4581485	total: 11.1ms	remaining: 6.54s
1:	learn: 55609.4345105	total: 22.1ms	remaining: 6.52s
2:	learn: 53048.5080491	total: 30.8ms	remaining: 6.05s
3:	learn: 50757.0877587	total: 39.8ms	remaining: 5.85s
4:	learn: 48660.5722871	total: 50.5ms	remaining: 5.93s
5:	learn: 46799.1036061	total: 60.2ms	remaining: 5.88s
6:	learn: 45127.9828875	total: 70.3ms	remaining: 5.87s
7:	learn: 43627.2868172	total: 79.7ms	remaining: 5.82s
8:	learn: 42324.8332367	total: 89.1ms	remaining: 5.77s
9:	learn: 41094.3016636	total: 99.6ms	remaining: 5.79s
10:	learn: 39970.2828819	total: 108ms	remaining: 5.71s
11:	learn: 38743.5387164	total: 119ms	remaining: 5.75s
12:	learn: 37662.1678404	total: 130ms	remaining: 5.77s
13:	learn: 36677.1246170	total: 140ms	remaining: 5.78s
14:	learn: 35789.2534724	total: 149ms	remaining: 5.74s
15:	learn: 35054.7479823	total: 158ms	remaining: 5.7s
16:	learn: 34339.3599097	t

In [33]:
print(xgb_best_params)
print('======================')
print(cat_best_params)

{'n_estimators': 623, 'learning_rate': 0.07901694526374901, 'max_depth': 6, 'alpha': 97.31612494508994}
{'iterations': 592, 'learning_rate': 0.09048308290706156, 'depth': 5}


In [34]:
selected_feature_list = ['아파트명', '전용면적(㎡)', '계약년월', '층', '건축년도', '구', '동', '도로명', '세대별주차대수', '좌표X', '좌표Y', 'top_apt', '국민평수', '1번째_가까운_역_이름', '1번째_가까운_역_호선', '1번째_가까운_역_거리', '5분이하_역_개수', '5분초과_10분이하_역_개수']
test = test[selected_feature_list]

In [35]:
impute_missing_values(test)

print((test.isnull().mean() * 100).to_string())

아파트명               0.0
전용면적(㎡)            0.0
계약년월               0.0
층                  0.0
건축년도               0.0
구                  0.0
동                  0.0
도로명                0.0
세대별주차대수            0.0
좌표X                0.0
좌표Y                0.0
top_apt            0.0
국민평수               0.0
1번째_가까운_역_이름       0.0
1번째_가까운_역_호선       0.0
1번째_가까운_역_거리       0.0
5분이하_역_개수          0.0
5분초과_10분이하_역_개수    0.0


In [36]:
# 최종 모델 정의 및 학습
def train_and_predict(model_class, best_params):
    model = model_class(**best_params)
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipeline.fit(X, y)
    predictions = pipeline.predict(test)
    return predictions.astype(int)  # 예측값을 int로 변환

# 예측 및 저장
print("Predicting using XGBoost...")
xgb_predictions = train_and_predict(XGBRegressor, xgb_best_params)
submission_xgb = pd.DataFrame({
    'target': xgb_predictions
})
submission_xgb.to_csv(f'test_xgb_11346.2787.csv', index=False)

print("Predicting using CatBoost...")
cat_predictions = train_and_predict(CatBoostRegressor, cat_best_params)
submission_cat = pd.DataFrame({
    'target': cat_predictions
})
submission_cat.to_csv(f'test_cat_13919.0407.csv', index=False)

Predicting using XGBoost...
Predicting using CatBoost...
0:	learn: 58666.2447197	total: 22ms	remaining: 13s
1:	learn: 55777.4992817	total: 43.8ms	remaining: 12.9s
2:	learn: 53194.1159611	total: 63ms	remaining: 12.4s
3:	learn: 50885.3535172	total: 83.9ms	remaining: 12.3s
4:	learn: 48867.1731807	total: 107ms	remaining: 12.5s
5:	learn: 46911.6612472	total: 130ms	remaining: 12.7s
6:	learn: 45251.0846458	total: 149ms	remaining: 12.5s
7:	learn: 43657.9872697	total: 172ms	remaining: 12.5s
8:	learn: 42218.3261898	total: 196ms	remaining: 12.7s
9:	learn: 40899.2211205	total: 217ms	remaining: 12.6s
10:	learn: 39682.2382856	total: 239ms	remaining: 12.6s
11:	learn: 38567.0632720	total: 260ms	remaining: 12.5s
12:	learn: 37520.3144218	total: 282ms	remaining: 12.5s
13:	learn: 36601.9034451	total: 301ms	remaining: 12.4s
14:	learn: 35710.1149239	total: 321ms	remaining: 12.4s
15:	learn: 34802.9462286	total: 341ms	remaining: 12.3s
16:	learn: 34070.8387426	total: 363ms	remaining: 12.3s
17:	learn: 33351.410