# 🏠 부동산 실거래가 예측 대회 - KKH - MODEL
> - 학습, 예측, 평가를 진행한다.
> - kimkihong / helpotcreator@gmail.com / Upstage AI Lab 3기
> - 2024.07.16.화 ~ 2024.07.19.금 19:00

In [58]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(fname=r'font/NanumGothic.otf', name='NanumBarunGothic')
fm.fontManager.ttflist.insert(0, fe)
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'})
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import re
import math

# Model
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, cross_val_score
from sklearn.metrics import mean_squared_error, roc_auc_score, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor, CatBoost
from category_encoders import TargetEncoder, OneHotEncoder, MEstimateEncoder, OrdinalEncoder, CatBoostEncoder

# Optuna
from optuna import create_study
from optuna.integration import OptunaSearchCV
from optuna.samplers import TPESampler
import optuna

import eli5
from eli5.sklearn import PermutationImportance

# 모든 열을 표시하도록 설정
pd.set_option('display.max_columns', None)

In [59]:
train = pd.read_csv('data/kkh_train.csv', encoding='utf-8')
test = pd.read_csv('data/kkh_test.csv', encoding='utf-8')
loan = pd.read_csv('data/kkh_loan.csv', encoding='utf-8') # 서울시 가계대출규모
finance = pd.read_csv('data/yoonjae_finance.csv', encoding='utf-8')
kb = pd.read_csv('data/jaemyung_kb.csv', encoding='utf-8')

In [60]:
train = pd.merge(train, finance, left_on='계약년월', right_on='date', how='left')
train = train.drop(columns=['date'])

train = pd.merge(train, kb, left_on='계약년월', right_on='date', how='left')
train = train.drop(columns=['date'])

In [61]:
train.sample(3)

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,도로명,해제사유발생일,등기신청일자,거래유형,중개사소재지,"k-단지분류(아파트,주상복합등등)",k-전화번호,k-팩스번호,단지소개기존clob,k-세대타입(분양형태),k-관리방식,k-복도유형,k-난방방식,k-전체동수,k-전체세대수,k-건설사(시공사),k-시행사,k-사용검사일-사용승인일,k-연면적,k-주거전용면적,k-관리비부과면적,k-전용면적별세대현황(60㎡이하),k-전용면적별세대현황(60㎡~85㎡이하),k-85㎡~135㎡이하,k-135㎡초과,k-홈페이지,k-등록일자,k-수정일자,고용보험관리번호,경비비관리형태,세대전기계약방법,청소비관리형태,건축면적,주차대수,기타/의무/임대/임의=1/2/3/4,단지승인일,사용허가여부,관리비 업로드,단지신청일,target,구,동,좌표X,좌표Y,동+아파트명,1번째_가까운_역_이름,1번째_가까운_역_호선,1번째_가까운_역_거리,5분이하_역_개수,5분초과_10분이하_역_개수,서울시_주택담보대출,kb부동산지수
318753,서울특별시 구로구 고척동,296,296.0,0.0,대우,84.98,202009,4,14,1999,고척로52길 53,,,-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,70500,구로구,고척동,126.855473,37.503033,고척동 대우,개봉,경인선,1009.09852,0,0,208142.8,93.2
523078,서울특별시 송파구 잠실동,22,22.0,0.0,리센츠,84.99,201610,2,14,2008,올림픽로 135,,,-,-,,24190035.0,24190039.0,4.0,분양,위탁관리,혼합식,지역난방,65.0,5563.0,"우방,삼성,대우,대림","우방,삼성,대우,대림",2008-07-31 00:00:00.0,935053.0,447330.0,581999.0,1113.0,3590.0,860.0,,www.movill.net/05502,,2024-07-11 18:13:07.0,90800835801.0,위탁,단일계약,위탁,33039.07,7451.0,의무,2018-01-29 10:37:55.0,Y,N,2013-03-07 09:46:36.0,120500,송파구,잠실동,127.086025,37.514366,잠실동 리센츠,잠실새내,2호선,298.137618,1,0,164916.1,124.8
552951,서울특별시 강동구 명일동,15,15.0,0.0,삼익그린2차,84.755,201512,8,4,1983,고덕로 210,,,-,-,,234276454.0,24293969.0,,분양,위탁관리,혼합식,중앙난방,18.0,2400.0,삼익그린맨션,삼익건설,1983-12-17 00:00:00.0,221766.0,176253.0,221766.0,765.0,1215.0,420.0,,samik2apt.com,,2024-07-11 19:46:09.0,19971201.0,위탁,단일계약,위탁,18127.0,2300.0,의무,2013-06-23 14:34:38.0,Y,N,2013-03-07 09:46:34.0,55500,강동구,명일동,127.148477,37.553301,명일동 삼익그린2차,명일,5호선,449.416177,0,2,151157.6,62.7


In [40]:
train['세대별주차대수'] = train['주차대수'] / train['k-전체세대수']
test['세대별주차대수'] = test['주차대수'] / test['k-전체세대수']

In [41]:
# '계약년월'과 '년월'이 일치하는 경우 '서울시_가계대출' 정보를 train에 병합
train = train.merge(loan, how='left', left_on='계약년월', right_on='년월')
# '년월' 열은 필요 없으므로 삭제
train.drop(columns=['년월'], inplace=True)

test = test.merge(loan, how='left', left_on='계약년월', right_on='년월')
test.drop(columns=['년월'], inplace=True)

In [42]:
# '동+아파트명' 기준으로 그룹화하여 'target' 피처의 평균 가격 계산
mean_target_per_group = train.groupby('동+아파트명')['target'].mean()

# 평균 가격이 200,000 이상인 그룹을 리스트로 생성
high_price_groups = mean_target_per_group[mean_target_per_group >= 200000].index.tolist()

# train 데이터에 'top_apt' 피처 추가
train['top_apt'] = train['동+아파트명'].apply(lambda x: 1 if x in high_price_groups else 0).astype('category')

# test 데이터에 'top_apt' 피처 추가
test['top_apt'] = test['동+아파트명'].apply(lambda x: 1 if x in high_price_groups else 0).astype('category')

In [43]:
train['국민평수'] = train['전용면적(㎡)'].apply(lambda x: 1 if x <= 90 else 0)
test['국민평수'] = test['전용면적(㎡)'].apply(lambda x: 1 if x <= 90 else 0)

In [None]:
train = train[train['계약년월'] >= 201801]

In [44]:
# selected_feature_list = ['아파트명', '전용면적(㎡)', '계약년월', '층', '건축년도', '구', '동', '도로명', '세대별주차대수', '좌표X', '좌표Y', 'target', 'top_apt', '국민평수', '1번째_가까운_역_이름', '1번째_가까운_역_호선', '1번째_가까운_역_거리', '5분이하_역_개수', '5분초과_10분이하_역_개수']
selected_feature_list = ['아파트명', '전용면적(㎡)', '서울시_가계대출', '층', '건축년도', '구', '동', '도로명', '세대별주차대수', '좌표X', '좌표Y', 'target', 'top_apt', '국민평수', '1번째_가까운_역_이름', '1번째_가까운_역_호선', '1번째_가까운_역_거리', '5분이하_역_개수', '5분초과_10분이하_역_개수', '서울시_주택담보대출', 'kb부동산지수']
train = train[selected_feature_list]

In [45]:
train.to_csv('data/kkh_train_final.csv', index=False)

In [46]:
def impute_missing_values(df):
    # 수치형 피처와 범주형 피처 구분
    numeric_features = df.select_dtypes(include=[np.number]).columns
    categorical_features = df.select_dtypes(include=[object]).columns
    
    # 수치형 피처의 결측치를 평균값으로 대체
    # for feature in numeric_features:
    #     mean_value = df[feature].mean()
    #     df[feature].fillna(mean_value, inplace=True)
    
    # 수치형 피처의 결측치를 -999으로 대체
    for feature in numeric_features:
        df[feature].fillna(-999, inplace=True)
    
    # 범주형 피처의 결측치를 'Missing'으로 대체
    for feature in categorical_features:
        df[feature].fillna('Missing', inplace=True)
    
    return df

impute_missing_values(train)

Unnamed: 0,아파트명,전용면적(㎡),계약년월,서울시_가계대출,층,건축년도,구,동,도로명,세대별주차대수,좌표X,좌표Y,target,top_apt,국민평수,1번째_가까운_역_이름,1번째_가까운_역_호선,1번째_가까운_역_거리,5분이하_역_개수,5분초과_10분이하_역_개수
0,개포6차우성,79.97,201712,282438.7,3,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,124000,0,1,구룡,분당선,1187.672025,0,0
1,개포6차우성,79.97,201712,282438.7,4,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,123500,0,1,구룡,분당선,1187.672025,0,0
2,개포6차우성,54.98,201712,282438.7,5,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,91500,0,1,구룡,분당선,1187.672025,0,0
3,개포6차우성,79.97,201801,283669.8,4,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,130000,0,1,구룡,분당선,1187.672025,0,0
4,개포6차우성,79.97,201801,283669.8,2,1987,강남구,개포동,언주로 3,0.970370,127.056859,37.476276,117000,0,1,구룡,분당선,1187.672025,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118817,갈현현대,59.94,200707,-999.0,11,1998,은평구,구산동,서오릉로21길 36,1.000000,126.905543,37.612989,20000,0,1,구산,6호선,1048.367837,0,0
1118818,갈현현대,59.94,200708,-999.0,10,1998,은평구,구산동,서오릉로21길 36,1.000000,126.905543,37.612989,20000,0,1,구산,6호선,1048.367837,0,0
1118819,갈현현대,84.83,200708,-999.0,20,1998,은평구,구산동,서오릉로21길 36,1.000000,126.905543,37.612989,28000,0,1,구산,6호선,1048.367837,0,0
1118820,갈현현대,84.83,200709,-999.0,8,1998,은평구,구산동,서오릉로21길 36,1.000000,126.905543,37.612989,29000,0,1,구산,6호선,1048.367837,0,0


In [47]:
print((train.isnull().mean() * 100).to_string())

아파트명               0.0
전용면적(㎡)            0.0
계약년월               0.0
서울시_가계대출           0.0
층                  0.0
건축년도               0.0
구                  0.0
동                  0.0
도로명                0.0
세대별주차대수            0.0
좌표X                0.0
좌표Y                0.0
target             0.0
top_apt            0.0
국민평수               0.0
1번째_가까운_역_이름       0.0
1번째_가까운_역_호선       0.0
1번째_가까운_역_거리       0.0
5분이하_역_개수          0.0
5분초과_10분이하_역_개수    0.0


In [48]:
train = train[~train['층'].isin([-2, 65])]

In [50]:
SEED = 1053682552

# TARGET 정의
TARGET = 'target'

# NUMERIC_COLS 계산: TARGET을 제외한 수치형 컬럼 리스트
NUMERIC_COLS = [col for col in train.select_dtypes(include=[float, int]).columns if col != TARGET]

# CAT_COLS 계산: NUMERIC_COLS를 제외한 나머지 컬럼 리스트
CAT_COLS = [col for col in train.columns if col not in NUMERIC_COLS + [TARGET]]

print(f'Target         --> {TARGET}')
print(f'Numeric Cols   --> {NUMERIC_COLS}')
print(f'Categoric Cols --> {CAT_COLS}')

Target         --> target
Numeric Cols   --> ['전용면적(㎡)', '계약년월', '서울시_가계대출', '층', '건축년도', '세대별주차대수', '좌표X', '좌표Y', '국민평수', '1번째_가까운_역_거리', '5분이하_역_개수', '5분초과_10분이하_역_개수']
Categoric Cols --> ['아파트명', '구', '동', '도로명', 'top_apt', '1번째_가까운_역_이름', '1번째_가까운_역_호선']


In [51]:
# 데이터 준비
X = train.drop(columns=['target'])
y = train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# 범주형 열에 대한 레이블 인코딩
def label_encode_columns(X, cols):
    for col in cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
    return X

# 전처리 및 모델 파이프라인 설정
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), NUMERIC_COLS),
        ('cat', FunctionTransformer(label_encode_columns, kw_args={'cols': CAT_COLS}), CAT_COLS)
    ], remainder='passthrough')

# 모델 정의, 학습, 예측, 평가
results = {}
kf = KFold(n_splits=3, shuffle=True, random_state=SEED)

model = RandomForestRegressor(random_state=SEED)
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
rmse = np.sqrt(-np.mean(cv_scores))
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
r2 = r2_score(y_test, y_pred)
results['RandomForest'] = {'R2 Score': r2, 'RMSE': rmse}

# 결과 출력
for model_name, metrics in results.items():
    print(f"@@ model: {model_name}  /  R2: {metrics['R2 Score']:.4f}  /  RMSE: {metrics['RMSE']:.4f}")

@@ model: RandomForest  /  R2: 0.9640  /  RMSE: 11487.2858


In [52]:

# # 데이터 준비
# X = train.drop(columns=['target'])
# y = train['target']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# # 범주형 열에 대한 레이블 인코딩 함수 정의
# def label_encode_columns(X, cols):
#     for col in cols:
#         le = LabelEncoder()
#         X[col] = le.fit_transform(X[col])
#     return X

# # 전처리 및 모델 파이프라인 설정
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), NUMERIC_COLS),
#         ('cat', FunctionTransformer(label_encode_columns, kw_args={'cols': CAT_COLS}), CAT_COLS)
#     ], remainder='passthrough')

# kf = KFold(n_splits=3, shuffle=True, random_state=SEED)

# # Optuna 하이퍼파라미터 튜닝 목적 함수
# def objective(trial):
#     n_estimators = trial.suggest_int('n_estimators', 50, 500)
#     max_depth = trial.suggest_int('max_depth', 5, 30)
#     min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
#     min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)

#     model = RandomForestRegressor(
#         n_estimators=n_estimators,
#         max_depth=max_depth,
#         min_samples_split=min_samples_split,
#         min_samples_leaf=min_samples_leaf,
#         random_state=SEED
#     )
#     pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
#     cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
#     rmse = np.sqrt(-np.mean(cv_scores))
#     return rmse

# # Optuna 스터디 생성 및 최적의 하이퍼파라미터 탐색
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=3)

# # 최적의 하이퍼파라미터 출력
# print(f"Best trial: {study.best_trial.params}")

# # 최적의 하이퍼파라미터로 모델 학습 및 평가
# best_params = study.best_trial.params
# model = RandomForestRegressor(**best_params, random_state=SEED)
# pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
# pipeline.fit(X_train, y_train)
# y_pred = pipeline.predict(X_test)
# r2 = r2_score(y_test, y_pred)
# rmse = np.sqrt(-np.mean(cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')))

# results = {'RandomForest': {'R2 Score': r2, 'RMSE': rmse}}

# # 결과 출력
# for model_name, metrics in results.items():
#     print(f"@@ model: {model_name}  /  R2: {metrics['R2 Score']:.4f}  /  RMSE: {metrics['RMSE']:.4f}")

In [25]:
import joblib
# 모델 저장
model_filename = f'random_forest_pipeline_rmse_{rmse:.4f}.pkl'
joblib.dump(pipeline, model_filename)
print(f"모델이 파일로 저장되었습니다: {model_filename}")

모델이 파일로 저장되었습니다: random_forest_pipeline_rmse_9423.2520.pkl


In [44]:
selected_feature_list = ['아파트명', '전용면적(㎡)', '서울시_가계대출', '층', '건축년도', '구', '동', '도로명', '세대별주차대수', '좌표X', '좌표Y', 'top_apt', '국민평수', '1번째_가까운_역_이름', '1번째_가까운_역_호선', '1번째_가까운_역_거리', '5분이하_역_개수', '5분초과_10분이하_역_개수', '서울시_주택담보대출', 'kb부동산지수']
test = test[selected_feature_list]

impute_missing_values(test)

print((test.isnull().mean() * 100).to_string())

아파트명               0.0
전용면적(㎡)            0.0
서울시_가계대출           0.0
층                  0.0
건축년도               0.0
구                  0.0
동                  0.0
도로명                0.0
세대별주차대수            0.0
좌표X                0.0
좌표Y                0.0
top_apt            0.0
국민평수               0.0
1번째_가까운_역_이름       0.0
1번째_가까운_역_호선       0.0
1번째_가까운_역_거리       0.0
5분이하_역_개수          0.0
5분초과_10분이하_역_개수    0.0


In [48]:
import joblib
import pandas as pd

# 모델 로드
model_filename = 'random_forest_pipeline_rmse_{:.4f}.pkl'.format(rmse)
pipeline = joblib.load(model_filename)

final_predictions = pipeline.predict(test)
test['target'] = final_predictions
output_filename = f'final_predictions_rmse_{rmse:.4f}.csv'
test.to_csv(output_filename, index=False)