# K-Fold를 사용하여 모델 생성

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import gc

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

import joblib

# 맥에서 글씨 깨짐 방지
from matplotlib import rc
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False



In [2]:
df = pd.read_csv('../data/preprocessed/34_apart_price_mean.csv')

  df = pd.read_csv('../data/preprocessed/34_apart_price_mean.csv')


In [3]:
df.shape

(1128094, 105)

In [4]:
# 일부 feature 제거 - 기준은 내마음대로

df.drop(['해제사유발생일', '등기신청일자', '거래유형', '중개사소재지', 'k-단지분류(아파트,주상복합등등)', 'k-전화번호',
         'k-팩스번호', '단지소개기존clob', 'k-세대타입(분양형태)', 'k-복도유형', 'k-난방방식', 'k-전체동수', 'k-전체세대수',
         'k-사용검사일-사용승인일', 'k-관리비부과면적', 'k-전용면적별세대현황(60이하)', 'k-전용면적별세대현황(60~85이하)',
         'k-85~135이하', 'k-135초과', 'k-홈페이지', 'k-등록일자', 'k-수정일자', '고용보험관리번호', '경비비관리형태',
         '기타/의무/임대/임의=1/2/3/4', '단지승인일', '사용허가여부', '관리비 업로드', '단지신청일'], axis=1, inplace=True)

In [6]:
df.columns

Index(['시군구', '번지', '본번', '부번', '아파트명', '전용면적', '계약년월', '계약일', '층', '건축년도',
       '도로명', 'k-관리방식', 'k-건설사(시공사)', 'k-시행사', 'k-연면적', 'k-주거전용면적', '세대전기계약방법',
       '청소비관리형태', '건축면적', '주차대수', '좌표X', '좌표Y', 'target', 'is_test', '시군구 번지',
       '좌표X,좌표Y', '가장 가까운 거리', 'index', '역사_ID', '역사명', '호선', '위도', '경도',
       '인근 지하철 역 개수', '가장 가까운 버스 정류장 거리', '가장 가까운 버스 정류장 index',
       '가장 가까운 버스 정류장 노드 ID', '가장 가까운 버스 정류소번호', '가장 가까운 버스 정류소명',
       '가장 가까운 버스 정류소 타입', '가장 가까운 버스 정류장 X좌표', '가장 가까운 버스 정류장 Y좌표',
       '인근 버스 정류장 개수', '계약년', '계약월', 'GDP', '한국은행 기준금리', '기대 인플레이션', '지가지수',
       '아파트 인허가', '미분양', '거래량', '건설사 랭킹', '구', '동', '구별 지가지수', '공시지가 평균',
       '매수우위지수', '건물나이', '30년이상50년이하', '구매력지수', '거래활발지수', '매매가격 지수 증감률',
       '매매 대비 전세가격 비율', '아파트 카테고리', '지하철 카테고리', '구 카테고리', '건설사 카테고리',
       '가장 가까운 다리와의 거리', '가장 가까운 다리 index', '가장 가까운 다리', '인근 한강다리 개수',
       '인근 다리 개수 1개이상 3개 이하', '학군', '매매가격 지수', '아파트 평균 가격'],
      dtype='object')

In [7]:
# 추가로 제거가 필요한 feature 제거
df.drop(['시군구', '번지', '본번', '부번', '아파트명', '도로명', 'k-관리방식', 'k-건설사(시공사)', 'k-시행사', 'k-연면적', 'k-주거전용면적',
         '세대전기계약방법', '청소비관리형태', '건축면적', '주차대수', '시군구 번지', '좌표X,좌표Y', 'index', '역사_ID', '역사명', '위도', '경도',
         '가장 가까운 버스 정류장 index', '가장 가까운 버스 정류장 노드 ID', '가장 가까운 버스 정류소번호', '가장 가까운 버스 정류소명',
         '가장 가까운 버스 정류소 타입', '가장 가까운 버스 정류장 X좌표', '가장 가까운 버스 정류장 Y좌표', '동', 
         '구', '호선'], axis=1, inplace=True)

In [8]:
df.columns

Index(['전용면적', '계약년월', '계약일', '층', '건축년도', '좌표X', '좌표Y', 'target', 'is_test',
       '가장 가까운 거리', '인근 지하철 역 개수', '가장 가까운 버스 정류장 거리', '인근 버스 정류장 개수', '계약년',
       '계약월', 'GDP', '한국은행 기준금리', '기대 인플레이션', '지가지수', '아파트 인허가', '미분양', '거래량',
       '건설사 랭킹', '구별 지가지수', '공시지가 평균', '매수우위지수', '건물나이', '30년이상50년이하', '구매력지수',
       '거래활발지수', '매매가격 지수 증감률', '매매 대비 전세가격 비율', '아파트 카테고리', '지하철 카테고리',
       '구 카테고리', '건설사 카테고리', '가장 가까운 다리와의 거리', '가장 가까운 다리 index', '가장 가까운 다리',
       '인근 한강다리 개수', '인근 다리 개수 1개이상 3개 이하', '학군', '매매가격 지수', '아파트 평균 가격'],
      dtype='object')

In [9]:
df.shape

(1128094, 44)

In [10]:
# 계약월 변수 생성
df['계약월'] = df['계약년월'].astype(str).str[4:].astype(int)

# 계약년월, 계약일, 건축년도, 계약년 변수 제거
df.drop(['계약년월', '계약일'], axis=1, inplace=True)

df['계약월_sin'] = np.sin(2 * np.pi * df['계약월'] / 12)
df.drop(['계약월'], axis=1, inplace=True)
df

Unnamed: 0,전용면적,층,건축년도,좌표X,좌표Y,target,is_test,가장 가까운 거리,인근 지하철 역 개수,가장 가까운 버스 정류장 거리,...,건설사 카테고리,가장 가까운 다리와의 거리,가장 가까운 다리 index,가장 가까운 다리,인근 한강다리 개수,인근 다리 개수 1개이상 3개 이하,학군,매매가격 지수,아파트 평균 가격,계약월_sin
0,79.97,3,1987,127.05721,37.476763,124000.0,0,1127.738351,2.0,61.89584,...,기타,5581.622466,17.0,청담대교,0.0,0,8.0,67.392545,104403.465347,-2.449294e-16
1,79.97,4,1987,127.05721,37.476763,123500.0,0,1127.738351,2.0,61.89584,...,기타,5581.622466,17.0,청담대교,0.0,0,8.0,67.392545,104403.465347,-2.449294e-16
2,54.98,5,1987,127.05721,37.476763,91500.0,0,1127.738351,2.0,61.89584,...,기타,5581.622466,17.0,청담대교,0.0,0,8.0,67.392545,104403.465347,-2.449294e-16
3,79.97,4,1987,127.05721,37.476763,130000.0,0,1127.738351,2.0,61.89584,...,기타,5581.622466,17.0,청담대교,0.0,0,8.0,69.049924,104403.465347,5.000000e-01
4,79.97,2,1987,127.05721,37.476763,117000.0,0,1127.738351,2.0,61.89584,...,기타,5581.622466,17.0,청담대교,0.0,0,8.0,69.049924,104403.465347,5.000000e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128089,84.65,13,2014,127.10672,37.618870,,1,732.482898,4.0,59.47567,...,기타,5918.624352,23.0,구리암사대교,0.0,0,1.0,91.623694,40355.229672,-5.000000e-01
1128090,84.62,12,2014,127.10672,37.618870,,1,732.482898,4.0,59.47567,...,기타,5918.624352,23.0,구리암사대교,0.0,0,1.0,91.623694,40355.229672,-5.000000e-01
1128091,101.65,12,2014,127.10672,37.618870,,1,732.482898,4.0,59.47567,...,기타,5918.624352,23.0,구리암사대교,0.0,0,1.0,91.066919,40355.229672,-8.660254e-01
1128092,84.94,18,2014,127.10672,37.618870,,1,732.482898,4.0,59.47567,...,기타,5918.624352,23.0,구리암사대교,0.0,0,1.0,90.861265,40355.229672,-1.000000e+00


In [11]:
df.columns

Index(['전용면적', '층', '건축년도', '좌표X', '좌표Y', 'target', 'is_test', '가장 가까운 거리',
       '인근 지하철 역 개수', '가장 가까운 버스 정류장 거리', '인근 버스 정류장 개수', '계약년', 'GDP',
       '한국은행 기준금리', '기대 인플레이션', '지가지수', '아파트 인허가', '미분양', '거래량', '건설사 랭킹',
       '구별 지가지수', '공시지가 평균', '매수우위지수', '건물나이', '30년이상50년이하', '구매력지수', '거래활발지수',
       '매매가격 지수 증감률', '매매 대비 전세가격 비율', '아파트 카테고리', '지하철 카테고리', '구 카테고리',
       '건설사 카테고리', '가장 가까운 다리와의 거리', '가장 가까운 다리 index', '가장 가까운 다리',
       '인근 한강다리 개수', '인근 다리 개수 1개이상 3개 이하', '학군', '매매가격 지수', '아파트 평균 가격',
       '계약월_sin'],
      dtype='object')

## 스케일링

In [12]:
def scailing(col, scaler_type):
    if scaler_type == 'min_max': scaler = MinMaxScaler()
    else : scaler = StandardScaler()

    return scaler.fit_transform(df[[col]])

cols = ['전용면적', '층', '건축년도', '좌표X', '좌표Y', '가장 가까운 거리',
       '인근 지하철 역 개수', '가장 가까운 버스 정류장 거리', '인근 버스 정류장 개수', 'GDP',
       '한국은행 기준금리', '기대 인플레이션', '지가지수', '아파트 인허가', '미분양', '거래량',
       '구별 지가지수', '공시지가 평균', '매수우위지수', '건물나이', '구매력지수', '거래활발지수',
       '매매가격 지수 증감률', '매매 대비 전세가격 비율', '가장 가까운 다리와의 거리', '매매가격 지수',
       '아파트 평균 가격', '계약월_sin']

for col in cols:
    print(col)
    scaler_type = 'min_max'
    if col == '계약월_sin': scaler_type = 'standard'

    df[[col]] = scailing(col, scaler_type)

전용면적
층
건축년도
좌표X
좌표Y
가장 가까운 거리
인근 지하철 역 개수
가장 가까운 버스 정류장 거리
인근 버스 정류장 개수
GDP
한국은행 기준금리
기대 인플레이션
지가지수
아파트 인허가
미분양
거래량
구별 지가지수
공시지가 평균
매수우위지수
건물나이
구매력지수
거래활발지수
매매가격 지수 증감률
매매 대비 전세가격 비율
가장 가까운 다리와의 거리
매매가격 지수
아파트 평균 가격
계약월_sin


## 호선 one-hot encoding

In [19]:
ohe = OneHotEncoder(sparse=False)
ohe_re = ohe.fit_transform(df[['호선']])



In [20]:
df = pd.concat([df, pd.DataFrame(ohe_re, columns=[col for col in ohe.categories_[0]])], axis=1)

In [21]:
df

Unnamed: 0,전용면적,층,건축년도,좌표X,좌표Y,target,is_test,가장 가까운 거리,호선,인근 지하철 역 개수,...,경춘선,공항철도1호선,과천선,분당선,신림선,신분당선,신분당선(연장2),우이신설선,일산선,중앙선
0,0.168839,0.095890,0.419355,0.678243,0.167126,124000.0,0,0.353721,분당선,0.086957,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.168839,0.109589,0.419355,0.678243,0.167126,123500.0,0,0.353721,분당선,0.086957,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.108520,0.123288,0.419355,0.678243,0.167126,91500.0,0,0.353721,분당선,0.086957,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.168839,0.109589,0.419355,0.678243,0.167126,130000.0,0,0.353721,분당선,0.086957,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.168839,0.082192,0.419355,0.678243,0.167126,117000.0,0,0.353721,분당선,0.086957,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128089,0.180135,0.232877,0.854839,0.807949,0.728162,,1,0.227756,경춘선,0.173913,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128090,0.180063,0.219178,0.854839,0.807949,0.728162,,1,0.227756,경춘선,0.173913,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128091,0.221168,0.219178,0.854839,0.807949,0.728162,,1,0.227756,경춘선,0.173913,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128092,0.180835,0.301370,0.854839,0.807949,0.728162,,1,0.227756,경춘선,0.173913,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
df.drop(['호선'], axis=1, inplace=True)
df

Unnamed: 0,전용면적,층,건축년도,좌표X,좌표Y,target,is_test,가장 가까운 거리,인근 지하철 역 개수,가장 가까운 버스 정류장 거리,...,경춘선,공항철도1호선,과천선,분당선,신림선,신분당선,신분당선(연장2),우이신설선,일산선,중앙선
0,0.168839,0.095890,0.419355,0.678243,0.167126,124000.0,0,0.353721,0.086957,0.101599,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.168839,0.109589,0.419355,0.678243,0.167126,123500.0,0,0.353721,0.086957,0.101599,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.108520,0.123288,0.419355,0.678243,0.167126,91500.0,0,0.353721,0.086957,0.101599,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.168839,0.109589,0.419355,0.678243,0.167126,130000.0,0,0.353721,0.086957,0.101599,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.168839,0.082192,0.419355,0.678243,0.167126,117000.0,0,0.353721,0.086957,0.101599,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128089,0.180135,0.232877,0.854839,0.807949,0.728162,,1,0.227756,0.173913,0.097533,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128090,0.180063,0.219178,0.854839,0.807949,0.728162,,1,0.227756,0.173913,0.097533,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128091,0.221168,0.219178,0.854839,0.807949,0.728162,,1,0.227756,0.173913,0.097533,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128092,0.180835,0.301370,0.854839,0.807949,0.728162,,1,0.227756,0.173913,0.097533,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
df.columns

Index(['전용면적', '층', '건축년도', '좌표X', '좌표Y', 'target', 'is_test', '가장 가까운 거리',
       '인근 지하철 역 개수', '가장 가까운 버스 정류장 거리', '인근 버스 정류장 개수', '계약년', 'GDP',
       '한국은행 기준금리', '기대 인플레이션', '거래량', '구', '구별 지가지수', '공시지가 평균', '매수우위지수',
       '30년이상50년이하', '계약월_sin', '1호선', '2호선', '3호선', '4호선', '5호선', '6호선',
       '7호선', '8호선', '9호선', '9호선(연장)', '경부선', '경원선', '경의중앙선', '경인선', '경춘선',
       '공항철도1호선', '과천선', '분당선', '신림선', '신분당선', '신분당선(연장2)', '우이신설선', '일산선',
       '중앙선'],
      dtype='object')

In [24]:
df.drop(['9호선(연장)', '경부선', '8호선', 
         '경의중앙선', '신림선', '신분당선',
         '신분당선(연장2)', '중앙선', '우이신설선',
         '공항철도1호선', '경인선', '1호선',
         '일산선', '과천선', '경춘선'], axis=1, inplace=True)

In [30]:
df.columns

Index(['전용면적', '층', '건축년도', '좌표X', '좌표Y', 'target', 'is_test', '가장 가까운 거리',
       '인근 지하철 역 개수', '가장 가까운 버스 정류장 거리', '인근 버스 정류장 개수', '계약년', 'GDP',
       '한국은행 기준금리', '기대 인플레이션', '지가지수', '아파트 인허가', '미분양', '거래량', '건설사 랭킹',
       '구별 지가지수', '공시지가 평균', '매수우위지수', '건물나이', '30년이상50년이하', '구매력지수', '거래활발지수',
       '매매가격 지수 증감률', '매매 대비 전세가격 비율', '아파트 카테고리', '지하철 카테고리', '구 카테고리',
       '건설사 카테고리', '가장 가까운 다리와의 거리', '인근 한강다리 개수', '인근 다리 개수 1개이상 3개 이하', '학군',
       '계약월_sin'],
      dtype='object')

# 카테고리 변수 처리
- 30년이상50년이하
- 아파트 카테고리
- 지하철 카테고리
- 구 카테고리
- 건설사 카테고리
- 인근 다리 개수 1개이상 3개 이하
- 학군

In [13]:
df.head()

Unnamed: 0,전용면적,층,건축년도,좌표X,좌표Y,target,is_test,가장 가까운 거리,인근 지하철 역 개수,가장 가까운 버스 정류장 거리,...,건설사 카테고리,가장 가까운 다리와의 거리,가장 가까운 다리 index,가장 가까운 다리,인근 한강다리 개수,인근 다리 개수 1개이상 3개 이하,학군,매매가격 지수,아파트 평균 가격,계약월_sin
0,0.168839,0.09589,0.419355,0.678243,0.167126,124000.0,0,0.353721,0.086957,0.101599,...,기타,0.347392,17.0,청담대교,0.0,0,8.0,0.395699,0.085307,-0.022272
1,0.168839,0.109589,0.419355,0.678243,0.167126,123500.0,0,0.353721,0.086957,0.101599,...,기타,0.347392,17.0,청담대교,0.0,0,8.0,0.395699,0.085307,-0.022272
2,0.10852,0.123288,0.419355,0.678243,0.167126,91500.0,0,0.353721,0.086957,0.101599,...,기타,0.347392,17.0,청담대교,0.0,0,8.0,0.395699,0.085307,-0.022272
3,0.168839,0.109589,0.419355,0.678243,0.167126,130000.0,0,0.353721,0.086957,0.101599,...,기타,0.347392,17.0,청담대교,0.0,0,8.0,0.424091,0.085307,0.6842
4,0.168839,0.082192,0.419355,0.678243,0.167126,117000.0,0,0.353721,0.086957,0.101599,...,기타,0.347392,17.0,청담대교,0.0,0,8.0,0.424091,0.085307,0.6842


In [14]:
df.columns

Index(['전용면적', '층', '건축년도', '좌표X', '좌표Y', 'target', 'is_test', '가장 가까운 거리',
       '인근 지하철 역 개수', '가장 가까운 버스 정류장 거리', '인근 버스 정류장 개수', '계약년', 'GDP',
       '한국은행 기준금리', '기대 인플레이션', '지가지수', '아파트 인허가', '미분양', '거래량', '건설사 랭킹',
       '구별 지가지수', '공시지가 평균', '매수우위지수', '건물나이', '30년이상50년이하', '구매력지수', '거래활발지수',
       '매매가격 지수 증감률', '매매 대비 전세가격 비율', '아파트 카테고리', '지하철 카테고리', '구 카테고리',
       '건설사 카테고리', '가장 가까운 다리와의 거리', '가장 가까운 다리 index', '가장 가까운 다리',
       '인근 한강다리 개수', '인근 다리 개수 1개이상 3개 이하', '학군', '매매가격 지수', '아파트 평균 가격',
       '계약월_sin'],
      dtype='object')

In [15]:
#df['30년이상50년이하'] = df['30년이상50년이하'].astype('category')
df['아파트 카테고리'] = df['아파트 카테고리'].astype('category')
df['지하철 카테고리'] = df['지하철 카테고리'].astype('category')
df['구 카테고리'] = df['구 카테고리'].astype('category')
df['건설사 카테고리'] = df['건설사 카테고리'].astype('category')
df['인근 다리 개수 1개이상 3개 이하'] = df['인근 다리 개수 1개이상 3개 이하'].astype('category')
df['학군'] = df['학군'].astype('category')

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128094 entries, 0 to 1128093
Data columns (total 42 columns):
 #   Column               Non-Null Count    Dtype   
---  ------               --------------    -----   
 0   전용면적                 1128094 non-null  float64 
 1   층                    1128094 non-null  float64 
 2   건축년도                 1128094 non-null  float64 
 3   좌표X                  1128094 non-null  float64 
 4   좌표Y                  1128094 non-null  float64 
 5   target               1118822 non-null  float64 
 6   is_test              1128094 non-null  int64   
 7   가장 가까운 거리            1128094 non-null  float64 
 8   인근 지하철 역 개수          1128094 non-null  float64 
 9   가장 가까운 버스 정류장 거리     1128094 non-null  float64 
 10  인근 버스 정류장 개수         1128094 non-null  float64 
 11  계약년                  1128094 non-null  int64   
 12  GDP                  1128094 non-null  float64 
 13  한국은행 기준금리            1128094 non-null  float64 
 14  기대 인플레이션             1128094 non-n

# Dataset 생성

In [17]:
df.columns

Index(['전용면적', '층', '건축년도', '좌표X', '좌표Y', 'target', 'is_test', '가장 가까운 거리',
       '인근 지하철 역 개수', '가장 가까운 버스 정류장 거리', '인근 버스 정류장 개수', '계약년', 'GDP',
       '한국은행 기준금리', '기대 인플레이션', '지가지수', '아파트 인허가', '미분양', '거래량', '건설사 랭킹',
       '구별 지가지수', '공시지가 평균', '매수우위지수', '건물나이', '30년이상50년이하', '구매력지수', '거래활발지수',
       '매매가격 지수 증감률', '매매 대비 전세가격 비율', '아파트 카테고리', '지하철 카테고리', '구 카테고리',
       '건설사 카테고리', '가장 가까운 다리와의 거리', '가장 가까운 다리 index', '가장 가까운 다리',
       '인근 한강다리 개수', '인근 다리 개수 1개이상 3개 이하', '학군', '매매가격 지수', '아파트 평균 가격',
       '계약월_sin'],
      dtype='object')

In [19]:
train_df = df[df['is_test'] == 0]
test_df = df[df['is_test'] == 1]

In [20]:
print(train_df.shape)
print(test_df.shape)

(1118822, 42)
(9272, 42)


In [21]:
train_df_dropped = train_df.drop(['is_test', '지가지수', '아파트 인허가', '미분양', '건설사 랭킹',
                                  '건물나이', '구매력지수', '거래활발지수', '매매가격 지수 증감률', 
                                  '매매 대비 전세가격 비율', '구 카테고리', '건설사 카테고리', '가장 가까운 다리와의 거리',
                                  '가장 가까운 다리 index', '가장 가까운 다리', '인근 한강다리 개수', 
                                  '인근 다리 개수 1개이상 3개 이하', '학군', '매매가격 지수'], axis=1)

test_X = test_df.drop(['is_test', 'target', '지가지수', '아파트 인허가', '미분양', '건설사 랭킹',
                                '건물나이', '구매력지수', '거래활발지수', '매매가격 지수 증감률', 
                                '매매 대비 전세가격 비율', '구 카테고리', '건설사 카테고리', '가장 가까운 다리와의 거리',
                                '가장 가까운 다리 index', '가장 가까운 다리', '인근 한강다리 개수', 
                                '인근 다리 개수 1개이상 3개 이하', '학군', '매매가격 지수'], axis=1)

# train_data shuffle

In [22]:
train_shuffled = train_df_dropped.sample(frac=1).reset_index(drop=True)
train_shuffled

Unnamed: 0,전용면적,층,건축년도,좌표X,좌표Y,target,가장 가까운 거리,인근 지하철 역 개수,가장 가까운 버스 정류장 거리,인근 버스 정류장 개수,...,기대 인플레이션,거래량,구별 지가지수,공시지가 평균,매수우위지수,30년이상50년이하,아파트 카테고리,지하철 카테고리,아파트 평균 가격,계약월_sin
0,0.088776,0.219178,0.451613,0.719300,0.896280,18500.0,0.022437,0.217391,0.094015,0.371429,...,0.677419,0.583991,0.293672,0.020264,0.381714,0,기타,4호선,0.027015,-1.435218
1,0.160488,0.150685,0.483871,0.631202,0.775464,64400.0,0.395885,0.086957,0.143891,0.628571,...,0.064516,0.405766,0.854176,0.115380,0.638747,1,기타,4호선,0.018165,0.684200
2,0.120179,0.273973,0.564516,0.544050,0.302006,143000.0,0.155359,0.478261,0.169725,0.171429,...,0.741935,0.007848,0.954917,0.528334,0.081841,0,기타,3호선,0.074448,0.684200
3,0.445317,0.273973,0.774194,0.563973,0.288625,214000.0,0.051542,0.521739,0.298160,0.057143,...,0.419355,0.470365,0.169306,0.265519,0.155371,0,기타,7호선,0.158409,1.201374
4,0.133116,0.136986,0.677419,0.791554,0.272049,40000.0,0.094813,0.347826,0.329313,0.085714,...,0.290323,0.739044,0.278311,0.234263,0.549872,0,기타,9호선,0.026739,-1.245919
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118817,0.154236,0.260274,0.500000,0.655530,0.847444,30500.0,0.134635,0.260870,0.115895,0.228571,...,0.483871,0.329891,0.403796,0.017539,0.067136,0,기타,경원선,0.044456,1.201374
1118818,0.180425,0.315068,0.564516,0.204479,0.321105,57500.0,0.182006,0.217391,0.132401,0.085714,...,0.258065,0.692495,0.415871,0.116383,0.418798,0,기타,2호선,0.053773,0.684200
1118819,0.058943,0.123288,0.500000,0.130346,0.536158,12000.0,0.195098,0.086957,0.152299,0.114286,...,0.387097,0.404857,0.141466,0.019699,0.386829,0,기타,9호선,0.017699,-1.245919
1118820,0.180685,0.328767,0.677419,0.173107,0.241891,46800.0,0.132842,0.086957,0.374019,0.085714,...,0.258065,0.842634,0.454524,0.081114,0.641944,0,기타,기타,0.027015,-0.728745


In [23]:
train_X = train_shuffled.drop(['target'], axis=1)
train_y = train_shuffled[['target']]

## K-Fold

In [24]:
kf = KFold(n_splits=10)

In [25]:
train_folds = kf.split(train_X, train_y)
display(train_folds)

<generator object _BaseKFold.split at 0x3181239a0>

In [27]:
params = {
    'n_estimators': 100000,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'metric_freq': 20,
    'device': 'gpu',
    'verbosity': 0
}

fold_models = []

for fold_idx, (train_idx, valid_idx) in enumerate(train_folds):
    print(f'--------------------{fold_idx}번째 학습 시작--------------------')

    train_X_fold = train_X.iloc[train_idx, :]
    train_y_fold = train_y.iloc[train_idx]

    valid_X_fold = train_X.iloc[valid_idx, :]
    valid_y_fold = train_y.iloc[valid_idx]

    model = lgb.LGBMRegressor(n_estimators=100000,
                              metric='rmse', data_sample_strategy='goss')
    
    model.fit(
    train_X_fold, train_y_fold,
    eval_set = [(train_X_fold, train_y_fold), (valid_X_fold, valid_y_fold)],
    eval_metric='rmse',
    callbacks=[lgb.early_stopping(stopping_rounds=60),
               lgb.log_evaluation(period=10, show_stdv=True)])
    
    fold_models.append(model)
    #joblib.dump(model, f'./16/kfold_models/model_{fold_idx}.pkl') # 모델 저장시 사용
    print(f'--------------------{fold_idx}번째 학습 종료--------------------')

--------------------0번째 학습 시작--------------------
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015682 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2708
[LightGBM] [Info] Number of data points in the train set: 1006939, number of used features: 22
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 58008.073139
Training until validation scores don't improve for 60 rounds
[10]	training's rmse: 23703.8	valid_1's rmse: 23239.1
[20]	training's rmse: 16361.8	valid_1's rmse: 16042.9
[30]	training's rmse: 13769.8	valid_1's rmse: 13595.2
[40]	training's rmse: 12532.2	valid_1's rmse: 12458.1
[50]	training's rmse: 11813.8	valid_1's rmse: 11778.7
[60]	training's rmse: 11270.9	valid_1's rmse: 11256
[70]	training's rmse: 10834.5	valid_1's rmse: 10850.3
[80]	training's rmse: 10480.1	valid_1's rmse: 10531.5
[90]	training's rm

In [40]:
train_X.columns

Index(['전용면적', '층', '건축년도', '좌표X', '좌표Y', '가장 가까운 거리', '인근 지하철 역 개수',
       '가장 가까운 버스 정류장 거리', '인근 버스 정류장 개수', '계약년', 'GDP', '한국은행 기준금리',
       '기대 인플레이션', '거래량', '구별 지가지수', '공시지가 평균', '매수우위지수', '30년이상50년이하',
       '아파트 카테고리', '지하철 카테고리', '아파트 평균 가격', '계약월_sin'],
      dtype='object')

# 예측

In [29]:
preds_list = []

for model in fold_models:
    preds_list.append(model.predict(test_X))

In [32]:
len(preds_list)

9

In [33]:
pd.DataFrame(preds_list[0], columns=['target'])

Unnamed: 0,target
0,192490.500501
1,291293.098867
2,295827.754121
3,270237.970394
4,216486.515940
...,...
9267,72451.677255
9268,71982.020366
9269,84595.639649
9270,73840.515537


In [34]:
re_df = pd.concat([pd.DataFrame(preds, columns=['target_']) for preds in preds_list], axis=1)
re_df

Unnamed: 0,target_,target_.1,target_.2,target_.3,target_.4,target_.5,target_.6,target_.7,target_.8
0,192490.500501,190856.833926,190848.317260,189066.038790,183985.844529,177544.436047,181005.559274,184049.670656,185401.174382
1,291293.098867,308671.351488,297053.771238,288758.827112,295870.227808,293779.823782,352641.982963,293725.617426,281090.780067
2,295827.754121,306404.331825,336390.663805,336132.092167,311937.695549,316573.281143,303905.442688,318464.730424,315383.507861
3,270237.970394,272277.984196,291347.999665,291130.999763,285805.739403,283516.125585,278314.306962,291305.557648,274942.572007
4,216486.515940,228346.186371,223938.384149,224605.702868,214387.519027,209528.358535,231589.397263,235227.236739,220073.681633
...,...,...,...,...,...,...,...,...,...
9267,72451.677255,73135.140268,74901.210854,73980.966085,70556.922751,74276.452959,72029.746127,72956.538735,75487.445437
9268,71982.020366,72968.610986,74354.902050,72979.643603,70243.925492,74181.839417,71948.892190,72980.006786,74877.783068
9269,84595.639649,82187.405227,85880.229926,83668.067349,83928.163873,82443.859084,79462.477239,82533.159523,90174.065587
9270,73840.515537,74599.880241,75947.714450,73453.732169,72371.705747,78760.257437,74436.678632,76322.169579,78414.966930


In [35]:
re_df['target'] = re_df.mean(axis=1)
re_df

Unnamed: 0,target_,target_.1,target_.2,target_.3,target_.4,target_.5,target_.6,target_.7,target_.8,target
0,192490.500501,190856.833926,190848.317260,189066.038790,183985.844529,177544.436047,181005.559274,184049.670656,185401.174382,186138.708374
1,291293.098867,308671.351488,297053.771238,288758.827112,295870.227808,293779.823782,352641.982963,293725.617426,281090.780067,300320.608972
2,295827.754121,306404.331825,336390.663805,336132.092167,311937.695549,316573.281143,303905.442688,318464.730424,315383.507861,315668.833287
3,270237.970394,272277.984196,291347.999665,291130.999763,285805.739403,283516.125585,278314.306962,291305.557648,274942.572007,282097.695069
4,216486.515940,228346.186371,223938.384149,224605.702868,214387.519027,209528.358535,231589.397263,235227.236739,220073.681633,222686.998058
...,...,...,...,...,...,...,...,...,...,...
9267,72451.677255,73135.140268,74901.210854,73980.966085,70556.922751,74276.452959,72029.746127,72956.538735,75487.445437,73308.455608
9268,71982.020366,72968.610986,74354.902050,72979.643603,70243.925492,74181.839417,71948.892190,72980.006786,74877.783068,72946.402662
9269,84595.639649,82187.405227,85880.229926,83668.067349,83928.163873,82443.859084,79462.477239,82533.159523,90174.065587,83874.785273
9270,73840.515537,74599.880241,75947.714450,73453.732169,72371.705747,78760.257437,74436.678632,76322.169579,78414.966930,75349.735636


In [36]:
re_df = re_df[['target']]
re_df

Unnamed: 0,target
0,186138.708374
1,300320.608972
2,315668.833287
3,282097.695069
4,222686.998058
...,...
9267,73308.455608
9268,72946.402662
9269,83874.785273
9270,75349.735636


In [37]:
re_df = np.round(re_df).astype(int)
re_df

Unnamed: 0,target
0,186139
1,300321
2,315669
3,282098
4,222687
...,...
9267,73308
9268,72946
9269,83875
9270,75350


In [39]:
re_df.to_csv('../submission/34/34_k.csv', index=False)