<a href="https://colab.research.google.com/github/Ryong1998/house_price/blob/main/EDA_file4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 프로젝트 소개2

- 앞에서의 개별 아파트 집값을 예측하는 프로젝트는 결론적으로 성능이 너무 낮게 나옴
- 월별로 '1년뒤 서울 아파트 전체 평당 매매가'를 예측하는 모델을 생성하여 진행하고자 하였으나, 기계학습을 통해 모델을 학습에 사용할 데이터의 양이 너무 적게 생성이 되는 문제가 발생
- 해결책으로 위해서 일별로 '1년뒤 서울 아파트 전체 평당 매매가'를 예측하는 모델을 생성하여 진행
- 개별 아파트를 추천하지는 못하더라도, 아파트 시장의 1년뒤 전망을 통해 현재 아파트를 살 타이밍인지 아닌지를 예측하는 프로젝트를 진행

In [1]:
# 구글 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# apartment_deal 파일 생성

- EDA_file3 에서 생성과정과 동일하지만, 파일경로만 ver_4로 수정해서 실행

In [None]:
import pandas as pd
import numpy as np
import os

# 연도별 아파트 매매 정보들이 들어있는 csv경로 설정 
dir_path = "/content/drive/MyDrive/house_price/original_data/deal_price/Seoul" 
file_list = os.listdir(dir_path)
file_list.sort()
df_list = list()
# 해당 폴더 안에 있는 csv 파일들을 읽어서 리스트 안에 데이터프레임들을 담음
for csv_file in file_list:
    df_list.append(pd.read_csv(dir_path+"/"+csv_file ,skiprows=15,  encoding='cp949'))

# 모든 데이터프레임을 하나의 데이터프레임으로 통합
df_default = df_list[0]
for df_csv in df_list[1:]:
    df_default = pd.concat([df_default, df_csv], axis=0)
df_default.reset_index(drop=True, inplace=True) # concat으로 합쳐질 때 인덱스 재설정


# 사용할 컬럼들만 거르고 컬럼명 영어로 치환 - 필요한 컬럼들만 선택
df_default = df_default[['시군구','본번','부번','도로명','단지명','계약년월','계약일','전용면적(㎡)','거래금액(만원)']]
df_default.columns = ['address','main_number','sub_number','road','name','year_month','day','area','deal_price']


# 타입 변경을 통해서 deal_price,year_month, day 타입 변경
df_default["deal_price"] = df_default["deal_price"].str.replace(",", "") # 'deal_price'에서 ','가 들어있는 부분 제거(추후 계산에 사용하기 위해서서)
df = df_default.astype({'year_month':'str','day':'str','deal_price':'int64'}).copy()

# 추후 데이터들 그룹화에 사용하기 위해서 날짜관련 컬럼들들 분리 및 생성
df['year'] = df['year_month'].str[0:4] # '연','월' 합쳐져 있는 컬럼에서 연도만 추출
df['month'] = df['year_month'].str[4:] # '연','월' 합쳐져 있는 컬럼에서 월만 추출
df.loc[df["day"].str.len()==1,"day"]='0'+df.loc[df["day"].str.len()==1,"day"] # '일'이 있는 컬럼에서 해당 '일'이 1일, 2일 처럼 1자리 숫자인 경우 앞에 0을 추가
df['date'] = pd.to_datetime(df['year']+df['month']+df['day']) # 일자들을 합쳐서 date 컬럼 생성
df = df.astype({'year':'int64','month':'int64','day':'int64'}) # 원하는 타입으로 변경경
df = df.drop(['year_month'], axis=1) # 사용 안하는 컬럼들 제거


# 주소 및 도로명들 분리
df["address_0"] = df["address"].str.split(' ',expand=True)[0] # '시' 만 추출해야 하나, 서울만 함으로 일단은 실행 X
df["address_1"] = df["address"].str.split(' ',expand=True)[1] # '구' 만 추출
df["address_2"] = df["address"].str.split(' ',expand=True)[2] # '동' 만 추출
df["road_name"] = df["road"].str.split(' ',expand=True)[0] # '도로명' 만 추출
df["road_number"] = df["road"].str.split(' ',expand=True)[1] # '도로숫자' 만 추출
df= df[['year','month','day','address_0','address_1','address_2','road_name','road_number','area','deal_price','name','main_number','sub_number','date']] # 사용할 컬럼만 선택

df.loc[df['name'] == '서울역센트럴자이(임대)','name']='서울역센트럴자이' # '서울역센트럴자이(임대)' 명칭을을 '서울역센트럴자이'로 수정
df.loc[df['name'] == '서울역센트럴자이','road_name']='만리재로' # 위에서 확인한 '서울역센트럴자이'의 값들로 'road_name' 수정
df.loc[df['name'] == '서울역센트럴자이','road_number']='175' # 위에서 확인한 '서울역센트럴자이'의 값들로 'road_number' 수정


df = df.replace('', np.nan) # ''값만 있는 값들을 null 값들로 수정


# 지번주소 null 값들을 네이버를 통해 검색하여서 정보를 얻고 수정
df.loc[df['name']=='힐스테이트 서초 젠트리스','main_number'] = 557
df.loc[df['name']=='힐스테이트 서초 젠트리스','sub_number'] = 0

# 사용할 컬럼들 선택택과, 컬럼명들 수정
df_deal = df[['date','year','month','day','address_0','address_1','address_2','main_number','sub_number','name','area','deal_price']].copy()
df_deal.columns =['date','year','month','day','address_0','address_1','address_2','address_3','address_4','name','area','deal_price']
df_deal = df_deal[df_deal['year']>=2011] # 전세/월세데이터가 2011년 이후로 있어서 연도 선택

df_deal.to_csv('/content/drive/MyDrive/house_price/after_data/ver_4/apartment_deal.csv',index=False)


# apartment_full_rent, apartment_month_rent 파일 생성

- EDA_file3 에서 생성과정과 동일하지만, 파일경로만 ver_4로 수정해서 실행

In [None]:
import pandas as pd
import os


dir_path = "/content/drive/MyDrive/house_price/original_data/rent_price/Seoul"
file_list = os.listdir(dir_path)
file_list.sort()
df_list = list()

# 해당 폴더 안에 있는 csv 파일들을 읽어서 리스트 안에 데이터프레임들을 담음
for csv_file in file_list:
    df_list.append(pd.read_csv(dir_path+"/"+csv_file ,skiprows=15,  encoding='cp949'))

# 모든 데이터프레임을을 통합
df_default = df_list[0]
for df_csv in df_list[1:]:
    df_default = pd.concat([df_default, df_csv], axis=0)
df_default.reset_index(drop=True, inplace=True) # concat으로 합쳐질 때 인덱스 재설정


# 전세 데이터 프레임 생성 - 주석부분은 중간에서 값 확인하는 부분들
df_full_rent = df_default.loc[df_default['전월세구분']=='전세',['시군구','본번','부번','도로명','계약년월','계약일','보증금(만원)','전용면적(㎡)','단지명']].copy()
df_full_rent.columns = ['address','main_number','sub_number','road','year_month','day','full_rent_price','area','name']

df_full_rent = df_full_rent.astype({'full_rent_price':'str','year_month':'str','day':'str','full_rent_price':'str'})
df_full_rent["full_rent_price"] = df_full_rent["full_rent_price"].str.replace(",", "")
df_full_rent.loc[df_full_rent["day"].str.len()==1,"day"]='0'+df_full_rent.loc[df_full_rent["day"].str.len()==1,"day"] # 일이 있는 컬럼에서 1자리 숫자인 경우 앞에 0을 추가성
df_full_rent['year'] = df_full_rent['year_month'].str[0:4] # 연,월 합쳐져 있는 컬럼에서 연도만 추출
df_full_rent['month'] = df_full_rent['year_month'].str[4:] # 연,월 합쳐져 있는 컬럼에서 월만 추출
df_full_rent['date'] = pd.to_datetime(df_full_rent['year']+df_full_rent['month']+df_full_rent['day']) # 일자들을 합쳐서 date 컬럼 생
df_full_rent = df_full_rent.astype({'year':'int64','month':'int64','day':'int64','full_rent_price':'int64'})
df_full_rent = df_full_rent.drop(['year_month'], axis=1) # 사용 안하는 컬럼들 제거


df_full_rent["address_0"] = df_full_rent["address"].str.split(' ',expand=True)[0] # '시' 만 추출해야 하나, 서울만 함으로 일단은 실행 X
df_full_rent["address_1"] = df_full_rent["address"].str.split(' ',expand=True)[1] # '구' 만 추출
df_full_rent["address_2"] = df_full_rent["address"].str.split(' ',expand=True)[2] # '동' 만 추출
df_full_rent["road_name"] = df_full_rent["road"].str.split(' ',expand=True)[0] # '도로명' 만 추출
df_full_rent["road_number"] = df_full_rent["road"].str.split(' ',expand=True)[1] # '도로숫자' 만 추출
df_full_rent= df_full_rent[['year','month','day','address_0','address_1','address_2','main_number','sub_number','road_name','road_number','area',"full_rent_price",'name','date']] # 사용할 컬럼만 선택


df_full_rent = df_full_rent.replace('', None) # ''값만 있는 값들을 null 값들로 수정


df_full_rent.loc[df_full_rent['name']=='힐스테이트 서초 젠트리스','main_number'] = 557
df_full_rent.loc[df_full_rent['name']=='힐스테이트 서초 젠트리스','sub_number'] = 0


df_full_rent = df_full_rent[['date','year','month','day','address_0','address_1','address_2','main_number','sub_number','name','area','full_rent_price']].copy()
df_full_rent.columns =['date','year','month','day','address_0','address_1','address_2','address_3','address_4','name','area','full_rent_price']

# area가 null값인 row들이 다른 주소정보관련 컬럼들을 리스트 화
add_1 = list(df_full_rent.loc[df_full_rent['area'].isnull(),'address_1'])
add_2 = list(df_full_rent.loc[df_full_rent['area'].isnull(),'address_2'])
add_3 = list(df_full_rent.loc[df_full_rent['area'].isnull(),'address_3'])
add_4 = list(df_full_rent.loc[df_full_rent['area'].isnull(),'address_4'])
area_list = list()

# area_list 에 값 추가
for i in range(len(add_1)):
    # 해당 주소에서 거래된 매물들의 '층' 정보가 없을 경우, area null을 처리할 참조 자료가 없음으로 ''으로 처리리
    if (len(df_full_rent.loc[(df_full_rent['address_1'] ==add_1[i]) & 
                     (df_full_rent['address_2'] ==add_2[i]) &
                     (df_full_rent['address_3'] ==add_3[i]) &
                     (df_full_rent['address_4'] ==add_4[i]),
                     'area'].value_counts())) == 0:

        area_list.append('')
    else:
        # 해당 주소에서 가장 많이 거래되었던 층수를 null 값에 채움움
        area_list.append(df_full_rent.loc[(df_full_rent['address_1'] ==add_1[i]) & 
                     (df_full_rent['address_2'] ==add_2[i]) &
                     (df_full_rent['address_3'] ==add_3[i]) &
                     (df_full_rent['address_4'] ==add_4[i]),
                     'area'].value_counts().idxmax())
        
# floor가 null인 값들을 처리, 가장 많이 거래된 '층'의 정보로 결측치 처리리
for i in range(len(add_1)):
    df_full_rent.loc[(df_full_rent['address_1'] ==add_1[i]) & 
                         (df_full_rent['address_2'] ==add_2[i]) &
                         (df_full_rent['address_3'] ==add_3[i]) &
                         (df_full_rent['address_4'] ==add_4[i]),
                         'area']=area_list[i]   

# floor이 ''인 값 제거
df_full_rent=df_full_rent.drop(df_full_rent[df_full_rent['area']==''].index)

df_full_rent.to_csv('/content/drive/MyDrive/house_price/after_data/ver_4/apartment_full_rent.csv', index=False) # 전세 csv 파일 생성성




# 월세 데이터 프레임 생성, 필요한 컬럼들만 필터링
df_month_rent = df_default.loc[df_default['전월세구분']=='월세',['시군구','본번','부번','도로명','계약년월','계약일','보증금(만원)','월세(만원)','전용면적(㎡)','단지명']].copy()
df_month_rent.columns = ['address','main_number','sub_number','road','year_month','day','rent_deposit','month_rent_price','area','name']

df_month_rent["month_rent_price2"] = df_month_rent["month_rent_price"].str.replace(',','')


# 해당 파트를 통해서 우선 type 을 변경한 다음에 진행해야 함
df_month_rent = df_month_rent.astype({'month_rent_price':'str','rent_deposit':'str'})

df_month_rent["rent_deposit"] = df_month_rent["rent_deposit"].str.replace(",", "")
df_month_rent["month_rent_price"] = df_month_rent["month_rent_price"].str.replace(',','')
df_month_rent = df_month_rent.astype({'year_month':'str','day':'str','rent_deposit':'int64','month_rent_price':'int64'})
df_month_rent['year'] = df_month_rent['year_month'].str[0:4] # 연,월 합쳐져 있는 컬럼에서 연도만 추출
df_month_rent['month'] = df_month_rent['year_month'].str[4:] # 연,월 합쳐져 있는 컬럼에서 월만 추출
df_month_rent.loc[df_month_rent["day"].str.len()==1,"day"]='0'+df_month_rent.loc[df_month_rent["day"].str.len()==1,"day"] # 일이 있는 컬럼에서 1자리 숫자인 경우 앞에 0을 추가성
df_month_rent['date'] = pd.to_datetime(df_month_rent['year']+df_month_rent['month']+df_month_rent['day']) # 일자들을 합쳐서 date 컬럼 생
df_month_rent = df_month_rent.astype({'year':'int64','month':'int64','day':'int64'})
df_month_rent = df_month_rent.drop(['year_month'], axis=1) # 사용 안하는 컬럼들 제거

df_month_rent["address_0"] = df_month_rent["address"].str.split(' ',expand=True)[0] # '시' 만 추출해야 하나, 서울만 함으로 일단은 실행 X
df_month_rent["address_1"] = df_month_rent["address"].str.split(' ',expand=True)[1] # '구' 만 추출
df_month_rent["address_2"] = df_month_rent["address"].str.split(' ',expand=True)[2] # '동' 만 추출
df_month_rent["road_name"] = df_month_rent["road"].str.split(' ',expand=True)[0] # '도로명' 만 추출
df_month_rent["road_number"] = df_month_rent["road"].str.split(' ',expand=True)[1] # '도로숫자' 만 추출
df_month_rent= df_month_rent[['year','month','day','address_0','address_1','address_2','main_number','sub_number','road_name','road_number','area',"rent_deposit","month_rent_price",'name','date']] # 사용할 컬럼만 선택


df_month_rent = df_month_rent.replace('', None) # ''값만 있는 값들을 null 값들로 수정



df_month_rent.loc[df_month_rent['name']=='힐스테이트 서초 젠트리스','main_number'] = 557
df_month_rent.loc[df_month_rent['name']=='힐스테이트 서초 젠트리스','sub_number'] = 0

df_month_rent = df_month_rent[['date','year','month','day','address_0','address_1','address_2','main_number','sub_number','name','area','rent_deposit','month_rent_price']]
df_month_rent.columns =['date','year','month','day','address_0','address_1','address_2','address_3','address_4','name','area','rent_deposit','month_rent_price']


add_1 = list(df_month_rent.loc[df_month_rent['area'].isnull(),'address_1'])
add_2 = list(df_month_rent.loc[df_month_rent['area'].isnull(),'address_2'])
add_3 = list(df_month_rent.loc[df_month_rent['area'].isnull(),'address_3'])
add_4 = list(df_month_rent.loc[df_month_rent['area'].isnull(),'address_4'])
area_list = list()
# area_list 에 값 추가
for i in range(len(add_1)):
    # 해당 주소에서 거래된 매물들의 '층' 정보가 없을 경우, area null을 처리할 참조 자료가 없음으로 ''으로 처리리
    if (len(df_month_rent.loc[(df_month_rent['address_1'] ==add_1[i]) & 
                     (df_month_rent['address_2'] ==add_2[i]) &
                     (df_month_rent['address_3'] ==add_3[i]) &
                     (df_month_rent['address_4'] ==add_4[i]),
                     'area'].value_counts())) == 0:

        area_list.append('')
    else:
        # 해당 주소에서 가장 많이 거래되었던 층수를 null 값에 채울거임
        area_list.append(df_month_rent.loc[(df_month_rent['address_1'] ==add_1[i]) & 
                     (df_month_rent['address_2'] ==add_2[i]) &
                     (df_month_rent['address_3'] ==add_3[i]) &
                     (df_month_rent['address_4'] ==add_4[i]),
                     'area'].value_counts().idxmax())

for i in range(len(add_1)):
    df_month_rent.loc[(df_month_rent['address_1'] ==add_1[i]) & 
                         (df_month_rent['address_2'] ==add_2[i]) &
                         (df_month_rent['address_3'] ==add_3[i]) &
                         (df_month_rent['address_4'] ==add_4[i]),
                         'area']=area_list[i]

df_month_rent.to_csv('/content/drive/MyDrive/house_price/after_data/ver_4/apartment_month_rent.csv', index=False)

  df_list.append(pd.read_csv(dir_path+"/"+csv_file ,skiprows=15,  encoding='cp949'))
  df_list.append(pd.read_csv(dir_path+"/"+csv_file ,skiprows=15,  encoding='cp949'))
  df_list.append(pd.read_csv(dir_path+"/"+csv_file ,skiprows=15,  encoding='cp949'))
  df_list.append(pd.read_csv(dir_path+"/"+csv_file ,skiprows=15,  encoding='cp949'))
  df_list.append(pd.read_csv(dir_path+"/"+csv_file ,skiprows=15,  encoding='cp949'))
  df_list.append(pd.read_csv(dir_path+"/"+csv_file ,skiprows=15,  encoding='cp949'))
  df_list.append(pd.read_csv(dir_path+"/"+csv_file ,skiprows=15,  encoding='cp949'))
  df_list.append(pd.read_csv(dir_path+"/"+csv_file ,skiprows=15,  encoding='cp949'))
  df_list.append(pd.read_csv(dir_path+"/"+csv_file ,skiprows=15,  encoding='cp949'))
  df_list.append(pd.read_csv(dir_path+"/"+csv_file ,skiprows=15,  encoding='cp949'))
  df_list.append(pd.read_csv(dir_path+"/"+csv_file ,skiprows=15,  encoding='cp949'))


# economic_data 파일생성

- EDA_file3 에서 생성한 economic_data_temp 를 불러와서 진행

In [None]:
import pandas as pd
import datetime
df_final = pd.read_csv('/content/drive/MyDrive/house_price/after_data/ver_3/economic_data_temp.csv',encoding='UTF8')

# 금리차 컬럼들을 추가
df_final['korea_10-3_year'] = df_final['korea_10_year'] - df_final['korea_3_year']
df_final['us_10-2_year'] = df_final['us_10_year'] - df_final['us_2_year']
df_final['us_10-3_year_month'] = df_final['us_10_year'] - df_final['us_3_month']

df_apartment_supply = pd.read_csv("/content/drive/MyDrive/house_price/original_data/apartment_supply.txt",  encoding='UTF8',sep="\t")

# 년, 월 컬럼들 생성
# ' ' 을 기준으로 잘라서 컬럼들을 생성
df_apartment_supply['year'] =df_apartment_supply['입주년월'].str.split(' ',expand=True)[0]
df_apartment_supply['month'] =df_apartment_supply['입주년월'].str.split(' ',expand=True)[1]

# 문자열 특정 문자들 수정
# 추후 데이터프레임 계산에 용이하게 문자들을 수정 
df_apartment_supply["year"] = df_apartment_supply["year"].str.replace("년", "")
df_apartment_supply["month"] = df_apartment_supply["month"].str.replace("월", "")
df_apartment_supply["apartment_supply"] = df_apartment_supply["총세대수"].str.replace("세대", "")
df_apartment_supply["apartment_supply"] = df_apartment_supply["apartment_supply"].str.replace(",", "")

# date 컬럼 생성
df_apartment_supply['date'] = pd.to_datetime(df_apartment_supply['year']+'-'+df_apartment_supply['month'], format="%Y-%m")
# 다음 달에 지수가 발표한다고 가정
df_apartment_supply['date_column'] = df_apartment_supply['date'] + datetime.timedelta(days=32)
df_apartment_supply['announcement_year'] = df_apartment_supply['date_column'].dt.year
df_apartment_supply['announcement_month'] = df_apartment_supply['date_column'].dt.month

# 사용할 컬럼만을 거른 후, 타입 변경
df_apartment_supply = df_apartment_supply[['announcement_year','announcement_month','apartment_supply']]
df_apartment_supply = df_apartment_supply.astype({'apartment_supply': 'int64'})

# 연, 월별 분양공급량을 group by를 통해서 구한 후, reset_index를 통해서 다시 컬럼화
df_apartment_supply=df_apartment_supply.groupby(['announcement_year','announcement_month'])['apartment_supply'].agg('sum')
df_apartment_supply = df_apartment_supply.reset_index(['announcement_year','announcement_month'])

df_apartment_unsold = pd.read_excel("/content/drive/MyDrive/house_price/original_data/unsold/서울 미분양 현황.xlsx")
df_apartment_unsold.index = df_apartment_unsold['구분']
df_apartment_unsold=df_apartment_unsold.drop('구분',axis=1)

# T 매소드를 통해서 row와 column을 교환환
df_apartment_unsold=df_apartment_unsold.T

# index가 날짜의 정보를 가지고 있음으로 reset_index를 통해서 날짜 정보를 컬럼으로 생성성
df_apartment_unsold = df_apartment_unsold.reset_index()

# 컬럼명 수정정
df_apartment_unsold.columns=['year_month','unsold_count','ratio']

# year_month 컬럼에서 ' 부분을 제거
df_apartment_unsold["year_month"] = df_apartment_unsold["year_month"].str.replace("'", "")

# 연, 월 컬럼 생성성
df_apartment_unsold['year'] =df_apartment_unsold["year_month"].str.split('.',expand=True)[0]
df_apartment_unsold['month'] =df_apartment_unsold["year_month"].str.split('.',expand=True)[1]

# 연 컬럼 수정 및 사용할 컬럼 선택택
df_apartment_unsold['year'] = '20'+df_apartment_unsold['year']
df_apartment_unsold = df_apartment_unsold[['year','month','unsold_count']]

# 미분양에 대한 정보는 한달이 지나야 결과를 알 수 있다 가정정
df_apartment_unsold['date'] = pd.to_datetime(df_apartment_unsold['year']+'-'+df_apartment_unsold['month'], format="%Y-%m")
df_apartment_unsold['date_column'] = df_apartment_unsold['date'] + datetime.timedelta(days=32)
df_apartment_unsold['announcement_year'] = df_apartment_unsold['date_column'].dt.year
df_apartment_unsold['announcement_month'] = df_apartment_unsold['date_column'].dt.month
df_apartment_unsold = df_apartment_unsold[['announcement_year','announcement_month','unsold_count']]
df_apartment_unsold = df_apartment_unsold.astype({'unsold_count': 'int64'})

# 사용할 연도의의 범위만을 설정 
df_apartment_unsold=df_apartment_unsold[df_apartment_unsold['announcement_year']>=2011]

# 데이터 프레임 병합합
df_apartment_supply_unsold=pd.merge(df_apartment_supply, df_apartment_unsold, on=['announcement_year','announcement_month'], how='inner')

# 미분양 비율을 구함
df_apartment_supply_unsold['unsold_ratio'] = 100*(df_apartment_supply_unsold['unsold_count'] / df_apartment_supply_unsold['apartment_supply'])

# 데이터 병합
df_final=pd.merge(df_final, df_apartment_supply_unsold, left_on=['year','month'], right_on=['announcement_year','announcement_month'], how='left')
df_final = df_final.drop(["announcement_year", "announcement_month"], axis=1)

df_final = df_final.dropna(subset=['apartment_supply']) # 해당 컬럼에 null 값이 있기에 제거

df_final.to_csv('/content/drive/MyDrive/house_price/after_data/ver_4/economic_data.csv',index=False)

# economic_data2 파일 생성 


- EDA_file3에서의 과정과 동일

In [None]:
import pandas as pd
import numpy as np
# 데이터들 불러오기
df_deal = pd.read_csv("/content/drive/MyDrive/house_price/after_data/ver_4/apartment_deal.csv",  encoding='UTF8')
df_month_rent = pd.read_csv("/content/drive/MyDrive/house_price/after_data/ver_4/apartment_month_rent.csv",  encoding='UTF8')
df_full_rent = pd.read_csv("/content/drive/MyDrive/house_price/after_data/ver_4/apartment_full_rent.csv",  encoding='UTF8')
df_economic = pd.read_csv("/content/drive/MyDrive/house_price/after_data/ver_4/economic_data.csv",  encoding='UTF8')

# 서울 아파트 월별 거래량을 group by를 이용하여여 계산
df_count = df_deal.groupby(["year","month"])["name"].agg('count').copy()
df_count = df_count.reset_index(["year","month"]) # index로 있던 컬럼들을 다시 컬럼화
df_count.columns = ["year","month","deal_count"] # 컬럼명들 수정정

df_temp = df_full_rent.groupby(["year","month"])["name"].agg('count').copy()
df_temp = df_temp.reset_index(["year","month"])
df_temp.columns = ["year","month","full_rent_count"]

# 아파트 매매 체결량 데이터프레임과 아파트 전세 체결량 데이터프레임임을 병합합
df_count=pd.merge(df_count,df_temp, left_on=["year","month"], right_on=["year","month"], how="inner")

df_temp = df_month_rent.groupby(["year","month"])["name"].agg('count').copy()
df_temp = df_temp.reset_index(["year","month"])
df_temp.columns = ["year","month","month_rent_count"]

# 아파트 월세 거래량 데이터프레임을 추가하여 병합합
df_count=pd.merge(df_count,df_temp, left_on=["year","month"], right_on=["year","month"], how="inner")

df_count['deal_count'] = df_count['deal_count'].shift(1)
df_count['month_rent_count'] = df_count['month_rent_count'].shift(1)
df_count['full_rent_count'] = df_count['full_rent_count'].shift(1)

# 컬럼명 수정
df_count.columns = ['year','month','last_month_total_deal_count','last_month_total_full_rent_count', 'last_month_total_month_rent_count']

df_count.dropna(axis=0,inplace=True)
df_count.reset_index(inplace=True,drop=True)

- EDA_file3에서와 병합한 결과가 다르기에 중간결과만 확인(date와 day 컬럼이 추가가 됨)

In [None]:
# 거시경제 지표가 모든 날짜들에 대한 정보를 가지고 있음으로, year과 month를 통해서 병합
df_economic=pd.merge(df_economic, df_count, left_on=["year","month"], right_on=["year","month"], how="inner")

df_economic = df_economic.rename(columns={'apartment_supply':  'last_month_total_apartment_supply', 'unsold_count' : 'last_month_total_unsold_count', 
                                          'unsold_ratio' : 'last_month_total_unsold_ratio'})


# 데이터프레임 타입 변경 
df_economic=df_economic.astype({'year': 'int16','month': 'int16',
                    'last_month_total_apartment_supply': 'int32',
                    'last_month_total_unsold_count': 'int32',
                    'last_month_total_deal_count': 'int32',
                    'last_month_total_full_rent_count': 'int32',
                    'last_month_total_month_rent_count': 'int32'})


df_economic.head()

Unnamed: 0,date,year,month,day,apartment_index,kospi_index,korea_rp,korea_3_year,korea_10_year,us_3_month,...,us_10_year,korea_10-3_year,us_10-2_year,us_10-3_year_month,last_month_total_apartment_supply,last_month_total_unsold_count,last_month_total_unsold_ratio,last_month_total_deal_count,last_month_total_full_rent_count,last_month_total_month_rent_count
0,2011-02-01,2011,2,1,93.0,2072.03,2.75,3.97,4.71,0.157,...,3.435,0.74,2.83,3.278,5342,2269,42.474729,7179,12336,2514
1,2011-02-02,2011,2,2,93.0,2072.03,2.75,3.97,4.71,0.157,...,3.479,0.74,2.815,3.322,5342,2269,42.474729,7179,12336,2514
2,2011-02-03,2011,2,3,93.0,2072.03,2.75,3.97,4.71,0.152,...,3.547,0.74,2.835,3.395,5342,2269,42.474729,7179,12336,2514
3,2011-02-04,2011,2,4,93.0,2072.03,2.75,3.97,4.71,0.152,...,3.638,0.74,2.886,3.486,5342,2269,42.474729,7179,12336,2514
4,2011-02-05,2011,2,5,93.0,2072.03,2.75,3.97,4.71,0.152,...,3.638,0.74,2.886,3.486,5342,2269,42.474729,7179,12336,2514


In [None]:
# csv 파일 저장
df_economic.to_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/economic_data2.pkl')

# final_economic 파일 생성

- 일부분만 EDA_file3와 다르고, 나머지는 다 동일

In [None]:
import pandas as pd
# 데이터 프레임 불러오기기
df_economic = pd.read_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/economic_data2.pkl')


# 월별 평균 값을 구한 데이터프레임 2개 구함(추후 병합에 사용) 
df_economic_6m_before = df_economic.drop(['date','day','apartment_index'],axis=1).copy()

# EDA_file3와 다른 부분분
# EDA_file3 에서는 월단위여서 mean을 할 필요가 없었으나, 여기서는 일 단위여서 월단위로 묶어서 평균을 할 필요가 있음음
df_economic_6m_before = df_economic_6m_before.groupby(['year','month']).agg('mean').reset_index()


df_economic_12m_before = df_economic.drop(['date','day','apartment_index'],axis=1).copy()
df_economic_12m_before = df_economic_6m_before.groupby(['year','month']).agg('mean').reset_index()

# 6달전 날짜들 구한
df_economic.loc[df_economic['month']<7, '6m_before_year'] = df_economic['year']-1
df_economic.loc[df_economic['month']<7, '6m_before_month'] = 12-(6-df_economic['month'])
df_economic.loc[df_economic['month']>=7, '6m_before_year'] = df_economic['year']
df_economic.loc[df_economic['month']>=7, '6m_before_month'] = df_economic['month']-6

# 12달전 날짜들 구한
df_economic.loc[:, '12m_before_year'] = df_economic['year']-1
df_economic.loc[:, '12m_before_month'] = df_economic['month']

df_economic=df_economic.astype({'6m_before_year': 'int16','6m_before_month': 'int16'})

# 추가할 컬럼들의 컬럼명들을 생성
temp_column_total_list = list()
month_num_list = [6,12] # 1개월,3개월,6개월,12개월 이전 자료들 생성
for i in month_num_list:
    column_list = list()
    column_list.append('year_'+str(i)+'m_before')
    column_list.append('month_'+str(i)+'m_before')
    column_list.append('kospi_index_'+str(i)+'m_before')
    column_list.append('korea_rp_'+str(i)+'m_before')
    column_list.append('korea_3_year_'+str(i)+'m_before')
    column_list.append('korea_10_year_'+str(i)+'m_before')
    column_list.append('us_3_month_'+str(i)+'m_before')
    column_list.append('us_2_year_'+str(i)+'m_before')
    column_list.append('us_10_year_'+str(i)+'m_before')
    column_list.append('korea_10-3_year_'+str(i)+'m_before')
    column_list.append('us_10-2_year_'+str(i)+'m_before')
    column_list.append('us_10-3_year_month_'+str(i)+'m_before')
    column_list.append('last_month_total_apartment_supply_'+str(i)+'m_before')
    column_list.append('last_month_total_unsold_count_'+str(i)+'m_before')
    column_list.append('last_month_total_unsold_ratio_'+str(i)+'m_before')
    column_list.append('last_month_total_deal_count_'+str(i)+'m_before')
    column_list.append('last_month_total_full_rent_count_'+str(i)+'m_before')
    column_list.append('last_month_total_month_rent_count_'+str(i)+'m_before')
    temp_column_total_list.append(column_list)

df_economic_6m_before.columns = temp_column_total_list[0]
df_economic_12m_before.columns = temp_column_total_list[1]

df_economic = pd.merge(df_economic, df_economic_6m_before, left_on=['6m_before_year', '6m_before_month'], right_on=['year_6m_before','month_6m_before'], how='inner')
df_economic = pd.merge(df_economic, df_economic_12m_before, left_on=['12m_before_year', '12m_before_month'], right_on=['year_12m_before','month_12m_before'], how='inner')
df_economic = df_economic.drop(["6m_before_year", "6m_before_month", "12m_before_year", "12m_before_month", "year_6m_before", "month_6m_before","year_12m_before", "month_12m_before"], axis=1)


column_list = list()
column_list.append(['kospi_index', 'korea_rp',
       'korea_3_year', 'korea_10_year', 'us_3_month', 'us_2_year',
       'us_10_year', 'korea_10-3_year', 'us_10-2_year', 'us_10-3_year_month',
       'last_month_total_apartment_supply', 'last_month_total_unsold_count',
       'last_month_total_unsold_ratio', 'last_month_total_deal_count',
       'last_month_total_full_rent_count', 'last_month_total_month_rent_count'])

column_list.append(temp_column_total_list[0][2:])
column_list.append(temp_column_total_list[1][2:])


# 변화정도 = 현재데이터 - 과거데이터 
for i in range(len(column_list[0])):
  df_economic[column_list[1][i]] = df_economic[column_list[0][i]] - df_economic[column_list[1][i]]
  df_economic[column_list[2][i]] = df_economic[column_list[0][i]] - df_economic[column_list[2][i]]

# type 이 floay64 인 컬럼을 float32로 변경, 메모리 사용량을 줄이기 위해서서
df_economic_columns = list(df_economic.columns)
for df_economic_column in df_economic_columns:
    if df_economic[df_economic_column].dtypes =='float64':
        df_economic[df_economic_column]=df_economic[df_economic_column].astype('float32')
    else:
        pass

df_economic.to_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/final_economic.pkl')

# df_area_deal, df_area_full_rent, df_area_year_rent 파일들 생성

- '아파트 거래'가 체결된 날 이외의 날들은 가장 최근에 체결된 거래가격이 유지된다고 가정

## 필요한 데이터들 불러오기

In [None]:
import pandas as pd
import numpy as np
# 데이터들 불러오기
df_deal = pd.read_csv("/content/drive/MyDrive/house_price/after_data/ver_4/apartment_deal.csv",  encoding='UTF8')
df_month_rent = pd.read_csv("/content/drive/MyDrive/house_price/after_data/ver_4/apartment_month_rent.csv",  encoding='UTF8')
df_full_rent = pd.read_csv("/content/drive/MyDrive/house_price/after_data/ver_4/apartment_full_rent.csv",  encoding='UTF8')

## df_area_deal 파일 생성

### 아파트 월별 매매 피봇 테이블 생성

In [None]:
# 대표 데이터 파악
df_deal.head()

Unnamed: 0,date,year,month,day,address_0,address_1,address_2,address_3,address_4,name,area,deal_price
0,2011-07-09,2011,7,9,서울특별시,강남구,개포동,655.0,2.0,개포2차현대아파트(220),77.75,64000
1,2011-07-28,2011,7,28,서울특별시,강남구,개포동,655.0,2.0,개포2차현대아파트(220),77.75,65500
2,2011-01-19,2011,1,19,서울특별시,강남구,개포동,658.0,1.0,개포6차우성아파트1동~8동,67.28,70500
3,2011-09-02,2011,9,2,서울특별시,강남구,개포동,658.0,1.0,개포6차우성아파트1동~8동,79.97,85000
4,2011-12-17,2011,12,17,서울특별시,강남구,개포동,658.0,1.0,개포6차우성아파트1동~8동,67.28,68000


In [None]:
df_deal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891388 entries, 0 to 891387
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   date        891388 non-null  object 
 1   year        891388 non-null  int64  
 2   month       891388 non-null  int64  
 3   day         891388 non-null  int64  
 4   address_0   891388 non-null  object 
 5   address_1   891388 non-null  object 
 6   address_2   891388 non-null  object 
 7   address_3   891388 non-null  float64
 8   address_4   891388 non-null  float64
 9   name        891388 non-null  object 
 10  area        891388 non-null  float64
 11  deal_price  891388 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 81.6+ MB


In [None]:
# 면적당 가격 컬럼을 추가
df_deal['area_deal_price'] = df_deal['deal_price'] / df_deal['area']
df_deal.head()

Unnamed: 0,date,year,month,day,address_0,address_1,address_2,address_3,address_4,name,area,deal_price,area_deal_price
0,2011-07-09,2011,7,9,서울특별시,강남구,개포동,655.0,2.0,개포2차현대아파트(220),77.75,64000,823.151125
1,2011-07-28,2011,7,28,서울특별시,강남구,개포동,655.0,2.0,개포2차현대아파트(220),77.75,65500,842.44373
2,2011-01-19,2011,1,19,서울특별시,강남구,개포동,658.0,1.0,개포6차우성아파트1동~8동,67.28,70500,1047.859691
3,2011-09-02,2011,9,2,서울특별시,강남구,개포동,658.0,1.0,개포6차우성아파트1동~8동,79.97,85000,1062.898587
4,2011-12-17,2011,12,17,서울특별시,강남구,개포동,658.0,1.0,개포6차우성아파트1동~8동,67.28,68000,1010.701546


In [None]:
# 최근에 체결된 가격이 계속 유지된다고 생각을 하고 모든 날짜의 가격들을 결정
# 이를 위해서 그룹
import numpy as np
pivot_table_area_deal = df_deal.pivot_table(index=['year','month','day'], columns=['address_1','address_2','address_3','address_4'], values='area_deal_price')
pivot_table_area_deal


Unnamed: 0_level_0,Unnamed: 1_level_0,address_1,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,...,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구
Unnamed: 0_level_1,Unnamed: 1_level_1,address_2,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,...,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동
Unnamed: 0_level_2,Unnamed: 1_level_2,address_3,12.0,12.0,138.0,140.0,141.0,166.0,172.0,176.0,177.0,179.0,...,307.0,314.0,318.0,331.0,413.0,438.0,450.0,452.0,453.0,454.0
Unnamed: 0_level_3,Unnamed: 1_level_3,address_4,0.0,2.0,0.0,0.0,0.0,4.0,3.0,1.0,0.0,0.0,...,76.0,1.0,81.0,64.0,8.0,0.0,0.0,0.0,0.0,0.0
year,month,day,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4
2011,1,1,,,,,,,,,,,...,,,,,,,,,,
2011,1,2,,,,,,,,,,,...,,,,,,,431.726908,,,
2011,1,3,,,,,,,,,,,...,,,,,,,,,,
2011,1,4,1018.685955,,,,,,,,,,...,,,,,,,,,,
2011,1,5,1087.781432,,2101.057579,,1887.191539,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,4,26,,,,,,,,,,,...,,,,,,,,,,
2023,4,27,,,,,,,,,,,...,,,,,,,,,,
2023,4,28,,,,,,,,,,,...,,,,,,,,,,
2023,4,29,,,,,,,,,,,...,,,,,,,,,,


In [None]:
pivot_table_area_deal.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4501 entries, (2011, 1, 1) to (2023, 4, 30)
Columns: 8890 entries, ('강남구', '개포동', 12.0, 0.0) to ('중랑구', '중화동', 454.0, 0.0)
dtypes: float64(8890)
memory usage: 305.3 MB


In [None]:
# 2011년 1월 1일부터 2022년 12월 31일 까지의 모든 일자들을 리스트에 선언 
from datetime import datetime, timedelta

start_date = datetime(2011, 1, 1)  # 시작 날짜
end_date = datetime(2023, 4, 30)  # 끝 날짜

date_list = []
current_date = start_date
while current_date <= end_date:
    date_tuple = (current_date.year, current_date.month, current_date.day)
    date_list.append(date_tuple)
    current_date += timedelta(days=1)

print(date_list)

[(2011, 1, 1), (2011, 1, 2), (2011, 1, 3), (2011, 1, 4), (2011, 1, 5), (2011, 1, 6), (2011, 1, 7), (2011, 1, 8), (2011, 1, 9), (2011, 1, 10), (2011, 1, 11), (2011, 1, 12), (2011, 1, 13), (2011, 1, 14), (2011, 1, 15), (2011, 1, 16), (2011, 1, 17), (2011, 1, 18), (2011, 1, 19), (2011, 1, 20), (2011, 1, 21), (2011, 1, 22), (2011, 1, 23), (2011, 1, 24), (2011, 1, 25), (2011, 1, 26), (2011, 1, 27), (2011, 1, 28), (2011, 1, 29), (2011, 1, 30), (2011, 1, 31), (2011, 2, 1), (2011, 2, 2), (2011, 2, 3), (2011, 2, 4), (2011, 2, 5), (2011, 2, 6), (2011, 2, 7), (2011, 2, 8), (2011, 2, 9), (2011, 2, 10), (2011, 2, 11), (2011, 2, 12), (2011, 2, 13), (2011, 2, 14), (2011, 2, 15), (2011, 2, 16), (2011, 2, 17), (2011, 2, 18), (2011, 2, 19), (2011, 2, 20), (2011, 2, 21), (2011, 2, 22), (2011, 2, 23), (2011, 2, 24), (2011, 2, 25), (2011, 2, 26), (2011, 2, 27), (2011, 2, 28), (2011, 3, 1), (2011, 3, 2), (2011, 3, 3), (2011, 3, 4), (2011, 3, 5), (2011, 3, 6), (2011, 3, 7), (2011, 3, 8), (2011, 3, 9), (2011,

In [None]:
len(date_list)

4503

In [None]:
pivot_table_area_deal.index

MultiIndex([(2011, 1,  1),
            (2011, 1,  2),
            (2011, 1,  3),
            (2011, 1,  4),
            (2011, 1,  5),
            (2011, 1,  6),
            (2011, 1,  7),
            (2011, 1,  8),
            (2011, 1,  9),
            (2011, 1, 10),
            ...
            (2023, 4, 21),
            (2023, 4, 22),
            (2023, 4, 23),
            (2023, 4, 24),
            (2023, 4, 25),
            (2023, 4, 26),
            (2023, 4, 27),
            (2023, 4, 28),
            (2023, 4, 29),
            (2023, 4, 30)],
           names=['year', 'month', 'day'], length=4501)

In [None]:
# 기간 내 모든 날짜들에서 '거래날짜'들 빼서 '거래날짜'에서 없는 날짜들을 고름
print(set(date_list) - set(pivot_table_area_deal.index)) # '모든날짜'에 있고 '거래날짜'에 없는 날짜
print(set(pivot_table_area_deal.index) - set(date_list)) # 잘못 추가 생성된 날짜 

{(2016, 2, 9), (2022, 9, 11)}
set()


In [None]:
# 빈 날짜 들(거래날짜에서 포함되지 않은 날짜의 거래가격들)을 null 로 채워서 row로 추가
pivot_table_area_deal.loc[(2016, 2, 9)]=np.nan
pivot_table_area_deal.loc[(2022, 9, 11)]=np.nan

In [None]:
# 연, 월, 일 로 정렬을 함 - 정렬을 하지 않으면 바로 위에서 추가한 row들이 적절한 위치에 들어가 있지 않는다
pivot_table_area_deal = pivot_table_area_deal.sort_values(by=['year','month','day'])
pivot_table_area_deal

Unnamed: 0_level_0,Unnamed: 1_level_0,address_1,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,...,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구
Unnamed: 0_level_1,Unnamed: 1_level_1,address_2,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,...,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동
Unnamed: 0_level_2,Unnamed: 1_level_2,address_3,12.0,12.0,138.0,140.0,141.0,166.0,172.0,176.0,177.0,179.0,...,307.0,314.0,318.0,331.0,413.0,438.0,450.0,452.0,453.0,454.0
Unnamed: 0_level_3,Unnamed: 1_level_3,address_4,0.0,2.0,0.0,0.0,0.0,4.0,3.0,1.0,0.0,0.0,...,76.0,1.0,81.0,64.0,8.0,0.0,0.0,0.0,0.0,0.0
year,month,day,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4
2011,1,1,,,,,,,,,,,...,,,,,,,,,,
2011,1,2,,,,,,,,,,,...,,,,,,,431.726908,,,
2011,1,3,,,,,,,,,,,...,,,,,,,,,,
2011,1,4,1018.685955,,,,,,,,,,...,,,,,,,,,,
2011,1,5,1087.781432,,2101.057579,,1887.191539,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,4,26,,,,,,,,,,,...,,,,,,,,,,
2023,4,27,,,,,,,,,,,...,,,,,,,,,,
2023,4,28,,,,,,,,,,,...,,,,,,,,,,
2023,4,29,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# 가장 최근에 체결된 값이 거래가격으로 유지 됨으로 ffill()을 사용
pivot_table_area_deal=pivot_table_area_deal.ffill()
pivot_table_area_deal

Unnamed: 0_level_0,Unnamed: 1_level_0,address_1,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,...,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구
Unnamed: 0_level_1,Unnamed: 1_level_1,address_2,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,...,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동
Unnamed: 0_level_2,Unnamed: 1_level_2,address_3,12.0,12.0,138.0,140.0,141.0,166.0,172.0,176.0,177.0,179.0,...,307.0,314.0,318.0,331.0,413.0,438.0,450.0,452.0,453.0,454.0
Unnamed: 0_level_3,Unnamed: 1_level_3,address_4,0.0,2.0,0.0,0.0,0.0,4.0,3.0,1.0,0.0,0.0,...,76.0,1.0,81.0,64.0,8.0,0.0,0.0,0.0,0.0,0.0
year,month,day,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4
2011,1,1,,,,,,,,,,,...,,,,,,,,,,
2011,1,2,,,,,,,,,,,...,,,,,,,431.726908,,,
2011,1,3,,,,,,,,,,,...,,,,,,,431.726908,,,
2011,1,4,1018.685955,,,,,,,,,,...,,,,,,,431.726908,,,
2011,1,5,1087.781432,,2101.057579,,1887.191539,,,,,,...,,,,,,,431.726908,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,4,26,2712.477396,1779.004227,3297.187014,2487.219819,4324.324324,1413.594063,1342.758827,2172.968275,2136.100092,3014.696646,...,589.761736,872.199239,466.954023,956.130484,595.238095,818.61013,1029.116466,727.417008,1006.355932,1131.141746
2023,4,27,2712.477396,1779.004227,3297.187014,2487.219819,4324.324324,1413.594063,1342.758827,2172.968275,2136.100092,3014.696646,...,589.761736,872.199239,466.954023,956.130484,595.238095,818.61013,1029.116466,727.417008,1006.355932,1131.141746
2023,4,28,2712.477396,1779.004227,3297.187014,2487.219819,4324.324324,1413.594063,1342.758827,2172.968275,2136.100092,3014.696646,...,589.761736,872.199239,466.954023,956.130484,595.238095,818.61013,1029.116466,727.417008,1006.355932,1131.141746
2023,4,29,2712.477396,1779.004227,3297.187014,2487.219819,4324.324324,1413.594063,1342.758827,2172.968275,2136.100092,3014.696646,...,589.761736,872.199239,466.954023,956.130484,595.238095,818.61013,1029.116466,727.417008,1006.355932,1131.141746


In [None]:
# null 값을 채움 - 값을 채우지 않으면 추후 stack을 할 때 null 값을 계산을 안함
pivot_table_area_deal = pivot_table_area_deal.fillna(0)
pivot_table_area_deal

Unnamed: 0_level_0,Unnamed: 1_level_0,address_1,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,...,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구
Unnamed: 0_level_1,Unnamed: 1_level_1,address_2,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,...,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동
Unnamed: 0_level_2,Unnamed: 1_level_2,address_3,12.0,12.0,138.0,140.0,141.0,166.0,172.0,176.0,177.0,179.0,...,307.0,314.0,318.0,331.0,413.0,438.0,450.0,452.0,453.0,454.0
Unnamed: 0_level_3,Unnamed: 1_level_3,address_4,0.0,2.0,0.0,0.0,0.0,4.0,3.0,1.0,0.0,0.0,...,76.0,1.0,81.0,64.0,8.0,0.0,0.0,0.0,0.0,0.0
year,month,day,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4
2011,1,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000
2011,1,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,431.726908,0.000000,0.000000,0.000000
2011,1,3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,431.726908,0.000000,0.000000,0.000000
2011,1,4,1018.685955,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,431.726908,0.000000,0.000000,0.000000
2011,1,5,1087.781432,0.000000,2101.057579,0.000000,1887.191539,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,431.726908,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,4,26,2712.477396,1779.004227,3297.187014,2487.219819,4324.324324,1413.594063,1342.758827,2172.968275,2136.100092,3014.696646,...,589.761736,872.199239,466.954023,956.130484,595.238095,818.61013,1029.116466,727.417008,1006.355932,1131.141746
2023,4,27,2712.477396,1779.004227,3297.187014,2487.219819,4324.324324,1413.594063,1342.758827,2172.968275,2136.100092,3014.696646,...,589.761736,872.199239,466.954023,956.130484,595.238095,818.61013,1029.116466,727.417008,1006.355932,1131.141746
2023,4,28,2712.477396,1779.004227,3297.187014,2487.219819,4324.324324,1413.594063,1342.758827,2172.968275,2136.100092,3014.696646,...,589.761736,872.199239,466.954023,956.130484,595.238095,818.61013,1029.116466,727.417008,1006.355932,1131.141746
2023,4,29,2712.477396,1779.004227,3297.187014,2487.219819,4324.324324,1413.594063,1342.758827,2172.968275,2136.100092,3014.696646,...,589.761736,872.199239,466.954023,956.130484,595.238095,818.61013,1029.116466,727.417008,1006.355932,1131.141746


>> stack을 할 때 null 값을 계산을 안함으로, 계산 시 값 변경을 예방하기 위해서 null 값들을 채워야 한다

### 피봇테이블 -> 데이터프레임

In [None]:
# 컬럼을 slice해서 값을 처리할 때, 컬럼의 개수가 많으면, row가 많을 때 보다 메모리를 많이 소모함으로 전치를 시킴킴
pivot_table_area_deal = pivot_table_area_deal.T
pivot_table_area_deal

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,year,2011,2011,2011,2011,2011,2011,2011,2011,2011,2011,...,2023,2023,2023,2023,2023,2023,2023,2023,2023,2023
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,month,1,1,1,1,1,1,1,1,1,1,...,4,4,4,4,4,4,4,4,4,4
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,day,1,2,3,4,5,6,7,8,9,10,...,21,22,23,24,25,26,27,28,29,30
address_1,address_2,address_3,address_4,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3,Unnamed: 24_level_3
강남구,개포동,12.0,0.0,0.0,0.000000,0.000000,1018.685955,1087.781432,1040.914561,1054.852321,1054.852321,1054.852321,1054.852321,...,2712.477396,2712.477396,2712.477396,2712.477396,2712.477396,2712.477396,2712.477396,2712.477396,2712.477396,2712.477396
강남구,개포동,12.0,2.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1779.004227,1779.004227,1779.004227,1779.004227,1779.004227,1779.004227,1779.004227,1779.004227,1779.004227,1779.004227
강남구,개포동,138.0,0.0,0.0,0.000000,0.000000,0.000000,2101.057579,2101.057579,2101.057579,2101.057579,2101.057579,2101.057579,...,3297.187014,3297.187014,3297.187014,3297.187014,3297.187014,3297.187014,3297.187014,3297.187014,3297.187014,3297.187014
강남구,개포동,140.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1565.991903,2064.490759,...,2487.219819,2487.219819,2487.219819,2487.219819,2487.219819,2487.219819,2487.219819,2487.219819,2487.219819,2487.219819
강남구,개포동,141.0,0.0,0.0,0.000000,0.000000,0.000000,1887.191539,1887.191539,1887.191539,1887.191539,1887.191539,1887.191539,...,4324.324324,4324.324324,4324.324324,4324.324324,4324.324324,4324.324324,4324.324324,4324.324324,4324.324324,4324.324324
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
중랑구,중화동,438.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,818.610130,818.610130,818.610130,818.610130,818.610130,818.610130,818.610130,818.610130,818.610130,818.610130
중랑구,중화동,450.0,0.0,0.0,431.726908,431.726908,431.726908,431.726908,485.274431,485.274431,485.274431,485.274431,485.274431,...,1029.116466,1029.116466,1029.116466,1029.116466,1029.116466,1029.116466,1029.116466,1029.116466,1029.116466,1029.116466
중랑구,중화동,452.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,727.417008,727.417008,727.417008,727.417008,727.417008,727.417008,727.417008,727.417008,727.417008,727.417008
중랑구,중화동,453.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1006.355932,1006.355932,1006.355932,1006.355932,1006.355932,1006.355932,1006.355932,1006.355932,1006.355932,1006.355932


- pandas는 row 개수가 많은것이, column의 개수가 많은 것보다 더 메모리 부담이 크다. why?

In [None]:
# 피봇테이블을 일반데이터프레임화 
df_area_deal = pivot_table_area_deal.stack(level=[0,1,2])
df_area_deal =df_area_deal.reset_index()
df_area_deal

Unnamed: 0,address_1,address_2,address_3,address_4,year,month,day,0
0,강남구,개포동,12.0,0.0,2011,1,1,0.000000
1,강남구,개포동,12.0,0.0,2011,1,2,0.000000
2,강남구,개포동,12.0,0.0,2011,1,3,0.000000
3,강남구,개포동,12.0,0.0,2011,1,4,1018.685955
4,강남구,개포동,12.0,0.0,2011,1,5,1087.781432
...,...,...,...,...,...,...,...,...
40031665,중랑구,중화동,454.0,0.0,2023,4,26,1131.141746
40031666,중랑구,중화동,454.0,0.0,2023,4,27,1131.141746
40031667,중랑구,중화동,454.0,0.0,2023,4,28,1131.141746
40031668,중랑구,중화동,454.0,0.0,2023,4,29,1131.141746


In [None]:
df_area_deal.columns = ['address_1','address_2','address_3','address_4','year','month','day','area_deal'] # 컬럼명 수정 
df_area_deal = df_area_deal.astype({'address_3': 'int16', 'address_4': 'int16','year':'int16', 'month':'int16', 'day':'int16', 'area_deal':'float32'})
df_area_deal = df_area_deal.drop(df_area_deal[df_area_deal.area_deal == 0].index) # 위에서 값이 null인 값들을 0으로 처리했으므로, 0인 값들을 제거한다 
df_area_deal

Unnamed: 0,address_1,address_2,address_3,address_4,year,month,day,area_deal
3,강남구,개포동,12,0,2011,1,4,1018.685974
4,강남구,개포동,12,0,2011,1,5,1087.781372
5,강남구,개포동,12,0,2011,1,6,1040.914551
6,강남구,개포동,12,0,2011,1,7,1054.852295
7,강남구,개포동,12,0,2011,1,8,1054.852295
...,...,...,...,...,...,...,...,...
40031665,중랑구,중화동,454,0,2023,4,26,1131.141724
40031666,중랑구,중화동,454,0,2023,4,27,1131.141724
40031667,중랑구,중화동,454,0,2023,4,28,1131.141724
40031668,중랑구,중화동,454,0,2023,4,29,1131.141724


In [None]:
df_area_deal.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33909436 entries, 3 to 40031669
Data columns (total 8 columns):
 #   Column     Dtype  
---  ------     -----  
 0   address_1  object 
 1   address_2  object 
 2   address_3  int16  
 3   address_4  int16  
 4   year       int16  
 5   month      int16  
 6   day        int16  
 7   area_deal  float32
dtypes: float32(1), int16(5), object(2)
memory usage: 1.2+ GB


### 파일저장

In [None]:
df_area_deal.to_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_area_deal.pkl')

## df_area_full_rent 파일 생성

- df_area_deal 파일생성 부분 참조

In [None]:
df_full_rent.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1498500 entries, 0 to 1498499
Data columns (total 12 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   date             1498500 non-null  object 
 1   year             1498500 non-null  int64  
 2   month            1498500 non-null  int64  
 3   day              1498500 non-null  int64  
 4   address_0        1498500 non-null  object 
 5   address_1        1498500 non-null  object 
 6   address_2        1498500 non-null  object 
 7   address_3        1498500 non-null  float64
 8   address_4        1498500 non-null  float64
 9   name             1498500 non-null  object 
 10  area             1498500 non-null  float64
 11  full_rent_price  1498500 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 137.2+ MB


In [None]:
import numpy as np
# 면적당 가격을 추가
df_full_rent['area_full_rent_price'] = df_full_rent['full_rent_price'] / df_full_rent['area']
pivot_table_area_full_rent = df_full_rent.pivot_table(index=['year','month','day'], columns=['address_1','address_2','address_3','address_4'], values='area_full_rent_price')
pivot_table_area_full_rent

Unnamed: 0_level_0,Unnamed: 1_level_0,address_1,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,...,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구
Unnamed: 0_level_1,Unnamed: 1_level_1,address_2,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,...,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동
Unnamed: 0_level_2,Unnamed: 1_level_2,address_3,12.0,12.0,138.0,140.0,141.0,166.0,172.0,176.0,177.0,179.0,...,307.0,314.0,318.0,331.0,413.0,438.0,450.0,452.0,453.0,454.0
Unnamed: 0_level_3,Unnamed: 1_level_3,address_4,0.0,2.0,0.0,0.0,0.0,4.0,3.0,1.0,0.0,0.0,...,76.0,1.0,81.0,64.0,8.0,0.0,0.0,0.0,0.0,0.0
year,month,day,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4
2011,1,1,,,,,,,,,,,...,,,,,,,,,,
2011,1,2,,,,,,,,,,,...,,,,,,,,,,
2011,1,3,430.053124,469.099032,,,190.044764,,,,,,...,,,,,,,,,,
2011,1,4,416.009890,,,259.109312,159.620342,,,,,,...,,,203.665988,,,,251.004016,,,
2011,1,5,,,217.090981,267.487606,212.476466,,,,,,...,,,,,,,190.408188,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,4,26,766.075782,,,,,,,,,,...,,,,,,,,,,
2023,4,27,818.527648,,,,,,,,,,...,,,,,,,,,,
2023,4,28,,,,,,,,,,,...,,,,,,,,,,
2023,4,29,,,,,,,,,,,...,,,,,,,632.184286,,,


In [None]:
from datetime import datetime, timedelta

start_date = datetime(2011, 1, 1)  # 시작 날짜
end_date = datetime(2023, 4, 30)  # 끝 날짜

date_list = []
current_date = start_date
while current_date <= end_date:
    date_tuple = (current_date.year, current_date.month, current_date.day)
    date_list.append(date_tuple)
    current_date += timedelta(days=1)

# 기간 내 모든 날짜들에서 '거래날짜'들 빼서 '거래날짜'에서 없는 날짜들을 고름
print(set(date_list) - set(pivot_table_area_full_rent.index)) # '모든날짜'에 있고 '거래날짜'에 없는 날짜
print(set(pivot_table_area_full_rent.index) - set(date_list)) # 잘못 추가 생성된 날짜 

set()
set()


In [None]:
pivot_table_area_full_rent = pivot_table_area_full_rent.ffill()
pivot_table_area_full_rent = pivot_table_area_full_rent.fillna(0)
pivot_table_area_full_rent = pivot_table_area_full_rent.T
df_area_full_rent = pivot_table_area_full_rent.stack(level=[0,1,2])
df_area_full_rent =df_area_full_rent.reset_index()
df_area_full_rent.columns = ['address_1','address_2','address_3','address_4','year','month','day','area_full_rent'] # 컬럼명 수정 
df_area_full_rent = df_area_full_rent.drop(df_area_full_rent[df_area_full_rent.area_full_rent == 0].index) # 위에서 값이 null인 값들을 0으로 처리했으므로, 0인 값들을 제거한다 
df_area_full_rent = df_area_full_rent.astype({'address_3': 'int16', 'address_4': 'int16','year':'int16', 'month':'int16', 'day':'int16', 'area_full_rent':'float32'})
df_area_full_rent

Unnamed: 0,address_1,address_2,address_3,address_4,year,month,day,area_full_rent
2,강남구,개포동,12,0,2011,1,3,430.053131
3,강남구,개포동,12,0,2011,1,4,416.009888
4,강남구,개포동,12,0,2011,1,5,416.009888
5,강남구,개포동,12,0,2011,1,6,416.009888
6,강남구,개포동,12,0,2011,1,7,400.000000
...,...,...,...,...,...,...,...,...
41828362,중랑구,중화동,454,0,2023,4,26,466.804993
41828363,중랑구,중화동,454,0,2023,4,27,466.804993
41828364,중랑구,중화동,454,0,2023,4,28,466.804993
41828365,중랑구,중화동,454,0,2023,4,29,466.804993


In [None]:
df_area_full_rent.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35627911 entries, 2 to 41828366
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   address_1       object 
 1   address_2       object 
 2   address_3       int16  
 3   address_4       int16  
 4   year            int16  
 5   month           int16  
 6   day             int16  
 7   area_full_rent  float32
dtypes: float32(1), int16(5), object(2)
memory usage: 1.3+ GB


In [None]:
df_area_full_rent.to_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_area_full_rent.pkl')

## df_area_year_rent 파일 생성

- df_area_deal 파일 생성 참조
- 아파트월세 피봇테이블 -> 아파트 월별 연세 피봇테이블
- 보증금은 계약시의 상황마다 다를 것
- 전월세전환률을 적용하여서 월세에서의 보증금을 변환
- 거래들마다 상황에 따라 보증금과 월세금액은 다를 수 있음으로, 보증금의 5.8% 값에 월세*12을 더하여 1년간 들어가는 금액인 연세를 계산

In [None]:
df_month_rent.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 672285 entries, 0 to 672284
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   date              672285 non-null  object 
 1   year              672285 non-null  int64  
 2   month             672285 non-null  int64  
 3   day               672285 non-null  int64  
 4   address_0         672285 non-null  object 
 5   address_1         672285 non-null  object 
 6   address_2         672285 non-null  object 
 7   address_3         672285 non-null  float64
 8   address_4         672285 non-null  float64
 9   name              672285 non-null  object 
 10  area              672285 non-null  float64
 11  rent_deposit      672285 non-null  int64  
 12  month_rent_price  672285 non-null  int64  
dtypes: float64(3), int64(5), object(5)
memory usage: 66.7+ MB


In [None]:
# 보증금의 5.8% 값에 월세*12을 더하여 1년간 들어가는 금액인 연세를 계산
df_month_rent['year_rent_price'] = (df_month_rent['rent_deposit']*0.058)+(df_month_rent['month_rent_price']*12)
df_month_rent['area_year_rent_price'] = df_month_rent['year_rent_price'] / df_month_rent['area']
pivot_table_area_year_rent = df_month_rent.pivot_table(index=['year','month','day'], columns=['address_1','address_2','address_3','address_4'], values='area_year_rent_price')
pivot_table_area_year_rent

Unnamed: 0_level_0,Unnamed: 1_level_0,address_1,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,강남구,...,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구,중랑구
Unnamed: 0_level_1,Unnamed: 1_level_1,address_2,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,개포동,...,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동,중화동
Unnamed: 0_level_2,Unnamed: 1_level_2,address_3,12.0,12.0,138.0,140.0,141.0,172.0,176.0,177.0,179.0,185.0,...,307.0,307.0,314.0,318.0,331.0,438.0,450.0,452.0,453.0,454.0
Unnamed: 0_level_3,Unnamed: 1_level_3,address_4,0.0,2.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,...,6.0,76.0,1.0,81.0,64.0,0.0,0.0,0.0,0.0,0.0
year,month,day,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4
2011,1,1,,,,,,,,,,,...,,,,,,,,,,
2011,1,2,,,,,,,,,,,...,,,,,,,,,,
2011,1,3,,,,,,,,,,,...,,,,,,,,,,
2011,1,4,,,,,,,,,,29.702312,...,,,,,,,,,,
2011,1,5,,,18.284371,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,4,26,,,,,,,,,,,...,,,,,,,,,,
2023,4,27,31.722742,,,,,,,,,,...,,,,,,,,,,
2023,4,28,,,,,,,,,,,...,,,,,,,,,,
2023,4,29,28.332912,,99.341124,,,,,,,,...,,,,,,,,,,


In [None]:
from datetime import datetime, timedelta

start_date = datetime(2011, 1, 1)  # 시작 날짜
end_date = datetime(2023, 4, 30)  # 끝 날짜

date_list = []
current_date = start_date
while current_date <= end_date:
    date_tuple = (current_date.year, current_date.month, current_date.day)
    date_list.append(date_tuple)
    current_date += timedelta(days=1)

# 기간 내 모든 날짜들에서 '거래날짜'들 빼서 '거래날짜'에서 없는 날짜들을 고름
print(set(date_list) - set(pivot_table_area_year_rent.index)) # '모든날짜'에 있고 '거래날짜'에 없는 날짜
print(set(pivot_table_area_year_rent.index) - set(date_list)) # 잘못 추가 생성된 날짜

set()
set()


In [None]:
pivot_table_area_year_rent=pivot_table_area_year_rent.ffill()
pivot_table_area_year_rent = pivot_table_area_year_rent.fillna(0)

# 피봇테이블을 일반데이터프레임화 
pivot_table_area_year_rent = pivot_table_area_year_rent.T
df_area_year_rent = pivot_table_area_year_rent.stack(level=[0,1,2])
df_area_year_rent = df_area_year_rent.reset_index()
df_area_year_rent.columns = ['address_1','address_2','address_3','address_4','year','month','day','area_year_rent'] # 컬럼명 수정 
df_area_year_rent = df_area_year_rent.drop(df_area_year_rent[df_area_year_rent.area_year_rent == 0].index) # 위에서 값이 null인 값들을 0으로 처리했으므로, 0인 값들을 제거한다
df_area_year_rent = df_area_year_rent.astype({'address_3': 'int16', 'address_4': 'int16','year':'int16', 'month':'int16', 'day':'int16', 'area_year_rent':'float32'}) 
df_area_year_rent

Unnamed: 0,address_1,address_2,address_3,address_4,year,month,day,area_year_rent
6,강남구,개포동,12,0,2011,1,7,30.255503
7,강남구,개포동,12,0,2011,1,8,30.255503
8,강남구,개포동,12,0,2011,1,9,30.255503
9,강남구,개포동,12,0,2011,1,10,30.255503
10,강남구,개포동,12,0,2011,1,11,30.255503
...,...,...,...,...,...,...,...,...
37910752,중랑구,중화동,454,0,2023,4,26,22.199171
37910753,중랑구,중화동,454,0,2023,4,27,22.199171
37910754,중랑구,중화동,454,0,2023,4,28,22.199171
37910755,중랑구,중화동,454,0,2023,4,29,22.199171


In [None]:
df_area_year_rent.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28251402 entries, 6 to 37910756
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   address_1       object 
 1   address_2       object 
 2   address_3       int16  
 3   address_4       int16  
 4   year            int16  
 5   month           int16  
 6   day             int16  
 7   area_year_rent  float32
dtypes: float32(1), int16(5), object(2)
memory usage: 1023.8+ MB


In [None]:
df_area_year_rent.to_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_area_year_rent.pkl')

## df_area_all 생성

### 3개의 파일 병합

- 가치평가 컬럼들을 구하기 위해서 merge를 통해, 매매가, 전세가, 연세가가 다 있는 정보들 만을 거름

In [None]:
import pandas as pd

df_area_deal = pd.read_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_area_deal.pkl')
df_area_full_rent = pd.read_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_area_full_rent.pkl')

In [None]:
df_area_deal_full_rent = pd.merge(df_area_deal,df_area_full_rent, on=['address_1', 'address_2', 'address_3', 'address_4', 'year', 'month','day'])

In [None]:
df_area_deal_full_rent.to_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_area_deal_full_rent.pkl')

- 메모리 부족 이슈로 나누어서 실행

In [None]:
import pandas as pd

df_area_deal_full_rent = pd.read_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_area_deal_full_rent.pkl')
df_area_year_rent = pd.read_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_area_year_rent.pkl')

In [None]:
df_area_all = pd.merge(df_area_deal_full_rent, df_area_year_rent , on=['address_1', 'address_2', 'address_3', 'address_4', 'year', 'month','day'])

In [None]:
df_area_all.to_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_area_all.pkl')

# df_original_dataset 생성

## 일별로 종합 수치들을 확인

- 추후 지역별을 추가하여서 그룹화 가능?

In [None]:
import pandas as pd

df_area_all = pd.read_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_area_all.pkl')
df_area_all.head()

Unnamed: 0,address_1,address_2,address_3,address_4,year,month,day,area_deal,area_full_rent,area_year_rent
0,강남구,개포동,12,0,2011,1,7,1054.852295,400.0,30.255503
1,강남구,개포동,12,0,2011,1,8,1054.852295,400.0,30.255503
2,강남구,개포동,12,0,2011,1,9,1054.852295,400.0,30.255503
3,강남구,개포동,12,0,2011,1,10,1054.852295,420.425629,30.255503
4,강남구,개포동,12,0,2011,1,11,1006.830261,434.408142,30.255503


In [None]:
df_area_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25054084 entries, 0 to 25054083
Data columns (total 10 columns):
 #   Column          Dtype  
---  ------          -----  
 0   address_1       object 
 1   address_2       object 
 2   address_3       int16  
 3   address_4       int16  
 4   year            int16  
 5   month           int16  
 6   day             int16  
 7   area_deal       float32
 8   area_full_rent  float32
 9   area_year_rent  float32
dtypes: float32(3), int16(5), object(2)
memory usage: 1.1+ GB


In [None]:
# 일별로 초반의 데이터들은 체결의 개수가 적어서 데이터의 신빙성이 확보되기가 어렵다 판단
# 일별 거래(매매,전세, 월세) 체결 개수들을 파악해서 너무 개수가 적은 데이터 들을 제거하는 과정 필요 
df_area_all_count = df_area_all.groupby(["year","month","day"])[["area_deal","area_full_rent","area_year_rent"]].count()
df_area_all_count

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,area_deal,area_full_rent,area_year_rent
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011,1,2,1,1,1
2011,1,3,6,6,6
2011,1,4,18,18,18
2011,1,5,43,43,43
2011,1,6,79,79,79
...,...,...,...,...,...
2023,4,26,7606,7606,7606
2023,4,27,7606,7606,7606
2023,4,28,7607,7607,7607
2023,4,29,7607,7607,7607


>> 데이터셋을 사용할 때, 각 수치들을 도출한 표본이 어느정도 이상이어야지 데이터로서의 가치가 있다

In [None]:
df_area_all_count.describe() # min의 값과 1분위수의 차이가 매우 큼을 확인

Unnamed: 0,area_deal,area_full_rent,area_year_rent
count,4502.0,4502.0,4502.0
mean,5565.100844,5565.100844,5565.100844
std,1644.460048,1644.460048,1644.460048
min,1.0,1.0,1.0
25%,4441.0,4441.0,4441.0
50%,6014.5,6014.5,6014.5
75%,6913.75,6913.75,6913.75
max,7607.0,7607.0,7607.0


In [None]:
# boxplot 을 통해서 이상치가 있음을 확인 
import plotly.express as px
fig = px.box(df_area_all_count, y="area_deal")
fig.show()

In [None]:
# 막대그래프를 통해서 체결 개수는 순차적으로 증가함을 확인 
# 즉, 특정 개수 이하인 value 기준으로 row들을 제거하면, 과거일자들의 value들도 특정개수 이하일 것이므로, 제거해도 괜찮음을 확인 
import plotly.express as px

df_area_all_count_2 = df_area_all_count.reset_index()
fig = px.bar(df_area_all_count, x=df_area_all_count_2.index, y='area_deal')
fig.show()

In [None]:
# 이상치 제거를 위한 변수들을 선언
q1=df_area_all_count['area_deal'].quantile(0.25)
q2=df_area_all_count['area_deal'].quantile(0.5)
q3=df_area_all_count['area_deal'].quantile(0.75)
iqr=q3-q1
iqr

2472.75

In [None]:
# 이상치의 인덱스들을 확인 
df_area_all_count.loc[df_area_all_count['area_deal']<q1-1.5*iqr,'area_deal'].index


MultiIndex([(2011, 1,  2),
            (2011, 1,  3),
            (2011, 1,  4),
            (2011, 1,  5),
            (2011, 1,  6),
            (2011, 1,  7),
            (2011, 1,  8),
            (2011, 1,  9),
            (2011, 1, 10),
            (2011, 1, 11),
            (2011, 1, 12),
            (2011, 1, 13),
            (2011, 1, 14),
            (2011, 1, 15),
            (2011, 1, 16),
            (2011, 1, 17),
            (2011, 1, 18),
            (2011, 1, 19),
            (2011, 1, 20),
            (2011, 1, 21),
            (2011, 1, 22),
            (2011, 1, 23),
            (2011, 1, 24),
            (2011, 1, 25),
            (2011, 1, 26),
            (2011, 1, 27),
            (2011, 1, 28),
            (2011, 1, 29),
            (2011, 1, 30),
            (2011, 1, 31)],
           names=['year', 'month', 'day'])

## df_area_micro 생성

In [None]:
# 일별로 그룹화해서, 서울 아파트 거래 평균 가격들을 도출 
df_area_micro=df_area_all.groupby(["year","month","day"])[["area_deal","area_full_rent","area_year_rent"]].mean()
df_area_micro

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,area_deal,area_full_rent,area_year_rent
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011,1,2,595.000000,259.771637,18.880535
2011,1,3,519.548096,274.167999,16.547453
2011,1,4,704.231018,326.309021,21.519497
2011,1,5,768.772095,326.750854,21.022175
2011,1,6,709.595642,303.017273,19.990875
...,...,...,...,...,...
2023,4,26,1024.179443,580.640137,29.545254
2023,4,27,1024.288940,580.863586,29.561056
2023,4,28,1024.102783,581.085632,29.577700
2023,4,29,1023.981995,581.219116,29.587328


In [None]:
# 위에서 구한 데이터의 개수가 적어서 데이터로서의 가치가 떨어지는 데이터들을 제거 
df_area_micro.drop(df_area_all_count.loc[df_area_all_count['area_deal']<q1-1.5*iqr,'area_deal'].index,inplace=True)
df_area_micro

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,area_deal,area_full_rent,area_year_rent
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011,2,1,650.274597,304.564636,21.068266
2011,2,2,650.676758,303.993530,21.060005
2011,2,3,650.676758,304.175018,21.060005
2011,2,4,650.503906,303.648407,21.046247
2011,2,5,650.336243,303.861664,21.049116
...,...,...,...,...,...
2023,4,26,1024.179443,580.640137,29.545254
2023,4,27,1024.288940,580.863586,29.561056
2023,4,28,1024.102783,581.085632,29.577700
2023,4,29,1023.981995,581.219116,29.587328


In [None]:
df_area_micro.reset_index(inplace=True)
df_area_micro

Unnamed: 0,year,month,day,area_deal,area_full_rent,area_year_rent
0,2011,2,1,650.274597,304.564636,21.068266
1,2011,2,2,650.676758,303.993530,21.060005
2,2011,2,3,650.676758,304.175018,21.060005
3,2011,2,4,650.503906,303.648407,21.046247
4,2011,2,5,650.336243,303.861664,21.049116
...,...,...,...,...,...,...
4467,2023,4,26,1024.179443,580.640137,29.545254
4468,2023,4,27,1024.288940,580.863586,29.561056
4469,2023,4,28,1024.102783,581.085632,29.577700
4470,2023,4,29,1023.981995,581.219116,29.587328


### 가치평가 지표 컬럼 추가

In [None]:
df_area_micro['deal_full_rent_rate'] = 100*(df_area_micro['area_full_rent'] / df_area_micro['area_deal'])
df_area_micro['deal_year_rent_multiple'] = df_area_micro['area_deal']/ df_area_micro['area_year_rent']
df_area_micro

Unnamed: 0,year,month,day,area_deal,area_full_rent,area_year_rent,deal_full_rent_rate,deal_year_rent_multiple
0,2011,2,1,650.274597,304.564636,21.068266,46.836311,30.865122
1,2011,2,2,650.676758,303.993530,21.060005,46.719593,30.896324
2,2011,2,3,650.676758,304.175018,21.060005,46.747486,30.896324
3,2011,2,4,650.503906,303.648407,21.046247,46.678951,30.908308
4,2011,2,5,650.336243,303.861664,21.049116,46.723778,30.896132
...,...,...,...,...,...,...,...,...
4467,2023,4,26,1024.179443,580.640137,29.545254,56.693203,34.664772
4468,2023,4,27,1024.288940,580.863586,29.561056,56.708954,34.649944
4469,2023,4,28,1024.102783,581.085632,29.577700,56.740944,34.624153
4470,2023,4,29,1023.981995,581.219116,29.587328,56.760674,34.608803


## 월별 평균 종합 수치들을 확인

In [None]:
df_area_micro_month = df_area_micro.groupby(["year","month"])[["area_deal","area_full_rent","area_year_rent"]].mean().copy()
df_area_micro_month.reset_index(inplace=True)
df_area_micro_month

Unnamed: 0,year,month,area_deal,area_full_rent,area_year_rent
0,2011,2,641.325623,300.576172,20.937654
1,2011,3,615.491150,297.227692,20.475676
2,2011,4,600.585022,295.111023,20.246986
3,2011,5,589.073303,295.179962,20.066994
4,2011,6,581.239624,297.469910,19.992504
...,...,...,...,...,...
142,2022,12,1060.324219,605.178101,30.067490
143,2023,1,1050.356201,596.822388,29.944647
144,2023,2,1040.774170,588.755737,29.721769
145,2023,3,1031.198730,583.412354,29.565010


In [None]:
df_area_micro_month['area_deal'] = df_area_micro_month['area_deal'].shift(1)
df_area_micro_month['area_full_rent'] = df_area_micro_month['area_full_rent'].shift(1)
df_area_micro_month['area_year_rent'] = df_area_micro_month['area_year_rent'].shift(1)
df_area_micro_month = df_area_micro_month.dropna()
df_area_micro_month.columns = ['year','month','last_month_area_deal','last_month_area_full_count', 'last_month_area_year_rent']
df_area_micro_month

Unnamed: 0,year,month,last_month_area_deal,last_month_area_full_count,last_month_area_year_rent
1,2011,3,641.325623,300.576172,20.937654
2,2011,4,615.491150,297.227692,20.475676
3,2011,5,600.585022,295.111023,20.246986
4,2011,6,589.073303,295.179962,20.066994
5,2011,7,581.239624,297.469910,19.992504
...,...,...,...,...,...
142,2022,12,1068.705444,612.513855,30.279573
143,2023,1,1060.324219,605.178101,30.067490
144,2023,2,1050.356201,596.822388,29.944647
145,2023,3,1040.774170,588.755737,29.721769


## 6개월전 종합 수치 병합

In [None]:
# df_area_micro_month의 6개월 후, 달을 컬럼으로 구한후, df_area_micro의 year, month와 merge 하면 6개월 전 수치들을 구할 수 있음 
df_area_micro_month_6m = df_area_micro_month.copy()
df_area_micro_month_6m.loc[df_area_micro_month_6m['month']<7, '6m_after_year'] = df_area_micro_month_6m['year'] 
df_area_micro_month_6m.loc[df_area_micro_month_6m['month']<7, '6m_after_month'] = df_area_micro_month_6m['month'] + 6
df_area_micro_month_6m.loc[df_area_micro_month_6m['month']>=7, '6m_after_year'] = df_area_micro_month_6m['year'] + 1
df_area_micro_month_6m.loc[df_area_micro_month_6m['month']>=7, '6m_after_month'] = df_area_micro_month_6m['month'] - 6

df_area_micro_month_6m



Unnamed: 0,year,month,last_month_area_deal,last_month_area_full_count,last_month_area_year_rent,6m_after_year,6m_after_month
1,2011,3,641.325623,300.576172,20.937654,2011.0,9.0
2,2011,4,615.491150,297.227692,20.475676,2011.0,10.0
3,2011,5,600.585022,295.111023,20.246986,2011.0,11.0
4,2011,6,589.073303,295.179962,20.066994,2011.0,12.0
5,2011,7,581.239624,297.469910,19.992504,2012.0,1.0
...,...,...,...,...,...,...,...
142,2022,12,1068.705444,612.513855,30.279573,2023.0,6.0
143,2023,1,1060.324219,605.178101,30.067490,2023.0,7.0
144,2023,2,1050.356201,596.822388,29.944647,2023.0,8.0
145,2023,3,1040.774170,588.755737,29.721769,2023.0,9.0


In [None]:
df_area_micro_month_6m = df_area_micro_month_6m.drop(['year','month'],axis=1)
df_area_micro_month_6m = df_area_micro_month_6m.astype({'6m_after_year':'int16', '6m_after_month' : 'int16'})
df_area_micro_month_6m.rename(columns = {'last_month_area_deal' : '6m_before_area_deal_mean', 'last_month_area_full_count' : '6m_before_area_full_rent_mean', 
                                      'last_month_area_year_rent' : '6m_before_area_year_rent_mean'}, inplace = True)
df_area_micro_month_6m

Unnamed: 0,6m_before_area_deal_mean,6m_before_area_full_rent_mean,6m_before_area_year_rent_mean,6m_after_year,6m_after_month
1,641.325623,300.576172,20.937654,2011,9
2,615.491150,297.227692,20.475676,2011,10
3,600.585022,295.111023,20.246986,2011,11
4,589.073303,295.179962,20.066994,2011,12
5,581.239624,297.469910,19.992504,2012,1
...,...,...,...,...,...
142,1068.705444,612.513855,30.279573,2023,6
143,1060.324219,605.178101,30.067490,2023,7
144,1050.356201,596.822388,29.944647,2023,8
145,1040.774170,588.755737,29.721769,2023,9


In [None]:
df_area_micro_month_6m['6m_before_deal_full_rent_rate'] = 100*(df_area_micro_month_6m['6m_before_area_full_rent_mean'] / df_area_micro_month_6m['6m_before_area_deal_mean'])
df_area_micro_month_6m['6m_before_deal_year_rent_multiple'] = df_area_micro_month_6m['6m_before_area_deal_mean']/ df_area_micro_month_6m['6m_before_area_year_rent_mean']
df_area_micro_month_6m

Unnamed: 0,6m_before_area_deal_mean,6m_before_area_full_rent_mean,6m_before_area_year_rent_mean,6m_after_year,6m_after_month,6m_before_deal_full_rent_rate,6m_before_deal_year_rent_multiple
1,641.325623,300.576172,20.937654,2011,9,46.867950,30.630251
2,615.491150,297.227692,20.475676,2011,10,48.291142,30.059626
3,600.585022,295.111023,20.246986,2011,11,49.137260,29.662933
4,589.073303,295.179962,20.066994,2011,12,50.109207,29.355333
5,581.239624,297.469910,19.992504,2012,1,51.178532,29.072878
...,...,...,...,...,...,...,...
142,1068.705444,612.513855,30.279573,2023,6,57.313625,35.294601
143,1060.324219,605.178101,30.067490,2023,7,57.074814,35.264809
144,1050.356201,596.822388,29.944647,2023,8,56.820953,35.076591
145,1040.774170,588.755737,29.721769,2023,9,56.569016,35.017235


In [None]:
df_area_micro = pd.merge(df_area_micro,df_area_micro_month_6m, left_on=['year','month'], right_on=['6m_after_year','6m_after_month'],how = 'left') # inner로 하면, 12개월 파트를 병합할 때 사라지는 데이터가 더 많아짐 ㅠㅠㅠ
df_area_micro

Unnamed: 0,year,month,day,area_deal,area_full_rent,area_year_rent,deal_full_rent_rate,deal_year_rent_multiple,6m_before_area_deal_mean,6m_before_area_full_rent_mean,6m_before_area_year_rent_mean,6m_after_year,6m_after_month,6m_before_deal_full_rent_rate,6m_before_deal_year_rent_multiple
0,2011,2,1,650.274597,304.564636,21.068266,46.836311,30.865122,,,,,,,
1,2011,2,2,650.676758,303.993530,21.060005,46.719593,30.896324,,,,,,,
2,2011,2,3,650.676758,304.175018,21.060005,46.747486,30.896324,,,,,,,
3,2011,2,4,650.503906,303.648407,21.046247,46.678951,30.908308,,,,,,,
4,2011,2,5,650.336243,303.861664,21.049116,46.723778,30.896132,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4467,2023,4,26,1024.179443,580.640137,29.545254,56.693203,34.664772,1079.784546,619.757629,30.16025,2023.0,4.0,57.39642,35.801579
4468,2023,4,27,1024.288940,580.863586,29.561056,56.708954,34.649944,1079.784546,619.757629,30.16025,2023.0,4.0,57.39642,35.801579
4469,2023,4,28,1024.102783,581.085632,29.577700,56.740944,34.624153,1079.784546,619.757629,30.16025,2023.0,4.0,57.39642,35.801579
4470,2023,4,29,1023.981995,581.219116,29.587328,56.760674,34.608803,1079.784546,619.757629,30.16025,2023.0,4.0,57.39642,35.801579


## 12개월전 종합 수치 병합 

In [None]:
df_area_micro_month_12m = df_area_micro_month.copy()
df_area_micro_month_12m['12m_after_year'] = df_area_micro_month_12m['year']+1
df_area_micro_month_12m['12m_after_month'] = df_area_micro_month_12m['month']

df_area_micro_month_12m = df_area_micro_month_12m.drop(['year','month'],axis=1)
df_area_micro_month_12m = df_area_micro_month_12m.astype({'12m_after_year':'int16', '12m_after_month' : 'int16'})
df_area_micro_month_12m.rename(columns = {'last_month_area_deal' : '12m_before_area_deal_mean', 'last_month_area_full_count' : '12m_before_area_full_rent_mean', 
                                      'last_month_area_year_rent' : '12m_before_area_year_rent_mean'}, inplace = True)

df_area_micro_month_12m['12m_before_deal_full_rent_rate'] = 100*(df_area_micro_month_12m['12m_before_area_full_rent_mean'] / df_area_micro_month_12m['12m_before_area_deal_mean'])
df_area_micro_month_12m['12m_before_deal_year_rent_multiple'] =df_area_micro_month_12m['12m_before_area_deal_mean']/ df_area_micro_month_12m['12m_before_area_year_rent_mean']


df_area_micro = pd.merge(df_area_micro, df_area_micro_month_12m, left_on=['year','month'], right_on=['12m_after_year','12m_after_month'],how = 'left')
df_area_micro

Unnamed: 0,year,month,day,area_deal,area_full_rent,area_year_rent,deal_full_rent_rate,deal_year_rent_multiple,6m_before_area_deal_mean,6m_before_area_full_rent_mean,...,6m_after_month,6m_before_deal_full_rent_rate,6m_before_deal_year_rent_multiple,12m_before_area_deal_mean,12m_before_area_full_rent_mean,12m_before_area_year_rent_mean,12m_after_year,12m_after_month,12m_before_deal_full_rent_rate,12m_before_deal_year_rent_multiple
0,2011,2,1,650.274597,304.564636,21.068266,46.836311,30.865122,,,...,,,,,,,,,,
1,2011,2,2,650.676758,303.993530,21.060005,46.719593,30.896324,,,...,,,,,,,,,,
2,2011,2,3,650.676758,304.175018,21.060005,46.747486,30.896324,,,...,,,,,,,,,,
3,2011,2,4,650.503906,303.648407,21.046247,46.678951,30.908308,,,...,,,,,,,,,,
4,2011,2,5,650.336243,303.861664,21.049116,46.723778,30.896132,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4467,2023,4,26,1024.179443,580.640137,29.545254,56.693203,34.664772,1079.784546,619.757629,...,4.0,57.39642,35.801579,1082.813354,608.582642,28.969515,2023.0,4.0,56.203835,37.377682
4468,2023,4,27,1024.288940,580.863586,29.561056,56.708954,34.649944,1079.784546,619.757629,...,4.0,57.39642,35.801579,1082.813354,608.582642,28.969515,2023.0,4.0,56.203835,37.377682
4469,2023,4,28,1024.102783,581.085632,29.577700,56.740944,34.624153,1079.784546,619.757629,...,4.0,57.39642,35.801579,1082.813354,608.582642,28.969515,2023.0,4.0,56.203835,37.377682
4470,2023,4,29,1023.981995,581.219116,29.587328,56.760674,34.608803,1079.784546,619.757629,...,4.0,57.39642,35.801579,1082.813354,608.582642,28.969515,2023.0,4.0,56.203835,37.377682


## df_area_micro 컬럼 수정

In [None]:
df_area_micro = df_area_micro.drop(['6m_after_year','6m_after_month', '12m_after_year', '12m_after_month'], axis=1)

df_area_micro['6m_before_area_deal_mean'] = 100*((df_area_micro['area_deal'] - df_area_micro['6m_before_area_deal_mean'])/ df_area_micro['6m_before_area_deal_mean'])
df_area_micro['6m_before_area_full_rent_mean'] = 100*((df_area_micro['area_full_rent'] - df_area_micro['6m_before_area_full_rent_mean'])/ df_area_micro['6m_before_area_full_rent_mean'])
df_area_micro['6m_before_area_year_rent_mean'] = 100*((df_area_micro['area_year_rent'] - df_area_micro['6m_before_area_year_rent_mean'])/ df_area_micro['6m_before_area_year_rent_mean'])
df_area_micro['6m_before_deal_full_rent_rate'] = 100*((df_area_micro['deal_full_rent_rate'] - df_area_micro['6m_before_deal_full_rent_rate'])/ df_area_micro['6m_before_deal_full_rent_rate'])
df_area_micro['6m_before_deal_year_rent_multiple'] = 100*((df_area_micro['deal_year_rent_multiple'] - df_area_micro['6m_before_deal_year_rent_multiple'])/ df_area_micro['6m_before_deal_year_rent_multiple'])


df_area_micro['12m_before_area_deal_mean'] = 100*((df_area_micro['area_deal'] - df_area_micro['12m_before_area_deal_mean'])/ df_area_micro['12m_before_area_deal_mean'])
df_area_micro['12m_before_area_full_rent_mean'] = 100*((df_area_micro['area_full_rent'] - df_area_micro['12m_before_area_full_rent_mean'])/ df_area_micro['12m_before_area_full_rent_mean'])
df_area_micro['12m_before_area_year_rent_mean'] = 100*((df_area_micro['area_year_rent'] - df_area_micro['12m_before_area_year_rent_mean'])/ df_area_micro['12m_before_area_year_rent_mean'])
df_area_micro['12m_before_deal_full_rent_rate'] = 100*((df_area_micro['deal_full_rent_rate'] - df_area_micro['12m_before_deal_full_rent_rate'])/ df_area_micro['12m_before_deal_full_rent_rate'])
df_area_micro['12m_before_deal_year_rent_multiple'] = 100*((df_area_micro['deal_year_rent_multiple'] - df_area_micro['12m_before_deal_year_rent_multiple'])/ df_area_micro['12m_before_deal_year_rent_multiple'])

df_area_micro.head()

Unnamed: 0,year,month,day,area_deal,area_full_rent,area_year_rent,deal_full_rent_rate,deal_year_rent_multiple,6m_before_area_deal_mean,6m_before_area_full_rent_mean,6m_before_area_year_rent_mean,6m_before_deal_full_rent_rate,6m_before_deal_year_rent_multiple,12m_before_area_deal_mean,12m_before_area_full_rent_mean,12m_before_area_year_rent_mean,12m_before_deal_full_rent_rate,12m_before_deal_year_rent_multiple
0,2011,2,1,650.274597,304.564636,21.068266,46.836311,30.865122,,,,,,,,,,
1,2011,2,2,650.676758,303.99353,21.060005,46.719593,30.896324,,,,,,,,,,
2,2011,2,3,650.676758,304.175018,21.060005,46.747486,30.896324,,,,,,,,,,
3,2011,2,4,650.503906,303.648407,21.046247,46.678951,30.908308,,,,,,,,,,
4,2011,2,5,650.336243,303.861664,21.049116,46.723778,30.896132,,,,,,,,,,


In [None]:
df_area_micro = df_area_micro.dropna()
df_area_micro.head()

Unnamed: 0,year,month,day,area_deal,area_full_rent,area_year_rent,deal_full_rent_rate,deal_year_rent_multiple,6m_before_area_deal_mean,6m_before_area_full_rent_mean,6m_before_area_year_rent_mean,6m_before_deal_full_rent_rate,6m_before_deal_year_rent_multiple,12m_before_area_deal_mean,12m_before_area_full_rent_mean,12m_before_area_year_rent_mean,12m_before_deal_full_rent_rate,12m_before_deal_year_rent_multiple
394,2012,3,1,548.170105,309.03598,20.537769,56.375927,26.69083,-4.654602,1.893891,1.922042,6.868177,-6.452623,-14.525463,2.814531,-1.909885,20.286734,-12.861209
395,2012,3,2,547.971802,308.738037,20.522972,56.341957,26.700411,-4.689094,1.795654,1.848608,6.803782,-6.419044,-14.556384,2.715407,-1.980558,20.214254,-12.829931
396,2012,3,3,547.847473,309.738129,20.533909,56.537292,26.680136,-4.710719,2.1254,1.902884,7.174067,-6.490106,-14.57577,3.048131,-1.928323,20.631031,-12.896124
397,2012,3,4,548.025452,309.982178,20.534513,56.563461,26.688017,-4.679762,2.205867,1.905884,7.223673,-6.462483,-14.548018,3.129325,-1.925435,20.686869,-12.870394
398,2012,3,5,547.768799,309.442413,20.514805,56.491428,26.701145,-4.724403,2.027898,1.808077,7.087125,-6.416471,-14.588037,2.949749,-2.019566,20.533175,-12.827534


In [None]:
df_area_micro.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4078 entries, 394 to 4471
Data columns (total 18 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   year                                4078 non-null   int64  
 1   month                               4078 non-null   int64  
 2   day                                 4078 non-null   int64  
 3   area_deal                           4078 non-null   float32
 4   area_full_rent                      4078 non-null   float32
 5   area_year_rent                      4078 non-null   float32
 6   deal_full_rent_rate                 4078 non-null   float32
 7   deal_year_rent_multiple             4078 non-null   float32
 8   6m_before_area_deal_mean            4078 non-null   float32
 9   6m_before_area_full_rent_mean       4078 non-null   float32
 10  6m_before_area_year_rent_mean       4078 non-null   float32
 11  6m_before_deal_full_rent_rate       4078 

In [None]:
df_area_micro.to_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_area_micro.pkl')

## final_economic 과의 병합

In [None]:
import pandas as pd
df_economic = pd.read_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/final_economic.pkl')
df_economic.head()

Unnamed: 0,date,year,month,day,apartment_index,kospi_index,korea_rp,korea_3_year,korea_10_year,us_3_month,...,us_10_year_12m_before,korea_10-3_year_12m_before,us_10-2_year_12m_before,us_10-3_year_month_12m_before,last_month_total_apartment_supply_12m_before,last_month_total_unsold_count_12m_before,last_month_total_unsold_ratio_12m_before,last_month_total_deal_count_12m_before,last_month_total_full_rent_count_12m_before,last_month_total_month_rent_count_12m_before
0,2012-02-01,2012,2,1,86.800003,1959.23999,3.25,3.38,3.75,0.061,...,-1.735429,-0.436429,-1.199357,-1.668893,-3520.0,-379.0,61.257435,-4393.0,-1891.0,-237.0
1,2012-02-02,2012,2,2,86.800003,1984.300049,3.25,3.38,3.76,0.084,...,-1.742429,-0.426429,-1.206357,-1.698893,-3520.0,-379.0,61.257435,-4393.0,-1891.0,-237.0
2,2012-02-03,2012,2,3,86.800003,1972.339966,3.25,3.38,3.76,0.079,...,-1.641429,-0.426429,-1.117357,-1.592893,-3520.0,-379.0,61.257435,-4393.0,-1891.0,-237.0
3,2012-02-04,2012,2,4,86.800003,1972.339966,3.25,3.38,3.76,0.079,...,-1.641429,-0.426429,-1.117357,-1.592893,-3520.0,-379.0,61.257435,-4393.0,-1891.0,-237.0
4,2012-02-05,2012,2,5,86.800003,1972.339966,3.25,3.38,3.76,0.079,...,-1.641429,-0.426429,-1.117357,-1.592893,-3520.0,-379.0,61.257435,-4393.0,-1891.0,-237.0


In [None]:
# 이 부분에서 2023년 이후 거시경제 지표가 없음으로 제거가 됨 2023년 이후 데이터는 병합과정에서 제거가 됨 
df_original_dataset = pd.merge(df_area_micro,df_economic, on = ['year','month','day'])
df_original_dataset

Unnamed: 0,year,month,day,area_deal,area_full_rent,area_year_rent,deal_full_rent_rate,deal_year_rent_multiple,6m_before_area_deal_mean,6m_before_area_full_rent_mean,...,us_10_year_12m_before,korea_10-3_year_12m_before,us_10-2_year_12m_before,us_10-3_year_month_12m_before,last_month_total_apartment_supply_12m_before,last_month_total_unsold_count_12m_before,last_month_total_unsold_ratio_12m_before,last_month_total_deal_count_12m_before,last_month_total_full_rent_count_12m_before,last_month_total_month_rent_count_12m_before
0,2012,3,1,548.170105,309.035980,20.537769,56.375927,26.690830,-4.654602,1.893891,...,-1.374226,-0.392419,-0.987774,-1.352387,-2468.0,-513.0,102.561394,-2078.0,794.0,-73.0
1,2012,3,2,547.971802,308.738037,20.522972,56.341957,26.700411,-4.689094,1.795654,...,-1.425226,-0.407419,-1.023774,-1.395387,-2468.0,-513.0,102.561394,-2078.0,794.0,-73.0
2,2012,3,3,547.847473,309.738129,20.533909,56.537292,26.680136,-4.710719,2.125400,...,-1.425226,-0.407419,-1.023374,-1.395487,-2468.0,-513.0,102.561394,-2078.0,794.0,-73.0
3,2012,3,4,548.025452,309.982178,20.534513,56.563461,26.688017,-4.679762,2.205867,...,-1.425226,-0.407419,-1.023374,-1.395487,-2468.0,-513.0,102.561394,-2078.0,794.0,-73.0
4,2012,3,5,547.768799,309.442413,20.514805,56.491428,26.701145,-4.724403,2.027898,...,-1.397226,-0.392419,-1.014774,-1.382387,-2468.0,-513.0,102.561394,-2078.0,794.0,-73.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3953,2022,12,27,1058.162842,602.649963,30.013388,56.952477,35.256363,-2.863783,-2.328694,...,2.393742,-0.434968,-1.326813,-1.857890,-266.0,811.0,46.509003,-695.0,-2296.0,1048.0
3954,2022,12,28,1057.969971,602.549805,29.992699,56.953396,35.274250,-2.881488,-2.344926,...,2.430742,-0.378968,-1.264513,-1.966890,-266.0,811.0,46.509003,-695.0,-2296.0,1048.0
3955,2022,12,29,1057.233032,602.409302,30.018431,56.979805,35.219463,-2.949137,-2.367698,...,2.364742,-0.380968,-1.338713,-1.998890,-266.0,811.0,46.509003,-695.0,-2296.0,1048.0
3956,2022,12,30,1056.862427,602.243958,30.013784,56.984138,35.212566,-2.983157,-2.394495,...,2.423742,-0.375968,-1.342013,-1.921890,-266.0,811.0,46.509003,-695.0,-2296.0,1048.0


In [None]:
df_original_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3958 entries, 0 to 3957
Data columns (total 68 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   year                                          3958 non-null   int64  
 1   month                                         3958 non-null   int64  
 2   day                                           3958 non-null   int64  
 3   area_deal                                     3958 non-null   float32
 4   area_full_rent                                3958 non-null   float32
 5   area_year_rent                                3958 non-null   float32
 6   deal_full_rent_rate                           3958 non-null   float32
 7   deal_year_rent_multiple                       3958 non-null   float32
 8   6m_before_area_deal_mean                      3958 non-null   float32
 9   6m_before_area_full_rent_mean                 3958 non-null   f

In [None]:
# date 컬럼의 타입을 변경 
df_original_dataset['date'] = pd.to_datetime(df_original_dataset['date'])
df_original_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3958 entries, 0 to 3957
Data columns (total 68 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   year                                          3958 non-null   int64         
 1   month                                         3958 non-null   int64         
 2   day                                           3958 non-null   int64         
 3   area_deal                                     3958 non-null   float32       
 4   area_full_rent                                3958 non-null   float32       
 5   area_year_rent                                3958 non-null   float32       
 6   deal_full_rent_rate                           3958 non-null   float32       
 7   deal_year_rent_multiple                       3958 non-null   float32       
 8   6m_before_area_deal_mean                      3958 non-null   float3

In [None]:
df_original_dataset.to_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_original_dataset_without_future.pkl')

## 1년후 가격 병합

In [None]:
df_future = df_original_dataset[['date','area_deal']].copy()
df_future.head()

Unnamed: 0,date,area_deal
0,2012-03-01,548.170105
1,2012-03-02,547.971802
2,2012-03-03,547.847473
3,2012-03-04,548.025452
4,2012-03-05,547.768799


In [None]:
df_future['date'] = df_future['date'] - pd.Timedelta(days=365)
df_future.head()

Unnamed: 0,date,area_deal
0,2011-03-02,548.170105
1,2011-03-03,547.971802
2,2011-03-04,547.847473
3,2011-03-05,548.025452
4,2011-03-06,547.768799


In [None]:
df_future['year'] = df_future['date'].dt.year
df_future['month'] = df_future['date'].dt.month
df_future['day'] = df_future['date'].dt.day
df_future.rename(columns = {'area_deal' : 'future_area_deal'}, inplace = True)
df_future.drop('date',axis=1,inplace=True)

df_future.head()

Unnamed: 0,future_area_deal,year,month,day
0,548.170105,2011,3,2
1,547.971802,2011,3,3
2,547.847473,2011,3,4
3,548.025452,2011,3,5
4,547.768799,2011,3,6


In [None]:
# 데이터 프레임 병합 
df_original_dataset = pd.merge(df_original_dataset,df_future, on = ['year','month','day'])
df_original_dataset

Unnamed: 0,year,month,day,area_deal,area_full_rent,area_year_rent,deal_full_rent_rate,deal_year_rent_multiple,6m_before_area_deal_mean,6m_before_area_full_rent_mean,...,korea_10-3_year_12m_before,us_10-2_year_12m_before,us_10-3_year_month_12m_before,last_month_total_apartment_supply_12m_before,last_month_total_unsold_count_12m_before,last_month_total_unsold_ratio_12m_before,last_month_total_deal_count_12m_before,last_month_total_full_rent_count_12m_before,last_month_total_month_rent_count_12m_before,future_area_deal
0,2012,3,1,548.170105,309.035980,20.537769,56.375927,26.690830,-4.654602,1.893891,...,-0.392419,-0.987774,-1.352387,-2468.0,-513.0,102.561394,-2078.0,794.0,-73.0,512.318481
1,2012,3,2,547.971802,308.738037,20.522972,56.341957,26.700411,-4.689094,1.795654,...,-0.407419,-1.023774,-1.395387,-2468.0,-513.0,102.561394,-2078.0,794.0,-73.0,512.909119
2,2012,3,3,547.847473,309.738129,20.533909,56.537292,26.680136,-4.710719,2.125400,...,-0.407419,-1.023374,-1.395487,-2468.0,-513.0,102.561394,-2078.0,794.0,-73.0,513.148926
3,2012,3,4,548.025452,309.982178,20.534513,56.563461,26.688017,-4.679762,2.205867,...,-0.407419,-1.023374,-1.395487,-2468.0,-513.0,102.561394,-2078.0,794.0,-73.0,513.173767
4,2012,3,5,547.768799,309.442413,20.514805,56.491428,26.701145,-4.724403,2.027898,...,-0.392419,-1.014774,-1.382387,-2468.0,-513.0,102.561394,-2078.0,794.0,-73.0,513.639587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3588,2021,12,27,1083.472168,607.116211,28.812990,56.034313,37.603600,7.364027,4.543994,...,-0.272226,-0.024955,0.574126,789.0,2.0,-1.540453,-5084.0,1313.0,-278.0,1058.162842
3589,2021,12,28,1083.753906,606.659790,28.849360,55.977631,37.565960,7.391945,4.465400,...,-0.298226,-0.069055,0.576126,789.0,2.0,-1.540453,-5084.0,1313.0,-278.0,1057.969971
3590,2021,12,29,1083.566772,606.564270,28.839167,55.978481,37.572750,7.373402,4.448951,...,-0.311226,0.006845,0.658126,789.0,2.0,-1.540453,-5084.0,1313.0,-278.0,1057.233032
3591,2021,12,30,1083.580566,606.848999,28.829924,56.004047,37.585274,7.374769,4.497981,...,-0.262226,-0.020455,0.614126,789.0,2.0,-1.540453,-5084.0,1313.0,-278.0,1056.862427


In [None]:
# 미래 변화율 컬럼을 추가 
df_original_dataset['future_change_rate'] = 100*((df_original_dataset['future_area_deal'] - df_original_dataset['area_deal'])/df_original_dataset['area_deal'])
df_original_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3593 entries, 0 to 3592
Data columns (total 70 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   year                                          3593 non-null   int64         
 1   month                                         3593 non-null   int64         
 2   day                                           3593 non-null   int64         
 3   area_deal                                     3593 non-null   float32       
 4   area_full_rent                                3593 non-null   float32       
 5   area_year_rent                                3593 non-null   float32       
 6   deal_full_rent_rate                           3593 non-null   float32       
 7   deal_year_rent_multiple                       3593 non-null   float32       
 8   6m_before_area_deal_mean                      3593 non-null   float3

## 파일 저장

In [None]:
df_original_dataset.to_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_original_dataset.pkl')

# 기계학습

- 앞에서의 기계학습 방법들을 적용시켰을 경우, 모델의 정확도가 너무 낮은 문제가 발생, 원래목표와의 다른 차선책으로 집값의 가격들의 합(지수처럼)의 변경을 예측하는 모델을 생성

## 기계학습에 사용할 원본 데이터셋 생성

In [2]:
import pandas as pd

df_original_dataset = pd.read_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_original_dataset.pkl')
df_original_dataset

Unnamed: 0,year,month,day,area_deal,area_full_rent,area_year_rent,deal_full_rent_rate,deal_year_rent_multiple,6m_before_area_deal_mean,6m_before_area_full_rent_mean,...,us_10-2_year_12m_before,us_10-3_year_month_12m_before,last_month_total_apartment_supply_12m_before,last_month_total_unsold_count_12m_before,last_month_total_unsold_ratio_12m_before,last_month_total_deal_count_12m_before,last_month_total_full_rent_count_12m_before,last_month_total_month_rent_count_12m_before,future_area_deal,future_change_rate
0,2012,3,1,548.170105,309.035980,20.537769,56.375927,26.690830,-4.654602,1.893891,...,-0.987774,-1.352387,-2468.0,-513.0,102.561394,-2078.0,794.0,-73.0,512.318481,-6.540236
1,2012,3,2,547.971802,308.738037,20.522972,56.341957,26.700411,-4.689094,1.795654,...,-1.023774,-1.395387,-2468.0,-513.0,102.561394,-2078.0,794.0,-73.0,512.909119,-6.398629
2,2012,3,3,547.847473,309.738129,20.533909,56.537292,26.680136,-4.710719,2.125400,...,-1.023374,-1.395487,-2468.0,-513.0,102.561394,-2078.0,794.0,-73.0,513.148926,-6.333615
3,2012,3,4,548.025452,309.982178,20.534513,56.563461,26.688017,-4.679762,2.205867,...,-1.023374,-1.395487,-2468.0,-513.0,102.561394,-2078.0,794.0,-73.0,513.173767,-6.359501
4,2012,3,5,547.768799,309.442413,20.514805,56.491428,26.701145,-4.724403,2.027898,...,-1.014774,-1.382387,-2468.0,-513.0,102.561394,-2078.0,794.0,-73.0,513.639587,-6.230587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3588,2021,12,27,1083.472168,607.116211,28.812990,56.034313,37.603600,7.364027,4.543994,...,-0.024955,0.574126,789.0,2.0,-1.540453,-5084.0,1313.0,-278.0,1058.162842,-2.335946
3589,2021,12,28,1083.753906,606.659790,28.849360,55.977631,37.565960,7.391945,4.465400,...,-0.069055,0.576126,789.0,2.0,-1.540453,-5084.0,1313.0,-278.0,1057.969971,-2.379132
3590,2021,12,29,1083.566772,606.564270,28.839167,55.978481,37.572750,7.373402,4.448951,...,0.006845,0.658126,789.0,2.0,-1.540453,-5084.0,1313.0,-278.0,1057.233032,-2.430283
3591,2021,12,30,1083.580566,606.848999,28.829924,56.004047,37.585274,7.374769,4.497981,...,-0.020455,0.614126,789.0,2.0,-1.540453,-5084.0,1313.0,-278.0,1056.862427,-2.465727


In [None]:
pd.set_option('display.max_rows', 70)

df_original_dataset.corr()['future_area_deal'].sort_values(ascending=False).to_frame()

  df_original_dataset.corr()['future_area_deal'].sort_values(ascending=False).to_frame()


Unnamed: 0,future_area_deal
future_area_deal,1.0
area_deal,0.978133
deal_year_rent_multiple,0.967116
area_full_rent,0.96146
year,0.957828
area_year_rent,0.93252
apartment_index,0.863615
12m_before_area_deal_mean,0.840139
6m_before_area_deal_mean,0.79336
12m_before_deal_year_rent_multiple,0.74197


In [None]:
# 상관관계들만을 컬럼으로 한 데이터 프레임 생성성
df_corr = df_original_dataset.corr(numeric_only=False)['future_area_deal']
df_corr.head()

year              0.957828
month             0.061658
day               0.007000
area_deal         0.978133
area_full_rent    0.961460
Name: future_area_deal, dtype: float64

In [None]:
df_corr.info()

<class 'pandas.core.series.Series'>
Index: 70 entries, year to future_change_rate
Series name: future_area_deal
Non-Null Count  Dtype  
--------------  -----  
70 non-null     float64
dtypes: float64(1)
memory usage: 3.1+ KB


In [None]:
# 컬럼명(series의)을 수정 
df_corr.name = 'correlation'
df_corr.info()

<class 'pandas.core.series.Series'>
Index: 70 entries, year to future_change_rate
Series name: correlation
Non-Null Count  Dtype  
--------------  -----  
70 non-null     float64
dtypes: float64(1)
memory usage: 3.1+ KB


In [None]:
# 상관계쑤가 0.6 이상이거나, -0.6 이하인것 (양의 상관관계나 음의 상관관계가 있는 컬럼들만을 고름)
list(df_corr[(df_corr >= 0.6) | (df_corr <= -0.6)].index)

['year',
 'area_deal',
 'area_full_rent',
 'area_year_rent',
 'deal_year_rent_multiple',
 '6m_before_area_deal_mean',
 '6m_before_deal_full_rent_rate',
 '6m_before_deal_year_rent_multiple',
 '12m_before_area_deal_mean',
 '12m_before_deal_full_rent_rate',
 '12m_before_deal_year_rent_multiple',
 'date',
 'apartment_index',
 'kospi_index',
 'korea_rp',
 'korea_3_year',
 'korea_10_year',
 'us_10-2_year',
 'us_10-3_year_month',
 'last_month_total_unsold_count',
 'last_month_total_month_rent_count',
 'future_area_deal']

In [None]:
# 상관계쑤가 0.6 이상이거나, -0.6 이하인것 (양의 상관관계나 음의 상관관계가 있는 컬럼들만을 고름)
learning_feature_list = list(df_corr[(df_corr >= 0.7) | (df_corr <= -0.7)].index)
learning_feature_list

['year',
 'area_deal',
 'area_full_rent',
 'area_year_rent',
 'deal_year_rent_multiple',
 '6m_before_area_deal_mean',
 '6m_before_deal_full_rent_rate',
 '12m_before_area_deal_mean',
 '12m_before_deal_full_rent_rate',
 '12m_before_deal_year_rent_multiple',
 'date',
 'apartment_index',
 'kospi_index',
 'korea_rp',
 'korea_3_year',
 'last_month_total_month_rent_count',
 'future_area_deal']

In [None]:
# year, date, apartment_index는 실제 사용하는 컬럼들이 아니기에, 제거 
to_remove = ['year','apartment_index', 'korea_rp']
for x in to_remove:
    learning_feature_list.remove(x)
learning_feature_list

['area_deal',
 'area_full_rent',
 'area_year_rent',
 'deal_year_rent_multiple',
 '6m_before_area_deal_mean',
 '6m_before_deal_full_rent_rate',
 '12m_before_area_deal_mean',
 '12m_before_deal_full_rent_rate',
 '12m_before_deal_year_rent_multiple',
 'date',
 'kospi_index',
 'korea_3_year',
 'last_month_total_month_rent_count',
 'future_area_deal']

- 거시경제 지표와, 미시경제 지표, 생성한 가치평가 지표 다 상관관계가 있음


In [None]:
# 사용할 컬러명들만 선택해서 학습&테스트 데이터셋을 확보 
df_train_test = df_original_dataset[learning_feature_list]
df_train_test

Unnamed: 0,area_deal,area_full_rent,area_year_rent,deal_year_rent_multiple,6m_before_area_deal_mean,6m_before_deal_full_rent_rate,12m_before_area_deal_mean,12m_before_deal_full_rent_rate,12m_before_deal_year_rent_multiple,date,kospi_index,korea_3_year,last_month_total_month_rent_count,future_area_deal
0,548.170105,309.035980,20.537769,26.690830,-4.654602,6.868177,-14.525463,20.286734,-12.861209,2012-03-01,2030.250000,3.430,2638,512.318481
1,547.971802,308.738037,20.522972,26.700411,-4.689094,6.803782,-14.556384,20.214254,-12.829931,2012-03-02,2034.630005,3.485,2638,512.909119
2,547.847473,309.738129,20.533909,26.680136,-4.710719,7.174067,-14.575770,20.631031,-12.896124,2012-03-03,2034.630005,3.485,2638,513.148926
3,548.025452,309.982178,20.534513,26.688017,-4.679762,7.223673,-14.548018,20.686869,-12.870394,2012-03-04,2034.630005,3.485,2638,513.173767
4,547.768799,309.442413,20.514805,26.701145,-4.724403,7.087125,-14.588037,20.533175,-12.827534,2012-03-05,2016.060059,3.490,2638,513.639587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3588,1083.472168,607.116211,28.812990,37.603600,7.364027,-2.626612,15.883393,-5.929564,5.545363,2021-12-27,2999.550049,1.776,6661,1058.162842
3589,1083.753906,606.659790,28.849360,37.565960,7.391945,-2.725111,15.913527,-6.024723,5.439716,2021-12-28,3020.239990,1.786,6661,1057.969971
3590,1083.566772,606.564270,28.839167,37.572750,7.373402,-2.723633,15.893512,-6.023295,5.458775,2021-12-29,2993.290039,1.783,6661,1057.233032
3591,1083.580566,606.848999,28.829924,37.585274,7.374769,-2.679206,15.894987,-5.980375,5.493926,2021-12-30,2977.649902,1.802,6661,1056.862427


In [None]:
df_train_test.to_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_train_test.pkl')

## 학습, 테스트 데이터 셋 선언

In [3]:
import pandas as pd

df_train_test = pd.read_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_train_test.pkl')
df_train_test.head()

Unnamed: 0,area_deal,area_full_rent,area_year_rent,deal_year_rent_multiple,6m_before_area_deal_mean,6m_before_deal_full_rent_rate,12m_before_area_deal_mean,12m_before_deal_full_rent_rate,12m_before_deal_year_rent_multiple,date,kospi_index,korea_3_year,last_month_total_month_rent_count,future_area_deal
0,548.170105,309.03598,20.537769,26.69083,-4.654602,6.868177,-14.525463,20.286734,-12.861209,2012-03-01,2030.25,3.43,2638,512.318481
1,547.971802,308.738037,20.522972,26.700411,-4.689094,6.803782,-14.556384,20.214254,-12.829931,2012-03-02,2034.630005,3.485,2638,512.909119
2,547.847473,309.738129,20.533909,26.680136,-4.710719,7.174067,-14.57577,20.631031,-12.896124,2012-03-03,2034.630005,3.485,2638,513.148926
3,548.025452,309.982178,20.534513,26.688017,-4.679762,7.223673,-14.548018,20.686869,-12.870394,2012-03-04,2034.630005,3.485,2638,513.173767
4,547.768799,309.442413,20.514805,26.701145,-4.724403,7.087125,-14.588037,20.533175,-12.827534,2012-03-05,2016.060059,3.49,2638,513.639587


In [4]:
df_train_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3593 entries, 0 to 3592
Data columns (total 14 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   area_deal                           3593 non-null   float32       
 1   area_full_rent                      3593 non-null   float32       
 2   area_year_rent                      3593 non-null   float32       
 3   deal_year_rent_multiple             3593 non-null   float32       
 4   6m_before_area_deal_mean            3593 non-null   float32       
 5   6m_before_deal_full_rent_rate       3593 non-null   float32       
 6   12m_before_area_deal_mean           3593 non-null   float32       
 7   12m_before_deal_full_rent_rate      3593 non-null   float32       
 8   12m_before_deal_year_rent_multiple  3593 non-null   float32       
 9   date                                3593 non-null   datetime64[ns]
 10  kospi_index             

In [5]:
train_columns = list(df_train_test.columns)

to_remove = ['future_area_deal','date']
for x in to_remove:
    train_columns.remove(x)
train_columns


['area_deal',
 'area_full_rent',
 'area_year_rent',
 'deal_year_rent_multiple',
 '6m_before_area_deal_mean',
 '6m_before_deal_full_rent_rate',
 '12m_before_area_deal_mean',
 '12m_before_deal_full_rent_rate',
 '12m_before_deal_year_rent_multiple',
 'kospi_index',
 'korea_3_year',
 'last_month_total_month_rent_count']

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_train_test[train_columns], df_train_test['future_area_deal'], test_size=0.3, random_state=42)


In [7]:
# 정렬을 하지 않으면 추후 시각화를 통해서 모델의 성능을 파악할 대, 그래프가 의도한 대로 나오지 않음 
X_test_sorted = X_test.sort_index()
y_test_sorted = y_test.sort_index()

In [None]:
X_train

Unnamed: 0,area_deal,area_full_rent,area_year_rent,deal_year_rent_multiple,6m_before_area_deal_mean,6m_before_deal_full_rent_rate,12m_before_area_deal_mean,12m_before_deal_full_rent_rate,12m_before_deal_year_rent_multiple,kospi_index,korea_3_year,last_month_total_month_rent_count
1978,633.178101,448.272217,23.312729,27.160189,6.144519,-2.594918,10.822819,-3.899040,10.063616,2402.709961,1.725,3864
1377,559.042236,409.758392,22.827459,24.489902,3.788705,2.911656,7.190088,6.791331,-0.723249,1949.040039,1.768,4172
407,513.955444,318.966705,20.378195,25.220852,-1.813341,5.358573,-5.688158,9.559248,-5.113914,1924.229980,2.675,3352
1883,604.688477,438.613190,23.324080,25.925503,3.038203,0.116519,8.020798,-2.292268,6.931191,2209.459961,1.673,4925
1157,537.340332,381.172485,21.926785,24.506117,2.834441,4.043371,3.909104,5.206642,0.015039,2127.169922,1.850,4413
...,...,...,...,...,...,...,...,...,...,...,...,...
1130,533.708130,377.128540,21.803717,24.477852,2.794503,3.812716,3.322755,4.630808,-0.003693,2045.420044,1.712,5655
1294,553.112305,399.430481,22.453661,24.633503,5.017668,2.715031,7.075886,6.565420,0.577138,1975.449951,1.673,4305
860,515.233154,349.829315,21.021002,24.510399,0.609227,1.869143,0.765832,8.314969,-2.895599,2000.500000,2.614,2804
3507,1074.258911,606.498108,28.203966,38.088932,8.372123,-2.195719,17.460936,-4.607842,6.379736,2959.459961,1.704,6184


In [None]:
X_test_sorted

Unnamed: 0,area_deal,area_full_rent,area_year_rent,deal_year_rent_multiple,6m_before_area_deal_mean,6m_before_deal_full_rent_rate,12m_before_area_deal_mean,12m_before_deal_full_rent_rate,12m_before_deal_year_rent_multiple,kospi_index,korea_3_year,last_month_total_month_rent_count
0,548.170105,309.035980,20.537769,26.690830,-4.654602,6.868177,-14.525463,20.286734,-12.861209,2030.250000,3.430,2638
7,546.781677,309.040894,20.527166,26.636978,-4.896097,7.141244,-14.741957,20.594090,-13.037023,2000.760010,3.480,2638
12,545.325134,308.750000,20.508192,26.590601,-5.149439,7.326300,-14.969071,20.802380,-13.188432,2025.040039,3.475,2638
14,545.194580,308.387939,20.531059,26.554625,-5.172147,7.226110,-14.989429,20.689611,-13.305886,2043.760010,3.565,2638
17,544.595398,309.191559,20.509285,26.553602,-5.276365,7.623809,-15.082857,21.137245,-13.309224,2034.439941,3.580,2638
...,...,...,...,...,...,...,...,...,...,...,...,...
3581,1082.718262,607.646851,28.764551,37.640713,7.289321,-2.473635,15.802759,-5.781777,5.649532,2963.000000,1.741,6661
3585,1083.359253,606.215698,28.791737,37.627438,7.352838,-2.760908,15.871316,-6.059305,5.612271,3012.429932,1.802,6661
3586,1083.347656,606.836914,28.802065,37.613541,7.351689,-2.660220,15.870076,-5.962033,5.573266,3012.429932,1.795,6661
3587,1083.281128,607.223083,28.803324,37.609589,7.345097,-2.592293,15.862960,-5.896410,5.562173,3012.429932,1.795,6661


## 모델 적용

### 선형회귀 모델

#### 기본모델

In [None]:
# Importing required libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import plotly.graph_objs as go

# Creating a Linear Regression model
model = LinearRegression()

# Training the model on the training set
model.fit(X_train, y_train)


# Making predictions on the testing set
y_pred = model.predict(X_test_sorted)


# Evaluating the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test_sorted , y_pred)
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', np.sqrt(mse))
print()

# Creating the traces
trace1 = go.Scatter(
    x = y_test_sorted.index,
    y = y_test_sorted.values,
    mode = 'lines',
    name = 'actual_value'
)


trace2 = go.Scatter(
    x = y_test_sorted.index,
    y = list(y_pred),
    mode = 'lines',
    name = 'predict_value'
)

# Combining the traces and creating the layout
data = [trace1, trace2]
layout = go.Layout(title='Linear Regression Predict future price for test dataset', xaxis=dict(title='index'), yaxis=dict(title='미래평당가격'))

# Creating the figure and plotting it
fig = go.Figure(data=data, layout=layout)
fig.show()


Mean Squared Error: 188.0048911043631
Root Mean Squared Error: 13.71148755986611



### 다항회귀 모델

#### 기본 모델(2차항)

In [None]:
# Importing required libraries
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np


# Creating polynomial features
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_sorted_poly = poly.transform(X_test_sorted)

# Creating a Polynomial Regression model
model = LinearRegression()

# Training the model on the training set
model.fit(X_train_poly, y_train)

# Making predictions on the testing set
y_pred = model.predict(X_test_sorted_poly)

# Evaluating the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test_sorted, y_pred)
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', np.sqrt(mse))
print()

# Creating the traces
trace1 = go.Scatter(
    x = y_test_sorted.index,
    y = y_test_sorted.values,
    mode = 'lines',
    name = 'actual_value'
)


trace2 = go.Scatter(
    x = y_test_sorted.index,
    y = list(y_pred),
    mode = 'lines',
    name = 'predict_value'
)

# Combining the traces and creating the layout
data = [trace1, trace2]
layout = go.Layout(title='Linear Regression Poly2 Predict future price for test dataset', xaxis=dict(title='index'), yaxis=dict(title='미래평당가격'))

# Creating the figure and plotting it
fig = go.Figure(data=data, layout=layout)
fig.show()



Mean Squared Error: 9.011577542291947
Root Mean Squared Error: 3.0019289702276346



#### 기본 모델(3차항)

In [None]:
# Importing required libraries
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np


# Creating polynomial features
poly = PolynomialFeatures(degree=3)
X_train_poly = poly.fit_transform(X_train)
X_test_sorted_poly = poly.transform(X_test_sorted)

# Creating a Polynomial Regression model
model = LinearRegression()

# Training the model on the training set
model.fit(X_train_poly, y_train)

# Making predictions on the testing set
y_pred = model.predict(X_test_sorted_poly)

# Evaluating the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test_sorted, y_pred)
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', np.sqrt(mse))
print()

# Creating the traces
trace1 = go.Scatter(
    x = y_test_sorted.index,
    y = y_test_sorted.values,
    mode = 'lines',
    name = 'actual_value'
)


trace2 = go.Scatter(
    x = y_test_sorted.index,
    y = list(y_pred),
    mode = 'lines',
    name = 'predict_value'
)

# Combining the traces and creating the layout
data = [trace1, trace2]
layout = go.Layout(title='Linear Regression Poly2 Predict future price for test dataset', xaxis=dict(title='index'), yaxis=dict(title='미래평당가격'))

# Creating the figure and plotting it
fig = go.Figure(data=data, layout=layout)
fig.show()


Mean Squared Error: 1.415579409172937
Root Mean Squared Error: 1.1897812442516216



#### 기본 모델(4차항)

In [None]:
# Importing required libraries
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np



# Creating polynomial features
poly = PolynomialFeatures(degree=4)
X_train_poly = poly.fit_transform(X_train)
X_test_sorted_poly = poly.transform(X_test_sorted)

# Creating a Polynomial Regression model
model = LinearRegression()

# Training the model on the training set
model.fit(X_train_poly, y_train)

# Making predictions on the testing set
y_pred = model.predict(X_test_sorted_poly)

# Evaluating the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test_sorted, y_pred)
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', np.sqrt(mse))

print()

# Creating the traces
trace1 = go.Scatter(
    x = y_test_sorted.index,
    y = y_test_sorted.values,
    mode = 'lines',
    name = 'actual_value'
)


trace2 = go.Scatter(
    x = y_test_sorted.index,
    y = list(y_pred),
    mode = 'lines',
    name = 'predict_value'
)

# Combining the traces and creating the layout
data = [trace1, trace2]
layout = go.Layout(title='Linear Regression Poly4 Predict future price for test dataset', xaxis=dict(title='index'), yaxis=dict(title='미래평당가격'))

# Creating the figure and plotting it
fig = go.Figure(data=data, layout=layout)
fig.show()


Mean Squared Error: 1.3479647570803281
Root Mean Squared Error: 1.1610188444122378



### Gradient Boosting 모델

In [None]:
# Importing required libraries
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import plotly.graph_objs as go


# Creating a Gradient Boosting model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=0)

# Training the model on the training set
model.fit(X_train, y_train)

# Making predictions on the testing set
y_pred = model.predict(X_test_sorted)

# Evaluating the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test_sorted , y_pred)
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', np.sqrt(mse))
print()

final_pred = model.predict(df_train_test[train_columns])
final_pred = final_pred.tolist()


# Creating the traces
trace1 = go.Scatter(
    x = y_test_sorted.index,
    y = y_test_sorted.values,
    mode = 'lines',
    name = 'actual_value'
)


trace2 = go.Scatter(
    x = y_test_sorted.index,
    y = list(y_pred),
    mode = 'lines',
    name = 'predict_value'
)

# Combining the traces and creating the layout
data = [trace1, trace2]
layout = go.Layout(title='Gradient Boosting Predict future price for test dataset', xaxis=dict(title='index'), yaxis=dict(title='평당가격'))

# Creating the figure and plotting it
fig = go.Figure(data=data, layout=layout)
fig.show()


Mean Squared Error: 2.788776814137118
Root Mean Squared Error: 1.6699631175978462



### XGBoost 모델

In [None]:
# Importing required libraries
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import plotly.graph_objs as go


# Creating an XGBoost model
model = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=3, random_state=0)

# Training the model on the training set
model.fit(X_train, y_train)


# Making predictions on the testing set
y_pred = model.predict(X_test_sorted)


# Evaluating the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test_sorted , y_pred)
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', np.sqrt(mse))


# Creating the traces
trace1 = go.Scatter(
    x = y_test_sorted.index,
    y = y_test_sorted.values,
    mode = 'lines',
    name = 'actual_value'
)


trace2 = go.Scatter(
    x = y_test_sorted.index,
    y = list(y_pred),
    mode = 'lines',
    name = 'predict_value'
)

# Combining the traces and creating the layout
data = [trace1, trace2]
layout = go.Layout(title='XGBoost Predict future price for test dataset', xaxis=dict(title='index'), yaxis=dict(title='평당가격'))

# Creating the figure and plotting it
fig = go.Figure(data=data, layout=layout)
fig.show()


Mean Squared Error: 3.2676165
Root Mean Squared Error: 1.807655


### RandomForest Regressor 모델



In [None]:
# Importing required libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import plotly.graph_objs as go


# Creating a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Training the model on the training set
model.fit(X_train, y_train)

# Making predictions on the testing set
y_pred = model.predict(X_test_sorted)

# Evaluating the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test_sorted , y_pred)
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', np.sqrt(mse))
print()



# Creating the traces
trace1 = go.Scatter(
    x = y_test_sorted.index,
    y = y_test_sorted.values,
    mode = 'lines',
    name = 'actual_value'
)


trace2 = go.Scatter(
    x = y_test_sorted.index,
    y = list(y_pred),
    mode = 'lines',
    name = 'predict_value'
)


# Combining the traces and creating the layout
data = [trace1, trace2]
layout = go.Layout(title='RandomForest Regressor Predict future price for test dataset', xaxis=dict(title='Date'), yaxis=dict(title='평당가격'))

# Creating the figure and plotting it
fig = go.Figure(data=data, layout=layout)
fig.show()


Mean Squared Error: 0.6788606847170302
Root Mean Squared Error: 0.8239300241628716



>> rmse가 mse보다 수치가 크게 나옴 -> mse 오류가 너무 작아서 루트를 씌웠을 때 오히려 그 수치가 더 크게 나온듯

## 실용성 테스트

- '2022년도와 2023년도의 실제 값' 과 '2021년도와 2022년도'를 기반으로 예측한 '2022년도와 2023년의 예측값을'을 비교

### 실용성 테스트를 할 데이터셋 불러오기

In [8]:
import pandas as pd
df_final_pred_dataset = pd.read_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_original_dataset_without_future.pkl')
# df_final_pred_dataset = df_final_pred_dataset.loc[~((df_final_pred_dataset['year']==2022)&(df_final_pred_dataset['month']>4)),:]
df_final_pred_dataset = df_final_pred_dataset.loc[df_final_pred_dataset['year']>2019,:]
df_final_pred_dataset['future_date'] = df_final_pred_dataset['date'] + pd.Timedelta(days=365)
df_final_pred_dataset

Unnamed: 0,year,month,day,area_deal,area_full_rent,area_year_rent,deal_full_rent_rate,deal_year_rent_multiple,6m_before_area_deal_mean,6m_before_area_full_rent_mean,...,korea_10-3_year_12m_before,us_10-2_year_12m_before,us_10-3_year_month_12m_before,last_month_total_apartment_supply_12m_before,last_month_total_unsold_count_12m_before,last_month_total_unsold_ratio_12m_before,last_month_total_deal_count_12m_before,last_month_total_full_rent_count_12m_before,last_month_total_month_rent_count_12m_before,future_date
2862,2020,1,1,827.498657,501.468811,24.241657,60.600559,34.135399,9.222917,4.931760,...,0.133516,0.186881,0.064158,-7585.0,124.0,2.224947,8041.0,3242.0,1380.0,2020-12-31
2863,2020,1,2,828.189575,501.314484,24.235041,60.531364,34.173229,9.314113,4.899467,...,0.127516,0.142881,0.043558,-7585.0,124.0,2.224947,8041.0,3242.0,1380.0,2021-01-01
2864,2020,1,3,828.056519,501.238098,24.259838,60.531872,34.132812,9.296550,4.883484,...,0.099516,0.097281,-0.027442,-7585.0,124.0,2.224947,8041.0,3242.0,1380.0,2021-01-02
2865,2020,1,4,828.664185,502.152924,24.265913,60.597878,34.149311,9.376757,5.074910,...,0.105516,0.097281,-0.027442,-7585.0,124.0,2.224947,8041.0,3242.0,1380.0,2021-01-03
2866,2020,1,5,828.829102,502.262726,24.251171,60.599072,34.176868,9.398524,5.097886,...,0.105516,0.089281,-0.063942,-7585.0,124.0,2.224947,8041.0,3242.0,1380.0,2021-01-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3953,2022,12,27,1058.162842,602.649963,30.013388,56.952477,35.256363,-2.863783,-2.328694,...,-0.434968,-1.326813,-1.857890,-266.0,811.0,46.509003,-695.0,-2296.0,1048.0,2023-12-27
3954,2022,12,28,1057.969971,602.549805,29.992699,56.953396,35.274250,-2.881488,-2.344926,...,-0.378968,-1.264513,-1.966890,-266.0,811.0,46.509003,-695.0,-2296.0,1048.0,2023-12-28
3955,2022,12,29,1057.233032,602.409302,30.018431,56.979805,35.219463,-2.949137,-2.367698,...,-0.380968,-1.338713,-1.998890,-266.0,811.0,46.509003,-695.0,-2296.0,1048.0,2023-12-29
3956,2022,12,30,1056.862427,602.243958,30.013784,56.984138,35.212566,-2.983157,-2.394495,...,-0.375968,-1.342013,-1.921890,-266.0,811.0,46.509003,-695.0,-2296.0,1048.0,2023-12-30


In [9]:
# 일자별 실제 평당 가격을 알기 위해서 사용(2023년도 포함)
df_area_micro = pd.read_pickle('/content/drive/MyDrive/house_price/after_data/ver_4/df_area_micro.pkl')
df_final_actual =  df_area_micro[['year','month','day','area_deal']].copy()
df_final_actual['date'] = pd.to_datetime(df_final_actual[['year','month','day']])
df_final_actual = df_final_actual.loc[~(df_final_actual['year']==2012),:]
df_final_actual = df_final_actual.loc[~((df_final_actual['year']==2013)&(df_final_actual['month']<3)),:]
df_final_actual = df_final_actual.loc[df_final_actual['year']>2020,:]
df_final_actual

Unnamed: 0,year,month,day,area_deal,date
3622,2021,1,1,959.375732,2021-01-01
3623,2021,1,2,960.076843,2021-01-02
3624,2021,1,3,960.203613,2021-01-03
3625,2021,1,4,959.832642,2021-01-04
3626,2021,1,5,960.375427,2021-01-05
...,...,...,...,...,...
4467,2023,4,26,1024.179443,2023-04-26
4468,2023,4,27,1024.288940,2023-04-27
4469,2023,4,28,1024.102783,2023-04-28
4470,2023,4,29,1023.981995,2023-04-29


### RandomForest Regressor 모델 적용

In [10]:
# Importing required libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import plotly.graph_objs as go


# Creating a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Training the model on the training set
model.fit(X_train, y_train)

# Making predictions on the testing set
y_pred = model.predict(df_final_pred_dataset[train_columns])

final_pred = model.predict(df_final_pred_dataset[train_columns])
final_pred = final_pred.tolist()



# Creating the traces
trace1 = go.Scatter(
    x = df_final_actual['date'],
    y = df_final_actual['area_deal'],
    mode = 'lines',
    name = 'actual_value'
)


trace2 = go.Scatter(
    x = df_final_pred_dataset['future_date'],
    y = final_pred,
    mode = 'lines',
    name = 'predict_value'
)

# Combining the traces and creating the layout
data = [trace1, trace2]
layout = go.Layout(title='Final RandomForest Regressor Predict future price for total dataset', xaxis=dict(title='Date'), yaxis=dict(title='평당가격'))

# Creating the figure and plotting it
fig = go.Figure(data=data, layout=layout)
fig.show()






- 2023도 예측값이 상당히 이상하게 나오는데, 그 이유로는 2022년도에 있었던 경제지표들이 그간에 잘 없었던 수치들이 나와서 그런게 아닌가 하는 가설
- 과적합으 오류?

### XGBoost 모델 적용

In [None]:
# Importing required libraries
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import plotly.graph_objs as go

# Creating an XGBoost model
model = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators=70, learning_rate=0.1, max_depth=3, random_state=0)

# Training the model on the training set
model.fit(X_train, y_train)

# Making predictions on the testing set
y_pred = model.predict(df_final_pred_dataset[train_columns])

final_pred = model.predict(df_final_pred_dataset[train_columns])
final_pred = final_pred.tolist()



# Creating the traces
trace1 = go.Scatter(
    x = df_final_actual['date'],
    y = df_final_actual['area_deal'],
    mode = 'lines',
    name = 'actual_value'
)


trace2 = go.Scatter(
    x = df_final_pred_dataset['future_date'],
    y = final_pred,
    mode = 'lines',
    name = 'predict_value'
)

# Combining the traces and creating the layout
data = [trace1, trace2]
layout = go.Layout(title='Final XGBoost Regressor Predict future price for total dataset', xaxis=dict(title='Date'), yaxis=dict(title='평당가격'))

# Creating the figure and plotting it
fig = go.Figure(data=data, layout=layout)
fig.show()






- 2023도 예측값이 상당히 이상하게 나오는데, 그 이유로는 2022년도에 있었던 경제지표들이 그간에 잘 없었던 수치들이 나와서 그런게 아닌가 하는 가설
- 과적합으 오류?

### Gradient Boosting 모델 적용

In [11]:
# Importing required libraries
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import plotly.graph_objs as go

# Creating a Gradient Boosting model
model = GradientBoostingRegressor(n_estimators=80, learning_rate=0.1, max_depth=3, random_state=0)

# Training the model on the training set
model.fit(X_train, y_train)

# Making predictions on the testing set

final_pred = model.predict(df_final_pred_dataset[train_columns])
final_pred = final_pred.tolist()



# Creating the traces
trace1 = go.Scatter(
    x = df_final_actual['date'],
    y = df_final_actual['area_deal'],
    mode = 'lines',
    name = 'actual_value'
)


trace2 = go.Scatter(
    x = df_final_pred_dataset['future_date'],
    y = final_pred,
    mode = 'lines',
    name = 'predict_value'
)

# Combining the traces and creating the layout
data = [trace1, trace2]
layout = go.Layout(title='Final Gradient Boosting Regressor Predict future price for total dataset', xaxis=dict(title='Date'), yaxis=dict(title='평당가격'))

# Creating the figure and plotting it
fig = go.Figure(data=data, layout=layout)
fig.show()






#### 20일 평균 이동선 추가

In [12]:
import pandas as pd

df_with_average_move = pd.DataFrame({'date':df_final_pred_dataset['future_date'], 'predict_price':final_pred})
df_with_average_move.reset_index(inplace=True,drop=True)
print(df_with_average_move.info())
print()
print(len(df_with_average_move))
print()
df_with_average_move.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1096 entries, 0 to 1095
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           1096 non-null   datetime64[ns]
 1   predict_price  1096 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 17.2 KB
None

1096



Unnamed: 0,date,predict_price
0,2020-12-31,953.425807
1,2021-01-01,961.45952
2,2021-01-02,961.45952
3,2021-01-03,961.45952
4,2021-01-04,961.45952


In [13]:
# 30일 지수 평균 이동선 생성
for cal_index in range(20,len(df_with_average_move)):
    start_index = cal_index-20
    sum_value = 0
    for i in range(start_index, cal_index):
        sum_value += df_with_average_move.loc[i]['predict_price']
    
    df_with_average_move.loc[cal_index,'20days_average_price'] = sum_value/20

In [14]:
df_with_average_move = df_with_average_move.dropna(subset=['20days_average_price'])
df_with_average_move.head()

Unnamed: 0,date,predict_price,20days_average_price
20,2021-01-20,968.881385,963.322589
21,2021-01-21,968.881385,964.095367
22,2021-01-22,968.881385,964.466461
23,2021-01-23,971.71809,964.837554
24,2021-01-24,971.71809,965.350482


In [15]:
# Importing required libraries
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import plotly.graph_objs as go




# Creating the traces
trace1 = go.Scatter(
    x = df_final_actual['date'],
    y = df_final_actual['area_deal'],
    mode = 'lines',
    name = 'actual_value'
)


trace2 = go.Scatter(
    x = df_with_average_move['date'],
    y = df_with_average_move['predict_price'],
    mode = 'lines',
    name = 'predict_value'
)

trace3 = go.Scatter(
    x = df_with_average_move['date'],
    y = df_with_average_move['20days_average_price'],
    mode = 'lines',
    name = 'predict_value_20days_average'
)

# Combining the traces and creating the layout
data = [trace1, trace2, trace3]
layout = go.Layout(title='Final Gradient Boosting Regressor Predict future price for total dataset', xaxis=dict(title='Date'), yaxis=dict(title='평당가격'))

# Creating the figure and plotting it
fig = go.Figure(data=data, layout=layout)
fig.show()






### Linear Regressor 모델 적용

In [16]:
# Importing required libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import plotly.graph_objs as go
# Creating a Linear Regression model
model = LinearRegression()

# Training the model on the training set
model.fit(X_train, y_train)


# Making predictions on the testing set

final_pred = model.predict(df_final_pred_dataset[train_columns])
final_pred = final_pred.tolist()



# Creating the traces
trace1 = go.Scatter(
    x = df_final_actual['date'],
    y = df_final_actual['area_deal'],
    mode = 'lines',
    name = 'actual_value'
)


trace2 = go.Scatter(
    x = df_final_pred_dataset['future_date'],
    y = final_pred,
    mode = 'lines',
    name = 'predict_value'
)

# Combining the traces and creating the layout
data = [trace1, trace2]
layout = go.Layout(title='Final Linear regressor Predict future price for total dataset', xaxis=dict(title='Date'), yaxis=dict(title='평당가격'))

# Creating the figure and plotting it
fig = go.Figure(data=data, layout=layout)
fig.show()






#### 20일 평균 이동선 추가

In [17]:
import pandas as pd

df_with_average_move = pd.DataFrame({'date':df_final_pred_dataset['future_date'], 'predict_price':final_pred})
df_with_average_move.reset_index(inplace=True,drop=True)

# 20일 지수 평균 이동선 생성
for cal_index in range(20,len(df_with_average_move)):
    start_index = cal_index-20
    sum_value = 0
    for i in range(start_index, cal_index):
        sum_value += df_with_average_move.loc[i]['predict_price']
    
    df_with_average_move.loc[cal_index,'20days_average_price'] = sum_value/20

df_with_average_move = df_with_average_move.dropna(subset=['20days_average_price'])

In [18]:
# Importing required libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import plotly.graph_objs as go





# Creating the traces
trace1 = go.Scatter(
    x = df_final_actual['date'],
    y = df_final_actual['area_deal'],
    mode = 'lines',
    name = 'actual_value'
)


trace2 = go.Scatter(
    x = df_with_average_move['date'],
    y = df_with_average_move['predict_price'],
    mode = 'lines',
    name = 'predict_value'
)

trace3 = go.Scatter(
    x = df_with_average_move['date'],
    y = df_with_average_move['20days_average_price'],
    mode = 'lines',
    name = 'predict_value_20days_average'
)

# Combining the traces and creating the layout
data = [trace1, trace2, trace3]
layout = go.Layout(title='Final Linear regressor Predict future price for total dataset', xaxis=dict(title='Date'), yaxis=dict(title='평당가격'))

# Creating the figure and plotting it
fig = go.Figure(data=data, layout=layout)
fig.show()






- 성능 평가에서는 Linear Regressor 모델이 RandomForest Regressor 모델 보다 더 성능이 안좋았는데, 실제 시각화를 해보니 오히려 Linear Regressor가 더 실제값과 비슷하게 움직임
- 학습을 할 때, 어느정도 오류의 범위를 수용하는 식으로 학습을 하는게 더 추세들을 파악할 때는 도움이 되는 것인가?? 아니면 손실함수 설정에 따라서 다른 것인가?

### 앙상블 모델 적용

In [19]:
# Importing required libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import plotly.graph_objs as go


# Creating a Linear Regression model
lr_model = LinearRegression()

# Creating a Gradient Boosting model
gb_model = GradientBoostingRegressor(n_estimators=80, learning_rate=0.1, max_depth=3, random_state=0)

# VotingRegressor를 통해 앙상블 모델 생성
ensemble_model_1 = VotingRegressor([('lr', lr_model), ('gb', gb_model)], weights=[0.3, 0.7])

# VotingRegressor를 통해 앙상블 모델 생성
ensemble_model_2 = VotingRegressor([('lr', lr_model), ('gb', gb_model)], weights=[0.4, 0.6])


# Training the model on the training set
ensemble_model_1.fit(X_train, y_train)
# Making predictions on the testing set
final_pred_1 = ensemble_model_1.predict(df_final_pred_dataset[train_columns])
final_pred_1 = final_pred_1.tolist()


# Training the model on the training set
ensemble_model_2.fit(X_train, y_train)
# Making predictions on the testing set
final_pred_2 = ensemble_model_2.predict(df_final_pred_dataset[train_columns])
final_pred_2 = final_pred_2.tolist()



# Training the model on the training set
lr_model.fit(X_train, y_train)
# Making predictions on the testing set
final_pred_3 = lr_model.predict(df_final_pred_dataset[train_columns])
final_pred_3 = final_pred_3.tolist()


# Training the model on the training set
gb_model.fit(X_train, y_train)
# Making predictions on the testing set
final_pred_4 = gb_model.predict(df_final_pred_dataset[train_columns])
final_pred_4 = final_pred_4.tolist()


# Creating the traces
trace1 = go.Scatter(
    x = df_final_actual['date'],
    y = df_final_actual['area_deal'],
    mode = 'lines',
    name = 'actual_value'
)


trace2 = go.Scatter(
    x = df_final_pred_dataset['future_date'],
    y = final_pred_1,
    mode = 'lines',
    name = 'ensemble_1 predict_value'
)

trace3 = go.Scatter(
    x = df_final_pred_dataset['future_date'],
    y = final_pred_2,
    mode = 'lines',
    name = 'ensemble_2 predict_value'
)


trace4 = go.Scatter(
    x = df_final_pred_dataset['future_date'],
    y = final_pred_3,
    mode = 'lines',
    name = 'Linear predict_value'
)

trace5 = go.Scatter(
    x = df_final_pred_dataset['future_date'],
    y = final_pred_4,
    mode = 'lines',
    name = 'Gradient Boosting predict_value'
)
# Combining the traces and creating the layout
data = [trace1, trace2, trace3, trace4,trace5]
layout = go.Layout(title='Final Ensemble Model Predict future price for total dataset', xaxis=dict(title='Date'), yaxis=dict(title='평당가격'))

# Creating the figure and plotting it
fig = go.Figure(data=data, layout=layout)
fig.show()

## corr 을 통해 추세 성능 평가

In [46]:
import pandas as pd

corr_df = pd.DataFrame({'date':df_final_actual['date'],'actual_value':df_final_actual['area_deal'], 'ensemble_1_value':final_pred_1[:850], 'ensemble_2_value':final_pred_2[:850],'linear_value':final_pred_3[:850],'Gradient_boosting_value':final_pred_4[:850]})

corr_df.reset_index(inplace=True, drop=True)

corr_df

Unnamed: 0,date,actual_value,ensemble_1_value,ensemble_2_value,linear_value,Gradient_boosting_value
0,2021-01-01,959.375732,958.841019,960.646089,971.476513,953.425807
1,2021-01-02,960.076843,964.620066,965.673581,971.994674,961.459520
2,2021-01-03,960.203613,964.317255,965.269833,970.985303,961.459520
3,2021-01-04,959.832642,965.163631,966.398334,973.806556,961.459520
4,2021-01-05,960.375427,965.693517,967.104849,975.572843,961.459520
...,...,...,...,...,...,...
845,2023-04-26,1024.179443,1028.256686,1027.450325,1022.612157,1030.675770
846,2023-04-27,1024.288940,1028.365674,1027.537441,1022.568045,1030.850372
847,2023-04-28,1024.102783,1027.692011,1026.697424,1020.729906,1030.675770
848,2023-04-29,1023.981995,1027.553437,1026.512659,1020.267994,1030.675770


In [47]:
corr_df_2023 = corr_df.loc[corr_df['date'].dt.year==2023,:].copy()
corr_df_2023 

Unnamed: 0,date,actual_value,ensemble_1_value,ensemble_2_value,linear_value,Gradient_boosting_value
730,2023-01-01,1056.691528,1056.138200,1054.896442,1047.445896,1059.863473
731,2023-01-02,1056.478394,1050.345460,1049.024023,1041.095400,1054.309771
732,2023-01-03,1055.845093,1050.481302,1049.205146,1041.548208,1054.309771
733,2023-01-04,1055.708862,1050.146769,1048.759102,1040.433099,1054.309771
734,2023-01-05,1055.373779,1048.623669,1046.728301,1035.356096,1054.309771
...,...,...,...,...,...,...
845,2023-04-26,1024.179443,1028.256686,1027.450325,1022.612157,1030.675770
846,2023-04-27,1024.288940,1028.365674,1027.537441,1022.568045,1030.850372
847,2023-04-28,1024.102783,1027.692011,1026.697424,1020.729906,1030.675770
848,2023-04-29,1023.981995,1027.553437,1026.512659,1020.267994,1030.675770


In [48]:
corr_df_2023.corr()['actual_value'].sort_values(ascending=False).to_frame()





Unnamed: 0,actual_value
actual_value,1.0
Gradient_boosting_value,0.820576
ensemble_1_value,0.808807
ensemble_2_value,0.787422
linear_value,0.346316


# 결론2

- 모델들 마다 정확한 수치들은 차이가 있지만, 실제 데이터와 비슷하게 움직이는 모델들을 적용시켰을 때, 2023년 말 까지는 서울 아파트 전체의 값들이 유지되거나 내려가는 추세임으로 현재 아파트를 매수하기에는 어려움을 알 수 있다
- corr()로만으만 성능을 평가하면, Gradient boosting 모델이 성능이 가장 좋지만, 시각화를 한 결과 너무 직선으로 움직여서 과연 신빙성이 있을까라는 의문
- Gradient boosting 모델과 Linear regressor 모델을 섞어서 만든 ensemble_1 모델과 ensemble_2 모델은 둘 다 Gradient boosting 모델과 큰 차이가 안나면서도 좋은 성능을 보임 

# 최종결론&보완할 점

- 기계학습 모델의 성능을 효과적으로 파악하기 위해서는, 그래프를 사용하는 것이  시각적으로 효용성이 있음
- 모델이 테스트를 통해서는 성능이 좋을 수는 있어도, 실제 미래의 값들은 다를 수 있음
- 각 모델의 동작과정을 알아야지, 어느 상황에서 어느 모델을 사용할 수 있는지 확인이 가능할 듯함
- 회귀 모델에서 오차와 성능을 어떻게 설정하는지가 모델의 성능의 큰 영향을 미치는듯 하다
- 회귀모델을 통해서 정확한 값을 얻으려 노력하기 보다 추세를 보려고 노력하는 것이 더 맞는 방향이지 않을까?
- 굳이 회귀 모델을 통해서 정확한 수익률을 예측하고자 하면 그 예측한 수익률 자체에 오류가 있을 텐데 그럴거면 미래변화율을 카테고리화 해서 어떤 카테고리에 속할지 예측하는 식으로 하는 게 더 효용성이 있지 않을까
- 데이터 분석을 진행 할 때, 최종 생성할 테이블과, 중간에 생성을 할 테이블들에 대한 구조들(스키마 테이블)을 미리 설계를 해놔야 추후 데이터들을 전처리하거나 생성할 때 더 효율적으로 일을 처리할 수 있음을 배움
- 판다스를 통해서 데이터를 전처리를 할 때, 메모리를 효율적으로 사용하는 식으로 코딩할 능력의 필요성을 느낌(메모리 부족으로 여러번에 나누어서 실행하면 번거롭고 원하는 결과와 다른 결과가 나올 수도 있음)
- col의 개수가 더 많을 때가, row의 개수가 많을 때보다 더 데이터를 처리하는데 오래 걸림
- 생각했던 가설이 참이 아닌 경우, 왜 참이 아니었는지 판단하고 검증하는 능력이 필요함
- 파이썬의 문법들을 활용해서 더 효율적인 함수를 만들 수 있는 능력이 필요
- 데이터 시각화를 어디에 적용하면 좋을지 판단하는 능력의 향상이 필요
- 데이터 분석을 하기위해서 사용하는 방식으로 시각화와 기계학습 모델 사용 뿐이 아닌, 다른 통계적 방법들에 대한 학습 필요
- csv파일, pkl파일, mysql 데이터베이스 사용시의 차이점들 비교하여 학습할 필요 


- df_area_after.info() 메서드는 데이터프레임의 각 열의 데이터 유형에 따라 추정 메모리 사용량을 계산하며, 이를 총합하여 보고합니다. 이는 데이터프레임의 실제 메모리 사용량과는 차이가 있을 수 있습니다.

- 반면, sys.getsizeof(df_area_after)는 데이터프레임 객체 자체가 차지하는 메모리 크기를 보고합니다. 이는 데이터프레임의 실제 메모리 사용량과 더 가깝습니다.

- 따라서, df_area_after.info() 메서드가 보고하는 메모리 사용량과 sys.getsizeof(df_area_after)가 보고하는 메모리 사용량은 일치하지 않을 수 있습니다. 예를 들어, df_area_after가 매우 큰 데이터프레임인 경우, df_area_after.info() 메서드가 보고하는 메모리 사용량은 sys.getsizeof(df_area_after)가 보고하는 메모리 사용량보다 작을 수 있습니다. 하지만, df_area_after가 작은 데이터프레임인 경우에는 df_area_after.info() 메서드가 보고하는 메모리 사용량이 sys.getsizeof(df_area_after)가 보고하는 메모리 사용량보다 작을 수 있습니다.

- 함수를 사용해서 하면 메모리에 할당하는 데이터프레임들의 용량을 줄일 수 있음