In [2]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import eli5
from eli5.sklearn import PermutationImportance

In [2]:
df = pd.read_csv('jibun_xy_by_naver.csv')
df

Unnamed: 0.1,Unnamed: 0,지번주소,좌표X,좌표Y,도로명주소
0,0,서울특별시 강남구 개포동 1164-12,127.052949,37.473193,서울특별시 강남구 논현로2길 34
1,1,서울특별시 강남구 개포동 1164-13,127.053094,37.473287,서울특별시 강남구 논현로2길 36
2,2,서울특별시 강남구 개포동 1164-14,127.053239,37.473382,서울특별시 강남구 논현로2길 38
3,3,서울특별시 강남구 개포동 1164-20,127.054110,37.473948,서울특별시 강남구 논현로2길 50
4,4,서울특별시 강남구 개포동 1164-25,127.054827,37.474392,서울특별시 강남구 논현로2길 62
...,...,...,...,...,...
8948,8948,서울특별시 중랑구 중화동 438,127.077761,37.604575,서울특별시 중랑구 중랑역로 124
8949,8949,서울특별시 중랑구 중화동 450,127.081923,37.597155,서울특별시 중랑구 동일로 752
8950,8950,서울특별시 중랑구 중화동 452,127.078533,37.599890,서울특별시 중랑구 중랑역로 72
8951,8951,서울특별시 중랑구 중화동 453,127.080084,37.602687,서울특별시 중랑구 동일로136길 15


In [3]:
df.drop(columns=['도로명주소'], inplace=True)
df

Unnamed: 0.1,Unnamed: 0,지번주소,좌표X,좌표Y
0,0,서울특별시 강남구 개포동 1164-12,127.052949,37.473193
1,1,서울특별시 강남구 개포동 1164-13,127.053094,37.473287
2,2,서울특별시 강남구 개포동 1164-14,127.053239,37.473382
3,3,서울특별시 강남구 개포동 1164-20,127.054110,37.473948
4,4,서울특별시 강남구 개포동 1164-25,127.054827,37.474392
...,...,...,...,...
8948,8948,서울특별시 중랑구 중화동 438,127.077761,37.604575
8949,8949,서울특별시 중랑구 중화동 450,127.081923,37.597155
8950,8950,서울특별시 중랑구 중화동 452,127.078533,37.599890
8951,8951,서울특별시 중랑구 중화동 453,127.080084,37.602687


In [4]:
df.drop(columns=['Unnamed: 0'], inplace=True)
df

Unnamed: 0,지번주소,좌표X,좌표Y
0,서울특별시 강남구 개포동 1164-12,127.052949,37.473193
1,서울특별시 강남구 개포동 1164-13,127.053094,37.473287
2,서울특별시 강남구 개포동 1164-14,127.053239,37.473382
3,서울특별시 강남구 개포동 1164-20,127.054110,37.473948
4,서울특별시 강남구 개포동 1164-25,127.054827,37.474392
...,...,...,...
8948,서울특별시 중랑구 중화동 438,127.077761,37.604575
8949,서울특별시 중랑구 중화동 450,127.081923,37.597155
8950,서울특별시 중랑구 중화동 452,127.078533,37.599890
8951,서울특별시 중랑구 중화동 453,127.080084,37.602687


In [5]:
df.isnull().sum()

지번주소    0
좌표X     0
좌표Y     0
dtype: int64

In [6]:
df.to_csv('jibun_xy_by_naver.csv', encoding='utf-8-sig')

In [1]:
from haversine import haversine, Unit

# 예제 좌표 (위도, 경도)
point1 = (37.481, 126.983)  # 지점 1
point2 = (37.498, 127.027)  # 지점 2

# 거리 계산 (단위: 킬로미터)
distance_km = haversine(point1, point2, unit=Unit.KILOMETERS)

print(f"두 지점 사이의 거리: {distance_km:.2f} km")

# 거리 계산 (단위: 마일)
distance_miles = haversine(point1, point2, unit=Unit.MILES)

print(f"두 지점 사이의 거리: {distance_miles:.2f} miles")


두 지점 사이의 거리: 4.32 km
두 지점 사이의 거리: 2.68 miles


In [5]:
type(distance_km)

float

In [3]:
train_path = '../train.csv'
test_path  = '../test.csv'
dt = pd.read_csv(train_path)
dt_test = pd.read_csv(test_path)

In [4]:
dt_newXY_forTrain = pd.read_csv('newXY_for_train.csv')
dt_newXY_forTest = pd.read_csv('newXY_for_test.csv')

dt_elemSchool_forTrain = pd.read_csv('elemSchool_for_train.csv')
dt_elemSchool_forTest = pd.read_csv('elemSchool_for_test.csv')

In [5]:
print(dt.shape, dt_test.shape)
print(dt_newXY_forTrain.shape, dt_newXY_forTest.shape)
print(dt_elemSchool_forTrain.shape, dt_elemSchool_forTest.shape)

(1118822, 52) (9272, 51)
(1118822, 2) (9272, 2)
(1118822, 2) (9272, 2)


In [38]:
# pd.set_option('display.max_columns', None)

In [6]:
dt

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,건축면적,주차대수,기타/의무/임대/임의=1/2/3/4,단지승인일,사용허가여부,관리비 업로드,좌표X,좌표Y,단지신청일,target
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,8,3,1987,...,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,124000
1,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,22,4,1987,...,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,123500
2,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,54.98,201712,28,5,1987,...,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,91500
3,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,3,4,1987,...,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,130000
4,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,8,2,1987,...,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,117000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118817,서울특별시 은평구 구산동,382,382.0,0.0,갈현현대,59.94,200707,12,11,1998,...,0.0,366.0,의무,2013-06-04 16:18:51.0,Y,N,126.905638,37.612962,2013-03-07 09:46:27.0,20000
1118818,서울특별시 은평구 구산동,382,382.0,0.0,갈현현대,59.94,200708,25,10,1998,...,0.0,366.0,의무,2013-06-04 16:18:51.0,Y,N,126.905638,37.612962,2013-03-07 09:46:27.0,20000
1118819,서울특별시 은평구 구산동,382,382.0,0.0,갈현현대,84.83,200708,31,20,1998,...,0.0,366.0,의무,2013-06-04 16:18:51.0,Y,N,126.905638,37.612962,2013-03-07 09:46:27.0,28000
1118820,서울특별시 은평구 구산동,382,382.0,0.0,갈현현대,84.83,200709,15,8,1998,...,0.0,366.0,의무,2013-06-04 16:18:51.0,Y,N,126.905638,37.612962,2013-03-07 09:46:27.0,29000


In [37]:
dt = pd.concat([dt, dt_newXY_forTrain], axis=1)
dt_test = pd.concat([dt_test, dt_newXY_forTest], axis=1)

dt = pd.concat([dt, dt_elemSchool_forTrain], axis=1)
dt_test = pd.concat([dt_test, dt_elemSchool_forTest], axis=1)

display(dt)
display(dt_test)

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,사용허가여부,관리비 업로드,좌표X,좌표Y,단지신청일,target,좌표X_2,좌표Y_2,최단거리초등학교명,최단거리초등학교Km
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,8,3,1987,...,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,124000,127.056859,37.476276,서울포이초등학교,0.385832
1,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,22,4,1987,...,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,123500,127.056859,37.476276,서울포이초등학교,0.385832
2,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,54.98,201712,28,5,1987,...,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,91500,127.056859,37.476276,서울포이초등학교,0.385832
3,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,3,4,1987,...,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,130000,127.056859,37.476276,서울포이초등학교,0.385832
4,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,8,2,1987,...,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,117000,127.056859,37.476276,서울포이초등학교,0.385832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118817,서울특별시 은평구 구산동,382,382.0,0.0,갈현현대,59.94,200707,12,11,1998,...,Y,N,126.905638,37.612962,2013-03-07 09:46:27.0,20000,126.905543,37.612989,서울구현초등학교,0.350313
1118818,서울특별시 은평구 구산동,382,382.0,0.0,갈현현대,59.94,200708,25,10,1998,...,Y,N,126.905638,37.612962,2013-03-07 09:46:27.0,20000,126.905543,37.612989,서울구현초등학교,0.350313
1118819,서울특별시 은평구 구산동,382,382.0,0.0,갈현현대,84.83,200708,31,20,1998,...,Y,N,126.905638,37.612962,2013-03-07 09:46:27.0,28000,126.905543,37.612989,서울구현초등학교,0.350313
1118820,서울특별시 은평구 구산동,382,382.0,0.0,갈현현대,84.83,200709,15,8,1998,...,Y,N,126.905638,37.612962,2013-03-07 09:46:27.0,29000,126.905543,37.612989,서울구현초등학교,0.350313


Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,단지승인일,사용허가여부,관리비 업로드,좌표X,좌표Y,단지신청일,좌표X_2,좌표Y_2,최단거리초등학교명,최단거리초등학교Km
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.9700,202307,26,5,1987,...,2022-11-17 13:00:29.0,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,127.056859,37.476276,서울포이초등학교,0.385832
1,서울특별시 강남구 개포동,651-1,651.0,1.0,개포더샵트리에,108.2017,202308,15,10,2021,...,2022-02-23 13:01:10.0,Y,N,127.056394,37.484892,2022-02-23 11:05:05.0,127.057257,37.484829,서울개일초등학교,0.159846
2,서울특별시 강남구 개포동,652,652.0,0.0,개포우성3차,161.0000,202307,28,15,1984,...,1984-12-22 00:00:00.0,Y,N,127.055990,37.483894,2013-03-07 09:46:28.0,127.056019,37.483975,서울개일초등학교,0.291634
3,서울특별시 강남구 개포동,652,652.0,0.0,개포우성3차,133.4600,202308,10,14,1984,...,1984-12-22 00:00:00.0,Y,N,127.055990,37.483894,2013-03-07 09:46:28.0,127.056019,37.483975,서울개일초등학교,0.291634
4,서울특별시 강남구 개포동,652,652.0,0.0,개포우성3차,104.4300,202308,18,6,1984,...,1984-12-22 00:00:00.0,Y,N,127.055990,37.483894,2013-03-07 09:46:28.0,127.056019,37.483975,서울개일초등학교,0.291634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9267,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.6500,202307,19,13,2014,...,2015-09-09 15:30:27.0,Y,N,127.106720,37.618870,2014-09-01 13:05:03.0,127.106467,37.617195,서울새솔초등학교,0.437157
9268,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.6200,202307,25,12,2014,...,2015-09-09 15:30:27.0,Y,N,127.106720,37.618870,2014-09-01 13:05:03.0,127.106467,37.617195,서울새솔초등학교,0.437157
9269,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,101.6500,202308,27,12,2014,...,2015-09-09 15:30:27.0,Y,N,127.106720,37.618870,2014-09-01 13:05:03.0,127.106467,37.617195,서울새솔초등학교,0.437157
9270,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.9400,202309,2,18,2014,...,2015-09-09 15:30:27.0,Y,N,127.106720,37.618870,2014-09-01 13:05:03.0,127.106467,37.617195,서울새솔초등학교,0.437157


### trainset_0715.csv 에 최단거리초등학교 컬럼 붙이기
- 그냥 우측에 concat 하기

In [7]:
dt_0715 = pd.read_csv('../trainset_0715.csv')
display(dt_0715.columns)
display(dt_0715)
display(dt_0715.shape)

Index(['시군구', '번지', '본번', '부번', '아파트명', '전용면적(㎡)', '계약년월', '계약일', '층', '건축년도',
       ...
       '대형학원수', '종합학원수', '입시학원수', '외국어학원수', '예체능학원수', '독서실수', '정보학원수',
       '특수교육학원수', '직업기술학원수', '기타학원수'],
      dtype='object', length=116)

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,대형학원수,종합학원수,입시학원수,외국어학원수,예체능학원수,독서실수,정보학원수,특수교육학원수,직업기술학원수,기타학원수
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,8,3,1987,...,0.0,2.0,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0
1,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,22,4,1987,...,0.0,2.0,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0
2,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,54.98,201712,28,5,1987,...,0.0,2.0,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0
3,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,3,4,1987,...,0.0,2.0,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0
4,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,8,2,1987,...,0.0,2.0,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128089,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.65,202307,19,13,2014,...,0.0,0.0,6.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
1128090,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.62,202307,25,12,2014,...,0.0,0.0,6.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
1128091,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,101.65,202308,27,12,2014,...,0.0,0.0,6.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
1128092,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.94,202309,2,18,2014,...,0.0,0.0,6.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0


(1128094, 116)

In [8]:
dt_0715['is_test']

0          0
1          0
2          0
3          0
4          0
          ..
1128089    1
1128090    1
1128091    1
1128092    1
1128093    1
Name: is_test, Length: 1128094, dtype: int64

In [9]:
dt_elem_train = pd.read_csv('elemSchool_for_train.csv')
dt_elem_test = pd.read_csv('elemSchool_for_test.csv')
dt_elem_concat = pd.concat([dt_elem_train, dt_elem_test])
dt_elem_concat

Unnamed: 0,최단거리초등학교명,최단거리초등학교Km
0,서울포이초등학교,0.385832
1,서울포이초등학교,0.385832
2,서울포이초등학교,0.385832
3,서울포이초등학교,0.385832
4,서울포이초등학교,0.385832
...,...,...
9267,서울새솔초등학교,0.437157
9268,서울새솔초등학교,0.437157
9269,서울새솔초등학교,0.437157
9270,서울새솔초등학교,0.437157


In [10]:
dt_elem_concat.reset_index(drop=True, inplace=True)
dt_elem_concat

Unnamed: 0,최단거리초등학교명,최단거리초등학교Km
0,서울포이초등학교,0.385832
1,서울포이초등학교,0.385832
2,서울포이초등학교,0.385832
3,서울포이초등학교,0.385832
4,서울포이초등학교,0.385832
...,...,...
1128089,서울새솔초등학교,0.437157
1128090,서울새솔초등학교,0.437157
1128091,서울새솔초등학교,0.437157
1128092,서울새솔초등학교,0.437157


In [11]:
dt_0715 = pd.concat([dt_0715, dt_elem_concat], axis=1)
dt_0715

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,입시학원수,외국어학원수,예체능학원수,독서실수,정보학원수,특수교육학원수,직업기술학원수,기타학원수,최단거리초등학교명,최단거리초등학교Km
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,8,3,1987,...,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0,서울포이초등학교,0.385832
1,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,22,4,1987,...,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0,서울포이초등학교,0.385832
2,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,54.98,201712,28,5,1987,...,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0,서울포이초등학교,0.385832
3,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,3,4,1987,...,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0,서울포이초등학교,0.385832
4,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,8,2,1987,...,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0,서울포이초등학교,0.385832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128089,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.65,202307,19,13,2014,...,6.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,서울새솔초등학교,0.437157
1128090,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.62,202307,25,12,2014,...,6.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,서울새솔초등학교,0.437157
1128091,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,101.65,202308,27,12,2014,...,6.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,서울새솔초등학교,0.437157
1128092,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.94,202309,2,18,2014,...,6.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,서울새솔초등학교,0.437157


In [17]:
dt_0715['k-전체세대수'].isnull().sum()

263415

### trainset_0715.csv 에 최단거리초등학교 컬럼 붙이기
- X, Y 좌표를 키값으로 left 머지

In [22]:
dt_0715 = pd.read_csv('../trainset_0715.csv')
display(dt_0715.columns)
display(dt_0715)
display(dt_0715.shape)

Index(['시군구', '번지', '본번', '부번', '아파트명', '전용면적(㎡)', '계약년월', '계약일', '층', '건축년도',
       ...
       '대형학원수', '종합학원수', '입시학원수', '외국어학원수', '예체능학원수', '독서실수', '정보학원수',
       '특수교육학원수', '직업기술학원수', '기타학원수'],
      dtype='object', length=116)

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,대형학원수,종합학원수,입시학원수,외국어학원수,예체능학원수,독서실수,정보학원수,특수교육학원수,직업기술학원수,기타학원수
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,8,3,1987,...,0.0,2.0,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0
1,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,22,4,1987,...,0.0,2.0,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0
2,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,54.98,201712,28,5,1987,...,0.0,2.0,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0
3,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,3,4,1987,...,0.0,2.0,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0
4,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,8,2,1987,...,0.0,2.0,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128089,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.65,202307,19,13,2014,...,0.0,0.0,6.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
1128090,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.62,202307,25,12,2014,...,0.0,0.0,6.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
1128091,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,101.65,202308,27,12,2014,...,0.0,0.0,6.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
1128092,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.94,202309,2,18,2014,...,0.0,0.0,6.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0


(1128094, 116)

In [23]:
dt_newXY_for_Train = pd.read_csv('newXY_for_train.csv')
dt_newXY_for_Test = pd.read_csv('newXY_for_test.csv')

dt_newXY = pd.concat([dt_newXY_for_Train, dt_newXY_for_Test])
dt_newXY.reset_index(inplace=True, drop=True)
dt_newXY

Unnamed: 0,좌표X_2,좌표Y_2
0,127.056859,37.476276
1,127.056859,37.476276
2,127.056859,37.476276
3,127.056859,37.476276
4,127.056859,37.476276
...,...,...
1128089,127.106467,37.617195
1128090,127.106467,37.617195
1128091,127.106467,37.617195
1128092,127.106467,37.617195


In [24]:
dt_0715 = pd.concat([dt_0715, dt_newXY], axis=1)
dt_0715

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,입시학원수,외국어학원수,예체능학원수,독서실수,정보학원수,특수교육학원수,직업기술학원수,기타학원수,좌표X_2,좌표Y_2
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,8,3,1987,...,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0,127.056859,37.476276
1,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,22,4,1987,...,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0,127.056859,37.476276
2,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,54.98,201712,28,5,1987,...,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0,127.056859,37.476276
3,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,3,4,1987,...,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0,127.056859,37.476276
4,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,8,2,1987,...,23.0,1.0,14.0,0.0,0.0,0.0,1.0,1.0,127.056859,37.476276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128089,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.65,202307,19,13,2014,...,6.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,127.106467,37.617195
1128090,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.62,202307,25,12,2014,...,6.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,127.106467,37.617195
1128091,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,101.65,202308,27,12,2014,...,6.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,127.106467,37.617195
1128092,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.94,202309,2,18,2014,...,6.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,127.106467,37.617195


In [26]:
dt_elemSchool = pd.read_csv('elemSchool_with_XY.csv')
dt_elemSchool

Unnamed: 0,좌표X_2,좌표Y_2,최단거리초등학교명,최단거리초등학교Km
0,127.056859,37.476276,서울포이초등학교,0.385832
1,127.056019,37.483975,서울개일초등학교,0.291634
2,127.076624,37.496296,서울대진초등학교,0.148287
3,127.061582,37.478484,서울개현초등학교,0.256165
4,127.068516,37.487597,서울개포초등학교,0.190272
...,...,...,...,...
8950,126.960768,37.500333,서울은로초등학교,0.321256
8951,126.935943,37.544878,서울신석초등학교,0.166289
8952,127.063437,37.548714,서울성수초등학교,0.487360
8953,126.996582,37.564606,서울충무초등학교,0.505446


In [28]:
dt_0715 = pd.merge(dt_0715, dt_elemSchool, on=['좌표X_2', '좌표Y_2'], how='left')
dt_0715

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,예체능학원수,독서실수,정보학원수,특수교육학원수,직업기술학원수,기타학원수,좌표X_2,좌표Y_2,최단거리초등학교명,최단거리초등학교Km
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,8,3,1987,...,14.0,0.0,0.0,0.0,1.0,1.0,127.056859,37.476276,서울포이초등학교,0.385832
1,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,22,4,1987,...,14.0,0.0,0.0,0.0,1.0,1.0,127.056859,37.476276,서울포이초등학교,0.385832
2,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,54.98,201712,28,5,1987,...,14.0,0.0,0.0,0.0,1.0,1.0,127.056859,37.476276,서울포이초등학교,0.385832
3,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,3,4,1987,...,14.0,0.0,0.0,0.0,1.0,1.0,127.056859,37.476276,서울포이초등학교,0.385832
4,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801,8,2,1987,...,14.0,0.0,0.0,0.0,1.0,1.0,127.056859,37.476276,서울포이초등학교,0.385832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128089,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.65,202307,19,13,2014,...,4.0,0.0,0.0,0.0,0.0,0.0,127.106467,37.617195,서울새솔초등학교,0.437157
1128090,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.62,202307,25,12,2014,...,4.0,0.0,0.0,0.0,0.0,0.0,127.106467,37.617195,서울새솔초등학교,0.437157
1128091,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,101.65,202308,27,12,2014,...,4.0,0.0,0.0,0.0,0.0,0.0,127.106467,37.617195,서울새솔초등학교,0.437157
1128092,서울특별시 중랑구 신내동,816,816.0,0.0,신내우디안1단지,84.94,202309,2,18,2014,...,4.0,0.0,0.0,0.0,0.0,0.0,127.106467,37.617195,서울새솔초등학교,0.437157


In [30]:
display(dt_0715['최단거리초등학교명'].isnull().sum())
display(dt_0715['최단거리초등학교Km'].isnull().sum())

0

0

In [33]:
dt_0715.iloc[[10000, 20000, 30000]]

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,예체능학원수,독서실수,정보학원수,특수교육학원수,직업기술학원수,기타학원수,좌표X_2,좌표Y_2,최단거리초등학교명,최단거리초등학교Km
10000,서울특별시 강북구 미아동,811,811.0,0.0,두산위브트레지움,84.95,201806,6,4,2011,...,33.0,0.0,0.0,0.0,0.0,3.0,127.016938,37.617769,서울삼각산초등학교,0.275652
20000,서울특별시 관악구 신림동,1644,1644.0,0.0,강남,46.72,201801,20,5,1974,...,,,,,,,126.903924,37.484238,서울영림초등학교,0.366597
30000,서울특별시 노원구 공릉동,705,705.0,0.0,동부,59.34,201803,17,7,1996,...,,,,,,,127.078614,37.628082,서울공연초등학교,0.366343


### ~~

In [21]:
import numpy as np

# 예시 예측값
predictions = {
    'model_1': np.array([0.1, 0.4, 0.5, 0.9]),
    'model_2': np.array([0.2, 0.3, 0.5, 0.8]),
    'model_3': np.array([0.3, 0.3, 0.4, 0.7])
}

# 각 모델의 예측값을 리스트로 변환
predictions_list = [pred for pred in predictions.values()]

# 예측값들의 평균 계산
ensemble_pred = np.mean(predictions_list, axis=0)

print(ensemble_pred)


[0.2        0.33333333 0.46666667 0.8       ]


### 팀원 데이터셋 확인

In [12]:
df = pd.read_csv('../trainset_0718_3.csv')
df

Unnamed: 0.1,Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년,계약년월,계약년월일,...,target_차분,target_누적합계,target_Lag1,target_Lag2,target_Lag3,target_Lag4,target_Lag5,target_Lag6,target_Lag7,target_변화율
0,0,서울특별시 관악구 신림동,1717,1717.0,0.0,두영,115.3000,2007,200701,2007-01-15,...,,35400.0,,,,,,,,
1,1,서울특별시 도봉구 방학동,740,740.0,0.0,동양크레오,84.9661,2007,200701,2007-01-02,...,,24000.0,,,,,,,,
2,2,서울특별시 중구 신당동,843,843.0,0.0,삼성,59.9700,2007,200701,2007-01-28,...,,27800.0,,,,,,,,
3,3,서울특별시 도봉구 방학동,740,740.0,0.0,동양크레오,84.9661,2007,200701,2007-01-02,...,0.0,48000.0,24000.0,,,,,,,0.0
4,4,서울특별시 도봉구 방학동,740,740.0,0.0,동양크레오,84.9358,2007,200701,2007-01-02,...,0.0,72000.0,24000.0,24000.0,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128089,1128089,서울특별시 금천구 시흥동,1002-1,1002.0,1.0,럭키남서울,41.2200,2023,202309,2023-09-08,...,,,,,51000.0,49500.0,59500.0,45000.0,61000.0,0.0
1128090,1128090,서울특별시 금천구 시흥동,1013,1013.0,0.0,관악산벽산타운5,59.3400,2023,202309,2023-09-16,...,,,,,,,,,,0.0
1128091,1128091,서울특별시 금천구 시흥동,1013,1013.0,0.0,관악산벽산타운5,84.9700,2023,202309,2023-09-16,...,,,,,,,,,,0.0
1128092,1128092,서울특별시 송파구 송파동,163-14,163.0,14.0,현대,59.9100,2023,202309,2023-09-04,...,,,,,,,,,,0.0


In [22]:
dfInterest = df[['계약년월', '기준금리']]
dfInterest.drop_duplicates(inplace=True)
dfInterest.reset_index(drop=True, inplace=True)
dfInterest

Unnamed: 0,계약년월,기준금리
0,200701,4.5
1,200702,4.5
2,200703,4.5
3,200704,4.5
4,200705,4.5
...,...,...
196,202305,3.5
197,202306,3.5
198,202307,3.5
199,202308,3.5


In [14]:
df[['계약년월', '기준금리']]

Unnamed: 0,계약년월,기준금리
0,200701,4.5
1,200701,4.5
2,200701,4.5
3,200701,4.5
4,200701,4.5
...,...,...
1128089,202309,3.5
1128090,202309,3.5
1128091,202309,3.5
1128092,202309,3.5


In [6]:
display(list(df.columns))

['Unnamed: 0',
 '시군구',
 '번지',
 '본번',
 '부번',
 '아파트명',
 '전용면적(㎡)',
 '계약년',
 '계약년월',
 '계약년월일',
 '층',
 '건축년도',
 '도로명',
 'target',
 'is_test',
 '번지주소',
 '구',
 '동',
 '도로명주소',
 'k-아파트코드',
 'k-아파트명',
 'k-단지분류(아파트,주상복합등등)',
 'kapt도로명주소',
 'k-전체동수',
 'k-전체세대수',
 'k-건설사(시공사)',
 '준공일자',
 'k-연면적',
 'k-주거전용면적',
 'k-관리비부과면적',
 'k-전용면적별세대현황(60㎡이하)',
 'k-전용면적별세대현황(60㎡~85㎡이하)',
 'k-85㎡~135㎡이하',
 '주차대수',
 'Nearest_Bus_Station_Distance',
 'Nearest_Subway_Station_Distance',
 '동수_x',
 '합계 세대수',
 '분양 세대수',
 '서울특별시_부동산소비심리지수',
 '미분양_구',
 '기준금리',
 '국고채-회사채',
 '생산자물가지수',
 '소비자물가지수',
 '서울특별시_가계대출',
 '서울특별시_주택담보대출',
 '서울특별시_주택매매거래량',
 '서울특별시_아파트매매거래량',
 '전국매매가격지수',
 '전국전세가격지수',
 '서울특별시_주택미분양',
 '대형학원수',
 '종합학원수',
 '입시학원수',
 '외국어학원수',
 '독서실수',
 '모든학원수',
 'brand',
 'lower_outliers',
 'upper_outliers',
 '강남여부',
 '가격높은동',
 '재건축예상',
 '미분양_이동평균',
 '미분양_3개월대비',
 '지번주소',
 '지번주소.1',
 '좌표X_2',
 '좌표X_2.1',
 '좌표X_2.2',
 '좌표Y_2',
 '좌표Y_2.1',
 '좌표Y_2.2',
 '1번째_가까운_역_거리',
 '1번째_가까운_역_도보시간',
 '2번째_가까운_역_거리',
 '2번째_가까운_역_도보시간',
 

In [10]:
df[['시군구', '번지', '계약년', '계약년월', '계약년월일', '서울특별시_부동산소비심리지수', '기준금리', '소비자물가지수', '서울특별시_가계대출', '서울특별시_주택담보대출', '서울특별시_주택매매거래량', '서울특별시_아파트매매거래량', '전국매매가격지수']]

Unnamed: 0,시군구,번지,계약년,계약년월,계약년월일,서울특별시_부동산소비심리지수,기준금리,소비자물가지수,서울특별시_가계대출,서울특별시_주택담보대출,서울특별시_주택매매거래량,서울특별시_아파트매매거래량,전국매매가격지수
0,서울특별시 관악구 신림동,1717,2007,200701,2007-01-15,,4.5,76.496,,,16826.0,6353.0,66.892
1,서울특별시 도봉구 방학동,740,2007,200701,2007-01-02,,4.5,76.496,,,16826.0,6353.0,66.892
2,서울특별시 중구 신당동,843,2007,200701,2007-01-28,,4.5,76.496,,,16826.0,6353.0,66.892
3,서울특별시 도봉구 방학동,740,2007,200701,2007-01-02,,4.5,76.496,,,16826.0,6353.0,66.892
4,서울특별시 도봉구 방학동,740,2007,200701,2007-01-02,,4.5,76.496,,,16826.0,6353.0,66.892
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128089,서울특별시 금천구 시흥동,1002-1,2023,202309,2023-09-08,100.9,3.5,112.850,407049.8,248147.3,6396.0,3845.0,93.280
1128090,서울특별시 금천구 시흥동,1013,2023,202309,2023-09-16,100.9,3.5,112.850,407049.8,248147.3,6396.0,3845.0,93.280
1128091,서울특별시 금천구 시흥동,1013,2023,202309,2023-09-16,100.9,3.5,112.850,407049.8,248147.3,6396.0,3845.0,93.280
1128092,서울특별시 송파구 송파동,163-14,2023,202309,2023-09-04,100.9,3.5,112.850,407049.8,248147.3,6396.0,3845.0,93.280


In [11]:
df['계약년'].min()

2007