In [2]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import eli5
from eli5.sklearn import PermutationImportance

import random

In [3]:
train_path = '../data/interim/train_final_ver1.csv'
test_path = '../data/interim/test_final_ver1.csv'

df_train = pd.read_csv(train_path, index_col=0)
df_test = pd.read_csv(test_path, index_col=0)

In [4]:
bus_path = '../data/external/bus_feature.csv'
subway_path = '../data/external/subway_feature.csv'

df_bus = pd.read_csv(bus_path)
df_subway = pd.read_csv(subway_path)

display(df_bus)
display(df_subway)

Unnamed: 0,노드 ID,정류소번호,정류소명,X좌표,Y좌표,정류소 타입
0,100000001,1001,종로2가사거리,126.987752,37.569808,중앙차로
1,100000002,1002,창경궁.서울대학교병원,126.996566,37.579183,중앙차로
2,100000003,1003,명륜3가.성대입구,126.998251,37.582581,중앙차로
3,100000004,1004,종로2가.삼일교,126.987613,37.568579,중앙차로
4,100000005,1005,혜화동로터리.여운형활동터,127.001744,37.586243,중앙차로
...,...,...,...,...,...,...
12579,124000334,25995,우성아파트,127.139338,37.550386,일반차로
12580,124000333,25996,우성아파트,127.140046,37.550643,일반차로
12581,124000332,25997,조일약국,127.123596,37.533630,일반차로
12582,124000331,25998,성내시장,127.125497,37.536155,일반차로


Unnamed: 0,역사_ID,역사명,호선,위도,경도
0,9996,미사,5호선,37.560927,127.193877
1,9995,강일,5호선,37.557490,127.175930
2,4929,김포공항,김포골드라인,37.562360,126.801868
3,4928,고촌,김포골드라인,37.601243,126.770345
4,4927,풍무,김포골드라인,37.612488,126.732387
...,...,...,...,...,...
763,154,종로5가,1호선,37.570926,127.001849
764,153,종로3가,1호선,37.570406,126.991847
765,152,종각,1호선,37.570161,126.982923
766,151,시청,1호선,37.565715,126.977088


In [5]:
df_bus = df_bus.rename(columns={'X좌표' : 'bus_x', 'Y좌표' : 'bus_y'})
df_subway = df_subway.rename(columns={'위도' : 'sub_y', '경도' : 'sub_x'})

display(df_bus)
display(df_subway)

Unnamed: 0,노드 ID,정류소번호,정류소명,bus_x,bus_y,정류소 타입
0,100000001,1001,종로2가사거리,126.987752,37.569808,중앙차로
1,100000002,1002,창경궁.서울대학교병원,126.996566,37.579183,중앙차로
2,100000003,1003,명륜3가.성대입구,126.998251,37.582581,중앙차로
3,100000004,1004,종로2가.삼일교,126.987613,37.568579,중앙차로
4,100000005,1005,혜화동로터리.여운형활동터,127.001744,37.586243,중앙차로
...,...,...,...,...,...,...
12579,124000334,25995,우성아파트,127.139338,37.550386,일반차로
12580,124000333,25996,우성아파트,127.140046,37.550643,일반차로
12581,124000332,25997,조일약국,127.123596,37.533630,일반차로
12582,124000331,25998,성내시장,127.125497,37.536155,일반차로


Unnamed: 0,역사_ID,역사명,호선,sub_y,sub_x
0,9996,미사,5호선,37.560927,127.193877
1,9995,강일,5호선,37.557490,127.175930
2,4929,김포공항,김포골드라인,37.562360,126.801868
3,4928,고촌,김포골드라인,37.601243,126.770345
4,4927,풍무,김포골드라인,37.612488,126.732387
...,...,...,...,...,...
763,154,종로5가,1호선,37.570926,127.001849
764,153,종로3가,1호선,37.570406,126.991847
765,152,종각,1호선,37.570161,126.982923
766,151,시청,1호선,37.565715,126.977088


In [6]:
display(df_train)
display(df_test)

Unnamed: 0,복도유형,난방방식,시군구,전용면적,계약일,층,건축년도,k-전체동수,k-전체세대수,k-연면적,건축면적,x,y,target,아파트명,계약년월,도로명
0,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,3,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,124000,개포6차우성,201712,언주로 3
1,계단식,개별난방,서울특별시 강남구 개포동,79.97,22,4,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,123500,개포6차우성,201712,언주로 3
2,계단식,개별난방,서울특별시 강남구 개포동,54.98,28,5,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,91500,개포6차우성,201712,언주로 3
3,계단식,개별난방,서울특별시 강남구 개포동,79.97,3,4,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,130000,개포6차우성,201801,언주로 3
4,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,2,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,117000,개포6차우성,201801,언주로 3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118817,혼합식,개별난방,서울특별시 은평구 구산동,59.94,12,11,1998,4.0,366.0,45515.0,0.0,126.905638,37.612962,20000,갈현현대,200707,서오릉로21길 36
1118818,혼합식,개별난방,서울특별시 은평구 구산동,59.94,25,10,1998,4.0,366.0,45515.0,0.0,126.905638,37.612962,20000,갈현현대,200708,서오릉로21길 36
1118819,혼합식,개별난방,서울특별시 은평구 구산동,84.83,31,20,1998,4.0,366.0,45515.0,0.0,126.905638,37.612962,28000,갈현현대,200708,서오릉로21길 36
1118820,혼합식,개별난방,서울특별시 은평구 구산동,84.83,15,8,1998,4.0,366.0,45515.0,0.0,126.905638,37.612962,29000,갈현현대,200709,서오릉로21길 36


Unnamed: 0,복도유형,난방방식,시군구,전용면적,계약일,층,건축년도,k-전체동수,k-전체세대수,k-연면적,건축면적,x,y,아파트명,계약년월,도로명
0,계단식,개별난방,서울특별시 강남구 개포동,79.9700,26,5,1987,8.0,270.0,22637.0,4858.00,127.057210,37.476763,개포6차우성,202307,언주로 3
1,계단식,지역난방,서울특별시 강남구 개포동,108.2017,15,10,2021,2.0,232.0,44951.0,2724.46,127.056394,37.484892,개포더샵트리에,202308,개포로 311
2,혼합식,지역난방,서울특별시 강남구 개포동,161.0000,28,15,1984,5.0,405.0,63304.0,61064.24,127.055990,37.483894,개포우성3차,202307,개포로 307
3,혼합식,지역난방,서울특별시 강남구 개포동,133.4600,10,14,1984,5.0,405.0,63304.0,61064.24,127.055990,37.483894,개포우성3차,202308,개포로 307
4,혼합식,지역난방,서울특별시 강남구 개포동,104.4300,18,6,1984,5.0,405.0,63304.0,61064.24,127.055990,37.483894,개포우성3차,202308,개포로 307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9267,혼합식,개별난방,서울특별시 중랑구 신내동,84.6500,19,13,2014,16.0,1402.0,190866.0,14171.00,127.106720,37.618870,신내우디안1단지,202307,신내역로1길 85
9268,혼합식,개별난방,서울특별시 중랑구 신내동,84.6200,25,12,2014,16.0,1402.0,190866.0,14171.00,127.106720,37.618870,신내우디안1단지,202307,신내역로1길 85
9269,혼합식,개별난방,서울특별시 중랑구 신내동,101.6500,27,12,2014,16.0,1402.0,190866.0,14171.00,127.106720,37.618870,신내우디안1단지,202308,신내역로1길 85
9270,혼합식,개별난방,서울특별시 중랑구 신내동,84.9400,2,18,2014,16.0,1402.0,190866.0,14171.00,127.106720,37.618870,신내우디안1단지,202309,신내역로1길 85


In [7]:
df_train['is_test'] = 0
df_test['is_test'] = 1
df_all = pd.concat([df_train, df_test])
df_all

Unnamed: 0,복도유형,난방방식,시군구,전용면적,계약일,층,건축년도,k-전체동수,k-전체세대수,k-연면적,건축면적,x,y,target,아파트명,계약년월,도로명,is_test
0,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,3,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,124000.0,개포6차우성,201712,언주로 3,0
1,계단식,개별난방,서울특별시 강남구 개포동,79.97,22,4,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,123500.0,개포6차우성,201712,언주로 3,0
2,계단식,개별난방,서울특별시 강남구 개포동,54.98,28,5,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,91500.0,개포6차우성,201712,언주로 3,0
3,계단식,개별난방,서울특별시 강남구 개포동,79.97,3,4,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,130000.0,개포6차우성,201801,언주로 3,0
4,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,2,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,117000.0,개포6차우성,201801,언주로 3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9267,혼합식,개별난방,서울특별시 중랑구 신내동,84.65,19,13,2014,16.0,1402.0,190866.0,14171.0,127.10672,37.618870,,신내우디안1단지,202307,신내역로1길 85,1
9268,혼합식,개별난방,서울특별시 중랑구 신내동,84.62,25,12,2014,16.0,1402.0,190866.0,14171.0,127.10672,37.618870,,신내우디안1단지,202307,신내역로1길 85,1
9269,혼합식,개별난방,서울특별시 중랑구 신내동,101.65,27,12,2014,16.0,1402.0,190866.0,14171.0,127.10672,37.618870,,신내우디안1단지,202308,신내역로1길 85,1
9270,혼합식,개별난방,서울특별시 중랑구 신내동,84.94,2,18,2014,16.0,1402.0,190866.0,14171.0,127.10672,37.618870,,신내우디안1단지,202309,신내역로1길 85,1


In [8]:
display(df_bus.describe())
display(df_subway.describe())
display(df_all.describe())

Unnamed: 0,노드 ID,정류소번호,bus_x,bus_y
count,12584.0,12584.0,12584.0,12584.0
mean,113178300.0,14295.31246,126.984517,37.55056
std,6964900.0,6957.804578,0.086485,0.054734
min,100000000.0,1001.0,126.721031,37.430537
25%,107900200.0,8780.75,126.915837,37.502978
50%,113900100.0,14601.5,126.992572,37.549841
75%,119000300.0,20511.25,127.049804,37.589799
max,167000600.0,25999.0,127.18176,37.690489


Unnamed: 0,역사_ID,sub_y,sub_x
count,768.0,768.0,768.0
mean,2297.946615,37.509654,126.969929
std,1418.164103,0.15126,0.183336
min,150.0,36.769502,126.441442
25%,1276.75,37.470875,126.851355
50%,1955.5,37.52124,126.993792
75%,3131.25,37.575834,127.070481
max,9996.0,37.9481,127.723792


Unnamed: 0,전용면적,계약일,층,건축년도,k-전체동수,k-전체세대수,k-연면적,건축면적,x,y,target,계약년월,is_test
count,1128094.0,1128094.0,1128094.0,1128094.0,250887.0,251969.0,251969.0,251815.0,1128094.0,1128094.0,1118822.0,1128094.0,1128094.0
mean,77.16028,15.79966,8.881412,1998.791,14.810871,1186.767436,161901.0,190491.5,126.9943,37.55437,57991.53,201482.9,0.008219173
std,29.36448,8.722162,5.989302,9.35854,17.682771,1197.452046,184311.3,1735029.0,0.7329411,0.2224538,46426.02,423.7691,0.09028635
min,10.02,1.0,-4.0,1961.0,1.0,59.0,0.0,0.0,0.0,0.0,350.0,200701.0,0.0
25%,59.65,8.0,4.0,1992.0,5.0,405.0,53666.0,0.0,126.9175,37.50836,30500.0,201111.0,0.0
50%,81.87,16.0,8.0,2000.0,10.0,768.0,102624.0,1735.0,127.0199,37.5475,44800.0,201508.0,0.0
75%,84.96,23.0,12.0,2005.0,17.0,1622.0,203904.0,8414.21,127.0675,37.60063,69800.0,201805.0,0.0
max,424.32,31.0,69.0,2023.0,124.0,9510.0,9591851.0,31596200.0,127.18,37.68929,1450000.0,202309.0,1.0


In [9]:
df_all.columns

Index(['복도유형', '난방방식', '시군구', '전용면적', '계약일', '층', '건축년도', 'k-전체동수', 'k-전체세대수',
       'k-연면적', '건축면적', 'x', 'y', 'target', '아파트명', '계약년월', '도로명', 'is_test'],
      dtype='object')

In [10]:
df_all[(df_all['아파트명'] == '삼성') & (df_all['시군구'] == '서울특별시 종로구 평창동')]

Unnamed: 0,복도유형,난방방식,시군구,전용면적,계약일,층,건축년도,k-전체동수,k-전체세대수,k-연면적,건축면적,x,y,target,아파트명,계약년월,도로명,is_test
92239,계단식,개별난방,서울특별시 종로구 평창동,84.93,31,13,1998,,,,,126.979915,37.611731,46000.0,삼성,201710,평창문화로 172,0
92240,계단식,개별난방,서울특별시 종로구 평창동,84.93,3,9,1998,,,,,126.979915,37.611731,45700.0,삼성,201711,평창문화로 172,0
92241,혼합식,개별난방,서울특별시 종로구 평창동,59.97,27,4,1998,,,,,126.979915,37.611731,34000.0,삼성,201711,평창문화로 171,0
92242,계단식,개별난방,서울특별시 종로구 평창동,84.98,13,3,1998,,,,,126.979915,37.611731,40000.0,삼성,201712,평창문화로 172,0
92243,혼합식,개별난방,서울특별시 종로구 평창동,59.95,22,14,1998,,,,,126.979915,37.611731,38800.0,삼성,201801,평창문화로 172,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115601,계단식,개별난방,서울특별시 종로구 평창동,84.93,27,6,1998,,,,,0.000000,0.000000,30700.0,삼성,200706,,0
1115602,계단식,개별난방,서울특별시 종로구 평창동,84.93,6,5,1998,,,,,0.000000,0.000000,25000.0,삼성,200707,,0
1115603,혼합식,개별난방,서울특별시 종로구 평창동,59.97,18,7,1998,,,,,126.979915,37.611731,23000.0,삼성,200707,평창문화로 172,0
1115604,혼합식,개별난방,서울특별시 종로구 평창동,59.97,1,10,1998,,,,,126.979915,37.611731,20000.0,삼성,200709,평창문화로 172,0


## 평창 삼성아파트의 위도 경도가 0으로 기입되어있음.

    - 따라서 실제 위도 경도로 바꿔주기

In [11]:
df_all.loc[(df_all['아파트명'] == '삼성') & (df_all['x'] < 1), 'x'] = 126.9788
df_all.loc[(df_all['아파트명'] == '삼성') & (df_all['y'] < 1), 'y'] = 37.6133

In [12]:
df_all.describe()

Unnamed: 0,전용면적,계약일,층,건축년도,k-전체동수,k-전체세대수,k-연면적,건축면적,x,y,target,계약년월,is_test
count,1128094.0,1128094.0,1128094.0,1128094.0,250887.0,251969.0,251969.0,251815.0,1128094.0,1128094.0,1118822.0,1128094.0,1128094.0
mean,77.16028,15.79966,8.881412,1998.791,14.810871,1186.767436,161901.0,190491.5,126.9984,37.5556,57991.53,201482.9,0.008219173
std,29.36448,8.722162,5.989302,9.35854,17.682771,1197.452046,184311.3,1735029.0,0.09067939,0.05680886,46426.02,423.7691,0.09028635
min,10.02,1.0,-4.0,1961.0,1.0,59.0,0.0,0.0,126.7983,37.4344,350.0,200701.0,0.0
25%,59.65,8.0,4.0,1992.0,5.0,405.0,53666.0,0.0,126.9175,37.5084,30500.0,201111.0,0.0
50%,81.87,16.0,8.0,2000.0,10.0,768.0,102624.0,1735.0,127.0199,37.5475,44800.0,201508.0,0.0
75%,84.96,23.0,12.0,2005.0,17.0,1622.0,203904.0,8414.21,127.0675,37.60077,69800.0,201805.0,0.0
max,424.32,31.0,69.0,2023.0,124.0,9510.0,9591851.0,31596200.0,127.18,37.68929,1450000.0,202309.0,1.0


## 버스 좌표와 전체 데이터셋의 좌표를 통해 정류장과의 거리 피쳐 생성

In [13]:
df_bus

Unnamed: 0,노드 ID,정류소번호,정류소명,bus_x,bus_y,정류소 타입
0,100000001,1001,종로2가사거리,126.987752,37.569808,중앙차로
1,100000002,1002,창경궁.서울대학교병원,126.996566,37.579183,중앙차로
2,100000003,1003,명륜3가.성대입구,126.998251,37.582581,중앙차로
3,100000004,1004,종로2가.삼일교,126.987613,37.568579,중앙차로
4,100000005,1005,혜화동로터리.여운형활동터,127.001744,37.586243,중앙차로
...,...,...,...,...,...,...
12579,124000334,25995,우성아파트,127.139338,37.550386,일반차로
12580,124000333,25996,우성아파트,127.140046,37.550643,일반차로
12581,124000332,25997,조일약국,127.123596,37.533630,일반차로
12582,124000331,25998,성내시장,127.125497,37.536155,일반차로


In [14]:
import math

# 위경도를 이용해 두 지점간의 거리를 구하는 함수를 생성합니다.
def haversine_distance(lat1, lon1, lat2, lon2): # y1, x1, y2, x2
    radius = 6371.0

    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance = radius * c
    return distance

In [15]:
bus_sample = df_bus.iloc[:10]
all_sample = df_all.iloc[:10]

display(bus_sample)
display(all_sample)

Unnamed: 0,노드 ID,정류소번호,정류소명,bus_x,bus_y,정류소 타입
0,100000001,1001,종로2가사거리,126.987752,37.569808,중앙차로
1,100000002,1002,창경궁.서울대학교병원,126.996566,37.579183,중앙차로
2,100000003,1003,명륜3가.성대입구,126.998251,37.582581,중앙차로
3,100000004,1004,종로2가.삼일교,126.987613,37.568579,중앙차로
4,100000005,1005,혜화동로터리.여운형활동터,127.001744,37.586243,중앙차로
5,101000305,1006,서대문역사거리,126.966893,37.566137,중앙차로
6,100000380,1007,서울역사박물관.경희궁앞,126.97038,37.569135,중앙차로
7,100000379,1008,서울역사박물관.경희궁앞,126.97076,37.569515,중앙차로
8,100000384,1009,광화문,126.976357,37.570142,중앙차로
9,100000385,1010,광화문,126.978086,37.570217,중앙차로


Unnamed: 0,복도유형,난방방식,시군구,전용면적,계약일,층,건축년도,k-전체동수,k-전체세대수,k-연면적,건축면적,x,y,target,아파트명,계약년월,도로명,is_test
0,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,3,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,124000.0,개포6차우성,201712,언주로 3,0
1,계단식,개별난방,서울특별시 강남구 개포동,79.97,22,4,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,123500.0,개포6차우성,201712,언주로 3,0
2,계단식,개별난방,서울특별시 강남구 개포동,54.98,28,5,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,91500.0,개포6차우성,201712,언주로 3,0
3,계단식,개별난방,서울특별시 강남구 개포동,79.97,3,4,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,130000.0,개포6차우성,201801,언주로 3,0
4,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,2,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,117000.0,개포6차우성,201801,언주로 3,0
5,계단식,개별난방,서울특별시 강남구 개포동,79.97,11,1,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,130000.0,개포6차우성,201801,언주로 3,0
6,계단식,개별난방,서울특별시 강남구 개포동,79.97,19,2,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,139500.0,개포6차우성,201803,언주로 3,0
7,계단식,개별난방,서울특별시 강남구 개포동,54.98,5,5,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,107500.0,개포6차우성,201804,언주로 3,0
8,계단식,개별난방,서울특별시 강남구 개포동,79.97,28,3,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,145000.0,개포6차우성,201806,언주로 3,0
9,계단식,개별난방,서울특별시 강남구 개포동,54.98,9,3,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,112000.0,개포6차우성,201807,언주로 3,0


In [16]:
df_loc = df_all[['x', 'y']]
unique_loc = set()
for i in range(len(df_loc)):
    unique_loc.add((df_loc['x'].values[i], df_loc['y'].values[i]))

# len(unique_loc) - 5887

In [17]:
unique_list = list(unique_loc)

In [18]:
unique_x, unique_y = [], []
for i in range(len(unique_list)):
    x, y = unique_list[i]
    unique_x.append(x)
    unique_y.append(y)

In [19]:
unique_x

[126.9516695,
 127.0540147273029,
 127.04754234729876,
 126.90453540408564,
 127.040462,
 126.9780911,
 126.87209913171048,
 126.942148701351,
 127.0332882,
 127.0354867,
 126.8889704,
 126.9490413,
 127.0618732,
 127.05873,
 126.9133162,
 126.86346960780637,
 127.0275895,
 126.9339504,
 126.8571418,
 127.0705565,
 126.82748279768444,
 127.1224052,
 126.948372,
 126.9448801,
 126.9183568,
 126.9071609,
 126.9530377,
 127.047983,
 127.12077535,
 127.1437144,
 126.9111664,
 126.819535,
 126.9159733,
 126.9397339,
 127.0428551,
 126.8997497,
 126.9735558,
 127.0503904,
 127.0464339,
 126.94179,
 127.0386293,
 127.0341005,
 127.0345357,
 127.0310684,
 126.9174157,
 126.896786,
 127.070637,
 126.8822117,
 126.8462422,
 126.91308384422445,
 126.9509079,
 127.0168969,
 127.0503113,
 127.1122234,
 126.9427466,
 127.0204439,
 126.8659598,
 126.8280182,
 127.0322547,
 127.1247501,
 126.9534382,
 127.0701578,
 127.0569387,
 126.8479062,
 127.0493092,
 127.0707049,
 126.8576779,
 127.0657325,
 127

In [20]:
ser_x = pd.Series(unique_x)
ser_y = pd.Series(unique_y)
display(ser_x)
display(ser_y)

0       126.951669
1       127.054015
2       127.047542
3       126.904535
4       127.040462
           ...    
5882    127.033902
5883    127.057734
5884    126.918684
5885    126.894966
5886    127.044834
Length: 5887, dtype: float64

0       37.480261
1       37.489476
2       37.583577
3       37.589763
4       37.590592
          ...    
5882    37.486772
5883    37.539845
5884    37.526537
5885    37.513878
5886    37.625137
Length: 5887, dtype: float64

In [21]:
df_loc = pd.DataFrame({'x' : ser_x, 'y' : ser_y})
df_loc

Unnamed: 0,x,y
0,126.951669,37.480261
1,127.054015,37.489476
2,127.047542,37.583577
3,126.904535,37.589763
4,127.040462,37.590592
...,...,...
5882,127.033902,37.486772
5883,127.057734,37.539845
5884,126.918684,37.526537
5885,126.894966,37.513878


In [22]:
display(df_bus)
display(df_loc)

Unnamed: 0,노드 ID,정류소번호,정류소명,bus_x,bus_y,정류소 타입
0,100000001,1001,종로2가사거리,126.987752,37.569808,중앙차로
1,100000002,1002,창경궁.서울대학교병원,126.996566,37.579183,중앙차로
2,100000003,1003,명륜3가.성대입구,126.998251,37.582581,중앙차로
3,100000004,1004,종로2가.삼일교,126.987613,37.568579,중앙차로
4,100000005,1005,혜화동로터리.여운형활동터,127.001744,37.586243,중앙차로
...,...,...,...,...,...,...
12579,124000334,25995,우성아파트,127.139338,37.550386,일반차로
12580,124000333,25996,우성아파트,127.140046,37.550643,일반차로
12581,124000332,25997,조일약국,127.123596,37.533630,일반차로
12582,124000331,25998,성내시장,127.125497,37.536155,일반차로


Unnamed: 0,x,y
0,126.951669,37.480261
1,127.054015,37.489476
2,127.047542,37.583577
3,126.904535,37.589763
4,127.040462,37.590592
...,...,...
5882,127.033902,37.486772
5883,127.057734,37.539845
5884,126.918684,37.526537
5885,126.894966,37.513878


In [23]:
bus_sample = df_bus.iloc[:10]
loc_sample = df_loc.iloc[:10]

display(bus_sample)
display(loc_sample)

Unnamed: 0,노드 ID,정류소번호,정류소명,bus_x,bus_y,정류소 타입
0,100000001,1001,종로2가사거리,126.987752,37.569808,중앙차로
1,100000002,1002,창경궁.서울대학교병원,126.996566,37.579183,중앙차로
2,100000003,1003,명륜3가.성대입구,126.998251,37.582581,중앙차로
3,100000004,1004,종로2가.삼일교,126.987613,37.568579,중앙차로
4,100000005,1005,혜화동로터리.여운형활동터,127.001744,37.586243,중앙차로
5,101000305,1006,서대문역사거리,126.966893,37.566137,중앙차로
6,100000380,1007,서울역사박물관.경희궁앞,126.97038,37.569135,중앙차로
7,100000379,1008,서울역사박물관.경희궁앞,126.97076,37.569515,중앙차로
8,100000384,1009,광화문,126.976357,37.570142,중앙차로
9,100000385,1010,광화문,126.978086,37.570217,중앙차로


Unnamed: 0,x,y
0,126.951669,37.480261
1,127.054015,37.489476
2,127.047542,37.583577
3,126.904535,37.589763
4,127.040462,37.590592
5,126.978091,37.481291
6,126.872099,37.538179
7,126.942149,37.505792
8,127.033288,37.609448
9,127.035487,37.576978


In [24]:
def get_bus_min_dist(df1, df2): # 거리 차이를 계산할 데이터 프레임 --> 전체 데이터, 버스 or 지하철 데이터
    min_dist = []
    for i in range(len(df1)):
        x, y = df1['x'].values[i], df1['y'].values[i]
        dist = []
        for j in range(len(df2)):
            tar_x, tar_y = df2['bus_x'].values[j], df2['bus_y'].values[j]
            harv_dist = haversine_distance(y, x, tar_y, tar_x)
            dist.append(harv_dist)
        min_dist.append(min(dist))

    return min_dist
        

In [25]:
bus_dist = []
for i in range(len(loc_sample)):
    x, y = loc_sample['x'].values[i], loc_sample['y'].values[i]
    temp = []
    for j in range(len(bus_sample)):
        bus_x, bus_y = bus_sample['bus_x'].values[j], bus_sample['bus_y'].values[j]
        harv_dist = haversine_distance(y, x, bus_y, bus_x)
        temp.append(harv_dist)
    bus_dist.append(min(temp))

print(bus_dist)

[9.64284860789448, 10.566669054415284, 4.046458141171095, 6.090913311687588, 3.4456297446204127, 9.486000579988003, 8.916113257314377, 7.055862186941602, 3.7922297364627693, 3.1468429737774706]


In [26]:
bus_min_dist = get_bus_min_dist(df_loc, df_bus)

In [27]:
df_loc['bus_min_dist'] = bus_min_dist
display(df_loc)

Unnamed: 0,x,y,bus_min_dist
0,126.951669,37.480261,0.066548
1,127.054015,37.489476,0.139641
2,127.047542,37.583577,0.116220
3,126.904535,37.589763,0.140379
4,127.040462,37.590592,0.067162
...,...,...,...
5882,127.033902,37.486772,0.058074
5883,127.057734,37.539845,0.177540
5884,126.918684,37.526537,0.030601
5885,126.894966,37.513878,0.035820


In [28]:
display(df_all)

Unnamed: 0,복도유형,난방방식,시군구,전용면적,계약일,층,건축년도,k-전체동수,k-전체세대수,k-연면적,건축면적,x,y,target,아파트명,계약년월,도로명,is_test
0,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,3,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,124000.0,개포6차우성,201712,언주로 3,0
1,계단식,개별난방,서울특별시 강남구 개포동,79.97,22,4,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,123500.0,개포6차우성,201712,언주로 3,0
2,계단식,개별난방,서울특별시 강남구 개포동,54.98,28,5,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,91500.0,개포6차우성,201712,언주로 3,0
3,계단식,개별난방,서울특별시 강남구 개포동,79.97,3,4,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,130000.0,개포6차우성,201801,언주로 3,0
4,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,2,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,117000.0,개포6차우성,201801,언주로 3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9267,혼합식,개별난방,서울특별시 중랑구 신내동,84.65,19,13,2014,16.0,1402.0,190866.0,14171.0,127.10672,37.618870,,신내우디안1단지,202307,신내역로1길 85,1
9268,혼합식,개별난방,서울특별시 중랑구 신내동,84.62,25,12,2014,16.0,1402.0,190866.0,14171.0,127.10672,37.618870,,신내우디안1단지,202307,신내역로1길 85,1
9269,혼합식,개별난방,서울특별시 중랑구 신내동,101.65,27,12,2014,16.0,1402.0,190866.0,14171.0,127.10672,37.618870,,신내우디안1단지,202308,신내역로1길 85,1
9270,혼합식,개별난방,서울특별시 중랑구 신내동,84.94,2,18,2014,16.0,1402.0,190866.0,14171.0,127.10672,37.618870,,신내우디안1단지,202309,신내역로1길 85,1


In [29]:
df_bus = pd.merge(df_all, df_loc, how='inner')
display(df_bus)

Unnamed: 0,복도유형,난방방식,시군구,전용면적,계약일,층,건축년도,k-전체동수,k-전체세대수,k-연면적,건축면적,x,y,target,아파트명,계약년월,도로명,is_test,bus_min_dist
0,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,3,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,124000.0,개포6차우성,201712,언주로 3,0,0.061783
1,계단식,개별난방,서울특별시 강남구 개포동,79.97,22,4,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,123500.0,개포6차우성,201712,언주로 3,0,0.061783
2,계단식,개별난방,서울특별시 강남구 개포동,54.98,28,5,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,91500.0,개포6차우성,201712,언주로 3,0,0.061783
3,계단식,개별난방,서울특별시 강남구 개포동,79.97,3,4,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,130000.0,개포6차우성,201801,언주로 3,0,0.061783
4,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,2,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,117000.0,개포6차우성,201801,언주로 3,0,0.061783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128089,복도식,개별난방,서울특별시 중랑구 신내동,49.77,22,7,1996,,,,,127.093511,37.616533,,신내6대주,202307,신내로19길 42,1,0.053753
1128090,복도식,개별난방,서울특별시 중랑구 신내동,39.84,7,3,1996,,,,,127.093511,37.616533,,신내6대주,202308,신내로19길 42,1,0.053753
1128091,혼합식,개별난방,서울특별시 중랑구 신내동,59.76,17,4,1996,,,,,127.093511,37.616533,,신내6대주,202308,신내로19길 42,1,0.053753
1128092,복도식,개별난방,서울특별시 중랑구 신내동,39.84,2,9,1996,,,,,127.093511,37.616533,,신내6대주,202309,신내로19길 42,1,0.053753


In [32]:
bus_train = df_bus[df_bus['is_test'] == 0]
bus_test = df_bus[df_bus['is_test'] == 1]

display(bus_train)
display(bus_test)

Unnamed: 0,복도유형,난방방식,시군구,전용면적,계약일,층,건축년도,k-전체동수,k-전체세대수,k-연면적,건축면적,x,y,target,아파트명,계약년월,도로명,is_test,bus_min_dist
0,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,3,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,124000.0,개포6차우성,201712,언주로 3,0,0.061783
1,계단식,개별난방,서울특별시 강남구 개포동,79.97,22,4,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,123500.0,개포6차우성,201712,언주로 3,0,0.061783
2,계단식,개별난방,서울특별시 강남구 개포동,54.98,28,5,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,91500.0,개포6차우성,201712,언주로 3,0,0.061783
3,계단식,개별난방,서울특별시 강남구 개포동,79.97,3,4,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,130000.0,개포6차우성,201801,언주로 3,0,0.061783
4,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,2,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,117000.0,개포6차우성,201801,언주로 3,0,0.061783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1126418,혼합식,개별난방,서울특별시 강동구 둔촌동,73.98,2,2,1995,,,,,127.141232,37.530097,23800.0,성원,200707,양재대로102가길 22,0,0.141061
1126419,혼합식,개별난방,서울특별시 강동구 둔촌동,73.98,25,11,1995,,,,,127.141232,37.530097,26000.0,성원,200709,양재대로102가길 22,0,0.141061
1126420,계단식,지역난방,서울특별시 서초구 반포동,84.07,30,5,1979,,,,,126.996770,37.499668,100000.0,미주,200708,신반포로16길 15-20,0,0.043369
1126421,혼합식,지역난방,서울특별시 송파구 송파동,74.04,7,10,1995,,,,,127.106904,37.509822,31000.0,호수임광,200705,송파대로48길,0,0.090750


Unnamed: 0,복도유형,난방방식,시군구,전용면적,계약일,층,건축년도,k-전체동수,k-전체세대수,k-연면적,건축면적,x,y,target,아파트명,계약년월,도로명,is_test,bus_min_dist
101,계단식,개별난방,서울특별시 강남구 개포동,79.97,26,5,1987,8.0,270.0,22637.0,4858.00,127.057210,37.476763,,개포6차우성,202307,언주로 3,1,0.061783
261,혼합식,지역난방,서울특별시 강남구 개포동,161.00,28,15,1984,5.0,405.0,63304.0,61064.24,127.055990,37.483894,,개포우성3차,202307,개포로 307,1,0.098282
262,혼합식,지역난방,서울특별시 강남구 개포동,133.46,10,14,1984,5.0,405.0,63304.0,61064.24,127.055990,37.483894,,개포우성3차,202308,개포로 307,1,0.098282
263,혼합식,지역난방,서울특별시 강남구 개포동,104.43,18,6,1984,5.0,405.0,63304.0,61064.24,127.055990,37.483894,,개포우성3차,202308,개포로 307,1,0.098282
3669,복도식,지역난방,서울특별시 강남구 개포동,74.25,28,8,1983,6.0,940.0,93810.0,6524.00,127.068028,37.487802,,개포주공5단지,202307,삼성로4길 17,1,0.131192
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128089,복도식,개별난방,서울특별시 중랑구 신내동,49.77,22,7,1996,,,,,127.093511,37.616533,,신내6대주,202307,신내로19길 42,1,0.053753
1128090,복도식,개별난방,서울특별시 중랑구 신내동,39.84,7,3,1996,,,,,127.093511,37.616533,,신내6대주,202308,신내로19길 42,1,0.053753
1128091,혼합식,개별난방,서울특별시 중랑구 신내동,59.76,17,4,1996,,,,,127.093511,37.616533,,신내6대주,202308,신내로19길 42,1,0.053753
1128092,복도식,개별난방,서울특별시 중랑구 신내동,39.84,2,9,1996,,,,,127.093511,37.616533,,신내6대주,202309,신내로19길 42,1,0.053753


In [33]:
bus_train.to_csv('../data/interim/bus_train.csv', index=False)
bus_test.to_csv('../data/interim/bus_test.csv', index=False)