In [64]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import eli5
from eli5.sklearn import PermutationImportance

import random

In [65]:
train_path = '../data/interim/train_final_ver1.csv'
test_path = '../data/interim/test_final_ver1.csv'

df_train = pd.read_csv(train_path, index_col=0)
df_test = pd.read_csv(test_path, index_col=0)

In [66]:
subway_path = '../data/external/subway_feature.csv'

df_subway = pd.read_csv(subway_path)

display(df_subway)

Unnamed: 0,역사_ID,역사명,호선,위도,경도
0,9996,미사,5호선,37.560927,127.193877
1,9995,강일,5호선,37.557490,127.175930
2,4929,김포공항,김포골드라인,37.562360,126.801868
3,4928,고촌,김포골드라인,37.601243,126.770345
4,4927,풍무,김포골드라인,37.612488,126.732387
...,...,...,...,...,...
763,154,종로5가,1호선,37.570926,127.001849
764,153,종로3가,1호선,37.570406,126.991847
765,152,종각,1호선,37.570161,126.982923
766,151,시청,1호선,37.565715,126.977088


In [67]:
df_subway = df_subway.rename(columns={'위도' : 'sub_y', '경도' : 'sub_x'})
display(df_subway)

Unnamed: 0,역사_ID,역사명,호선,sub_y,sub_x
0,9996,미사,5호선,37.560927,127.193877
1,9995,강일,5호선,37.557490,127.175930
2,4929,김포공항,김포골드라인,37.562360,126.801868
3,4928,고촌,김포골드라인,37.601243,126.770345
4,4927,풍무,김포골드라인,37.612488,126.732387
...,...,...,...,...,...
763,154,종로5가,1호선,37.570926,127.001849
764,153,종로3가,1호선,37.570406,126.991847
765,152,종각,1호선,37.570161,126.982923
766,151,시청,1호선,37.565715,126.977088


In [68]:
df_train['is_test'] = 0
df_test['is_test'] = 1
df_all = pd.concat([df_train, df_test])
display(df_all)

Unnamed: 0,복도유형,난방방식,시군구,전용면적,계약일,층,건축년도,k-전체동수,k-전체세대수,k-연면적,건축면적,x,y,target,아파트명,계약년월,도로명,is_test
0,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,3,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,124000.0,개포6차우성,201712,언주로 3,0
1,계단식,개별난방,서울특별시 강남구 개포동,79.97,22,4,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,123500.0,개포6차우성,201712,언주로 3,0
2,계단식,개별난방,서울특별시 강남구 개포동,54.98,28,5,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,91500.0,개포6차우성,201712,언주로 3,0
3,계단식,개별난방,서울특별시 강남구 개포동,79.97,3,4,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,130000.0,개포6차우성,201801,언주로 3,0
4,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,2,1987,8.0,270.0,22637.0,4858.0,127.05721,37.476763,117000.0,개포6차우성,201801,언주로 3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9267,혼합식,개별난방,서울특별시 중랑구 신내동,84.65,19,13,2014,16.0,1402.0,190866.0,14171.0,127.10672,37.618870,,신내우디안1단지,202307,신내역로1길 85,1
9268,혼합식,개별난방,서울특별시 중랑구 신내동,84.62,25,12,2014,16.0,1402.0,190866.0,14171.0,127.10672,37.618870,,신내우디안1단지,202307,신내역로1길 85,1
9269,혼합식,개별난방,서울특별시 중랑구 신내동,101.65,27,12,2014,16.0,1402.0,190866.0,14171.0,127.10672,37.618870,,신내우디안1단지,202308,신내역로1길 85,1
9270,혼합식,개별난방,서울특별시 중랑구 신내동,84.94,2,18,2014,16.0,1402.0,190866.0,14171.0,127.10672,37.618870,,신내우디안1단지,202309,신내역로1길 85,1


In [69]:
display(df_all.describe())
display(df_subway.describe())

Unnamed: 0,전용면적,계약일,층,건축년도,k-전체동수,k-전체세대수,k-연면적,건축면적,x,y,target,계약년월,is_test
count,1128094.0,1128094.0,1128094.0,1128094.0,250887.0,251969.0,251969.0,251815.0,1128094.0,1128094.0,1118822.0,1128094.0,1128094.0
mean,77.16028,15.79966,8.881412,1998.791,14.810871,1186.767436,161901.0,190491.5,126.9943,37.55437,57991.53,201482.9,0.008219173
std,29.36448,8.722162,5.989302,9.35854,17.682771,1197.452046,184311.3,1735029.0,0.7329411,0.2224538,46426.02,423.7691,0.09028635
min,10.02,1.0,-4.0,1961.0,1.0,59.0,0.0,0.0,0.0,0.0,350.0,200701.0,0.0
25%,59.65,8.0,4.0,1992.0,5.0,405.0,53666.0,0.0,126.9175,37.50836,30500.0,201111.0,0.0
50%,81.87,16.0,8.0,2000.0,10.0,768.0,102624.0,1735.0,127.0199,37.5475,44800.0,201508.0,0.0
75%,84.96,23.0,12.0,2005.0,17.0,1622.0,203904.0,8414.21,127.0675,37.60063,69800.0,201805.0,0.0
max,424.32,31.0,69.0,2023.0,124.0,9510.0,9591851.0,31596200.0,127.18,37.68929,1450000.0,202309.0,1.0


Unnamed: 0,역사_ID,sub_y,sub_x
count,768.0,768.0,768.0
mean,2297.946615,37.509654,126.969929
std,1418.164103,0.15126,0.183336
min,150.0,36.769502,126.441442
25%,1276.75,37.470875,126.851355
50%,1955.5,37.52124,126.993792
75%,3131.25,37.575834,127.070481
max,9996.0,37.9481,127.723792


In [70]:
df_all.loc[(df_all['아파트명'] == '삼성') & (df_all['x'] < 1), 'x'] = 126.9788
df_all.loc[(df_all['아파트명'] == '삼성') & (df_all['y'] < 1), 'y'] = 37.6133

display(df_all.describe())
display(df_subway.describe())

Unnamed: 0,전용면적,계약일,층,건축년도,k-전체동수,k-전체세대수,k-연면적,건축면적,x,y,target,계약년월,is_test
count,1128094.0,1128094.0,1128094.0,1128094.0,250887.0,251969.0,251969.0,251815.0,1128094.0,1128094.0,1118822.0,1128094.0,1128094.0
mean,77.16028,15.79966,8.881412,1998.791,14.810871,1186.767436,161901.0,190491.5,126.9984,37.5556,57991.53,201482.9,0.008219173
std,29.36448,8.722162,5.989302,9.35854,17.682771,1197.452046,184311.3,1735029.0,0.09067939,0.05680886,46426.02,423.7691,0.09028635
min,10.02,1.0,-4.0,1961.0,1.0,59.0,0.0,0.0,126.7983,37.4344,350.0,200701.0,0.0
25%,59.65,8.0,4.0,1992.0,5.0,405.0,53666.0,0.0,126.9175,37.5084,30500.0,201111.0,0.0
50%,81.87,16.0,8.0,2000.0,10.0,768.0,102624.0,1735.0,127.0199,37.5475,44800.0,201508.0,0.0
75%,84.96,23.0,12.0,2005.0,17.0,1622.0,203904.0,8414.21,127.0675,37.60077,69800.0,201805.0,0.0
max,424.32,31.0,69.0,2023.0,124.0,9510.0,9591851.0,31596200.0,127.18,37.68929,1450000.0,202309.0,1.0


Unnamed: 0,역사_ID,sub_y,sub_x
count,768.0,768.0,768.0
mean,2297.946615,37.509654,126.969929
std,1418.164103,0.15126,0.183336
min,150.0,36.769502,126.441442
25%,1276.75,37.470875,126.851355
50%,1955.5,37.52124,126.993792
75%,3131.25,37.575834,127.070481
max,9996.0,37.9481,127.723792


In [71]:
import math

# 위경도를 이용해 두 지점간의 거리를 구하는 함수를 생성합니다.
def haversine_distance(lat1, lon1, lat2, lon2): # y1, x1, y2, x2
    radius = 6371.0

    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance = radius * c
    return distance

In [72]:
df_loc = df_all[['x', 'y']]
unique_loc = set()
for i in range(len(df_loc)):
    unique_loc.add((df_loc['x'].values[i], df_loc['y'].values[i]))

unique_list = list(unique_loc)

unique_x, unique_y = [], []
for i in range(len(unique_list)):
    x, y = unique_list[i]
    unique_x.append(x)
    unique_y.append(y)

In [73]:
ser_x = pd.Series(unique_x)
ser_y = pd.Series(unique_y)
display(ser_x)
display(ser_y)

0       126.951669
1       127.054015
2       127.047542
3       126.904535
4       127.040462
           ...    
5882    127.033902
5883    127.057734
5884    126.918684
5885    126.894966
5886    127.044834
Length: 5887, dtype: float64

0       37.480261
1       37.489476
2       37.583577
3       37.589763
4       37.590592
          ...    
5882    37.486772
5883    37.539845
5884    37.526537
5885    37.513878
5886    37.625137
Length: 5887, dtype: float64

In [74]:
df_loc = pd.DataFrame({'x' : ser_x, 'y' : ser_y})

display(df_loc)
display(df_subway)

Unnamed: 0,x,y
0,126.951669,37.480261
1,127.054015,37.489476
2,127.047542,37.583577
3,126.904535,37.589763
4,127.040462,37.590592
...,...,...
5882,127.033902,37.486772
5883,127.057734,37.539845
5884,126.918684,37.526537
5885,126.894966,37.513878


Unnamed: 0,역사_ID,역사명,호선,sub_y,sub_x
0,9996,미사,5호선,37.560927,127.193877
1,9995,강일,5호선,37.557490,127.175930
2,4929,김포공항,김포골드라인,37.562360,126.801868
3,4928,고촌,김포골드라인,37.601243,126.770345
4,4927,풍무,김포골드라인,37.612488,126.732387
...,...,...,...,...,...
763,154,종로5가,1호선,37.570926,127.001849
764,153,종로3가,1호선,37.570406,126.991847
765,152,종각,1호선,37.570161,126.982923
766,151,시청,1호선,37.565715,126.977088


In [75]:
subway_sample = df_subway.iloc[:10]
loc_sample = df_loc.iloc[:10]

display(subway_sample)
display(loc_sample)

Unnamed: 0,역사_ID,역사명,호선,sub_y,sub_x
0,9996,미사,5호선,37.560927,127.193877
1,9995,강일,5호선,37.55749,127.17593
2,4929,김포공항,김포골드라인,37.56236,126.801868
3,4928,고촌,김포골드라인,37.601243,126.770345
4,4927,풍무,김포골드라인,37.612488,126.732387
5,4926,사우(김포시청),김포골드라인,37.620249,126.719731
6,4925,걸포북변,김포골드라인,37.63165,126.705975
7,4924,운양,김포골드라인,37.653867,126.68393
8,4923,장기,김포골드라인,37.643986,126.669017
9,4922,마산,김포골드라인,37.640732,126.644344


Unnamed: 0,x,y
0,126.951669,37.480261
1,127.054015,37.489476
2,127.047542,37.583577
3,126.904535,37.589763
4,127.040462,37.590592
5,126.978091,37.481291
6,126.872099,37.538179
7,126.942149,37.505792
8,127.033288,37.609448
9,127.035487,37.576978


In [76]:
def get_subway_min_dist(df1, df2): # 거리 차이를 계산할 데이터 프레임 --> 전체 데이터, 버스 or 지하철 데이터
    min_dist = []
    for i in range(len(df1)):
        x, y = df1['x'].values[i], df1['y'].values[i]
        dist = []
        for j in range(len(df2)):
            tar_x, tar_y = df2['sub_x'].values[j], df2['sub_y'].values[j]
            harv_dist = haversine_distance(y, x, tar_y, tar_x)
            dist.append(harv_dist)
        min_dist.append(min(dist))

    return min_dist
        

In [77]:
# sample로 결과값이 잘 나오는지  테스트
sub_min_dist = get_subway_min_dist(loc_sample, subway_sample)
print(sub_min_dist)

[16.058474418875416, 13.145090047345583, 11.681155779670387, 9.547070853572544, 12.493212374112847, 17.966432108933997, 6.7500549731014985, 13.876918862330527, 13.833543528870635, 12.566543477505954]


In [78]:
# 전체 데이터셋에 대해 실행
sub_min_dist = get_subway_min_dist(df_loc, df_subway)

In [79]:
df_loc['sub_min_dist'] = sub_min_dist
display(df_loc)

Unnamed: 0,x,y,sub_min_dist
0,126.951669,37.480261,0.144627
1,127.054015,37.489476,0.204798
2,127.047542,37.583577,0.320424
3,126.904535,37.589763,0.794475
4,127.040462,37.590592,0.367183
...,...,...,...
5882,127.033902,37.486772,0.255192
5883,127.057734,37.539845,0.549306
5884,126.918684,37.526537,0.188387
5885,126.894966,37.513878,0.451294


In [80]:
df_subway = pd.merge(df_all, df_loc, on=['x', 'y'], how='inner')
display(df_subway)

Unnamed: 0,복도유형,난방방식,시군구,전용면적,계약일,층,건축년도,k-전체동수,k-전체세대수,k-연면적,건축면적,x,y,target,아파트명,계약년월,도로명,is_test,sub_min_dist
0,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,3,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,124000.0,개포6차우성,201712,언주로 3,0,1.129775
1,계단식,개별난방,서울특별시 강남구 개포동,79.97,22,4,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,123500.0,개포6차우성,201712,언주로 3,0,1.129775
2,계단식,개별난방,서울특별시 강남구 개포동,54.98,28,5,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,91500.0,개포6차우성,201712,언주로 3,0,1.129775
3,계단식,개별난방,서울특별시 강남구 개포동,79.97,3,4,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,130000.0,개포6차우성,201801,언주로 3,0,1.129775
4,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,2,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,117000.0,개포6차우성,201801,언주로 3,0,1.129775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128089,복도식,개별난방,서울특별시 중랑구 신내동,49.77,22,7,1996,,,,,127.093511,37.616533,,신내6대주,202307,신내로19길 42,1,0.203699
1128090,복도식,개별난방,서울특별시 중랑구 신내동,39.84,7,3,1996,,,,,127.093511,37.616533,,신내6대주,202308,신내로19길 42,1,0.203699
1128091,혼합식,개별난방,서울특별시 중랑구 신내동,59.76,17,4,1996,,,,,127.093511,37.616533,,신내6대주,202308,신내로19길 42,1,0.203699
1128092,복도식,개별난방,서울특별시 중랑구 신내동,39.84,2,9,1996,,,,,127.093511,37.616533,,신내6대주,202309,신내로19길 42,1,0.203699


In [81]:
# 결측치 확인
df_subway.isnull().sum()

복도유형                 0
난방방식                 0
시군구                  0
전용면적                 0
계약일                  0
층                    0
건축년도                 0
k-전체동수          877207
k-전체세대수         876125
k-연면적           876125
건축면적            876279
x                    0
y                    0
target            9272
아파트명              2136
계약년월                 0
도로명                  0
is_test              0
sub_min_dist         0
dtype: int64

In [82]:
subway_train = df_subway[df_subway['is_test'] == 0]
subway_test = df_subway[df_subway['is_test'] == 1]

display(subway_train)
display(subway_test)

Unnamed: 0,복도유형,난방방식,시군구,전용면적,계약일,층,건축년도,k-전체동수,k-전체세대수,k-연면적,건축면적,x,y,target,아파트명,계약년월,도로명,is_test,sub_min_dist
0,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,3,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,124000.0,개포6차우성,201712,언주로 3,0,1.129775
1,계단식,개별난방,서울특별시 강남구 개포동,79.97,22,4,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,123500.0,개포6차우성,201712,언주로 3,0,1.129775
2,계단식,개별난방,서울특별시 강남구 개포동,54.98,28,5,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,91500.0,개포6차우성,201712,언주로 3,0,1.129775
3,계단식,개별난방,서울특별시 강남구 개포동,79.97,3,4,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,130000.0,개포6차우성,201801,언주로 3,0,1.129775
4,계단식,개별난방,서울특별시 강남구 개포동,79.97,8,2,1987,8.0,270.0,22637.0,4858.0,127.057210,37.476763,117000.0,개포6차우성,201801,언주로 3,0,1.129775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1126418,혼합식,개별난방,서울특별시 강동구 둔촌동,73.98,2,2,1995,,,,,127.141232,37.530097,23800.0,성원,200707,양재대로102가길 22,0,0.509011
1126419,혼합식,개별난방,서울특별시 강동구 둔촌동,73.98,25,11,1995,,,,,127.141232,37.530097,26000.0,성원,200709,양재대로102가길 22,0,0.509011
1126420,계단식,지역난방,서울특별시 서초구 반포동,84.07,30,5,1979,,,,,126.996770,37.499668,100000.0,미주,200708,신반포로16길 15-20,0,0.423304
1126421,혼합식,지역난방,서울특별시 송파구 송파동,74.04,7,10,1995,,,,,127.106904,37.509822,31000.0,호수임광,200705,송파대로48길,0,0.472486


Unnamed: 0,복도유형,난방방식,시군구,전용면적,계약일,층,건축년도,k-전체동수,k-전체세대수,k-연면적,건축면적,x,y,target,아파트명,계약년월,도로명,is_test,sub_min_dist
101,계단식,개별난방,서울특별시 강남구 개포동,79.97,26,5,1987,8.0,270.0,22637.0,4858.00,127.057210,37.476763,,개포6차우성,202307,언주로 3,1,1.129775
261,혼합식,지역난방,서울특별시 강남구 개포동,161.00,28,15,1984,5.0,405.0,63304.0,61064.24,127.055990,37.483894,,개포우성3차,202307,개포로 307,1,0.413746
262,혼합식,지역난방,서울특별시 강남구 개포동,133.46,10,14,1984,5.0,405.0,63304.0,61064.24,127.055990,37.483894,,개포우성3차,202308,개포로 307,1,0.413746
263,혼합식,지역난방,서울특별시 강남구 개포동,104.43,18,6,1984,5.0,405.0,63304.0,61064.24,127.055990,37.483894,,개포우성3차,202308,개포로 307,1,0.413746
3669,복도식,지역난방,서울특별시 강남구 개포동,74.25,28,8,1983,6.0,940.0,93810.0,6524.00,127.068028,37.487802,,개포주공5단지,202307,삼성로4길 17,1,0.221601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128089,복도식,개별난방,서울특별시 중랑구 신내동,49.77,22,7,1996,,,,,127.093511,37.616533,,신내6대주,202307,신내로19길 42,1,0.203699
1128090,복도식,개별난방,서울특별시 중랑구 신내동,39.84,7,3,1996,,,,,127.093511,37.616533,,신내6대주,202308,신내로19길 42,1,0.203699
1128091,혼합식,개별난방,서울특별시 중랑구 신내동,59.76,17,4,1996,,,,,127.093511,37.616533,,신내6대주,202308,신내로19길 42,1,0.203699
1128092,복도식,개별난방,서울특별시 중랑구 신내동,39.84,2,9,1996,,,,,127.093511,37.616533,,신내6대주,202309,신내로19길 42,1,0.203699


In [83]:
subway_train.to_csv('../data/interim/subway_train.csv', index=False)
subway_test.to_csv('../data/interim/subway_test.csv', index=False)