##### 特徴量の作成用

In [1]:
import os
os.chdir('/Users/kinoshitashouhei/Desktop/competitions/05_Prob_Space/accommodation_charge_prediction/')

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import warnings
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
from geopy.distance import geodesic

from src.config import *
import src.preprocessing as pr

warnings.filterwarnings('ignore')

In [3]:
df_train = pd.read_csv('input/train_data.csv', parse_dates=[COL_LAST_REVIEW], dtype=DICT_DTYPES)
df_test = pd.read_csv('input/test_data.csv', parse_dates=[COL_LAST_REVIEW], dtype=DICT_DTYPES)
df_station_list = pd.read_csv('input/station_list.csv')
sample_sub = pd.read_csv('input/submission.csv')

In [4]:
LIST_USE_COL = [COL_NEIGHBOURHOOD,
                COL_LATITUDE,
                COL_LONGITUDE,
                COL_ROOM_TYPE,
                COL_MINIMUM_NIGHTS,
                COL_NUMBER_OF_REVIEWS,
                COL_REVIEWS_PER_MONTH,
                COL_AVAILABILITY_365]

In [5]:
LIST_LABEL_ENC = [COL_NEIGHBOURHOOD,
                  COL_ROOM_TYPE]

In [6]:
df_all = pd.concat([
    df_train[LIST_USE_COL],
    df_test[LIST_USE_COL]
]).reset_index(drop=True)

In [7]:
df_all.fillna(0, inplace=True)

#### 2点間の距離から、最寄りの駅を計算

In [46]:
%%time
list_distance = []
for i in range(len(df_all)):
    list_tmp = []
    coordinate = (df_all.loc[i, COL_LATITUDE], df_all.loc[i, COL_LONGITUDE])
    for j in range(len(df_station_list)):
        compare_coordinate = (df_station_list.loc[j, COL_LATITUDE], df_station_list.loc[j, COL_LONGITUDE])
        list_tmp.append(geodesic(coordinate, compare_coordinate).km)
    list_distance.append(list_tmp.index(min(list_tmp)))

CPU times: user 7min 23s, sys: 776 ms, total: 7min 24s
Wall time: 7min 24s


In [50]:
df_all['nearest_station_index'] = list_distance

In [51]:
df_all.head()

Unnamed: 0,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,reviews_per_month,availability_365,nearest_station_index
0,Koto Ku,35.68185,139.8031,Entire home/apt,1,55,2.21,173,154
1,Taito Ku,35.72063,139.78536,Entire home/apt,6,72,2.11,9,21
2,Katsushika Ku,35.74723,139.82349,Entire home/apt,1,18,3.46,288,91
3,Shibuya Ku,35.68456,139.68077,Entire home/apt,1,2,1.76,87,110
4,Shinjuku Ku,35.6984,139.70467,Entire home/apt,1,86,2.0,156,11


In [59]:
df_station_list['nearest_station_index'] = df_station_list.index

In [60]:
df_station_list.head()

Unnamed: 0,station_name,longitude,latitude,No,nearest_station_index
0,東京,139.766103,35.681391,0,0
1,新橋,139.758587,35.666195,1,1
2,品川,139.738999,35.62876,2,2
3,大崎,139.728439,35.619772,3,3
4,五反田,139.723822,35.625974,4,4


In [61]:
df_merge = pd.merge(df_all, df_station_list[[COL_STATION_NAME, 'nearest_station_index']], on='nearest_station_index', how='left')

In [62]:
df_merge.head()

Unnamed: 0,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,reviews_per_month,availability_365,nearest_station_index,station_name
0,Koto Ku,35.68185,139.8031,Entire home/apt,1,55,2.21,173,154,月島
1,Taito Ku,35.72063,139.78536,Entire home/apt,6,72,2.11,9,21,鶯谷
2,Katsushika Ku,35.74723,139.82349,Entire home/apt,1,18,3.46,288,91,鐘ヶ淵
3,Shibuya Ku,35.68456,139.68077,Entire home/apt,1,2,1.76,87,110,幡ヶ谷
4,Shinjuku Ku,35.6984,139.70467,Entire home/apt,1,86,2.0,156,11,新大久保


In [65]:
df_train_join = df_train[[COL_ID, COL_HOST_ID]]
df_test_join = df_test[[COL_ID, COL_HOST_ID]]

In [71]:
df_train_station_info = df_merge[:df_train.shape[0]].reset_index(drop=True)[['nearest_station_index', 'station_name']]
df_test_station_info = df_merge[df_train.shape[0]:].reset_index(drop=True)[['nearest_station_index', 'station_name']]

In [72]:
df_train_station_info.head()

Unnamed: 0,nearest_station_index,station_name
0,154,月島
1,21,鶯谷
2,91,鐘ヶ淵
3,110,幡ヶ谷
4,11,新大久保


In [76]:
df_train_station_info = pd.concat([df_train_join, df_train_station_info], axis=1)

In [77]:
df_train_station_info.head()

Unnamed: 0,id,host_id,nearest_station_index,station_name
0,1,242899459,154,月島
1,2,308879948,21,鶯谷
2,3,300877823,91,鐘ヶ淵
3,4,236935461,110,幡ヶ谷
4,5,243408889,11,新大久保


In [79]:
df_test_station_info = pd.concat([df_test_join, df_test_station_info], axis=1)

In [81]:
df_train_station_info.to_csv('input/train_data_station_info.csv', index=False)
df_test_station_info.to_csv('input/test_data_station_info.csv', index=False)