In [2]:
import os
import sys
import pandas as pd
import numpy as np
import geocoder
import feather
import re
from collections import Counter
import glob

sys.path.append("/Users/ueda/Desktop/kaggle/mynavi/code/src/")
from util_tool import reduce_mem_usage

In [3]:
train = pd.read_csv('../input/train.csv')
target = train["賃料"]
train = train.drop("賃料", axis=1)
test = pd.read_csv('../input/test.csv')

train_length = train.shape[0]
test_length = test.shape[0]
all_df = pd.concat([train, test], axis=0, ignore_index=True)

In [4]:
def area_feature(df_input):
    df_out = pd.DataFrame()
    s = df_input["面積"]
    df_out["float面積"] = [float(i.split("m")[0]) for i in s]
    return df_out

def modify_miss(df_input):
    df_input.loc[df_input["間取り"].str.contains("11R"), "間取り"] = "1R"
    
    df_input.loc[df_input["築年数"].str.contains("520年5ヶ月"), "築年数"] = "52年5ヶ月"
    
    df_input.loc[df_input["築年数"].str.contains("1019年7ヶ月"), "築年数"] = "19年7ヶ月"
    
    df_input.loc[df_input["面積"]=="430.1m2", "面積"] = "43.01m2"
    #idx = df_input["築年数"].str.contains("1019年7ヶ月")
    #df_input["築年数"][idx] = "19年7ヶ月"    
    df_input.loc[df_input["面積"]==("1m2"), "面積"] = "10m2"
    df_input.loc[df_input["面積"]==("5.83m2"), "面積"] = "58.3m2"
    
    "三田線 三田(兵庫県)駅 徒歩14分"
    
    return df_input

def direction_feature(df_input):
    """価格の高い順に値振り分け　南は高いらしい"""
    df_out = pd.DataFrame()
    #direct_dict= {'南':8, '東':5, '西':4, '南東':7, '南西':6, '北':1, '北西':2, '北東':3}
    direct_dict= {'南':1, '東':0.95, '西':0.93, '南東':0.975, '南西':0.965, '北':0.85, '北西':0.89, '北東':0.90}
    df_out["int方角"] = df_input["方角"].map(direct_dict)
    return df_out


def built_floor_feature(df_input):
    """
    所在階の中に{／}を含まないものの中から、現実的な長さのものを一軒家とみなす
    階層情報から、低層、高層、タワーマンションで分割
    
    """
    
    search_house = ["1階建", "2階建", "3階建", "4階建", "2階建（地下1階）", "3階建（地下1階）"] 
    df_out = pd.DataFrame()
    df_out["living_floor"] = df_input["所在階"].str.split('／', expand=True)[0].str.split("（", expand=True)[0].str.replace('[^0-9]', '').fillna(0)
    cnt = -1
    for i in df_out["living_floor"]:
        cnt += 1
        if len(str(i)) == 0:
            df_out["living_floor"][df_out["living_floor"].index[cnt]]=np.nan
            
    df_out["living_floor"] = df_out["living_floor"].astype(float)
    df_out["rent_rate_by_floor"] = 1.01 ** df_out["living_floor"]

    return df_out

all_df = modify_miss(all_df)
all_df = pd.concat([all_df, area_feature(all_df), direction_feature(all_df), built_floor_feature(all_df)], axis=1)

In [5]:
def address_feature(df_input):
    df_out = pd.DataFrame()
    s = df_input["所在地"]
    df_out["所在_区"] = [i.split('区')[0] + '区' for i in s]
    return df_out

all_df = pd.concat([all_df, address_feature(all_df)], axis=1)

org_columns = all_df.columns

In [6]:
META_PATH = '../input/meta_csv/'

In [7]:
mt = []
for p in glob.glob(META_PATH + '*'):
    mt.append(pd.read_csv(p, encoding='shift_jis'))

In [8]:
tmp = pd.concat(mt, axis=1)
tmp = tmp.loc[:,~tmp.columns.duplicated()].drop(["調査年", "/項目"], axis=1)#.describe()
place = tmp["地域"]

In [9]:
obj_data = [pd.Series(tmp[col]).str.replace("[-,]", "") for col in list(tmp.columns) if tmp[col].dtype == "O"]
obj_data = [pd.Series(np.where(series == "X", np.nan, series), name=series.name) for series in obj_data]
num_data = [pd.Series(tmp[col]) for col in list(tmp.columns) if tmp[col].dtype != "O"]

drop_col = []
for v in obj_data:
    try:
        v.astype(float)
    except:
        drop_col.append(v.name)

tmp_ = pd.concat(obj_data, axis=1).drop(drop_col, axis=1).astype(float)


meta_df = pd.concat([place] + num_data + [tmp_], axis=1)
meta_df["地域"] = meta_df["地域"].str.replace(" ", "")
meta_col = [col for col in meta_df.columns if col != "地域"]

In [10]:
meta_dict = meta_df.set_index("地域").to_dict()

In [11]:
tmp2 = [all_df["所在_区"].map(meta_dict[col]).rename(col) for col in meta_col]
meta_features = pd.concat(tmp2, axis=1)
meta_features = reduce_mem_usage(meta_features)

Memory usage of dataframe is 166.56 MB
column =  348
0
50
100
150
200
250
300
Memory usage after optimization is: 80.23 MB
Decreased by 51.8%


In [12]:
use_col = ['A1700_外国人人口【人】',
           'C210851_事業所数（民営）（複合サービス事業）【所】',
           'A1231_年齢中位数【歳】',
           
           '#B01101_総面積（都道府県面積に占める割合）【％】',
           
           'C120110_課税対象所得【千円】',
           '#E0110201_小学校数（可住地面積100km2当たり）【校】',
           '#E0110202_中学校数（可住地面積100km2当たり）【校】',
           '#E0110203_高等学校数（可住地面積100km2当たり）【校】',
           '#I0950102_一般病院数（可住地面積100km2当たり）【施設】',
           
           '#H06127_小売店数（人口千人当たり）【店】',
           '#H06130_飲食店数（人口千人当たり）【店】',
           '#H06131_大型小売店数（人口10万人当たり）【店】',
           '#H06132_百貨店，総合スーパー数（人口10万人当たり）【店】',
           
           
           'H1100_総住宅数【住宅】',
           'H1101_居住世帯あり住宅数【住宅】',
           'H1102_居住世帯なし住宅数【住宅】',
           'H110202_空き家数【住宅】',
           'H110203_建築中住宅数【住宅】',
           'H1201_専用住宅数【住宅】',
           'H1310_持ち家数【住宅】',
           'H1320_借家数【住宅】',
           'H1401_一戸建住宅数【住宅】',
           'H1402_長屋建住宅数【住宅】',
           'H1403_共同住宅数【住宅】',
           'H140301_共同住宅数（木造）【住宅】',

           'H2130_１住宅当たり延べ面積【ｍ2】',
           'H213010_１住宅当たり延べ面積（持ち家）【ｍ2】',
           'H213020_１住宅当たり延べ面積（借家）【ｍ2】',
           
           
           'H2101_居住室の畳数５．９畳以下住宅数【住宅】',
           'H2102_居住室の畳数６．０〜１１．９畳住宅数【住宅】',
           'H2103_居住室の畳数１２．０〜１７．９畳住宅数【住宅】',
           'H2104_居住室の畳数１８．０〜２３．９畳住宅数【住宅】',
           'H2105_居住室の畳数２４．０〜２９．９畳住宅数【住宅】',
           'H2106_居住室の畳数３０．０〜３５．９畳住宅数【住宅】',
           'H2107_居住室の畳数３６．０〜４７．９畳住宅数【住宅】',
           'H2108_居住室の畳数４８．０畳以上住宅数【住宅】',
           
           'H2230_高齢者等用設備住宅数【住宅】',
           'H2110_１住宅当たり居住室数【室】',
           'H211010_１住宅当たり居住室数（持ち家）【室】',
           'H211020_１住宅当たり居住室数（借家）【室】',
           
           
           'H2500_着工居住用建築物床面積【ｍ2】',
           #'H2600_着工新設住宅床面積【ｍ2】',
           #'H2601_着工新設持家床面積【ｍ2】',
           #'H2602_着工新設分譲住宅床面積【ｍ2】',
           #'H2603_着工新設貸家床面積【ｍ2】',
           #'H2604_着工新設給与住宅床面積【ｍ2】',
           'H3100_総世帯数【世帯】',
           'H311110_６５歳以上の世帯員のいる世帯数（住宅・土地統計調査結果）【世帯】',
           
           'H4102_専用住宅の１か月当たり家賃・間代【円】',
           'H4104_専用住宅の１畳当たり家賃【円】',
           "H3300_総世帯人員【人】",
           "H7207_軽自動車等の賦課期日現在台数【台】",
           
           "H7401_家計を主に支える者が雇用者である普通世帯数【世帯】",
           "H740101_家計を主に支える者が雇用者である普通世帯数（通勤時間３０分未満）【世帯】",
           'H740102_家計を主に支える者が雇用者である普通世帯数（通勤時間３０分〜１時間）【世帯】',
           'H740103_家計を主に支える者が雇用者である普通世帯数（通勤時間１時間〜１時間３０分）【世帯】',
           "H740104_家計を主に支える者が雇用者である普通世帯数（通勤時間１時間３０分以上）【世帯】",
           "C2207_従業者数（経済センサス‐基礎調査結果）【人】",
           
           #"H810402_住居地域面積【ｈａ】",
           #"H810404_商業地域面積【ｈａ】",
           
           #'A1101_総人口【人】',
          ]

In [13]:
#use_col.sort()
#use_col

In [14]:
#meta_features[use_col].head()

In [15]:
meta_features["all_house_area"] = meta_features['H2500_着工居住用建築物床面積【ｍ2】'] * meta_features['#B01101_総面積（都道府県面積に占める割合）【％】']
meta_features["num_person_per_family"] = meta_features['A1101_総人口【人】'] / meta_features['H3100_総世帯数【世帯】']
meta_features["old_per_family"] = meta_features['H311110_６５歳以上の世帯員のいる世帯数（住宅・土地統計調査結果）【世帯】'] / meta_features['H3100_総世帯数【世帯】']

meta_features["30min_to_go_work"] = meta_features["H740101_家計を主に支える者が雇用者である普通世帯数（通勤時間３０分未満）【世帯】"] / meta_features["H7401_家計を主に支える者が雇用者である普通世帯数【世帯】"]
meta_features["60min_to_go_work"] = meta_features['H740102_家計を主に支える者が雇用者である普通世帯数（通勤時間３０分〜１時間）【世帯】'] / meta_features["H7401_家計を主に支える者が雇用者である普通世帯数【世帯】"]
meta_features["90min_to_go_work"] = meta_features['H740103_家計を主に支える者が雇用者である普通世帯数（通勤時間１時間〜１時間３０分）【世帯】'] / meta_features["H7401_家計を主に支える者が雇用者である普通世帯数【世帯】"]
meta_features["120min_to_go_work"] = meta_features["H740104_家計を主に支える者が雇用者である普通世帯数（通勤時間１時間３０分以上）【世帯】"] / meta_features["H7401_家計を主に支える者が雇用者である普通世帯数【世帯】"]



In [16]:
meta_features["tree/all_house_num"] = meta_features['H140301_共同住宅数（木造）【住宅】'] / meta_features['H1403_共同住宅数【住宅】']
meta_features["yesliving/living_house_num"] = meta_features['H1101_居住世帯あり住宅数【住宅】'] / meta_features['H1100_総住宅数【住宅】']
meta_features["having/rent_house_num"] = meta_features['H1310_持ち家数【住宅】'] / meta_features['H1320_借家数【住宅】']
meta_features["single/manshon_house_num"] = meta_features['H1401_一戸建住宅数【住宅】'] / meta_features['H1403_共同住宅数【住宅】']
meta_features["old/all_manshon_house_num"] = meta_features['H140301_共同住宅数（木造）【住宅】'] / meta_features['H1403_共同住宅数【住宅】']
meta_features["old/all_having_house_num"] = meta_features['H1402_長屋建住宅数【住宅】'] / meta_features['H1401_一戸建住宅数【住宅】']
meta_features["余り割合_house_num"] = meta_features['H110202_空き家数【住宅】'] / meta_features['H1100_総住宅数【住宅】']

meta_features["num_school"] = (meta_features['#E0110201_小学校数（可住地面積100km2当たり）【校】'] + meta_features['#E0110202_中学校数（可住地面積100km2当たり）【校】'] + 
                               meta_features['#E0110203_高等学校数（可住地面積100km2当たり）【校】']) * meta_features['#B01101_総面積（都道府県面積に占める割合）【％】']

meta_features["rent/allave_house_area"] = meta_features['H213020_１住宅当たり延べ面積（借家）【ｍ2】'] / meta_features['H2130_１住宅当たり延べ面積【ｍ2】']

meta_features["5.9jou/all_house_num"] = meta_features['H2101_居住室の畳数５．９畳以下住宅数【住宅】'] / meta_features['H1100_総住宅数【住宅】']
meta_features["6-11.9jou/all_house_num"] = meta_features['H2102_居住室の畳数６．０〜１１．９畳住宅数【住宅】'] / meta_features['H1100_総住宅数【住宅】']
meta_features["12-17.9jou/all_house_num"] = meta_features['H2103_居住室の畳数１２．０〜１７．９畳住宅数【住宅】'] / meta_features['H1100_総住宅数【住宅】']
meta_features["18-23.9jou/all_house_num"] = meta_features['H2104_居住室の畳数１８．０〜２３．９畳住宅数【住宅】'] / meta_features['H1100_総住宅数【住宅】']
meta_features["24-29.9jou/all_house_num"] = meta_features['H2105_居住室の畳数２４．０〜２９．９畳住宅数【住宅】'] / meta_features['H1100_総住宅数【住宅】']
meta_features["30-35.9jou/all_house_num"] = meta_features['H2106_居住室の畳数３０．０〜３５．９畳住宅数【住宅】'] / meta_features['H1100_総住宅数【住宅】']
meta_features["36-47.9jou/all_house_num"] = meta_features['H2107_居住室の畳数３６．０〜４７．９畳住宅数【住宅】'] / meta_features['H1100_総住宅数【住宅】']
meta_features["48-jou/all_house_num"] = meta_features['H2108_居住室の畳数４８．０畳以上住宅数【住宅】'] / meta_features['H1100_総住宅数【住宅】']

In [17]:
meta_features["平均的な賃料相場"] = meta_features['H4104_専用住宅の１畳当たり家賃【円】'] * all_df["float面積"]
meta_features["平均的な賃料相場_方角"] = meta_features["平均的な賃料相場"] * all_df["int方角"]
meta_features["平均的な賃料相場_階数_方角"] = meta_features["平均的な賃料相場"] * all_df["int方角"] * all_df["rent_rate_by_floor"]
meta_features["平均的な賃料相場_階数"] = meta_features["平均的な賃料相場"] * all_df["rent_rate_by_floor"]
#meta_features.corr()
#list(meta_features.columns)
use_col = ["A6108_昼夜間人口比率【％】", "平均的な賃料相場", 'H4104_専用住宅の１畳当たり家賃【円】', 
           "平均的な賃料相場_方角", "平均的な賃料相場_階数_方角", "平均的な賃料相場_階数" #"120min_to_go_work", "30min_to_go_work"
          ]
#meta_features["C120110_課税対象所得【千円】"].hist()

In [18]:
meta_features = meta_features[use_col]

In [50]:
#meta_features["平均賃料_mean"] = meta_features[["平均的な賃料相場_方角", "平均的な賃料相場_階数_方角", "平均的な賃料相場_階数", "平均的な賃料相場"]].mean(axis=1)
#meta_features["平均賃料_std"] = meta_features[["平均的な賃料相場_方角", "平均的な賃料相場_階数_方角", "平均的な賃料相場_階数", "平均的な賃料相場"]].std(axis=1)
#meta_features["平均賃料_min"] = meta_features[["平均的な賃料相場_方角", "平均的な賃料相場_階数_方角", "平均的な賃料相場_階数", "平均的な賃料相場"]].min(axis=1)
#meta_features["平均賃料_max"] = meta_features[["平均的な賃料相場_方角", "平均的な賃料相場_階数_方角", "平均的な賃料相場_階数", "平均的な賃料相場"]].max(axis=1)
meta_features.columns

Index(['A6108_昼夜間人口比率【％】', '平均的な賃料相場', 'H4104_専用住宅の１畳当たり家賃【円】', '平均的な賃料相場_方角',
       '平均的な賃料相場_階数_方角', '平均的な賃料相場_階数'],
      dtype='object')

In [53]:
from sklearn.preprocessing import MinMaxScaler
scalar = MinMaxScaler()
scalar.fit(all_eki_df[["0_people", "1_people", "2_people"]])
normalized_people = pd.DataFrame(scalar.transform(all_eki_df[["0_people", "1_people", "2_people"]]))

for col in ['平均的な賃料相場', '平均的な賃料相場_方角','平均的な賃料相場_階数_方角', '平均的な賃料相場_階数']:
    meta_features[f"駅_0利用率*{col}"] = meta_features[col] * normalized_people[0]
    meta_features[f"駅_1利用率*{col}"] = meta_features[col] * normalized_people[1]
    meta_features[f"駅_2利用率*{col}"] = meta_features[col] * normalized_people[2]

In [55]:
output_dir = '../code/feature_csv/'
meta_features.to_feather(os.path.join(output_dir, 'meta_features.feather'))
meta_features.columns

Index(['A6108_昼夜間人口比率【％】', '平均的な賃料相場', 'H4104_専用住宅の１畳当たり家賃【円】', '平均的な賃料相場_方角',
       '平均的な賃料相場_階数_方角', '平均的な賃料相場_階数', '駅_0利用率*平均的な賃料相場', '駅_1利用率*平均的な賃料相場',
       '駅_2利用率*平均的な賃料相場', '駅_0利用率*平均的な賃料相場_方角', '駅_1利用率*平均的な賃料相場_方角',
       '駅_2利用率*平均的な賃料相場_方角', '駅_0利用率*平均的な賃料相場_階数_方角', '駅_1利用率*平均的な賃料相場_階数_方角',
       '駅_2利用率*平均的な賃料相場_階数_方角', '駅_0利用率*平均的な賃料相場_階数', '駅_1利用率*平均的な賃料相場_階数',
       '駅_2利用率*平均的な賃料相場_階数'],
      dtype='object')

# 駅情報

In [20]:
import geopandas
import feather


In [21]:
eki_geoson = geopandas.read_file("../input/S12-17_GML/S12-17_NumberOfPassengers.geojson")
access_df = feather.read_dataframe("../code/feature_csv/access_feature.feather")
base_df = feather.read_dataframe("../code/feature_csv/base_feature1.feather")

In [22]:
access_df["路線_0"].value_counts().to_dict()

{'山手線': 5112,
 '総武線・中央線（各停）': 2806,
 '都営大江戸線': 2716,
 '東西線': 2693,
 '都営三田線': 2609,
 '西武池袋線': 2441,
 '有楽町線': 2309,
 '都営新宿線': 2268,
 '京浜東北線': 2048,
 '日比谷線': 1970,
 '京王線': 1960,
 '東急田園都市線': 1948,
 '千代田線': 1895,
 '丸ノ内線(池袋－荻窪)': 1855,
 '東武スカイツリーライン': 1698,
 '東武東上線': 1653,
 '中央線（快速）': 1602,
 '都営浅草線': 1578,
 '小田急小田原線': 1578,
 '西武新宿線': 1521,
 '東急東横線': 1432,
 '南北線': 1357,
 '京急本線': 1236,
 '半蔵門線': 1127,
 '東急池上線': 1064,
 '常磐線': 1057,
 '京王井の頭線': 1009,
 '日暮里・舎人ライナー': 876,
 '京成本線': 839,
 '東急目黒線': 826,
 '東急大井町線': 732,
 '丸ノ内線(中野坂上－方南町)': 644,
 'つくばエクスプレス': 627,
 '銀座線': 617,
 '埼京線': 616,
 '京王新線': 533,
 '京成押上線': 480,
 '総武本線': 415,
 '東急多摩川線': 387,
 '副都心線': 379,
 '東急世田谷線': 361,
 '京急空港線': 329,
 '京葉線(東京－蘇我)': 184,
 '常磐線快速': 184,
 'ゆりかもめ': 140,
 '京成金町線': 131,
 '横須賀線': 122,
 'りんかい線': 121,
 '東武亀戸線': 115,
 '都電荒川線': 102,
 '西武有楽町線': 77,
 '東武大師線': 77,
 '高崎線': 75,
 '東京モノレール羽田線': 44,
 '東武伊勢崎線(押上－曳舟)': 38,
 '北総線': 38,
 '東海道新幹線': 24,
 '西武豊島線': 23,
 '東海道本線': 12,
 '東北本線（宇都宮線）': 12,
 '湘南新宿ライン(東北本線－横須賀線)': 5,
 '中央本線(東京－松本)

In [23]:
access_df.head()

Unnamed: 0,路線_0,駅_0,駅からの距離_0,路線_1,駅_1,駅からの距離_1,路線_2,駅_2,駅からの距離_2,駅_0からの距離_mean,...,駅_2からの距離_mean_diff,access_way_count,駅_0_緯度,駅_0_経度,駅_1_緯度,駅_1_経度,駅_2_緯度,駅_2_経度,catuse_路線_0,catuse_駅_0
0,都営三田線,西巣鴨駅,320,埼京線,板橋駅,1120.0,都電荒川線,西ケ原四丁目駅,560.0,654.704595,...,-116.756757,3,35.743717,139.728583,35.745564,139.720024,35.744734,139.733086,都営三田線,西巣鴨駅
1,都営大江戸線,勝どき駅,400,有楽町線,月島駅,720.0,日比谷線,築地駅,1600.0,651.213873,...,-650.03096,3,35.658962,139.777175,35.663861,139.784085,35.66788,139.772516,都営大江戸線,勝どき駅
2,京王線,笹塚駅,480,京王線,代田橋駅,560.0,京王線,明大前駅,1360.0,1048.602151,...,-166.0,3,35.674556,139.667221,35.670976,139.658763,35.668379,139.65088,京王線,笹塚駅
3,総武線・中央線（各停）,高円寺駅,720,丸ノ内線(池袋－荻窪),新高円寺駅,240.0,丸ノ内線(池袋－荻窪),東高円寺駅,1120.0,1062.289417,...,-252.207407,3,35.704942,139.649909,35.697908,139.648405,35.697881,139.657623,総武線・中央線（各停）,高円寺駅
4,京成金町線,京成金町駅,400,常磐線,金町(東京都)駅,560.0,京成金町線,柴又駅,1360.0,1451.552795,...,81.170213,3,35.768598,139.870441,35.769538,139.870476,35.756573,139.875197,京成金町線,京成金町駅


In [24]:
use_railway = ['東日本旅客鉄道', '東武鉄道', '東京地下鉄', '東京都', '東京急行電鉄', '西武鉄道','京浜急行電鉄',
 '京成電鉄','札幌市','京王電鉄', 'ゆりかもめ', '東京モノレール', '東京臨海高速鉄道']

#eki_geoson["運営会社"].value_counts().to_dict()

In [25]:
"""実行は一度だけ"""
#eki_geoson[eki_geoson["運営会社"]=='東京地下鉄']
#use_cols = ['駅名', '運営会社', '路線名', '鉄道区分', '事業者種別', '重複2016', '有無2016', '備考2016', '乗降客数16']
#use_cols = ['駅名', '運営会社', '路線名', '乗降客数16']
#eki_geoson.loc[eki_geoson["運営会社"]=='東京地下鉄', "路線名"] = eki_geoson[eki_geoson["運営会社"]=='東京地下鉄'][use_cols]["路線名"].str.split("号線", expand=True)[1].values



'実行は一度だけ'

In [26]:
#eki_geoson[eki_geoson["運営会社"]==use_railway[1]]["路線名"].value_counts()#[use_cols]

In [27]:
eki_0 = access_df["駅_0"].str.split("駅", expand=True)
eki_0["駅名"] = np.where(eki_0[1]=="前", eki_0[0].astype(str) + "駅" + eki_0[1].astype(str), eki_0[0].astype(str))
eki_0_df = eki_0["駅名"].str.split("(", expand=True)[[0]]
eki_0_df.columns = ["eki0_name"]


eki_1 = access_df["駅_1"].str.split("駅", expand=True)
eki_1["駅名"] = np.where(eki_1[1]=="前", eki_1[0].astype(str) + "駅" + eki_1[1].astype(str), eki_1[0].astype(str))
eki_1_df = eki_1["駅名"].str.split("(", expand=True)[[0]]
eki_1_df.columns = ["eki1_name"]


eki_2 = access_df["駅_2"].str.split("駅", expand=True)
eki_2["駅名"] = np.where(eki_2[1]=="前", eki_2[0].astype(str) + "駅" + eki_2[1].astype(str), eki_2[0].astype(str))
eki_2_df = eki_2["駅名"].str.split("(", expand=True)[[0]]
eki_2_df.columns = ["eki2_name"]

In [28]:
all_eki_df = pd.concat([eki_0_df, eki_1_df, eki_2_df], axis=1)

In [29]:
tokyo_df = eki_geoson[eki_geoson.centroid.x.astype(int) == 139][eki_geoson.centroid.y.astype(int) == 35]
tokyo_df = tokyo_df.reset_index(drop=True)

  result = super(GeoDataFrame, self).__getitem__(key)


In [30]:
tokyo_dict = tokyo_df.groupby("駅名").sum()[["乗降客数14", "乗降客数15", "乗降客数16"]].max(axis=1).to_dict()

In [31]:
def num_people(text):
    if type(text)==float:
        return np.nan
    else:
        if text == "NaN":
            return np.nan
        else:         
            text = re.sub("ケ", "ヶ", text)
            text = re.sub("・", "･", text)
            return tokyo_dict[text]
    
all_eki_df["0_people"] = all_eki_df["eki0_name"].apply(lambda x:num_people(x))
all_eki_df["1_people"] = all_eki_df["eki1_name"].apply(lambda x:num_people(x))
all_eki_df["2_people"] = all_eki_df["eki2_name"].apply(lambda x:num_people(x))

In [32]:
all_eki_df["駅からの距離_0"] = access_df["駅からの距離_0"]
all_eki_df["駅からの距離_1"] = access_df["駅からの距離_1"]
all_eki_df["駅からの距離_2"] = access_df["駅からの距離_2"]

In [33]:
all_eki_df["mean_people"] = all_eki_df[["0_people", "1_people", "2_people"]].mean(axis=1)
all_eki_df["max_people"] = all_eki_df[["0_people", "1_people", "2_people"]].max(axis=1)
all_eki_df["min_people"] = all_eki_df[["0_people", "1_people", "2_people"]].min(axis=1)
all_eki_df["std_people"] = all_eki_df[["0_people", "1_people", "2_people"]].std(axis=1)


all_eki_df["people/meter_0"] = all_eki_df["0_people"] / all_eki_df["駅からの距離_0"]
all_eki_df["people/meter_1"] = all_eki_df["1_people"] / all_eki_df["駅からの距離_1"]
all_eki_df["people/meter_2"] = all_eki_df["2_people"] / all_eki_df["駅からの距離_2"]

all_eki_df["mean_people/meter_0"] = all_eki_df[["people/meter_0", "people/meter_1", "people/meter_2"]].mean(axis=1)
all_eki_df["max_people/meter_0"] = all_eki_df[["people/meter_0", "people/meter_1", "people/meter_2"]].max(axis=1)
all_eki_df["min_people/meter_0"] = all_eki_df[["people/meter_0", "people/meter_1", "people/meter_2"]].min(axis=1)
all_eki_df["std_people/meter_0"] = all_eki_df[["people/meter_0", "people/meter_1", "people/meter_2"]].std(axis=1)

In [34]:
all_eki_df["所在_区"] = all_df["所在_区"]
meta_features["平均的な賃料相場"]
#all_eki_df.groupby("所在_区").max()

0         95847.90
1        119212.50
2        149058.00
3        350300.16
4        161059.26
           ...    
62727    156833.92
62728    143663.46
62729    289294.72
62730    349195.20
62731    113884.86
Name: 平均的な賃料相場, Length: 62732, dtype: float64

In [35]:
already_col = ['eki0_name', 'eki1_name', 'eki2_name','駅からの距離_0', '駅からの距離_1', '駅からの距離_2', "所在_区"]

use_cols = [col for col in all_eki_df.columns if col not in already_col]

In [143]:
output_dir = '../code/feature_csv/'
all_eki_df[use_cols].to_feather(os.path.join(output_dir, 'eki_people.feather'))
all_eki_df.columns

Index(['eki0_name', 'eki1_name', 'eki2_name', '0_people', '1_people',
       '2_people', '駅からの距離_0', '駅からの距離_1', '駅からの距離_2', 'mean_people',
       'max_people', 'min_people', 'std_people', 'people/meter_0',
       'people/meter_1', 'people/meter_2', 'mean_people/meter_0',
       'max_people/meter_0', 'min_people/meter_0', 'std_people/meter_0',
       '所在_区'],
      dtype='object')

# 地価

In [144]:
import xml.etree.ElementTree as ET
import geopandas
import feather

In [145]:
geoson = geopandas.read_file("../input/L01-19_13_GML/L01-19_13.geojson")

In [146]:
geoson = geoson[geoson.centroid.x.astype(int) == 139][geoson.centroid.y.astype(int) == 35]
geoson = geoson[["geometry", "L01_006", "L01_024", "L01_045", "L01_046"]]
geoson[["L01_006", "L01_024", "L01_046"]] = geoson[["L01_006", "L01_024", "L01_046"]].astype(int)

#006:価格、 024:地籍、　046:駅からの距離

  result = super(GeoDataFrame, self).__getitem__(key)


In [147]:
#geoson["L01_045"].value_counts().to_dict()
all_eki_df["eki0_name"]
all_eki_df["eki0_tika"] = all_eki_df["eki0_name"].map(geoson.groupby("L01_045").mean()["L01_006"])
all_eki_df["eki1_tika"] = all_eki_df["eki1_name"].map(geoson.groupby("L01_045").mean()["L01_006"])
all_eki_df["eki2_tika"] = all_eki_df["eki2_name"].map(geoson.groupby("L01_045").mean()["L01_006"])

In [149]:
tika_df = all_eki_df[["eki0_tika", "eki1_tika", "eki2_tika"]]
tika_df["tika_mean"] = tika_df.mean(axis=1)
tika_df["tika_min"] = tika_df.min(axis=1)
tika_df["tika_max"] = tika_df.max(axis=1)
tika_df["tika_std"] = tika_df.std(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

In [157]:
tika_df["tika×area"] = tika_df["tika_mean"] * all_df["float面積"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [158]:
output_dir = '../code/feature_csv/'
tika_df.to_feather(os.path.join(output_dir, 'tika_feature.feather'))
tika_df.columns

Index(['eki0_tika', 'eki1_tika', 'eki2_tika', 'tika_mean', 'tika_min',
       'tika_max', 'tika_std', 'tika×area'],
      dtype='object')

In [127]:
len(train+test)

62732