In [1]:
!pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 2.8MB/s eta 0:00:011
Collecting branca>=0.3.0
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.3.1 folium-0.10.1


In [2]:
import os,sys,time,re,pickle
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
%matplotlib inline

pd.set_option("display.max_rows", 150)
pd.set_option("display.max_columns", 150)

import folium

In [6]:
path = '../input'

#列名を英語に&データの型を前処理したものを読み込み
with open('{}/data/pr_data/train.pickle'.format(path),mode='rb') as f:
    train = pickle.load(f)
with open('{}/data/pr_data/test.pickle'.format(path),mode='rb') as f:
    test = pickle.load(f)
with open('{}/data/pr_data/stop_station_location.pickle'.format(path),mode='rb') as f:
    stop_station_location = pickle.load(f)
with open('{}/data/pr_data/tunnel_location.pickle'.format(path),mode='rb') as f:
    tunnel_location = pickle.load(f)
with open('{}/data/pr_data/diagram.pickle'.format(path),mode='rb') as f:
    #行列を置換したデータ
    diagram = pickle.load(f)
with open('{}/data/pr_data/kanazawa_nosnow.pickle'.format(path),mode='rb') as f:
    kanazawa_nosnow = pickle.load(f)
with open('{}/data/pr_data/weather.pickle'.format(path),mode='rb') as f:
    weather = pickle.load(f)

print('train_shape:',train.shape)
display(train.head(3))
print('test_shape:',test.shape)
display(test.head(3))
display(stop_station_location.head(1))
display(tunnel_location.head(1))
display(diagram.head(3))
display(kanazawa_nosnow)
display(weather.head(1))


FileNotFoundError: [Errno 2] No such file or directory: '../input//data/pr_data/train.pickle'

In [None]:
train_shape: (15315, 10)

In [None]:
test_shape: (2430, 7)

In [None]:
array([2290,  552, 3502, 9536,  554, 2292, 3506, 3508,  560,  564,  566,
        568, 8528, 8530, 3510,  572, 8532, 3512, 3514, 3516,  728,  734],
      dtype=int32)

## 2.foliumでのマッピング

In [None]:
#foliumでのマッピング
map1 = folium.Map(location=[36.876868, 137.247129], zoom_start=9,tiles='openstreetmap')

#駅情報のマッピング、各停は青色
for l in stop_station_location.values.tolist():
    if l[0] in ['金沢','富山','長野']:
        c = 'red'
    else:
        c = 'blue'
    folium.Marker([l[2],l[3]], popup='{}駅'.format(l[0]),icon=folium.Icon(color=c)).add_to(map1)

#トンネルの座標をマッピング
for tl in tunnel_location.values.tolist():
    folium.PolyLine([[tl[3],tl[4]],[tl[5],tl[6]]], popup=tl[0],color="green", weight=5, opacity=1).add_to(map1)
map1.save('{}/data/map1.html'.format(path))
display(map1)

In [None]:
#トンネルの長さを求める
tunnel_length = tunnel_location.copy().set_index('tunnel_name').astype(float)
tunnel_length['tunnel_length'] = tunnel_length['end_length']-tunnel_length['start_length']
station_data = stop_station_location[['stop_station','longitude','latitude','length']].copy().rename(columns={'length':'total_length'})
station_data['length'] = station_data['total_length'].diff()

#トンネルごとにどの駅区間に存在するか
station_li = list(reversed(station_data.values.tolist()))
for i, sl in enumerate(station_li):
    if i == 0:
        continue
    else:
        tunnel_length.loc[tunnel_length['start_latitude'] <sl[2],'stop_station'] = sl[0]

station_data = pd.merge(station_data,tunnel_length.groupby('stop_station')['tunnel_length'].sum(),how='left',left_on='stop_station',right_index=True)

#トンネルない区間の長さを求める
station_data['field_length'] = station_data['length'] - station_data['tunnel_length']
station_data['field_total_length'] = station_data['field_length'].cumsum()
display(station_data.head())

## おまけEDA：屋外距離と着雪量の関係

In [None]:
#金沢-富山/富山-糸魚川間での着雪量の違いを可視化する
#金沢駅での着雪量が0,かつ着雪があった車両が富山,糸魚川の両駅に停車する条件での着雪量
train.loc[train['rail_number'].isin(kanazawa_nosnow),'start_from_zero'] = 1
train['start_from_zero'] = train['start_from_zero'].fillna(0)
data = train[train['sum']>0][train['start_from_zero']==1].sort_values(['date','rail_number','stop_station'])
data = data[data.iloc[:,:2].duplicated(keep=False)]

#区間ごとの着雪量
data['kanazawa-toyama'] = data.groupby(['date','rail_number'])['sum'].transform(lambda x: x.iloc[0])
data['toyama-itoi'] = data.groupby(['date','rail_number'])['sum'].transform(lambda x: x.max()-x.min())
data['section_ratio'] = data['kanazawa-toyama']/data['toyama-itoi']
data_uni = data[data.iloc[:,:2].duplicated()]

display(data.head(6))
display(data_uni)

In [None]:
fig = plt.figure(facecolor='w',figsize=(6,6))

#2区間での着雪量の可視化
ax = fig.add_subplot(1, 2, 1)
sns.boxplot(x=['kanazawa-toyama','toyama-itoi'], y=[data_uni['kanazawa-toyama'],data_uni['toyama-itoi']])
ax.set_ylabel('target')

#2区間での着雪量比の可視化
ax = fig.add_subplot(1, 2, 2)
sns.boxplot(x=['section_ratio'], y=[data_uni['section_ratio']])
ax.set_ylabel('ratio')

fig.subplots_adjust(hspace=.2,wspace=.2)
plt.show()

len_kanazawa_toyama = station_data['field_total_length'].iloc[2]
len_toyama_itoi = station_data['field_total_length'].iloc[4]- station_data['field_total_length'].iloc[2]
length_ratio = len_kanazawa_toyama/len_toyama_itoi

print('section_length\nkanazawa-toyama:{:.2f}km\ntoyama-itoi:{:.2f}km'.format(len_kanazawa_toyama,len_toyama_itoi))
print('length_ratio(kanazawa-toyama/toyama-itoi):{}'.format(length_ratio))
print('------------------------')
print('target_ratio(kanazawa_toyama/toyama-itoi):\n{}'.format(data_uni['section_ratio'].values))

In [None]:
#気象情報を結合
data_w = data.copy()
start_hour = pd.Series(pd.to_datetime(diagram['金沢']).dt.hour,name='start_hour')
data_w = pd.merge(data_w,start_hour,how='left',left_on='rail_number',right_index=True)

data_w['datetime'] = pd.to_datetime(data_w['date'].astype(str) + ' ' + data_w['start_hour'].astype(str) + ':00')
data_w = pd.merge(data_w,weather,how='left',left_on=['stop_station','datetime'],right_on=['location','datetime'])
data_w = data_w.drop(['start_hour','location'],axis=1)
data_w = data_w.sort_values(['section_ratio','rail_number','stop_station'],ascending=False)

#金沢-富山区間での着雪量が多いものを可視化
display(data_w[:10])