# Kaggle Bike Sharing Demand(レンタサイクルの需要予測)

## データの確認

In [747]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # グラフ描画用
import japanize_matplotlib
import seaborn as sns; sns.set() # グラフ描画用
from sklearn.preprocessing import OneHotEncoder

In [748]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
df = pd.concat([train, test])
df = df.reset_index(drop=True)

## データの加工

やること
- 日付から年、月、日、時間の列を生成 ok
- season番号の振り直し ok
- weekend列の作成 ok
- 新規利用者比率
- 降水量の推定. weatherとhumidityで特徴量作れるか
- キャンペーン日の特定(月初めの決まった曜日？)

In [749]:
df['datetime'] = pd.to_datetime(df['datetime'])
df['year'] = pd.to_datetime(df['datetime']).dt.strftime('%Y')
df['month'] = pd.to_datetime(df['datetime']).dt.strftime('%m')
df['day'] = pd.to_datetime(df['datetime']).dt.strftime('%d')
df['weekday'] = pd.to_datetime(df['datetime']).dt.strftime('%a')
df['hour'] = pd.to_datetime(df['datetime']).dt.strftime('%H')

In [750]:
df = df.sort_values('datetime')
period=24*30
df['casual_mov_ave'] = df['casual'].rolling(window=period, center=True).mean()
df['registered_mov_ave'] = df['registered'].rolling(window=period, center=True).mean()

In [751]:
df['weekend'] = 0
df.loc[(df['holiday'].values==0) & (df['workingday'].values==0),'weekend'] = 1

In [752]:
df['day_type'] = 'holiday'
df.loc[(df['workingday'].values==1), 'day_type'] = 'workingday'
df.loc[(df['weekend'].values==1), 'day_type'] = 'weekend'

In [753]:
df['month'] = df['month'].astype('int')
spring_begin = 3
summer_begin = 6
fall_begin = 9
df['season_rev'] = 4 #12,1,2月は面倒なので先に代入
df.loc[(df['month'].values >= spring_begin) & (df['month'].values < (spring_begin+3)),'season_rev'] = 1
df.loc[(df['month'].values >= summer_begin) & (df['month'].values < (summer_begin+3)),'season_rev'] = 2
df.loc[(df['month'].values >= fall_begin  ) & (df['month'].values < (fall_begin+3  )),'season_rev'] = 3
# df.head()

In [754]:
df['casual_ratio'] = df['casual']/df['count']*100

## データの前処理
時系列データをint型に変更
day_typeをLabelEncoderで数値に変換

In [755]:
from sklearn.preprocessing import LabelEncoder

In [756]:
df['year'] = df['year'].astype('int')
df['day'] = df['day'].astype('int')
df['hour'] = df['hour'].astype('int')

In [757]:
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,...,month,day,weekday,hour,casual_mov_ave,registered_mov_ave,weekend,day_type,season_rev,casual_ratio
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3.0,...,1,1,Sat,0,,,1,weekend,4,18.75
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8.0,...,1,1,Sat,1,,,1,weekend,4,20.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5.0,...,1,1,Sat,2,,,1,weekend,4,15.625
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3.0,...,1,1,Sat,3,,,1,weekend,4,23.076923
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0.0,...,1,1,Sat,4,,,1,weekend,4,0.0


In [758]:
df.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'year', 'month', 'day', 'weekday', 'hour', 'casual_mov_ave',
       'registered_mov_ave', 'weekend', 'day_type', 'season_rev',
       'casual_ratio'],
      dtype='object')

In [759]:
df.drop(['holiday', 'workingday', 'season', 'atemp', 'weekend', 'count', 'casual_ratio','year', 'month', 'day'], axis=1, inplace=True)

In [760]:
# le = LabelEncoder()
# le.fit(df['weekday'])
# df['weekday'] = le.transform(df['weekday'])
# le.fit(df['day_type'])
# df['day_type'] = le.transform(df['day_type'])

In [761]:
df = pd.get_dummies(df, columns=['weather', 'weekday', 'season_rev', 'day_type'])

In [762]:
df = df.sort_index()
df.head()
# df.drop([''],axis=1, inplace=True)
# train = df.iloc[:len(train),:].copy()
# test  = df.iloc[len(train):,:].copy()
# test.drop(['casual','registered'], axis=1, inplace=True)
# display(train.head())
# display(train.shape)
# display(test.head())
# # display(test.shape)
# # test.shape

Unnamed: 0,datetime,temp,humidity,windspeed,casual,registered,hour,casual_mov_ave,registered_mov_ave,weather_1,...,weekday_Thu,weekday_Tue,weekday_Wed,season_rev_1,season_rev_2,season_rev_3,season_rev_4,day_type_holiday,day_type_weekend,day_type_workingday
0,2011-01-01 00:00:00,9.84,81,0.0,3.0,13.0,0,,,1,...,0,0,0,0,0,0,1,0,1,0
1,2011-01-01 01:00:00,9.02,80,0.0,8.0,32.0,1,,,1,...,0,0,0,0,0,0,1,0,1,0
2,2011-01-01 02:00:00,9.02,80,0.0,5.0,27.0,2,,,1,...,0,0,0,0,0,0,1,0,1,0
3,2011-01-01 03:00:00,9.84,75,0.0,3.0,10.0,3,,,1,...,0,0,0,0,0,0,1,0,1,0
4,2011-01-01 04:00:00,9.84,75,0.0,0.0,1.0,4,,,1,...,0,0,0,0,0,0,1,0,1,0


In [763]:
df.to_csv('output/df_prepro_one-hot-encoding.csv',index=False)