## Preprocessing「前処理」

In [13]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
pd.options.display.max_columns = 100

In [9]:
# データ読み込み
station = pd.read_csv('station.csv')
trip = pd.read_csv('trip.csv')
status = pd.read_csv('status.csv')
weather = pd.read_csv('weather.csv')
sample_submission = pd.read_csv('sample_submit.csv', header=None, names=['id','count'])

In [10]:
# date系カラムの整理
trip.start_date=trip.start_date.apply(lambda n: datetime.datetime.strptime(n, '%m/%d/%Y %H:%M' ))
weather.date=weather.date.apply(lambda n: datetime.datetime.strptime(n, '%Y-%m-%d' ))
weather["month"] = weather.date.apply(lambda n: n.month)
weather["year"] = weather.date.apply(lambda n: n.year)
weather["day"] = weather.date.apply(lambda n: n.day)
weather["weekday"] = weather.date.apply(lambda n: n.weekday())
station.installation_date = station.installation_date.apply(lambda n: datetime.datetime.strptime(n, '%m/%d/%Y' ))

In [11]:
import os
import random
# 乱数シードの固定
def set_seed(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

set_seed(seed=42)

In [12]:
# 学習・予測用のデータを整形
status = status.merge(station, on="station_id").merge(weather, on=["year","month","day"])
train_status = status[status.predict==0].merge(station, on="station_id")
train_status = train_status[train_status.bikes_available.isnull() == False]

## Learning「学習」
今回は時系列モデルや回帰モデルを使用していないものを最終スコアの提出に用いたので学習フェーズは割愛します
CVだけ計算します

In [14]:
#hyper parameters
date_span = 180
min_date_count = 20
alpha = 20

In [50]:
train_dates = np.unique(train_status.date.values)

In [51]:
# 過去のデータがない日はバリデーションの予測日にできない
val_dates = train_dates[train_dates>station.installation_date.max() + datetime.timedelta(days=30)]

In [72]:
kf = KFold(n_splits=3, shuffle=True)
scores = []
for _, val_index in kf.split(val_dates):
    t_status, v_status = train_status[~train_status.date.isin(val_dates[val_index])], train_status[train_status.date.isin(val_dates[val_index])]
    pred = []
    i = 1
    for row in v_status.itertuples():
        if i % 10000 == 0:
            print(f"iter: {i}")
        i += 1
        tmt_df = t_status[(t_status.date < row.date) & (t_status.date > row.date - datetime.timedelta(days=date_span)) & (t_status.hour == row.hour) & (t_status.station_id == row.station_id)]
        a = t_status[(t_status.date < row.date) & (t_status.date >= row.date - datetime.timedelta(days=date_span//2)) & (t_status.hour == row.hour) & (t_status.station_id == row.station_id)]
        b = t_status[(t_status.date < row.date - datetime.timedelta(days=date_span//2)) & (t_status.date >= row.date - datetime.timedelta(days=date_span)) & (t_status.hour == row.hour) & (t_status.station_id == row.station_id)]
        if len(a) > min_date_count and len(b) > min_date_count:
            diff = a.bikes_available.mean() - b.bikes_available.mean()
        else:
            diff=0
        pred.append(tmt_df.bikes_available.mean() + diff/alpha)
    score = ((v_status.bikes_available.values - np.array(pred))**2).mean() **0.5
    print(f"RMSE: {score}")
    scores.append(score)

iter: 10000
iter: 20000
iter: 30000
iter: 40000
iter: 50000
iter: 60000
iter: 70000
iter: 80000
iter: 90000
iter: 100000
iter: 110000
iter: 120000
iter: 130000
RMSE: 3.343846553329613
iter: 10000
iter: 20000
iter: 30000
iter: 40000
iter: 50000
iter: 60000
iter: 70000
iter: 80000
iter: 90000
iter: 100000
iter: 110000
iter: 120000
iter: 130000
RMSE: 3.4162244217775957
iter: 10000
iter: 20000
iter: 30000
iter: 40000
iter: 50000
iter: 60000
iter: 70000
iter: 80000
iter: 90000
iter: 100000
iter: 110000
iter: 120000
iter: 130000
RMSE: 3.2708297865590956


## Predicting「予測」

In [42]:
submit_df = sample_submission.merge(status, on='id').sort_values('id')

In [9]:
result_d = {
    "id": [],
    "bikes": [],
}
i = 1
for row in submit_df.itertuples():
    if i % 1000 == 0:
        print(f"iter: {i}")
    i += 1
    tmt_df = train_status[(train_status.date < row.date) & (train_status.date > row.date - datetime.timedelta(days=date_span)) & (train_status.hour == row.hour) & (train_status.station_id == row.station_id)]
    a = train_status[(train_status.date < row.date) & (train_status.date >= row.date - datetime.timedelta(days=date_span//2)) & (train_status.hour == row.hour) & (train_status.station_id == row.station_id)]
    b = train_status[(train_status.date < row.date - datetime.timedelta(days=date_span//2)) & (train_status.date >= row.date - datetime.timedelta(days=date_span)) & (train_status.hour == row.hour) & (train_status.station_id == row.station_id)]
    if len(a) > min_date_count and len(b) > min_date_count:
        diff = a.bikes_available.mean() - b.bikes_available.mean()
    else:
        diff=0
    result_d["bikes"].append(tmt_df.bikes_available.mean() + diff/alpha)
    result_d["id"].append(row.id)

iter: 1000
iter: 2000
iter: 3000
iter: 4000
iter: 5000
iter: 6000
iter: 7000
iter: 8000
iter: 9000
iter: 10000
iter: 11000
iter: 12000
iter: 13000
iter: 14000
iter: 15000
iter: 16000
iter: 17000
iter: 18000
iter: 19000
iter: 20000
iter: 21000
iter: 22000
iter: 23000
iter: 24000
iter: 25000
iter: 26000
iter: 27000
iter: 28000
iter: 29000
iter: 30000
iter: 31000
iter: 32000
iter: 33000
iter: 34000
iter: 35000
iter: 36000
iter: 37000
iter: 38000
iter: 39000
iter: 40000
iter: 41000
iter: 42000
iter: 43000
iter: 44000
iter: 45000
iter: 46000
iter: 47000
iter: 48000
iter: 49000
iter: 50000
iter: 51000
iter: 52000
iter: 53000
iter: 54000
iter: 55000
iter: 56000
iter: 57000
iter: 58000
iter: 59000
iter: 60000
iter: 61000
iter: 62000
iter: 63000
iter: 64000
iter: 65000
iter: 66000
iter: 67000
iter: 68000
iter: 69000
iter: 70000
iter: 71000
iter: 72000
iter: 73000
iter: 74000
iter: 75000
iter: 76000
iter: 77000
iter: 78000
iter: 79000
iter: 80000
iter: 81000
iter: 82000
iter: 83000
iter: 84000
i

In [14]:
pd.DataFrame(result_d).to_csv('final_submission.csv', header=False, index=False)