In [10]:
import sys
import numpy as np
import scipy
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

import math
from datetime import datetime, timedelta, time, date
from sklearn.cluster import KMeans
%matplotlib inline

In [11]:
# 使用IJCAI-17rank1的trcik：SJH模型（时间序列加权回归模型，自创的，搜都搜不到...）
# 核心就是根据题目的损失函数

In [12]:
# using KNN to predict
train_path = '../dataset/training/volume(table 6)_training.csv'
test_path = '../dataset/testing_phase1/volume(table 6)_test1.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
train_df.time = pd.to_datetime(train_df.time)
test_df.time = pd.to_datetime(test_df.time)

In [13]:
# from 9-19 to 10-17
NUM_TRAIN_DAYS = 29

# from 10-18 to 10-24
NUM_TSET_DAYS = 7

# define Holiday
NATIONNAL_START = date(2016,10,1)
NATIONNAL_END = date(2016,10,9)

MID_AUTUMN_START = date(2016,9,15)
MID_AUTUMN_END = date(2016,9,18)


TRAIN_START_DAY = date(2016,9,19)
TRAIN_END_DAY = date(2016,10,17)

VALI_START_DAY = date(2016,10,11)
VALI_END_DAY = date(2016,10,17)

TEST_START_DAY = date(2016,10,18)
TEST_END_DAY = date(2016,10,24)

In [14]:
train_df.head()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type
0,2016-09-19 23:09:25,2,0,1,0,
1,2016-09-19 23:11:53,2,0,1,0,
2,2016-09-19 23:13:54,2,0,1,0,
3,2016-09-19 23:17:48,1,0,1,1,
4,2016-09-19 23:16:07,2,0,1,0,


In [15]:
def MAPE(pred, true):
    return abs((true - pred) / true)

def cal_mape(df_pred, df_true):
    pred_values = df_pred.values
    true_values = df_true.values
    mape_mean = 0.0
    for i in range(len(pred_values)):
        pred_i = pred_values[i]
        true_i = true_values[i]
        mape_mean += abs((pred_i-true_i) / true_i)
    mape_mean /= len(pred_values)
    return mape_mean

def per_20min(dt):
    minute = int(math.floor(dt.minute / 20) * 20)
    second = 0
    dt_new = datetime(dt.year, dt.month, dt.day, dt.hour,minute, 0)
    return dt_new

# 9~19～10.17只有国庆节，因此只考虑国庆节
def remove_holiday(df):
    day_all = df.time.dt.date
    df = df.loc[((day_all < NATIONNAL_START) | (day_all > NATIONNAL_END))]
    return df

def select_time(df):
    df['time'] = df.time.apply(
        per_20min)
    if {'has_etc','vehicle_type', 'vehicle_model'}.issubset(df.columns):
        df = df.drop(['has_etc','vehicle_type', 'vehicle_model'], axis=1)
    df = df.groupby(['tollgate_id', 'direction', 'time']).size()
    df = df.reset_index()
    df = df.rename_axis({0:'volume'}, axis='columns')
    hour = df.time.dt.hour
    df = df.loc[((hour >= 6) & (hour < 10)) 
                     | ((hour >= 15) & (hour < 19))]
    return df

def slice_time(df):
    hour = df.time.dt.hour  
    df_prev2h = df.loc[(((hour >= 6) & (hour < 8)) | ((hour >= 15) & (hour < 17)))]
    df_follow2h = df.loc[(((hour >= 8) & (hour < 10)) | ((hour >= 17) & (hour < 19)))]
    return df_prev2h, df_follow2h

def complete_miss_time(df):
    start_day = df.time.dt.date.values[0]
    end_day = df.time.dt.date.values[-1]
    toll_dire = [(1,0), (1,1), (2,0), (3,0), (3,1)]
    hour_min = [(8,0), (8,20), (8,40), (9,0), (9,20), (9,40),
              (17,0), (17,20), (17,40), (18,0), (18,20), (18,40)]
    
    df_comp = pd.DataFrame(columns=['tollgate_id','direction','time','volume'])
    for d in range((end_day - start_day).days+1):
        day = start_day + timedelta(days=d)
        if ((day < NATIONNAL_START) or (day > NATIONNAL_END)):
            for i in range(len(toll_dire)):
                toll,dire= toll_dire[i]
                for j in range(len(hour_min)):
                    h, m = hour_min[j]
                    day_time = datetime(day.year, day.month, day.day, h, m, 0)
                    index = ((df.tollgate_id == toll) & (df.direction == dire) &
                            (df.time == day_time))
                    volume = df.loc[index].volume
                    if (not volume.empty):
                        v = volume.values[0]
                    else:
                        v = np.NaN
                    row = {'tollgate_id': toll, 'direction':dire,
                       'time': str(day_time), 'volume':v} 
                    df_comp = df_comp.append(row, ignore_index=True)
    
    df_comp['tollgate_id'] = df_comp['tollgate_id'].astype(int)
    df_comp['direction'] = df_comp['direction'].astype(int)
    df_comp.time = pd.to_datetime(df_comp.time)
    df_comp['volume'] = df_comp.volume.interpolate(method='linear')
    return df_comp

def repeat_days(df, start_day, end_day):
    df_repeat = pd.DataFrame(columns=['tollgate_id','direction','time','volume'])
    for d in range((end_day - start_day).days+1):
        day = start_day + timedelta(days=d)
        temp = df.copy()
        temp.time = temp.time.apply(lambda t:
                        datetime(day.year,day.month,day.day,t.hour,t.minute,0))
        df_repeat = df_repeat.append(temp)
    
    return df_repeat

In [16]:
train_df = remove_holiday(train_df)
train_df = select_time(train_df)
train_prev2h, train_follow2h = slice_time(train_df)
train_follow2h = complete_miss_time(train_follow2h)

In [17]:
# 数据对比完毕，和官方教程一致
print train_follow2h.shape[0]
print train_follow2h.isnull().sum().sum()
print train_follow2h.time.dt.date.unique()
train_follow2h.head()

1200
0
[datetime.date(2016, 9, 19) datetime.date(2016, 9, 20)
 datetime.date(2016, 9, 21) datetime.date(2016, 9, 22)
 datetime.date(2016, 9, 23) datetime.date(2016, 9, 24)
 datetime.date(2016, 9, 25) datetime.date(2016, 9, 26)
 datetime.date(2016, 9, 27) datetime.date(2016, 9, 28)
 datetime.date(2016, 9, 29) datetime.date(2016, 9, 30)
 datetime.date(2016, 10, 10) datetime.date(2016, 10, 11)
 datetime.date(2016, 10, 12) datetime.date(2016, 10, 13)
 datetime.date(2016, 10, 14) datetime.date(2016, 10, 15)
 datetime.date(2016, 10, 16) datetime.date(2016, 10, 17)]


Unnamed: 0,tollgate_id,direction,time,volume
0,1,0,2016-09-19 08:00:00,46.0
1,1,0,2016-09-19 08:20:00,56.0
2,1,0,2016-09-19 08:40:00,41.0
3,1,0,2016-09-19 09:00:00,50.0
4,1,0,2016-09-19 09:20:00,49.0


In [80]:
# 极大似然寻找使得loss最小的预测值
def sjh_predict(df, phase = 'test'):
    df_pred = pd.DataFrame(columns=['tollgate_id','direction','time','volume'])
    train_start_day = date(2016,9,18)
    test_days = []
    validation_days = []
    for k in range(18,25):
        test_days.append((10,k))
    for h in range(11,18):
        validation_days.append((10,h))
    if phase == 'validation':
        pred_days = validation_days
    else:
        pred_days = test_days
    print pred_days
    toll_dire = [(1,0), (1,1), (2,0), (3,0), (3,1)]
    hour_min = [(8,0), (8,20), (8,40), (9,0), (9,20), (9,40),
              (17,0), (17,20), (17,40), (18,0), (18,20), (18,40)]
    for pred_m, pred_d in pred_days:
        pred_day = date(2016,pred_m,pred_d)
        print(pred_day)
        for toll, dire in toll_dire:
            for h, m in hour_min:
                df_unit = df.loc[((df.tollgate_id == toll) & (df.direction == dire) &
                              (df.time.dt.hour == h) & (df.time.dt.minute == m) & 
                                 (df.time.dt.date < pred_day))]
                v_min = df_unit.volume.min()
                v_max = df_unit.volume.max()
                v_pred = v_min
                loss_min = sys.float_info.max
                for v_i in range(int(v_min), int(v_max)):
                    v_f = float(v_i)            
                    a_1, a_2 = 0.35, 0.65
                    w_1_all = df_unit.time.apply(lambda train_d:
                                            a_1*((train_d.date()-train_start_day).days))
                    w_2_all = df_unit.time.apply(lambda train_d:
                                            a_2*(10.0/((pred_day-train_d.date()).days)))
                    w_all = w_1_all + w_2_all
                    mape_all = df_unit.volume.apply(lambda v_true: 
                                    MAPE(true=float(v_true),pred=v_f))
                    loss_f = (mape_all * w_all).sum()
                    if loss_f < loss_min:
                        loss_min = loss_f
                        v_pred = v_f
                data = {'tollgate_id':toll, 'direction': dire,
                    'time': datetime(2016,int(pred_m),int(pred_d),h,m,0), 'volume': v_pred}
                df_pred = df_pred.append(data, ignore_index=True)
    df_pred.tollgate_id = df_pred.tollgate_id.astype(int)
    df_pred.direction = df_pred.direction.astype(int)
    return df_pred

In [81]:
validation_pred = sjh_predict(train_follow2h, phase='validation')

[(10, 11), (10, 12), (10, 13), (10, 14), (10, 15), (10, 16), (10, 17)]
2016-10-11
2016-10-12
2016-10-13
2016-10-14
2016-10-15
2016-10-16
2016-10-17


In [82]:
day_all = train_follow2h.time.dt.date
validation_true = train_follow2h.loc[((day_all >= VALI_START_DAY) & (day_all <= VALI_END_DAY))]

In [83]:
print validation_pred.shape[0]
print validation_true.shape[0]
validation_true.head()
validation_pred.head()

420
420


Unnamed: 0,tollgate_id,direction,time,volume
0,1,0,2016-10-11 08:00:00,47.0
1,1,0,2016-10-11 08:20:00,48.0
2,1,0,2016-10-11 08:40:00,50.0
3,1,0,2016-10-11 09:00:00,51.0
4,1,0,2016-10-11 09:20:00,44.0


In [84]:
vali_pred_vol = validation_pred.volume
vali_true_vol = validation_true.volume
vali_mape = cal_mape(df_pred=vali_pred_vol, df_true=vali_true_vol)
print ('validation mape: %.8f' %vali_mape)

validation mape: 0.17230635


In [61]:
test_pred = sjh_predict(train_follow2h, phase='test')

[(10, 18), (10, 19), (10, 20), (10, 21), (10, 22), (10, 23), (10, 24)]
2016-10-18
2016-10-19
2016-10-20
2016-10-21
2016-10-22
2016-10-23
2016-10-24


In [62]:
print test_pred.shape[0]
print test_pred.time.dt.date.unique()
test_pred.head()

420
[datetime.date(2016, 10, 18) datetime.date(2016, 10, 19)
 datetime.date(2016, 10, 20) datetime.date(2016, 10, 21)
 datetime.date(2016, 10, 22) datetime.date(2016, 10, 23)
 datetime.date(2016, 10, 24)]


Unnamed: 0,tollgate_id,direction,time,volume
0,1,0,2016-10-18 08:00:00,45.0
1,1,0,2016-10-18 08:20:00,47.0
2,1,0,2016-10-18 08:40:00,47.0
3,1,0,2016-10-18 09:00:00,45.0
4,1,0,2016-10-18 09:20:00,48.0


In [63]:
# 生成对应格式的输出csv文件
output_csv = test_pred.copy()
start_time= output_csv.time
end_time = start_time.apply(lambda dt: dt + timedelta(minutes=20))
output_csv['time']  = '['+ start_time.astype(str) + ',' + end_time.astype(str) + ')'
output_csv = output_csv.rename_axis({'time':'time_window'}, axis='columns')
output_csv = output_csv.reindex_axis(['tollgate_id', 'time_window', 'direction', 'volume'],
                            axis='columns')

In [85]:
print output_csv.shape[0]
output_csv.head()

420


Unnamed: 0,tollgate_id,time_window,direction,volume
0,1,"[2016-10-18 08:00:00,2016-10-18 08:20:00)",0,45.0
1,1,"[2016-10-18 08:20:00,2016-10-18 08:40:00)",0,47.0
2,1,"[2016-10-18 08:40:00,2016-10-18 09:00:00)",0,47.0
3,1,"[2016-10-18 09:00:00,2016-10-18 09:20:00)",0,45.0
4,1,"[2016-10-18 09:20:00,2016-10-18 09:40:00)",0,48.0


In [None]:
output_csv.to_csv('results/task2_sjh_weight_35_65and10.csv', index=False)

In [34]:
x = datetime(2016,1,1,1,1,1)
print(x.date())

2016-01-01


[]
