In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

import math
from datetime import datetime, timedelta, time, date
%matplotlib inline

In [2]:
# 使用IJCAI-17rank1的trcik：SJH模型（时间序列加权回归模型，自创的，搜都搜不到...）
# 核心就是根据题目的损失函数

In [3]:
# read train and test
train_path = '../dataset/training/trajectories(table 5)_training.csv'
test_path = '../dataset/testing_phase1/trajectories(table 5)_test1.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
train_df.starting_time = pd.to_datetime(train_df.starting_time)
test_df.starting_time = pd.to_datetime(test_df.starting_time)

print(train_df.starting_time.dt.weekday.unique())
print(date(2017,5,15).weekday())

[1 2 3 4 5 6 0]
0


In [4]:
# from 9-19 to 10-17
NUM_TRAIN_DAYS = 91

# from 10-18 to 10-24
NUM_TSET_DAYS = 7

# define Holiday
NATIONNAL_START = date(2016,10,1)
NATIONNAL_END = date(2016,10,9)

MID_AUTUMN_START = date(2016,9,15)
MID_AUTUMN_END = date(2016,9,18)


TRAIN_START_DAY = date(2016,9,19)
TRAIN_END_DAY = date(2016,10,17)

VALI_START_DAY = date(2016,10,11)
VALI_END_DAY = date(2016,10,17)

TEST_START_DAY = date(2016,10,18)
TEST_END_DAY = date(2016,10,24)

In [5]:
train_df.head()

Unnamed: 0,intersection_id,tollgate_id,vehicle_id,starting_time,travel_seq,travel_time
0,B,3,1065642,2016-07-19 00:14:24,105#2016-07-19 00:14:24#9.56;100#2016-07-19 00...,70.85
1,B,3,1047198,2016-07-19 00:35:56,105#2016-07-19 00:35:56#11.58;100#2016-07-19 0...,148.79
2,B,1,1086390,2016-07-19 00:37:15,105#2016-07-19 00:37:15#5.26;100#2016-07-19 00...,79.76
3,A,2,1071181,2016-07-19 00:37:59,110#2016-07-19 00:37:59#13.74;123#2016-07-19 0...,58.05
4,B,1,1065807,2016-07-19 00:56:21,105#2016-07-19 00:56:21#16.08;100#2016-07-19 0...,137.98


In [15]:
def MAPE(pred, true):
    return abs((true - pred) / true)

def cal_mape(df_pred, df_true):
    pred_values = df_pred.values
    true_values = df_true.values
    mape_mean = 0.0
    for i in range(len(pred_values)):
        pred_i = pred_values[i]
        true_i = true_values[i]
        mape_mean += abs((pred_i-true_i) / true_i)
    mape_mean /= len(pred_values)
    return mape_mean

def per_20min(dt):
    minute = int(math.floor(dt.minute / 20) * 20)
    second = 0
    dt_new = datetime(dt.year, dt.month, dt.day, dt.hour,minute, 0)
    return dt_new

# 9~19～10.17只有国庆节，因此只考虑国庆节
def remove_holiday(df):
    day_all = df.starting_time.dt.date
    df = df.loc[((day_all >= TRAIN_START_DAY) & (day_all <= TRAIN_END_DAY))]
    df = df.loc[((day_all < NATIONNAL_START) | (day_all > NATIONNAL_END))]
    return df

def select_time(df):
    df.starting_time = pd.to_datetime(df.starting_time)
    df['starting_time'] = df.starting_time.apply(
        per_20min)
    if {'vehicle_id','travel_seq'}.issubset(df.columns):
        df = df.drop(['vehicle_id','travel_seq'], axis=1)
    df = df.groupby(['intersection_id', 'tollgate_id', 'starting_time']).mean()
    df = df.reset_index()
    df = df.rename_axis({'travel_time':'avg_travel_time'}, axis='columns')
    hour = df.starting_time.dt.hour
    df = df.loc[((hour >= 6) & (hour < 10)) 
                     | ((hour >= 15) & (hour < 19))]
    return df

def slice_time(df):
    hour = df.starting_time.dt.hour  
    df_prev2h = df.loc[(((hour >= 6) & (hour < 8)) | ((hour >= 15) & (hour < 17)))]
    df_follow2h = df.loc[(((hour >= 8) & (hour < 10)) | ((hour >= 17) & (hour < 19)))]
    return df_prev2h, df_follow2h

def complete_miss_time(df, duration='follow2h'):
    start_day = df.starting_time.dt.date.values[0]
    end_day = df.starting_time.dt.date.values[-1]
    inter_toll = [('A',2), ('A',3), ('B',1), ('B',3), ('C',1), ('C',3)]
    if duration == 'prev2h':
        hour_min = [(6,0), (6,20), (6,40), (7,0), (7,20), (7,40),
              (15,0), (15,20), (15,40), (16,0), (16,20), (16,40)]
    else:
        hour_min = [(8,0), (8,20), (8,40), (9,0), (9,20), (9,40),
              (17,0), (17,20), (17,40), (18,0), (18,20), (18,40)]
    df_comp = pd.DataFrame(columns=['intersection_id', 'tollgate_id',
                                    'starting_time','avg_travel_time'])
    for d in range((end_day - start_day).days+1):
        day = start_day + timedelta(days=d)
        if ((day < NATIONNAL_START) or (day > NATIONNAL_END)):
            for i in range(len(inter_toll)):
                inter, toll= inter_toll[i]
                for j in range(len(hour_min)):
                    h, m = hour_min[j]
                    day_time = datetime(day.year, day.month, day.day, h, m, 0)
                    index = ((df.intersection_id == inter) & (df.tollgate_id == toll) &
                        (df.starting_time == day_time))
                    avg_travel_time = df.loc[index].avg_travel_time
                    if (not avg_travel_time.empty):
                        avg = avg_travel_time.values[0]
                    else:
                        avg = np.NaN
                    row = {'intersection_id': inter, 'tollgate_id': toll,
                           'starting_time': str(day_time), 'avg_travel_time':avg} 
                    df_comp = df_comp.append(row, ignore_index=True)
    df_comp['tollgate_id'] = df_comp['tollgate_id'].astype(int)
    df_comp.starting_time = pd.to_datetime(df_comp.starting_time)
    df_comp['avg_travel_time'] = df_comp.avg_travel_time.interpolate()
    return df_comp

def repeat_days(df, start_day, end_day):
    df_repeat = pd.DataFrame(columns=['intersection_id', 'tollgate_id',
                                    'starting_time','avg_travel_time'])
    for d in range((end_day - start_day).days+1):
        day = start_day + timedelta(days=d)
        temp = df.copy()
        temp.starting_time = temp.starting_time.apply(lambda t:
                        datetime(day.year,day.month,day.day,t.hour,t.minute,0))
        df_repeat = df_repeat.append(temp)
    df_repeat.tollgate_id = df_repeat.tollgate_id.astype(int)
    return df_repeat

In [16]:
train_df = remove_holiday(train_df)
train_df = select_time(train_df)
train_prev2h, train_follow2h = slice_time(train_df)
train_follow2h = complete_miss_time(train_follow2h)

In [17]:
# 数据对比完毕，和官方教程一致
print train_follow2h.shape[0]
print train_follow2h.isnull().sum().sum()
train_follow2h.head(10)

1440
0


Unnamed: 0,intersection_id,tollgate_id,starting_time,avg_travel_time
0,A,2,2016-09-19 08:00:00,102.489333
1,A,2,2016-09-19 08:20:00,61.684667
2,A,2,2016-09-19 08:40:00,76.212778
3,A,2,2016-09-19 09:00:00,66.965625
4,A,2,2016-09-19 09:20:00,69.775
5,A,2,2016-09-19 09:40:00,141.191111
6,A,2,2016-09-19 17:00:00,63.533333
7,A,2,2016-09-19 17:20:00,63.537778
8,A,2,2016-09-19 17:40:00,64.111818
9,A,2,2016-09-19 18:00:00,58.54


In [18]:
# 极大似然寻找最小loss的预测回归值
def sjh_predict(df, phase = 'test'):
    df_pred = pd.DataFrame(columns=['intersection_id', 'tollgate_id',
                                    'starting_time','avg_travel_time'])
    train_start_day = date(2016,9,18)
    test_days = []
    validation_days = []
    for k in range(18,25):
        test_days.append((10,k))
    for h in range(11,18):
        validation_days.append((10,h))
    if phase == 'validation':
        pred_days = validation_days
    else:
        pred_days = test_days
    inter_toll = [('A',2), ('A',3), ('B',1), ('B',3), ('C',1), ('C',3)]
    hour_min = [(8,0), (8,20), (8,40), (9,0), (9,20), (9,40),
              (17,0), (17,20), (17,40), (18,0), (18,20), (18,40)]
    print pred_days
    for pred_m, pred_d in pred_days:
        pred_day = date(2016,pred_m,pred_d)
        print(pred_day)
        for inter, toll in inter_toll:
            for h, m in hour_min:
                df_unit = df.loc[((df.intersection_id == inter) & (df.tollgate_id == toll) &
                              (df.starting_time.dt.hour == h) & (df.starting_time.dt.minute == m) &
                                 (df.starting_time.dt.date < pred_day))]
                avg_min = df_unit.avg_travel_time.min()
                avg_max = df_unit.avg_travel_time.max()
                avg_pred = avg_min
                loss_min = sys.float_info.max
                for avg_i in range(int(avg_min), int(avg_max)):
                    avg_f = float(avg_i)
                    a_1, a_2 = 0.82, 0.42
                    w_1_all = df_unit.starting_time.apply(lambda train_d:
                                            a_1*((train_d.date()-train_start_day).days))
                    w_2_all = df_unit.starting_time.apply(lambda train_d:
                                            a_2*(10.0/((pred_day-train_d.date()).days)))
                    w_all = w_1_all + w_2_all
                    mape_all = df_unit.avg_travel_time.apply(lambda avg_true: 
                                    MAPE(true=avg_true,pred=avg_f))
                    loss_f = (mape_all * w_all).sum()
                    if loss_f < loss_min:
                        loss_min = loss_f
                        avg_pred = avg_f
                data = {'intersection_id':inter, 'tollgate_id': toll,
                    'starting_time': datetime(2016,int(pred_m),int(pred_d),h,m,0), 'avg_travel_time': avg_pred}
                df_pred = df_pred.append(data, ignore_index=True)
    return df_pred


In [19]:
validation_pred = sjh_predict(train_follow2h, phase='validation')

[(10, 18), (10, 19), (10, 20), (10, 21), (10, 22), (10, 23), (10, 24)]
2016-10-18
2016-10-19
2016-10-20
2016-10-21
2016-10-22
2016-10-23
2016-10-24


In [20]:
print validation_pred.shape[0]
validation_pred.head()

504


Unnamed: 0,intersection_id,tollgate_id,starting_time,avg_travel_time
0,A,2.0,2016-10-18 08:00:00,73.0
1,A,2.0,2016-10-18 08:20:00,83.0
2,A,2.0,2016-10-18 08:40:00,84.0
3,A,2.0,2016-10-18 09:00:00,73.0
4,A,2.0,2016-10-18 09:20:00,69.0


In [21]:
day_all = train_follow2h.starting_time.dt.date
validation_true = train_follow2h.loc[((day_all >= VALI_START_DAY) & (day_all <= VALI_END_DAY))]
print validation_pred.shape[0]
print validation_true.shape[0]
validation_true.head()

504
504


Unnamed: 0,intersection_id,tollgate_id,starting_time,avg_travel_time
936,A,2,2016-10-11 08:00:00,68.0925
937,A,2,2016-10-11 08:20:00,75.023636
938,A,2,2016-10-11 08:40:00,75.0032
939,A,2,2016-10-11 09:00:00,69.749091
940,A,2,2016-10-11 09:20:00,55.277222


In [22]:
# 用10月11~17号的数据集进行验证验证
vali_pred_avg = validation_pred.avg_travel_time
vali_true_avg = validation_true.avg_travel_time
vali_mape = cal_mape(df_pred=vali_pred_avg, df_true=vali_true_avg)
print ('validation mape: %.8f' %vali_mape)

validation mape: 0.16725734


In [84]:
test_pred = sjh_predict(train_follow2h, phase='test')

[(10, 18), (10, 19), (10, 20), (10, 21), (10, 22), (10, 23), (10, 24)]
2016-10-18
2016-10-19
2016-10-20
2016-10-21
2016-10-22
2016-10-23
2016-10-24


In [85]:
print test_pred.shape[0]
test_pred.head()

504


Unnamed: 0,intersection_id,tollgate_id,starting_time,avg_travel_time
0,A,2.0,2016-10-18 08:00:00,72.3
1,A,2.0,2016-10-18 08:20:00,83.2
2,A,2.0,2016-10-18 08:40:00,84.3
3,A,2.0,2016-10-18 09:00:00,72.5
4,A,2.0,2016-10-18 09:20:00,69.3


In [86]:
# 生成对应格式的输出csv文件
output_csv = test_pred.copy()
start_time= pd.to_datetime(output_csv.starting_time)
end_time = start_time.apply(lambda dt: dt + timedelta(minutes=20))
output_csv['starting_time']  = '['+ start_time.astype(str) + ',' + end_time.astype(str) + ')'
output_csv = output_csv.rename_axis({'starting_time':'time_window'}, axis='columns')
output_csv = output_csv.reindex_axis(['intersection_id', 'tollgate_id',
                                    'time_window','avg_travel_time'],
                            axis='columns')
output_csv.tollgate_id = output_csv.tollgate_id.astype(int)

In [87]:
print(output_csv.shape[0])
output_csv.head()

504


Unnamed: 0,intersection_id,tollgate_id,time_window,avg_travel_time
0,A,2,"[2016-10-18 08:00:00,2016-10-18 08:20:00)",72.3
1,A,2,"[2016-10-18 08:20:00,2016-10-18 08:40:00)",83.2
2,A,2,"[2016-10-18 08:40:00,2016-10-18 09:00:00)",84.3
3,A,2,"[2016-10-18 09:00:00,2016-10-18 09:20:00)",72.5
4,A,2,"[2016-10-18 09:20:00,2016-10-18 09:40:00)",69.3


In [88]:
output_csv.to_csv('results/task1_sjh_weight_82_42_and10.csv', index=False)