In [1]:
import sys
import numpy as np
import scipy
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

import math
from datetime import datetime, timedelta, time, date

from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
%matplotlib inline

### 使用KNN regressor进行预测
> 分成十份待预测的数据集的预测（五个方向，每个方向都分上下午预测），先用最简单的KNN。以上午为例，从训练集中找到最相近的几个6~8.然后用最相近的那几天的8~10做个加权平均作为测试集的预测集。
计划完成的目标有：
1. 10份数据集的预测
2. 线下验证（用10月11~17日的数据集）

In [2]:
# using KNN to predict
train_path = '../dataset/training/volume(table 6)_training.csv'
test_path = '../dataset/testing_phase1/volume(table 6)_test1.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
train_df.time = pd.to_datetime(train_df.time)
test_df.time = pd.to_datetime(test_df.time)

In [3]:
# from 9-19 to 10-17 (except holiday)
NUM_TRAIN_DAYS = 20

# from 10-18 to 10-24
NUM_TSET_DAYS = 7

# define Holiday
NATIONNAL_START = date(2016,10,1)
NATIONNAL_END = date(2016,10,9)

MID_AUTUMN_START = date(2016,9,15)
MID_AUTUMN_END = date(2016,9,18)


TRAIN_START_DAY = date(2016,9,19)
TRAIN_END_DAY = date(2016,10,17)

VALI_START_DAY = date(2016,10,11)
VALI_END_DAY = date(2016,10,17)

TEST_START_DAY = date(2016,10,18)
TEST_END_DAY = date(2016,10,24)

In [4]:
train_df.head()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type
0,2016-09-19 23:09:25,2,0,1,0,
1,2016-09-19 23:11:53,2,0,1,0,
2,2016-09-19 23:13:54,2,0,1,0,
3,2016-09-19 23:17:48,1,0,1,1,
4,2016-09-19 23:16:07,2,0,1,0,


In [5]:
def MAPE(pred, true):
    return abs((true - pred) / true)

def cal_mape(pred_values, true_values):
    mape_mean = 0.0
    for i in range(len(pred_values)):
        pred_i = pred_values[i]
        true_i = true_values[i]
        mape_mean += abs((pred_i-true_i) / true_i)
    mape_mean /= len(pred_values)
    return mape_mean

def per_20min(dt):
    minute = int(math.floor(dt.minute / 20) * 20)
    second = 0
    dt_new = datetime(dt.year, dt.month, dt.day, dt.hour,minute, 0)
    return dt_new

# 9~19～10.17只有国庆节，因此只考虑国庆节
def remove_holiday(df):
    day_all = df.time.dt.date
    df = df.loc[((day_all < NATIONNAL_START) | (day_all > NATIONNAL_END))]
    return df

def select_time(df):
    df['time'] = df.time.apply(
        per_20min)
    if {'has_etc','vehicle_type', 'vehicle_model'}.issubset(df.columns):
        df = df.drop(['has_etc','vehicle_type', 'vehicle_model'], axis=1)
    df = df.groupby(['tollgate_id', 'direction', 'time']).size()
    df = df.reset_index()
    df = df.rename_axis({0:'volume'}, axis='columns')
    hour = df.time.dt.hour
    df = df.loc[((hour >= 6) & (hour < 10)) 
                     | ((hour >= 15) & (hour < 19))]
    df = df.sort_values(['tollgate_id','direction','time'])
    return df

def slice_am_pm(df):
    hours = df.time.dt.hour
    df_am = df.loc[(hours < 12)]
    df_pm = df.loc[(hours >= 12)]
    return df_am, df_pm

def slice_time(df):
    hour = df.time.dt.hour  
    df_prev2h = df.loc[(((hour >= 6) & (hour < 8)) | ((hour >= 15) & (hour < 17)))]
    df_follow2h = df.loc[(((hour >= 8) & (hour < 10)) | ((hour >= 17) & (hour < 19)))]
    return df_prev2h, df_follow2h

def complete_miss_time(df, df_type='train'):
    start_day = df.time.dt.date.values[0]
    end_day = df.time.dt.date.values[-1]
    toll_dire = [(1,0), (1,1), (2,0), (3,0), (3,1)]
    if df_type == 'test':
        hour_min = [(6,0), (6,20), (6,40), (7,0), (7,20), (7,40),
              (15,0), (15,20), (15,40), (16,0), (16,20), (16,40)]
    else:        
        hour_min = [(6,0), (6,20), (6,40), (7,0), (7,20), (7,40),
                (8,0), (8,20), (8,40), (9,0), (9,20), (9,40),
              (15,0), (15,20), (15,40), (16,0), (16,20), (16,40),
              (17,0), (17,20), (17,40), (18,0), (18,20), (18,40)]
    df_comp = pd.DataFrame(columns=['tollgate_id','direction','time','volume'])
    for d in range((end_day - start_day).days+1):
        day = start_day + timedelta(days=d)
        if ((day < NATIONNAL_START) or (day > NATIONNAL_END)):
            for i in range(len(toll_dire)):
                toll,dire= toll_dire[i]
                for j in range(len(hour_min)):
                    h, m = hour_min[j]
                    day_time = datetime(day.year, day.month, day.day, h, m, 0)
                    index = ((df.tollgate_id == toll) & (df.direction == dire) &
                            (df.time == day_time))
                    volume = df.loc[index].volume
                    if (not volume.empty):
                        v = volume.values[0]
                    else:
                        v = np.NaN
                    row = {'tollgate_id': toll, 'direction':dire,
                       'time': str(day_time), 'volume':v} 
                    df_comp = df_comp.append(row, ignore_index=True)
    
    df_comp['tollgate_id'] = df_comp['tollgate_id'].astype(int)
    df_comp['direction'] = df_comp['direction'].astype(int)
    df_comp.time = pd.to_datetime(df_comp.time)
    df_comp['volume'] = df_comp.volume.interpolate(method='linear')
    return df_comp

In [6]:
# 数据预处理，分别是训练集，验证集和测试集
train_df = remove_holiday(train_df)
train_df = select_time(train_df)
train_df = complete_miss_time(train_df, df_type='train')
train_am, train_pm = slice_am_pm(train_df)
test_df = select_time(test_df)
test_df = complete_miss_time(test_df, df_type='test')
test_am, test_pm = slice_am_pm(test_df)


days_all = train_df.time.dt.date
nonvali_train_df = train_df.loc[((days_all < VALI_START_DAY) | (days_all > VALI_END_DAY))]
nonvali_train_am, nonvali_train_pm = slice_am_pm(nonvali_train_df)
vali_df = train_df.loc[((days_all >= VALI_START_DAY) & (days_all <= VALI_END_DAY))]
vali_prev2h, vali_follow2h = slice_time(vali_df)
vali_prev2h_am, vali_prev2h_pm = slice_am_pm(vali_prev2h)
vali_follow2h_am, vali_follow2h_pm = slice_am_pm(vali_follow2h)

In [7]:
# 数据对比完毕，和官方教程一致
print(train_df.isnull().sum().sum(), test_df.isnull().sum().sum())
print(len(train_df.time.dt.date.unique()))
vali_prev2h_am.head(10)

(0, 0)
20


Unnamed: 0,tollgate_id,direction,time,volume
1560,1,0,2016-10-11 06:00:00,9.0
1561,1,0,2016-10-11 06:20:00,26.0
1562,1,0,2016-10-11 06:40:00,18.0
1563,1,0,2016-10-11 07:00:00,30.0
1564,1,0,2016-10-11 07:20:00,40.0
1565,1,0,2016-10-11 07:40:00,52.0
1584,1,1,2016-10-11 06:00:00,37.0
1585,1,1,2016-10-11 06:20:00,51.0
1586,1,1,2016-10-11 06:40:00,56.0
1587,1,1,2016-10-11 07:00:00,72.0


In [8]:
test_am.head(10)

Unnamed: 0,tollgate_id,direction,time,volume
0,1,0,2016-10-18 06:00:00,13.0
1,1,0,2016-10-18 06:20:00,17.0
2,1,0,2016-10-18 06:40:00,21.0
3,1,0,2016-10-18 07:00:00,31.0
4,1,0,2016-10-18 07:20:00,28.0
5,1,0,2016-10-18 07:40:00,47.0
12,1,1,2016-10-18 06:00:00,37.0
13,1,1,2016-10-18 06:20:00,47.0
14,1,1,2016-10-18 06:40:00,72.0
15,1,1,2016-10-18 07:00:00,68.0


In [9]:
# 对于训练集，每行包括6~10的12个时间段（上午），前6个时间段为特征，后6个时间段为标签
# 对于测试集和验证集，每行只包括6~8的6个时间段
def generate_knn_set(df_am, df_pm):
    toll_dire = [(1,0), (1,1), (2,0), (3,0), (3,1)]
    df_days = df_am.time.dt.date.unique()
    df_days_len = len(df_days)
    set_am_data = {}
    set_pm_data = {}
    for i in range(len(toll_dire)):
        toll, dire = toll_dire[i]
        df_am_unit = df_am.loc[((df_am.tollgate_id == toll) & (df_am.direction == dire))]
        df_pm_unit = df_pm.loc[((df_pm.tollgate_id == toll) & (df_pm.direction == dire))]
        arr_am_data = df_am_unit.volume.values.reshape(df_days_len,-1)
        arr_pm_data = df_pm_unit.volume.values.reshape(df_days_len,-1)
        set_am_data[i] = arr_am_data
        set_pm_data[i] = arr_pm_data
    return set_am_data, set_pm_data

def generate_vali_true(df_am, df_pm):
    toll_dire = [(1,0), (1,1), (2,0), (3,0), (3,1)]
    vali_true_am = []
    vali_true_pm = []
    for i in range(len(toll_dire)):
        toll, dire = toll_dire[i]
        df_am_unit = vali_follow2h_am.loc[((df_am.tollgate_id == toll) & (df_am.direction == dire))]
        df_pm_unit = vali_follow2h_pm.loc[((df_pm.tollgate_id == toll) & (df_pm.direction == dire))]
        arr_am_data = df_am_unit.volume.values
        arr_pm_data = df_pm_unit.volume.values
        vali_true_am.append(arr_am_data)
        vali_true_pm.append(arr_pm_data)
    vali_true_am = np.array(vali_true_am).reshape(-1)
    vali_true_pm = np.array(vali_true_pm).reshape(-1)
    return vali_true_am, vali_true_pm

In [10]:
train_set_am, train_set_pm = generate_knn_set(train_am, train_pm)
test_set_am, test_set_pm = generate_knn_set(test_am, test_pm)


nonvali_train_set_am, nonvali_train_set_pm = generate_knn_set(nonvali_train_am, nonvali_train_pm)
vali_set_am, vali_set_pm = generate_knn_set(vali_prev2h_am, vali_prev2h_pm)
vali_true_am, vali_true_pm = generate_vali_true(vali_follow2h_am, vali_follow2h_pm)

In [11]:
print(vali_true_am.shape)

(210,)


In [12]:
train_set_am_i = train_set_am[0]
train_set_pm_i = train_set_pm[0]
print(train_set_am_i.shape, train_set_pm_i.shape)
print(train_set_am_i[0:2,:])
print(train_set_pm_i[0:2,:])

((20, 12), (20, 12))
[[  8.  13.  32.  39.  31.  43.  46.  56.  41.  50.  49.  44.]
 [ 13.  20.  29.  38.  30.  31.  53.  50.  45.  51.  68.  59.]]
[[ 48.  57.  43.  42.  46.  55.  36.  34.  24.  29.  27.  18.]
 [ 71.  64.  73.  48.  57.  48.  38.  42.  38.  26.  29.  20.]]


In [13]:
test_set_am_i = test_set_am[0]
test_set_pm_i = test_set_pm[0]
print(test_set_am_i.shape, test_set_pm_i.shape)
print(test_set_am_i[0:2,:])
print(test_set_pm_i[0:2,:])

((7, 6), (7, 6))
[[ 13.  17.  21.  31.  28.  47.]
 [ 12.  16.  17.  22.  38.  41.]]
[[ 52.  38.  35.  57.  45.  53.]
 [ 40.  34.  50.  39.  40.  36.]]


In [149]:
# W = np.arange(0.1,3.2,0.6)
a = np.arange(0.1,1.2,0.5)
b = np.arange(1.6,3.3,0.8)
W =  np.concatenate((a,b), axis=0)

def MAPE(x, y):
    return np.sum(W * np.abs(x-y))

# 因为要对十份数据进行预测，所以封装了knn
def knn_fit(set_am, set_pm, n_neighbors=5,weights='distance',algorithm='auto',metric=MAPE):
    # set random seed to 0
    n_features = 6
    knn_set_am = {}
    knn_set_pm = {}
    for i in range(len(set_am)):
        arr_am, arr_pm = set_am[i], set_pm[i]
        features_am, labels_am = arr_am[:,:n_features], arr_am[:,n_features:]
        features_pm, labels_pm = arr_pm[:,:n_features], arr_pm[:,n_features:]
        knn_reg_am = KNeighborsRegressor(n_neighbors=n_neighbors,weights=weights,
                                         algorithm=algorithm, metric=metric).fit(features_am, labels_am)
        knn_reg_pm = KNeighborsRegressor(n_neighbors=n_neighbors,weights=weights, 
                                         algorithm=algorithm, metric=metric).fit(features_pm, labels_pm)
        knn_set_am[i] = knn_reg_am
        knn_set_pm[i] = knn_reg_pm
    return knn_set_am, knn_set_pm

def knn_predict(knn_set_am, knn_set_pm, set_am, set_pm):
    pred_list_am = []
    pred_list_pm = []
    for i in range(len(set_am)):
        knn_reg_am, knn_reg_pm = knn_set_am[i], knn_set_pm[i]
        features_am, features_pm = set_am[i], set_pm[i]
        pred_am = knn_reg_am.predict(features_am)
        pred_pm = knn_reg_pm.predict(features_pm)
        pred_list_am.append(pred_am)
        pred_list_pm.append(pred_pm)
    pred_vec_am = np.array(pred_list_am).reshape(-1)
    pred_vec_pm = np.array(pred_list_pm).reshape(-1)
    return pred_vec_am, pred_vec_pm    

In [151]:
def validatoin_mape(train_set_am, train_set_pm, vali_set_am, vali_set_pm,
                    vali_true_am, vali_true_pm):
    n_neig_list = range(1,13)
    vali_true = np.concatenate((vali_true_am, vali_true_pm), axis=0)
    for n_neig in n_neig_list:
        knn_set_am, knn_set_pm = knn_fit(train_set_am, train_set_pm, n_neighbors=n_neig)
        vali_pred_am, vali_pred_pm = knn_predict(knn_set_am, knn_set_pm, vali_set_am, vali_set_pm)
        vali_pred = np.concatenate((vali_pred_am, vali_pred_pm), axis=0)
        vali_mape_am = cal_mape(pred_values=vali_pred_am, true_values=vali_true_am)
        vali_mape_pm = cal_mape(pred_values=vali_pred_pm, true_values=vali_true_pm)
        vali_mape = cal_mape(pred_values=vali_pred, true_values=vali_true)
        print('n_neighbors: %i, validation_mape_am: %.4f' %(n_neig, vali_mape_am))
        print('n_neighbors: %i, validation_mape_pm: %.4f' %(n_neig, vali_mape_pm))
        print('n_neighbors: %i, validation_mape: %.4f' %(n_neig, vali_mape))
        
validatoin_mape(nonvali_train_set_am, nonvali_train_set_pm, vali_set_am, vali_set_pm,
                    vali_true_am, vali_true_pm)


n_neighbors: 1, validation_mape_am: 0.1611
n_neighbors: 1, validation_mape_pm: 0.1982
n_neighbors: 1, validation_mape: 0.1797
n_neighbors: 2, validation_mape_am: 0.1385
n_neighbors: 2, validation_mape_pm: 0.1765
n_neighbors: 2, validation_mape: 0.1575
n_neighbors: 3, validation_mape_am: 0.1323
n_neighbors: 3, validation_mape_pm: 0.1699
n_neighbors: 3, validation_mape: 0.1511
n_neighbors: 4, validation_mape_am: 0.1316
n_neighbors: 4, validation_mape_pm: 0.1584
n_neighbors: 4, validation_mape: 0.1450
n_neighbors: 5, validation_mape_am: 0.1340
n_neighbors: 5, validation_mape_pm: 0.1521
n_neighbors: 5, validation_mape: 0.1431
n_neighbors: 6, validation_mape_am: 0.1323
n_neighbors: 6, validation_mape_pm: 0.1520
n_neighbors: 6, validation_mape: 0.1422
n_neighbors: 7, validation_mape_am: 0.1304
n_neighbors: 7, validation_mape_pm: 0.1471
n_neighbors: 7, validation_mape: 0.1387
n_neighbors: 8, validation_mape_am: 0.1307
n_neighbors: 8, validation_mape_pm: 0.1477
n_neighbors: 8, validation_mape:

In [153]:
n_neig_vali = 7
knn_set_am, knn_set_pm = knn_fit(nonvali_train_set_am, nonvali_train_set_pm, n_neighbors=n_neig_vali)
vali_pred_am, vali_pred_pm = knn_predict(knn_set_am, knn_set_pm, vali_set_am, vali_set_pm)
print(nonvali_train_set_am[0].shape)
print(vali_pred_am.shape, vali_pred_pm.shape)
print(vali_pred_am[:10])
print(vali_pred_pm[:10])

(13, 12)
((210,), (210,))
[ 47.21856885  55.38295418  51.48485838  49.42944623  53.77077996
  54.00594186  44.28913187  52.72344447  51.84010777  50.3827153 ]
[ 40.58750223  40.0300713   28.44966677  25.58068392  22.60240673
  22.27107402  41.65727332  39.13339482  26.98377921  25.08322745]


In [154]:
print(train_set_am[0].shape)
n_neig_test = 9
knn_set_am, knn_set_pm = knn_fit(train_set_am, train_set_pm, n_neighbors=n_neig_test)
test_pred_am, test_pred_pm = knn_predict(knn_set_am, knn_set_pm, test_set_am, test_set_pm)

(20, 12)


In [160]:
print(test_pred_am.shape, test_pred_pm.shape)
print(test_pred_am[:10])
print(test_pred_pm[:10])
a = test_df.time.dt.date.unique()

((210,), (210,))
[ 44.1211325   54.31907795  47.75557354  51.10269589  53.11123722
  50.06026153  45.3373537   54.86775639  52.42823342  47.55322128]
[ 42.53379419  39.9155647   30.18085236  27.15144177  25.65775308
  20.57705832  42.76903467  35.48750736  26.67681461  26.653471  ]


In [161]:
def generate_output(pred_am, pred_pm):
    output_df = pd.DataFrame(columns=['tollgate_id','direction','time','volume'])
    toll_dire = [(1,0), (1,1), (2,0), (3,0), (3,1)]
    test_days = test_df.time.dt.date.unique()
    hour_min_am = [(8,0), (8,20), (8,40), (9,0), (9,20), (9,40)]
    hour_min_pm = [(17,0), (17,20), (17,40), (18,0), (18,20), (18,40)]
    i, j = 0, 0
    for toll, dire in toll_dire:
        for test_d in test_days:
            for hour, minute in hour_min_am:
                test_time = datetime(test_d.year,test_d.month,test_d.day,int(hour),int(minute),0)
                data = {'tollgate_id':toll, 'direction': dire,
                    'time': test_time, 'volume': pred_am[i]}
                output_df = output_df.append(data, ignore_index=True)
                i += 1
            for hour, minute in hour_min_pm:
                test_time = datetime(test_d.year,test_d.month,test_d.day,int(hour),int(minute),0)
                data = {'tollgate_id':toll, 'direction': dire, 
                        'time': test_time, 'volume': pred_pm[j]}
                output_df = output_df.append(data, ignore_index=True)
                j += 1
    output_df.tollgate_id = output_df.tollgate_id.astype(int)
    output_df.direction = output_df.direction.astype(int)
    output_df.time = pd.to_datetime(output_df.time)
    print(i, j)
    return output_df

output_df = generate_output(test_pred_am, test_pred_pm)

(210, 210)


In [162]:
print(output_df.shape)
output_df.head(10)

(420, 4)


Unnamed: 0,tollgate_id,direction,time,volume
0,1,0,2016-10-18 08:00:00,44.121132
1,1,0,2016-10-18 08:20:00,54.319078
2,1,0,2016-10-18 08:40:00,47.755574
3,1,0,2016-10-18 09:00:00,51.102696
4,1,0,2016-10-18 09:20:00,53.111237
5,1,0,2016-10-18 09:40:00,50.060262
6,1,0,2016-10-18 17:00:00,42.533794
7,1,0,2016-10-18 17:20:00,39.915565
8,1,0,2016-10-18 17:40:00,30.180852
9,1,0,2016-10-18 18:00:00,27.151442


In [158]:
# 生成对应格式的输出csv文件
output_csv = output_df.copy()
start_time= output_csv.time
end_time = start_time.apply(lambda dt: dt + timedelta(minutes=20))
output_csv['time']  = '['+ start_time.astype(str) + ',' + end_time.astype(str) + ')'
output_csv = output_csv.rename_axis({'time':'time_window'}, axis='columns')
output_csv = output_csv.reindex_axis(['tollgate_id', 'time_window', 'direction', 'volume'],
                            axis='columns')

In [163]:
output_csv.head(10)

Unnamed: 0,tollgate_id,time_window,direction,volume
0,1,"[2016-10-18 08:00:00,2016-10-18 08:20:00)",0,44.121132
1,1,"[2016-10-18 08:20:00,2016-10-18 08:40:00)",0,54.319078
2,1,"[2016-10-18 08:40:00,2016-10-18 09:00:00)",0,47.755574
3,1,"[2016-10-18 09:00:00,2016-10-18 09:20:00)",0,51.102696
4,1,"[2016-10-18 09:20:00,2016-10-18 09:40:00)",0,53.111237
5,1,"[2016-10-18 09:40:00,2016-10-18 10:00:00)",0,50.060262
6,1,"[2016-10-18 17:00:00,2016-10-18 17:20:00)",0,42.533794
7,1,"[2016-10-18 17:20:00,2016-10-18 17:40:00)",0,39.915565
8,1,"[2016-10-18 17:40:00,2016-10-18 18:00:00)",0,30.180852
9,1,"[2016-10-18 18:00:00,2016-10-18 18:20:00)",0,27.151442


In [164]:
output_csv.to_csv('results/task2_knn_9_WMAPE.csv', index=False)

In [178]:

a = np.array([[1,2,3],[2,3,4]])
b = np.array([4,5,6])
c = np.vstack((a,b))
c = np.vstack((c,b))
print(a.shape)
print(c.shape)
print(c)

(2, 3)
(4, 3)
[[1 2 3]
 [2 3 4]
 [4 5 6]
 [4 5 6]]


In [51]:
a = np.array([0.1,0.2,0.3])
b = np.array([1,2,3])
print(np.sum(a*b))

1.4


In [128]:
a = np.arange(0.1,0.8,0.3)
b = np.arange(1.5,2.6,0.5)
c =  np.concatenate((a,b), axis=0)
print(c)

[ 0.1  0.4  0.7  1.5  2.   2.5]
