In [285]:
import sys
import numpy as np
import scipy
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

import math
from datetime import datetime, timedelta, time, date

from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
%matplotlib inline

### 使用KNN regressor进行预测
> 分成十份待预测的数据集的预测（五个方向，每个方向都分上下午预测），先用最简单的KNN。以上午为例，从训练集中找到最相近的几个6~8.然后用最相近的那几天的8~10做个加权平均作为测试集的预测集。
计划完成的目标有：
1. 10份数据集的预测
2. 线下验证（用10月11~17日的数据集）

In [287]:
# using KNN to predict
train_path_1 = '../dataset/training/volume(table 6)_training.csv'
train_path_2 = '../dataset/dataSet_phase2/volume(table 6)_training2.csv'

test_path = '../dataset/dataSet_phase2/volume(table 6)_test2.csv'

train_df_1 = pd.read_csv(train_path_1)
train_df_2 = pd.read_csv(train_path_2)
train_df = train_df_1.append(train_df_2, ignore_index=True)

test_df = pd.read_csv(test_path)

train_df.time = pd.to_datetime(train_df.time)
test_df.time = pd.to_datetime(test_df.time)

In [288]:
# from 9-19 to 10-17 (except holiday)
NUM_TRAIN_DAYS = 20

# from 10-18 to 10-24
NUM_TSET_DAYS = 7

# define Holiday
NATIONNAL_START = date(2016,10,1)
NATIONNAL_END = date(2016,10,9)

MID_AUTUMN_START = date(2016,9,15)
MID_AUTUMN_END = date(2016,9,18)


TRAIN_START_DAY = date(2016,9,19)
TRAIN_END_DAY = date(2016,10,24)

VALI_START_DAY = date(2016,10,18)
VALI_END_DAY = date(2016,10,24)

TEST_START_DAY = date(2016,10,25)
TEST_END_DAY = date(2016,10,31)

In [289]:
print(train_df.shape)
train_df.tail()

(672522, 6)


Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type
672517,2016-10-24 21:15:56,1,1,5,0,1.0
672518,2016-10-24 22:19:00,3,1,1,0,0.0
672519,2016-10-24 23:20:35,1,1,1,0,0.0
672520,2016-10-24 20:20:26,3,1,1,0,0.0
672521,2016-10-24 22:11:38,1,1,2,0,1.0


In [291]:
print(test_df.shape)
test_df.tail()

(30999, 6)


Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type
30994,2016-10-31 16:42:10,1,1,2,0,1.0
30995,2016-10-31 15:48:09,3,1,7,1,1.0
30996,2016-10-31 16:59:05,3,1,1,0,0.0
30997,2016-10-31 16:37:38,3,1,5,0,1.0
30998,2016-10-31 15:29:35,3,1,5,0,1.0


In [292]:
def MAPE(pred, true):
    return abs((true - pred) / true)

def cal_mape(pred_values, true_values):
    mape_mean = 0.0
    for i in range(len(pred_values)):
        pred_i = pred_values[i]
        true_i = true_values[i]
        mape_mean += abs((pred_i-true_i) / true_i)
    mape_mean /= len(pred_values)
    return mape_mean

def per_20min(dt):
    minute = int(math.floor(dt.minute / 20) * 20)
    second = 0
    dt_new = datetime(dt.year, dt.month, dt.day, dt.hour,minute, 0)
    return dt_new

# 9~19～10.17只有国庆节，因此只考虑国庆节
def remove_holiday(df):
    day_all = df.time.dt.date
    df = df.loc[((day_all < NATIONNAL_START) | (day_all > NATIONNAL_END))]
    return df

def select_time(df):
    df['time'] = df.time.apply(
        per_20min)
    if {'has_etc','vehicle_type', 'vehicle_model'}.issubset(df.columns):
        df = df.drop(['has_etc','vehicle_type', 'vehicle_model'], axis=1)
    df = df.groupby(['tollgate_id', 'direction', 'time']).size()
    df = df.reset_index()
    df = df.rename_axis({0:'volume'}, axis='columns')
    hour = df.time.dt.hour
    df = df.loc[((hour >= 6) & (hour < 10)) 
                     | ((hour >= 15) & (hour < 19))]
    df = df.sort_values(['tollgate_id','direction','time'])
    return df

def slice_am_pm(df):
    hours = df.time.dt.hour
    df_am = df.loc[(hours < 12)]
    df_pm = df.loc[(hours >= 12)]
    return df_am, df_pm

def slice_time(df):
    hour = df.time.dt.hour  
    df_prev2h = df.loc[(((hour >= 6) & (hour < 8)) | ((hour >= 15) & (hour < 17)))]
    df_follow2h = df.loc[(((hour >= 8) & (hour < 10)) | ((hour >= 17) & (hour < 19)))]
    return df_prev2h, df_follow2h

def complete_miss_time(df, df_type='train'):
    start_day = df.time.dt.date.values[0]
    end_day = df.time.dt.date.values[-1]
    toll_dire = [(1,0), (1,1), (2,0), (3,0), (3,1)]
    if df_type == 'test':
        hour_min = [(6,0), (6,20), (6,40), (7,0), (7,20), (7,40),
              (15,0), (15,20), (15,40), (16,0), (16,20), (16,40)]
    else:        
        hour_min = [(6,0), (6,20), (6,40), (7,0), (7,20), (7,40),
                (8,0), (8,20), (8,40), (9,0), (9,20), (9,40),
              (15,0), (15,20), (15,40), (16,0), (16,20), (16,40),
              (17,0), (17,20), (17,40), (18,0), (18,20), (18,40)]
    df_comp = pd.DataFrame(columns=['tollgate_id','direction','time','volume'])
    for d in range((end_day - start_day).days+1):
        day = start_day + timedelta(days=d)
        if ((day < NATIONNAL_START) or (day > NATIONNAL_END)):
            for i in range(len(toll_dire)):
                toll,dire= toll_dire[i]
                for j in range(len(hour_min)):
                    h, m = hour_min[j]
                    day_time = datetime(day.year, day.month, day.day, h, m, 0)
                    index = ((df.tollgate_id == toll) & (df.direction == dire) &
                            (df.time == day_time))
                    volume = df.loc[index].volume
                    if (not volume.empty):
                        v = volume.values[0]
                    else:
                        v = np.NaN
                    row = {'tollgate_id': toll, 'direction':dire,
                       'time': str(day_time), 'volume':v} 
                    df_comp = df_comp.append(row, ignore_index=True)
    
    df_comp['tollgate_id'] = df_comp['tollgate_id'].astype(int)
    df_comp['direction'] = df_comp['direction'].astype(int)
    df_comp.time = pd.to_datetime(df_comp.time)
    df_comp['volume'] = df_comp.volume.interpolate(method='linear')
    return df_comp

In [293]:
# 数据预处理，分别是训练集，验证集和测试集
# knn没必要
train_df = select_time(train_df)
train_df = complete_miss_time(train_df, df_type='train')
train_am, train_pm = slice_am_pm(train_df)
test_df = select_time(test_df)
test_df = complete_miss_time(test_df, df_type='test')
test_am, test_pm = slice_am_pm(test_df)


days_all = train_df.time.dt.date
nonvali_train_df = train_df.loc[((days_all < VALI_START_DAY) | (days_all > VALI_END_DAY))]
nonvali_train_am, nonvali_train_pm = slice_am_pm(nonvali_train_df)
vali_df = train_df.loc[((days_all >= VALI_START_DAY) & (days_all <= VALI_END_DAY))]
vali_prev2h, vali_follow2h = slice_time(vali_df)
vali_prev2h_am, vali_prev2h_pm = slice_am_pm(vali_prev2h)
vali_follow2h_am, vali_follow2h_pm = slice_am_pm(vali_follow2h)

In [307]:
# 数据对比完毕，和官方教程一致
print(train_df.isnull().sum().sum(), test_df.isnull().sum().sum())
print(len(train_df.time.dt.date.unique()))
print(train_am.head(10))
print(test_am.shape, test_pm.shape)
print(test_am.head(10))

(0, 0)
27
   tollgate_id  direction                time  volume
0            1          0 2016-09-19 06:00:00     8.0
1            1          0 2016-09-19 06:20:00    13.0
2            1          0 2016-09-19 06:40:00    32.0
3            1          0 2016-09-19 07:00:00    39.0
4            1          0 2016-09-19 07:20:00    31.0
5            1          0 2016-09-19 07:40:00    43.0
6            1          0 2016-09-19 08:00:00    46.0
7            1          0 2016-09-19 08:20:00    56.0
8            1          0 2016-09-19 08:40:00    41.0
9            1          0 2016-09-19 09:00:00    50.0
((210, 4), (210, 4))
    tollgate_id  direction                time  volume
0             1          0 2016-10-25 06:00:00    10.0
1             1          0 2016-10-25 06:20:00    17.0
2             1          0 2016-10-25 06:40:00    30.0
3             1          0 2016-10-25 07:00:00    25.0
4             1          0 2016-10-25 07:20:00    38.0
5             1          0 2016-10-25 07:40:0

In [295]:
train_df.to_csv('processed_data/train_data_combined.csv', index=False)
test_df.to_csv('processed_data/test_data_combined.csv', index=False)

In [253]:
# 对于训练集，每行包括6~10的12个时间段（上午），前6个时间段为特征，后6个时间段为标签
# 对于测试集和验证集，每行只包括6~8的6个时间段
def generate_knn_set(df_am, df_pm):
    toll_dire = [(1,0), (1,1), (2,0), (3,0), (3,1)]
    df_days = df_am.time.dt.date.unique()
    df_days_len = len(df_days)
    set_am_data = {}
    set_pm_data = {}
    for i in range(len(toll_dire)):
        toll, dire = toll_dire[i]
        df_am_unit = df_am.loc[((df_am.tollgate_id == toll) & (df_am.direction == dire))]
        df_pm_unit = df_pm.loc[((df_pm.tollgate_id == toll) & (df_pm.direction == dire))]
        arr_am_data = df_am_unit.volume.values.reshape(df_days_len,-1)
        arr_pm_data = df_pm_unit.volume.values.reshape(df_days_len,-1)
        set_am_data[i] = arr_am_data
        set_pm_data[i] = arr_pm_data
    return set_am_data, set_pm_data

def generate_vali_true(df_am, df_pm):
    toll_dire = [(1,0), (1,1), (2,0), (3,0), (3,1)]
    vali_true_am = []
    vali_true_pm = []
    for i in range(len(toll_dire)):
        toll, dire = toll_dire[i]
        df_am_unit = vali_follow2h_am.loc[((df_am.tollgate_id == toll) & (df_am.direction == dire))]
        df_pm_unit = vali_follow2h_pm.loc[((df_pm.tollgate_id == toll) & (df_pm.direction == dire))]
        arr_am_data = df_am_unit.volume.values
        arr_pm_data = df_pm_unit.volume.values
        vali_true_am.append(arr_am_data)
        vali_true_pm.append(arr_pm_data)
    vali_true_am = np.array(vali_true_am).reshape(-1)
    vali_true_pm = np.array(vali_true_pm).reshape(-1)
    return vali_true_am, vali_true_pm

In [254]:
train_set_am, train_set_pm = generate_knn_set(train_am, train_pm)
test_set_am, test_set_pm = generate_knn_set(test_am, test_pm)


nonvali_train_set_am, nonvali_train_set_pm = generate_knn_set(nonvali_train_am, nonvali_train_pm)
vali_set_am, vali_set_pm = generate_knn_set(vali_prev2h_am, vali_prev2h_pm)
vali_true_am, vali_true_pm = generate_vali_true(vali_follow2h_am, vali_follow2h_pm)

In [263]:
print(vali_true_am.shape)

(210,)


In [264]:
train_set_am_i = train_set_am[0]
train_set_pm_i = train_set_pm[0]
print(train_set_am_i.shape, train_set_pm_i.shape)
print(train_set_am_i[0:2,:])
print(train_set_pm_i[0:2,:])

((27, 12), (27, 12))
[[  8.  13.  32.  39.  31.  43.  46.  56.  41.  50.  49.  44.]
 [ 13.  20.  29.  38.  30.  31.  53.  50.  45.  51.  68.  59.]]
[[ 48.  57.  43.  42.  46.  55.  36.  34.  24.  29.  27.  18.]
 [ 71.  64.  73.  48.  57.  48.  38.  42.  38.  26.  29.  20.]]


In [308]:
vali_set_am_i = vali_set_am[0]
vali_set_pm_i = vali_set_pm[0]
print(vali_set_am_i.shape, vali_set_pm_i.shape)
print(vali_set_am_i[0:2,:])
print(vali_set_pm_i[0:2,:])

((7, 6), (7, 6))
[[ 13.  17.  21.  31.  28.  47.]
 [ 12.  16.  17.  22.  38.  41.]]
[[ 52.  38.  35.  57.  45.  53.]
 [ 40.  34.  50.  39.  40.  36.]]


In [258]:
# W为距离矩阵的权值
a = np.arange(0.1,1.2,0.5)
b = np.arange(1.8,3.7,0.9)
c = np.array([4.0, 3.2])
W =  np.concatenate((a,b,c), axis=0)

def MAPE(x, y):
    return np.sum(W * np.abs(x-y))

def get_knn_features(times_am, times_pm):
    # 增加几个维度：例如均值，方差，等
    n_times = 6
    n_features = 8
    features_am = np.zeros((times_am.shape[0], n_features), dtype='float32')
    features_pm = np.zeros((times_pm.shape[0], n_features), dtype='float32')
    features_am[:,:n_times], features_pm[:,:n_times] = times_am, times_pm
    # axis=1表示对每行的数据进行取均值或者方差
    features_am[:,-2], features_pm[:,-2] = np.mean(times_am, axis=1),np.mean(times_pm, axis=1)
    features_am[:,-1], features_pm[:,-1] = np.std(times_am, axis=1),np.std(times_pm, axis=1)
    return features_am, features_pm
    
    
# 因为要对十份数据进行预测，所以封装了knn
def knn_fit(set_am, set_pm, n_neighbors=5,weights='distance',algorithm='auto',metric=MAPE):
    # set random seed to 0
    n_times = 6
    knn_set_am = {}
    knn_set_pm = {}
    for i in range(len(set_am)):
        arr_am, arr_pm = set_am[i], set_pm[i]
        times_am, labels_am = arr_am[:,:n_times], arr_am[:,n_times:]
        times_pm, labels_pm = arr_pm[:,:n_times], arr_pm[:,n_times:]
        features_am, features_pm = get_knn_features(times_am, times_pm)
        knn_reg_am = KNeighborsRegressor(n_neighbors=n_neighbors,weights=weights,
                                         algorithm=algorithm, metric=metric).fit(features_am, labels_am)
        knn_reg_pm = KNeighborsRegressor(n_neighbors=n_neighbors,weights=weights, 
                                         algorithm=algorithm, metric=metric).fit(features_pm, labels_pm)
        knn_set_am[i] = knn_reg_am
        knn_set_pm[i] = knn_reg_pm
    return knn_set_am, knn_set_pm

def knn_predict(knn_set_am, knn_set_pm, set_am, set_pm):
    pred_list_am = []
    pred_list_pm = []
    for i in range(len(set_am)):
        knn_reg_am, knn_reg_pm = knn_set_am[i], knn_set_pm[i]
        times_am, times_pm = set_am[i], set_pm[i]
        features_am, features_pm = get_knn_features(times_am, times_pm)
        pred_am = knn_reg_am.predict(features_am)
        pred_pm = knn_reg_pm.predict(features_pm)
        pred_list_am.append(pred_am)
        pred_list_pm.append(pred_pm)
    pred_vec_am = np.array(pred_list_am).reshape(-1)
    pred_vec_pm = np.array(pred_list_pm).reshape(-1)
    return pred_vec_am, pred_vec_pm    

In [259]:
def validatoin_mape(train_set_am, train_set_pm, vali_set_am, vali_set_pm,
                    vali_true_am, vali_true_pm):
    n_neig_list = range(1,16)
    vali_true = np.concatenate((vali_true_am, vali_true_pm), axis=0)
    for n_neig in n_neig_list:
        knn_set_am, knn_set_pm = knn_fit(train_set_am, train_set_pm, n_neighbors=n_neig)
        vali_pred_am, vali_pred_pm = knn_predict(knn_set_am, knn_set_pm, vali_set_am, vali_set_pm)
        vali_pred = np.concatenate((vali_pred_am, vali_pred_pm), axis=0)
        vali_mape_am = cal_mape(pred_values=vali_pred_am, true_values=vali_true_am)
        vali_mape_pm = cal_mape(pred_values=vali_pred_pm, true_values=vali_true_pm)
        vali_mape = cal_mape(pred_values=vali_pred, true_values=vali_true)
        print('n_neighbors: %i, validation_mape_am: %.4f' %(n_neig, vali_mape_am))
        print('n_neighbors: %i, validation_mape_pm: %.4f' %(n_neig, vali_mape_pm))
        print('n_neighbors: %i, validation_mape: %.4f' %(n_neig, vali_mape))
        
validatoin_mape(nonvali_train_set_am, nonvali_train_set_pm, vali_set_am, vali_set_pm,
                    vali_true_am, vali_true_pm)


n_neighbors: 1, validation_mape_am: 0.1693
n_neighbors: 1, validation_mape_pm: 0.2515
n_neighbors: 1, validation_mape: 0.2104
n_neighbors: 2, validation_mape_am: 0.1452
n_neighbors: 2, validation_mape_pm: 0.2128
n_neighbors: 2, validation_mape: 0.1790
n_neighbors: 3, validation_mape_am: 0.1370
n_neighbors: 3, validation_mape_pm: 0.2019
n_neighbors: 3, validation_mape: 0.1695
n_neighbors: 4, validation_mape_am: 0.1370
n_neighbors: 4, validation_mape_pm: 0.1923
n_neighbors: 4, validation_mape: 0.1646
n_neighbors: 5, validation_mape_am: 0.1414
n_neighbors: 5, validation_mape_pm: 0.1801
n_neighbors: 5, validation_mape: 0.1607
n_neighbors: 6, validation_mape_am: 0.1404
n_neighbors: 6, validation_mape_pm: 0.1777
n_neighbors: 6, validation_mape: 0.1591
n_neighbors: 7, validation_mape_am: 0.1404
n_neighbors: 7, validation_mape_pm: 0.1780
n_neighbors: 7, validation_mape: 0.1592
n_neighbors: 8, validation_mape_am: 0.1403
n_neighbors: 8, validation_mape_pm: 0.1764
n_neighbors: 8, validation_mape:

In [278]:
n_neig_vali = 9
knn_set_am, knn_set_pm = knn_fit(nonvali_train_set_am, nonvali_train_set_pm, n_neighbors=n_neig_vali)
vali_pred_am, vali_pred_pm = knn_predict(knn_set_am, knn_set_pm, vali_set_am, vali_set_pm)
print(nonvali_train_set_am[0].shape)
print(vali_pred_am.shape, vali_pred_pm.shape)
print(vali_pred_am[:10])
print(vali_pred_pm[:10])

(20, 12)
((210,), (210,))
[ 47.0813038   53.64899503  49.60925577  51.87302159  54.71101957
  52.16840598  46.26689094  54.46953543  52.44920992  48.32777105]
[ 43.99052659  40.49373747  29.80058237  27.73708617  25.68164451
  20.89796929  43.03143059  35.83134076  26.99119054  26.94341547]


In [309]:
test_set_am_i = test_set_am[0]
test_set_pm_i = test_set_pm[0]
print(test_set_am_i.shape, test_set_pm_i.shape)
print(test_set_am_i[0:2,:])
print(test_set_pm_i[0:2,:])

((7, 6), (7, 6))
[[ 10.  17.  30.  25.  38.  44.]
 [ 12.   9.  25.  31.  32.  45.]]
[[ 52.  44.  46.  50.  40.  50.]
 [ 38.  36.  44.  49.  45.  46.]]


In [279]:
print(train_set_am[0].shape)
n_neig_test = 11
knn_set_am, knn_set_pm = knn_fit(train_set_am, train_set_pm, n_neighbors=n_neig_test)
test_pred_am, test_pred_pm = knn_predict(knn_set_am, knn_set_pm, test_set_am, test_set_pm)

(27, 12)


In [280]:
print(test_pred_am.shape, test_pred_pm.shape)
print(test_pred_am[:10])
print(test_pred_pm[:10])
a = test_df.time.dt.date.unique()

((210,), (210,))
[ 47.56485948  52.74332904  49.28139767  48.74363536  48.27880783
  52.0517563   46.47924998  50.37178537  51.00060992  50.76158913]
[ 43.69252664  40.94893701  28.90628703  27.16561616  23.88564645
  19.56686143  42.4225925   39.43912726  29.19340607  26.80018715]


In [237]:
def generate_output(pred_am, pred_pm):
    output_df = pd.DataFrame(columns=['tollgate_id','direction','time','volume'])
    toll_dire = [(1,0), (1,1), (2,0), (3,0), (3,1)]
    test_days = test_df.time.dt.date.unique()
    hour_min_am = [(8,0), (8,20), (8,40), (9,0), (9,20), (9,40)]
    hour_min_pm = [(17,0), (17,20), (17,40), (18,0), (18,20), (18,40)]
    i, j = 0, 0
    for toll, dire in toll_dire:
        for test_d in test_days:
            for hour, minute in hour_min_am:
                test_time = datetime(test_d.year,test_d.month,test_d.day,int(hour),int(minute),0)
                data = {'tollgate_id':toll, 'direction': dire,
                    'time': test_time, 'volume': pred_am[i]}
                output_df = output_df.append(data, ignore_index=True)
                i += 1
            for hour, minute in hour_min_pm:
                test_time = datetime(test_d.year,test_d.month,test_d.day,int(hour),int(minute),0)
                data = {'tollgate_id':toll, 'direction': dire, 
                        'time': test_time, 'volume': pred_pm[j]}
                output_df = output_df.append(data, ignore_index=True)
                j += 1
    output_df.tollgate_id = output_df.tollgate_id.astype(int)
    output_df.direction = output_df.direction.astype(int)
    output_df.time = pd.to_datetime(output_df.time)
    print(i, j)
    return output_df

output_df = generate_output(test_pred_am, test_pred_pm)

(210, 210)


In [330]:
print(output_df.shape)
output_df.head(10)

(420, 4)


Unnamed: 0,tollgate_id,direction,time,volume
0,1,0,2016-10-25 08:00:00,47.564859
1,1,0,2016-10-25 08:20:00,52.743329
2,1,0,2016-10-25 08:40:00,49.281398
3,1,0,2016-10-25 09:00:00,48.743635
4,1,0,2016-10-25 09:20:00,48.278808
5,1,0,2016-10-25 09:40:00,52.051756
6,1,0,2016-10-25 17:00:00,43.692527
7,1,0,2016-10-25 17:20:00,40.948937
8,1,0,2016-10-25 17:40:00,28.906287
9,1,0,2016-10-25 18:00:00,27.165616


In [339]:
days_all = output_df.time.dt.date
days_unique = output_df.time.dt.date.unique()
output_csv = pd.DataFrame(columns=['tollgate_id', 'direction','time', 'volume'])
for this_day in days_unique:
    df_unit = output_df.loc[(days_all == this_day)]
    output_csv = output_csv.append(df_unit)
output_csv.tollgate_id = output_csv.tollgate_id.astype(int)
output_csv.direction = output_csv.direction.astype(int)
print(output_csv.head(30))

     tollgate_id  direction                time      volume
0              1          0 2016-10-25 08:00:00   47.564859
1              1          0 2016-10-25 08:20:00   52.743329
2              1          0 2016-10-25 08:40:00   49.281398
3              1          0 2016-10-25 09:00:00   48.743635
4              1          0 2016-10-25 09:20:00   48.278808
5              1          0 2016-10-25 09:40:00   52.051756
6              1          0 2016-10-25 17:00:00   43.692527
7              1          0 2016-10-25 17:20:00   40.948937
8              1          0 2016-10-25 17:40:00   28.906287
9              1          0 2016-10-25 18:00:00   27.165616
10             1          0 2016-10-25 18:20:00   23.885646
11             1          0 2016-10-25 18:40:00   19.566861
84             1          1 2016-10-25 08:00:00  116.843146
85             1          1 2016-10-25 08:20:00  120.142943
86             1          1 2016-10-25 08:40:00  118.494636
87             1          1 2016-10-25 0

In [340]:
# 生成对应格式的输出csv文件
start_time= output_csv.time
end_time = start_time.apply(lambda dt: dt + timedelta(minutes=20))
output_csv['time']  = '['+ start_time.astype(str) + ',' + end_time.astype(str) + ')'
output_csv = output_csv.rename_axis({'time':'time_window'}, axis='columns')
output_csv = output_csv.reindex_axis(['tollgate_id', 'time_window', 'direction', 'volume'],
                            axis='columns')

In [341]:
output_csv.head(30)

Unnamed: 0,tollgate_id,time_window,direction,volume
0,1,"[2016-10-25 08:00:00,2016-10-25 08:20:00)",0,47.564859
1,1,"[2016-10-25 08:20:00,2016-10-25 08:40:00)",0,52.743329
2,1,"[2016-10-25 08:40:00,2016-10-25 09:00:00)",0,49.281398
3,1,"[2016-10-25 09:00:00,2016-10-25 09:20:00)",0,48.743635
4,1,"[2016-10-25 09:20:00,2016-10-25 09:40:00)",0,48.278808
5,1,"[2016-10-25 09:40:00,2016-10-25 10:00:00)",0,52.051756
6,1,"[2016-10-25 17:00:00,2016-10-25 17:20:00)",0,43.692527
7,1,"[2016-10-25 17:20:00,2016-10-25 17:40:00)",0,40.948937
8,1,"[2016-10-25 17:40:00,2016-10-25 18:00:00)",0,28.906287
9,1,"[2016-10-25 18:00:00,2016-10-25 18:20:00)",0,27.165616


In [342]:
output_csv.to_csv('results_phase2/task2_knn_11_WMAPE_MEAN_STD.csv', index=False)

In [13]:
a = np.array([[1,2],[3,4]])
b = np.zeros((2,4),dtype='float32')
b[:,:2] = a
b[:,-2] = np.mean(a, axis=1)
b[:,-1] = np.std(a, axis=1)
print(a)
print(b)

[[1 2]
 [3 4]]
[[ 1.   2.   1.5  0.5]
 [ 3.   4.   3.5  0.5]]


In [354]:
_,train_follow2h = slice_time(train_df)
day_all = train_follow2h.time.dt.date
vali_true = train_follow2h.loc[((day_all >= VALI_START_DAY) & (day_all <= VALI_END_DAY))]
knn_pred = pd.read_csv('results_phase2/task2_knn_11_WMAPE_MEAN_STD.csv')
sjh_pred  =pd.read_csv('results_phase2/task2_phase2_sjh_35_65and10_.csv')
zjx_pred_2 = pd.read_csv('results_phase2/task2_n2.csv')
zjx_pred_3 = pd.read_csv('results_phase2/task2_n3.csv')
# knn_pred = knn_pred.sort_values(['tollgate_id', 'direction'])
# sjh_pred = sjh_pred.sort_values(['tollgate_id', 'direction'])
# zjx_pred = zjx_pred.sort_values(['tollgate_id', 'direction'])
# vali_true = vali_true.sort_values(['tollgate_id', 'direction'])
print(vali_true.shape)
print(vali_true.head(10))

(420, 4)
      tollgate_id  direction                time  volume
2406            1          0 2016-10-18 08:00:00    50.0
2407            1          0 2016-10-18 08:20:00    41.0
2408            1          0 2016-10-18 08:40:00    49.0
2409            1          0 2016-10-18 09:00:00    61.0
2410            1          0 2016-10-18 09:20:00    39.0
2411            1          0 2016-10-18 09:40:00    46.0
2418            1          0 2016-10-18 17:00:00    40.0
2419            1          0 2016-10-18 17:20:00    42.0
2420            1          0 2016-10-18 17:40:00    23.0
2421            1          0 2016-10-18 18:00:00    27.0


In [358]:
sjh_knn_mape = cal_mape(true_values=knn_pred.volume.values, pred_values=sjh_pred.volume.values)
knn_sjh_mape = cal_mape(true_values=sjh_pred.volume.values, pred_values=knn_pred.volume.values)
sjh_zjx_2_mape = cal_mape(true_values=zjx_pred_2.volume.values, pred_values=sjh_pred.volume.values)
zjx_2_sjh_mape = cal_mape(true_values=sjh_pred.volume.values, pred_values=zjx_pred_2.volume.values)
sjh_zjx_3_mape = cal_mape(true_values=zjx_pred_3.volume.values, pred_values=sjh_pred.volume.values)
zjx_3_sjh_mape = cal_mape(true_values=sjh_pred.volume.values, pred_values=zjx_pred_3.volume.values)

zjx_2_knn_mape = cal_mape(true_values=knn_pred.volume.values, pred_values=zjx_pred_2.volume.values)
knn_zjx_2_mape = cal_mape(true_values=zjx_pred_2.volume.values, pred_values=knn_pred.volume.values)
zjx_3_knn_mape = cal_mape(true_values=knn_pred.volume.values, pred_values=zjx_pred_3.volume.values)
knn_zjx_3_mape = cal_mape(true_values=zjx_pred_3.volume.values, pred_values=knn_pred.volume.values)

zjx_2_zjx_3_mape = cal_mape(true_values=zjx_pred_3.volume.values, pred_values=zjx_pred_2.volume.values)
zjx_3_zjx_2_mape = cal_mape(true_values=zjx_pred_2.volume.values, pred_values=zjx_pred_3.volume.values)


print('sjh_pred_knn_true_mape: %.4f, knn_pred_sjh_true: %.4f' %(sjh_knn_mape, knn_sjh_mape))
print('sjh_pred_zjx_2_true_mape: %.4f, zjx_2_pred_sjh_true: %.4f' %(sjh_zjx_2_mape, zjx_2_sjh_mape))
print('sjh_pred_zjx_3_true_mape: %.4f, zjx_2_pred_sjh_true: %.4f' %(sjh_zjx_3_mape, zjx_3_sjh_mape))

print('\n')
print('zjx_2_pred_knn_true_mape: %.4f, knn_pred_zjx_2_true: %.4f' %(zjx_2_knn_mape, knn_zjx_2_mape))
print('zjx_3_pred_knn_true_mape: %.4f, knn_pred_zjx_3_true: %.4f' %(zjx_3_knn_mape, knn_zjx_3_mape))

print('\n')
print('zjx_2_pred_zjx_3_true_mape: %.4f, zjx_3_pred_zjx_2_true: %.4f' %(zjx_2_zjx_3_mape, zjx_3_zjx_2_mape))



print('\n')
zjx_2_vali_mape = cal_mape(true_values=vali_true.volume.values, pred_values=zjx_pred_2.volume.values)
vali_zjx_2_mape = cal_mape(true_values=zjx_pred_2.volume.values, pred_values=vali_true.volume.values)
zjx_3_vali_mape = cal_mape(true_values=vali_true.volume.values, pred_values=zjx_pred_3.volume.values)
vali_zjx_3_mape = cal_mape(true_values=zjx_pred_3.volume.values, pred_values=vali_true.volume.values)
print('zjx_2_pred_vali_true_mape: %.4f, vali_pred_zjx_2_true: %.4f' %(zjx_2_vali_mape, vali_zjx_2_mape))
print('zjx_3_pred_vali_true_mape: %.4f, vali_pred_zjx_3_true: %.4f' %(zjx_3_vali_mape, vali_zjx_3_mape))

sjh_pred_knn_true_mape: 0.0956, knn_pred_sjh_true: 0.1164
sjh_pred_zjx_2_true_mape: 0.3522, zjx_2_pred_sjh_true: 0.2604
sjh_pred_zjx_3_true_mape: 0.4070, zjx_2_pred_sjh_true: 0.3483


zjx_2_pred_knn_true_mape: 0.1755, knn_pred_zjx_2_true: 0.2888
zjx_3_pred_knn_true_mape: 0.2176, knn_pred_zjx_3_true: 0.3236


zjx_2_pred_zjx_3_true_mape: 0.1371, zjx_3_pred_zjx_2_true: 0.1447


zjx_2_pred_vali_true_mape: 0.2860, vali_pred_zjx_2_true: 0.3255
zjx_3_pred_vali_true_mape: 0.3626, vali_pred_zjx_3_true: 0.3648


In [77]:
a = '123'
print(int(a) == 123)

True
