In [20]:
import sys
import numpy as np
import scipy
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

import math
from datetime import datetime, timedelta, time, date

from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
%matplotlib inline

### 使用KNN regressor进行预测
> 先用最简单的KNN。以上午为例，从训练集中找到最相近的几个6~8.然后用最相近的那几天的8~10做个加权平均作为测试集的预测集。
计划完成的目标有：
1. 10份数据集的预测
2. 线下验证（用10月11~17日的数据集）

In [21]:
# using KNN to predict
train_path = '../dataset/training/volume(table 6)_training.csv'
test_path = '../dataset/testing_phase1/volume(table 6)_test1.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
train_df.time = pd.to_datetime(train_df.time)
test_df.time = pd.to_datetime(test_df.time)

In [22]:
# from 9-19 to 10-17 (except holiday)
NUM_TRAIN_DAYS = 20

# from 10-18 to 10-24
NUM_TSET_DAYS = 7

# define Holiday
NATIONNAL_START = date(2016,10,1)
NATIONNAL_END = date(2016,10,9)

MID_AUTUMN_START = date(2016,9,15)
MID_AUTUMN_END = date(2016,9,18)


TRAIN_START_DAY = date(2016,9,19)
TRAIN_END_DAY = date(2016,10,17)

VALI_START_DAY = date(2016,10,11)
VALI_END_DAY = date(2016,10,17)

TEST_START_DAY = date(2016,10,18)
TEST_END_DAY = date(2016,10,24)

In [23]:
train_df.head()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type
0,2016-09-19 23:09:25,2,0,1,0,
1,2016-09-19 23:11:53,2,0,1,0,
2,2016-09-19 23:13:54,2,0,1,0,
3,2016-09-19 23:17:48,1,0,1,1,
4,2016-09-19 23:16:07,2,0,1,0,


In [24]:
def MAPE(pred, true):
    return abs((true - pred) / true)

def cal_mape(pred_values, true_values):
    mape_mean = 0.0
    for i in range(len(pred_values)):
        pred_i = pred_values[i]
        true_i = true_values[i]
        mape_mean += abs((pred_i-true_i) / true_i)
    mape_mean /= len(pred_values)
    return mape_mean

def per_20min(dt):
    minute = int(math.floor(dt.minute / 20) * 20)
    second = 0
    dt_new = datetime(dt.year, dt.month, dt.day, dt.hour,minute, 0)
    return dt_new

# 9~19～10.17只有国庆节，因此只考虑国庆节
def remove_holiday(df):
    day_all = df.time.dt.date
    df = df.loc[((day_all < NATIONNAL_START) | (day_all > NATIONNAL_END))]
    return df

def select_time(df):
    df['time'] = df.time.apply(
        per_20min)
    if {'has_etc','vehicle_type', 'vehicle_model'}.issubset(df.columns):
        df = df.drop(['has_etc','vehicle_type', 'vehicle_model'], axis=1)
    df = df.groupby(['tollgate_id', 'direction', 'time']).size()
    df = df.reset_index()
    df = df.rename_axis({0:'volume'}, axis='columns')
    hour = df.time.dt.hour
    df = df.loc[((hour >= 6) & (hour < 10)) 
                     | ((hour >= 15) & (hour < 19))]
    df = df.sort_values(['tollgate_id','direction','time'])
    return df

def slice_am_pm(df):
    hours = df.time.dt.hour
    df_am = df.loc[(hours < 12)]
    df_pm = df.loc[(hours >= 12)]
    return df_am, df_pm

def slice_time(df):
    hour = df.time.dt.hour  
    df_prev2h = df.loc[(((hour >= 6) & (hour < 8)) | ((hour >= 15) & (hour < 17)))]
    df_follow2h = df.loc[(((hour >= 8) & (hour < 10)) | ((hour >= 17) & (hour < 19)))]
    return df_prev2h, df_follow2h

def complete_miss_time(df, df_type='train'):
    start_day = df.time.dt.date.values[0]
    end_day = df.time.dt.date.values[-1]
    toll_dire = [(1,0), (1,1), (2,0), (3,0), (3,1)]
    if df_type == 'test':
        hour_min = [(6,0), (6,20), (6,40), (7,0), (7,20), (7,40),
              (15,0), (15,20), (15,40), (16,0), (16,20), (16,40)]
    else:        
        hour_min = [(6,0), (6,20), (6,40), (7,0), (7,20), (7,40),
                (8,0), (8,20), (8,40), (9,0), (9,20), (9,40),
              (15,0), (15,20), (15,40), (16,0), (16,20), (16,40),
              (17,0), (17,20), (17,40), (18,0), (18,20), (18,40)]
    df_comp = pd.DataFrame(columns=['tollgate_id','direction','time','volume'])
    for d in range((end_day - start_day).days+1):
        day = start_day + timedelta(days=d)
        if ((day < NATIONNAL_START) or (day > NATIONNAL_END)):
            for i in range(len(toll_dire)):
                toll,dire= toll_dire[i]
                for j in range(len(hour_min)):
                    h, m = hour_min[j]
                    day_time = datetime(day.year, day.month, day.day, h, m, 0)
                    index = ((df.tollgate_id == toll) & (df.direction == dire) &
                            (df.time == day_time))
                    volume = df.loc[index].volume
                    if (not volume.empty):
                        v = volume.values[0]
                    else:
                        v = np.NaN
                    row = {'tollgate_id': toll, 'direction':dire,
                       'time': str(day_time), 'volume':v} 
                    df_comp = df_comp.append(row, ignore_index=True)
    
    df_comp['tollgate_id'] = df_comp['tollgate_id'].astype(int)
    df_comp['direction'] = df_comp['direction'].astype(int)
    df_comp.time = pd.to_datetime(df_comp.time)
    df_comp['volume'] = df_comp.volume.interpolate(method='linear')
    return df_comp

In [25]:
# 数据预处理，分别是训练集，验证集和测试集
train_df = remove_holiday(train_df)
train_df = select_time(train_df)
train_df = complete_miss_time(train_df, df_type='train')
train_am, train_pm = slice_am_pm(train_df)
test_df = select_time(test_df)
test_df = complete_miss_time(test_df, df_type='test')
test_am, test_pm = slice_am_pm(test_df)


days_all = train_df.time.dt.date
nonvali_train_df = train_df.loc[((days_all < VALI_START_DAY) | (days_all > VALI_END_DAY))]
nonvali_train_am, nonvali_train_pm = slice_am_pm(nonvali_train_df)
vali_df = train_df.loc[((days_all >= VALI_START_DAY) & (days_all <= VALI_END_DAY))]
vali_am, vali_pm = slice_am_pm(vali_df)

In [29]:
# 数据对比完毕，和官方教程一致
print(train_df.isnull().sum().sum(), test_df.isnull().sum().sum())
print(len(train_df.time.dt.date.unique()), len(nonvali_train_df.time.dt.date.unique()),
     len(vali_df.time.dt.date.unique()))
print(train_am.shape, train_pm.shape)
print(nonvali_train_am.shape, nonvali_train_pm.shape)
print(vali_am.shape, vali_pm.shape)
vali_am.head(20)

(0, 0)
(20, 13, 7)
((1200, 4), (1200, 4))
((780, 4), (780, 4))
((420, 4), (420, 4))


Unnamed: 0,tollgate_id,direction,time,volume
1560,1,0,2016-10-11 06:00:00,9.0
1561,1,0,2016-10-11 06:20:00,26.0
1562,1,0,2016-10-11 06:40:00,18.0
1563,1,0,2016-10-11 07:00:00,30.0
1564,1,0,2016-10-11 07:20:00,40.0
1565,1,0,2016-10-11 07:40:00,52.0
1566,1,0,2016-10-11 08:00:00,49.0
1567,1,0,2016-10-11 08:20:00,49.0
1568,1,0,2016-10-11 08:40:00,40.0
1569,1,0,2016-10-11 09:00:00,45.0


In [28]:
print(test_am.shape, test_pm.shape)
test_am.head(20)

((210, 4), (210, 4))


Unnamed: 0,tollgate_id,direction,time,volume
0,1,0,2016-10-18 06:00:00,13.0
1,1,0,2016-10-18 06:20:00,17.0
2,1,0,2016-10-18 06:40:00,21.0
3,1,0,2016-10-18 07:00:00,31.0
4,1,0,2016-10-18 07:20:00,28.0
5,1,0,2016-10-18 07:40:00,47.0
12,1,1,2016-10-18 06:00:00,37.0
13,1,1,2016-10-18 06:20:00,47.0
14,1,1,2016-10-18 06:40:00,72.0
15,1,1,2016-10-18 07:00:00,68.0


In [30]:
# 对于训练集，每行包括6~10的12个时间段（上午），前6个时间段为特征，后6个时间段为标签
# 对于测试集和验证集，每行只包括6~8的6个时间段
def generate_knn_arr(df_am, df_pm, df_type='train'):
    if df_type == 'test':
        n_dims = 6
    else:
        n_dims = 12
    arr_am_data = df_am.volume.values.reshape(-1,n_dims)
    arr_pm_data = df_pm.volume.values.reshape(-1,n_dims)
    return arr_am_data, arr_pm_data

def generate_vali(arr_am, arr_pm):
    n_features = 6
    vali_feature_am, vali_label_am = arr_am[:,:n_features], arr_am[:,n_features:]
    vali_feature_pm, vali_label_pm = arr_pm[:,:n_features], arr_pm[:,n_features:]
    return vali_feature_am, vali_label_am, vali_feature_pm, vali_label_pm

In [31]:
train_arr_am, train_arr_pm = generate_knn_arr(train_am, train_pm, df_type='train')
test_arr_am, test_arr_pm = generate_knn_arr(test_am, test_pm, df_type='test')


nonvali_train_arr_am, nonvali_train_arr_pm = generate_knn_arr(nonvali_train_am, nonvali_train_pm,
                                                             df_type='train')
vali_arr_am, vali_arr_pm = generate_knn_arr(vali_am, vali_pm, df_type='vali')
vali_feature_am, vali_label_am, vali_feature_pm, vali_label_pm = generate_vali(vali_arr_am, vali_arr_pm)

In [33]:
print(train_arr_am.shape, train_arr_pm.shape)
print(train_arr_am[0:2,:])
print(train_arr_pm[0:2,:])

((100, 12), (100, 12))
[[   8.   13.   32.   39.   31.   43.   46.   56.   41.   50.   49.   44.]
 [  39.   57.   72.   70.   83.  118.  123.  119.  116.  120.  151.  131.]]
[[  48.   57.   43.   42.   46.   55.   36.   34.   24.   29.   27.   18.]
 [  77.  104.   92.  110.  122.  116.   96.  116.  112.   67.   94.   65.]]


In [34]:
print(vali_feature_am.shape, vali_feature_pm.shape)
print(vali_label_am.shape, vali_label_pm.shape)
print(vali_feature_am[0:2,:])
print(vali_label_am[0:2,:])

((35, 6), (35, 6))
((35, 6), (35, 6))
[[  9.  26.  18.  30.  40.  52.]
 [ 37.  51.  56.  72.  94.  93.]]
[[  49.   49.   40.   45.   47.   44.]
 [ 130.  108.  126.  129.  108.  123.]]


In [35]:
print(test_arr_am.shape, test_arr_pm.shape)
print(test_arr_am[0:2,:])
print(test_arr_pm[0:2,:])

((35, 6), (35, 6))
[[  13.   17.   21.   31.   28.   47.]
 [  37.   47.   72.   68.   94.  105.]]
[[  52.   38.   35.   57.   45.   53.]
 [  72.  102.  116.  102.   84.  113.]]


In [52]:
def knn_fit(arr_am, arr_pm, n_neighbors=5,weights='distance',algorithm='auto',metric='minkowski'):
    # set random seed to 0
    n_feature = 6
    feature_am, label_am = arr_am[:,:n_feature], arr_am[:,n_feature:]
    feature_pm, label_pm = arr_pm[:,:n_feature], arr_pm[:,n_feature:]
    knn_reg_am = KNeighborsRegressor(n_neighbors=n_neighbors,weights=weights,
                                     algorithm=algorithm, metric=metric).fit(feature_am, label_am)
    knn_reg_pm = KNeighborsRegressor(n_neighbors=n_neighbors,weights=weights, 
                                     algorithm=algorithm, metric=metric).fit(feature_pm, label_pm)
    return knn_reg_am, knn_reg_pm

def knn_predict(knn_reg_am, knn_reg_pm, feature_am, feature_pm):
    pred_am = knn_reg_am.predict(feature_am)
    pred_pm = knn_reg_pm.predict(feature_pm)
    pred_vec_am = pred_am.reshape(-1)
    pred_vec_pm = pred_pm.reshape(-1)
    return pred_vec_am, pred_vec_pm    

In [54]:
def validatoin_mape(train_arr_am, train_arr_pm, vali_feature_am, vali_feature_pm,
                    vali_label_am, vali_label_pm):
    n_neig_list = range(1,20)
    vali_true_am = vali_label_am.reshape(-1)
    vali_true_pm = vali_label_pm.reshape(-1)
    vali_true = np.concatenate((vali_true_am, vali_true_pm), axis=0)
    for n_neig in n_neig_list:
        knn_reg_am, knn_reg_pm = knn_fit(train_arr_am, train_arr_pm, n_neighbors=n_neig)
        vali_pred_am, vali_pred_pm = knn_predict(knn_reg_am, knn_reg_pm, vali_feature_am, vali_feature_pm)
        vali_pred = np.concatenate((vali_pred_am, vali_pred_pm), axis=0)
        vali_mape_am = cal_mape(pred_values=vali_pred_am, true_values=vali_true_am)
        vali_mape_pm = cal_mape(pred_values=vali_pred_pm, true_values=vali_true_pm)
        vali_mape = cal_mape(pred_values=vali_pred, true_values=vali_true)
        print('n_neighbors: %i, validation_mape_am: %.4f' %(n_neig, vali_mape_am))
        print('n_neighbors: %i, validation_mape_pm: %.4f' %(n_neig, vali_mape_pm))
        print('n_neighbors: %i, validation_mape: %.4f' %(n_neig, vali_mape))
        
validatoin_mape(nonvali_train_arr_am, nonvali_train_arr_pm, vali_feature_am, vali_feature_pm,
                    vali_label_am, vali_label_pm)

n_neighbors: 1, validation_mape_am: 0.1912
n_neighbors: 1, validation_mape_pm: 0.2377
n_neighbors: 1, validation_mape: 0.2144
n_neighbors: 2, validation_mape_am: 0.1696
n_neighbors: 2, validation_mape_pm: 0.2009
n_neighbors: 2, validation_mape: 0.1852
n_neighbors: 3, validation_mape_am: 0.1652
n_neighbors: 3, validation_mape_pm: 0.1857
n_neighbors: 3, validation_mape: 0.1755
n_neighbors: 4, validation_mape_am: 0.1488
n_neighbors: 4, validation_mape_pm: 0.1814
n_neighbors: 4, validation_mape: 0.1651
n_neighbors: 5, validation_mape_am: 0.1493
n_neighbors: 5, validation_mape_pm: 0.1765
n_neighbors: 5, validation_mape: 0.1629
n_neighbors: 6, validation_mape_am: 0.1492
n_neighbors: 6, validation_mape_pm: 0.1767
n_neighbors: 6, validation_mape: 0.1630
n_neighbors: 7, validation_mape_am: 0.1443
n_neighbors: 7, validation_mape_pm: 0.1704
n_neighbors: 7, validation_mape: 0.1574
n_neighbors: 8, validation_mape_am: 0.1434
n_neighbors: 8, validation_mape_pm: 0.1703
n_neighbors: 8, validation_mape:

In [12]:
# 根据验证集获得的参数
n_neig = 10
knn_reg_am, knn_reg_pm = knn_fit(train_arr_am, train_arr_pm, n_neighbors=n_neig)
vali_pred_am, vali_pred_pm = knn_predict(knn_reg_am, knn_reg_pm, vali_feature_am, vali_feature_pm)
vali_pred = np.concatenate((vali_pred_am, vali_pred_pm), axis=0)

[[ 0.5  1. ]]


In [4]:
a = np.array([[1,2,3],[4,5,6],[7,8,9]])
b = a+1
c = b+1
x = []
x.append(a)
x.append(b)
print(x)
y = np.array(x)
print(y.shape)
print(y)
z = y.reshape(-1)
print(z.shape)
print(z)
w = z.reshape(-1,3)
print(w.shape)
print(w)
q = w.reshape(-1)
print(q.shape)
print(q)

[array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]]), array([[ 2,  3,  4],
       [ 5,  6,  7],
       [ 8,  9, 10]])]
(2, 3, 3)
[[[ 1  2  3]
  [ 4  5  6]
  [ 7  8  9]]

 [[ 2  3  4]
  [ 5  6  7]
  [ 8  9 10]]]
(18,)
[ 1  2  3  4  5  6  7  8  9  2  3  4  5  6  7  8  9 10]
(6, 3)
[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [ 2  3  4]
 [ 5  6  7]
 [ 8  9 10]]
(18,)
[ 1  2  3  4  5  6  7  8  9  2  3  4  5  6  7  8  9 10]


In [17]:
def my_fun(x, p=[]):
    if(len(p) == 1):
        return x+p[0]
    elif(len(p) == 2):
        return x+p[0]+p[1]
    else:
        return x
print(my_fun(1,[]))
print(my_fun(1,[1]))
print(my_fun(1,[1]))

1
2
4


In [40]:
a = np.array([1,2,3])
b = np.array([4,5,6])
c = np.concatenate((a,b), axis=0)
print(a.shape)
print(c.shape)
print(c)

(3,)
(6,)
[1 2 3 4 5 6]
