In [6]:
import sys
import numpy as np
import scipy
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

import math
from datetime import datetime, timedelta, time, date

from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
%matplotlib inline

### 使用KNN regressor进行预测
> 分成十份待预测的数据集的预测（六个方向，每个方向都分上下午预测），先用最简单的KNN。以上午为例，从训练集中找到最相近的几个6~8.然后用最相近的那几天的8~10做个加权平均作为测试集的预测集。
计划完成的目标有：
1. 12份数据集的预测
2. 线下验证（用10月11~17日的数据集）

In [7]:
# using KNN to predict
train_path = '../dataset/training/trajectories(table 5)_training.csv'
test_path = '../dataset/testing_phase1/trajectories(table 5)_test1.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
train_df.starting_time = pd.to_datetime(train_df.starting_time)
test_df.starting_time = pd.to_datetime(test_df.starting_time)

In [8]:
# from 9-19 to 10-17 (except holiday)
NUM_TRAIN_DAYS = 20

# from 10-18 to 10-24
NUM_TSET_DAYS = 7

# define Holiday
NATIONNAL_START = date(2016,10,1)
NATIONNAL_END = date(2016,10,9)

MID_AUTUMN_START = date(2016,9,15)
MID_AUTUMN_END = date(2016,9,18)

TRAIN_START_DAY = date(2016,9,19)
TRAIN_END_DAY = date(2016,10,17)

VALI_START_DAY = date(2016,10,11)
VALI_END_DAY = date(2016,10,17)

TEST_START_DAY = date(2016,10,18)
TEST_END_DAY = date(2016,10,24)

In [9]:
a = train_df.loc[:,('intersection_id', 'tollgate_id')]
a.head()

Unnamed: 0,intersection_id,tollgate_id
0,B,3
1,B,3
2,B,1
3,A,2
4,B,1


In [10]:
def MAPE(pred, true):
    return abs((true - pred) / true)

def cal_mape(pred_values, true_values):
    mape_mean = 0.0
    for i in range(len(pred_values)):
        pred_i = pred_values[i]
        true_i = true_values[i]
        mape_mean += abs((pred_i-true_i) / true_i)
    mape_mean /= len(pred_values)
    return mape_mean

def per_20min(dt):
    minute = int(math.floor(dt.minute / 20) * 20)
    second = 0
    dt_new = datetime(dt.year, dt.month, dt.day, dt.hour,minute, 0)
    return dt_new

# 9~19～10.17只有国庆节，因此只考虑国庆节
def remove_holiday(df):
    day_all = df.starting_time.dt.date
    df = df.loc[((day_all < MID_AUTUMN_START) | (day_all > MID_AUTUMN_END))]
    df = df.loc[((day_all < NATIONNAL_START) | (day_all > NATIONNAL_END))]
    return df

def select_time(df):
    df['starting_time'] = df.starting_time.apply(
        per_20min)
    if {'vehicle_id','travel_seq'}.issubset(df.columns):
        df = df.drop(['vehicle_id','travel_seq'], axis=1)
    df = df.groupby(['intersection_id', 'tollgate_id', 'starting_time']).mean()
    df = df.reset_index()
    df = df.rename_axis({'travel_time':'avg_travel_time'}, axis='columns')
    hour = df.starting_time.dt.hour
    df = df.loc[((hour >= 6) & (hour < 10)) 
                     | ((hour >= 15) & (hour < 19))]
    return df

def slice_am_pm(df):
    hours = df.starting_time.dt.hour
    df_am = df.loc[(hours < 12)]
    df_pm = df.loc[(hours >= 12)]
    return df_am, df_pm

def slice_time(df):
    hour = df.starting_time.dt.hour  
    df_prev2h = df.loc[(((hour >= 6) & (hour < 8)) | ((hour >= 15) & (hour < 17)))]
    df_follow2h = df.loc[(((hour >= 8) & (hour < 10)) | ((hour >= 17) & (hour < 19)))]
    return df_prev2h, df_follow2h

def complete_miss_time(df, df_type='train'):
    start_day = df.starting_time.dt.date.values[0]
    end_day = df.starting_time.dt.date.values[-1]
    inter_toll = [('A',2), ('A',3), ('B',1), ('B',3), ('C',1), ('C',3)]
    if df_type == 'test':
        hour_min = [(6,0), (6,20), (6,40), (7,0), (7,20), (7,40),
              (15,0), (15,20), (15,40), (16,0), (16,20), (16,40)]
    else:        
        hour_min = [(6,0), (6,20), (6,40), (7,0), (7,20), (7,40),
                (8,0), (8,20), (8,40), (9,0), (9,20), (9,40),
              (15,0), (15,20), (15,40), (16,0), (16,20), (16,40),
              (17,0), (17,20), (17,40), (18,0), (18,20), (18,40)]
    df_comp = pd.DataFrame(columns=['intersection_id', 'tollgate_id',
                                    'starting_time','avg_travel_time'])
    for d in range((end_day - start_day).days+1):
        day = start_day + timedelta(days=d)
        if ((day < NATIONNAL_START) or (day > NATIONNAL_END)):
             for i in range(len(inter_toll)):
                inter, toll= inter_toll[i]
                for j in range(len(hour_min)):
                    h, m = hour_min[j]
                    day_time = datetime(day.year, day.month, day.day, h, m, 0)
                    index = ((df.intersection_id == inter) & (df.tollgate_id == toll) &
                        (df.starting_time == day_time))
                    avg_travel_time = df.loc[index].avg_travel_time
                    if (not avg_travel_time.empty):
                        avg = avg_travel_time.values[0]
                    else:
                        avg = np.NaN
                    row = {'intersection_id': inter, 'tollgate_id': toll,
                           'starting_time': str(day_time), 'avg_travel_time':avg} 
                    df_comp = df_comp.append(row, ignore_index=True)
    
    df_comp['tollgate_id'] = df_comp['tollgate_id'].astype(int)
    df_comp.starting_time = pd.to_datetime(df_comp.starting_time)
    df_comp['avg_travel_time'] = df_comp.avg_travel_time.interpolate()
    return df_comp

In [11]:
# 数据预处理，分别是训练集，验证集和测试集
train_df = remove_holiday(train_df)
train_df = select_time(train_df)
train_df = complete_miss_time(train_df, df_type='train')
train_am, train_pm = slice_am_pm(train_df)
test_df = select_time(test_df)
test_df = complete_miss_time(test_df, df_type='test')
test_am, test_pm = slice_am_pm(test_df)


days_all = train_df.starting_time.dt.date
nonvali_train_df = train_df.loc[((days_all < VALI_START_DAY) | (days_all > VALI_END_DAY))]
nonvali_train_am, nonvali_train_pm = slice_am_pm(nonvali_train_df)
vali_df = train_df.loc[((days_all >= VALI_START_DAY) & (days_all <= VALI_END_DAY))]
vali_prev2h, vali_follow2h = slice_time(vali_df)
vali_prev2h_am, vali_prev2h_pm = slice_am_pm(vali_prev2h)
vali_follow2h_am, vali_follow2h_pm = slice_am_pm(vali_follow2h)

In [12]:
# 数据对比完毕，和官方教程一致
print(train_df.isnull().sum().sum(), test_df.isnull().sum().sum())
print(len(train_df.starting_time.dt.date.unique()))
vali_follow2h_am.head(20)

(0, 0)
82


Unnamed: 0,intersection_id,tollgate_id,starting_time,avg_travel_time
10806,A,2,2016-10-11 08:00:00,68.0925
10807,A,2,2016-10-11 08:20:00,75.023636
10808,A,2,2016-10-11 08:40:00,75.0032
10809,A,2,2016-10-11 09:00:00,69.749091
10810,A,2,2016-10-11 09:20:00,55.277222
10811,A,2,2016-10-11 09:40:00,70.264286
10830,A,3,2016-10-11 08:00:00,146.16125
10831,A,3,2016-10-11 08:20:00,168.431818
10832,A,3,2016-10-11 08:40:00,180.401429
10833,A,3,2016-10-11 09:00:00,142.722857


In [13]:
test_am.head(20)

Unnamed: 0,intersection_id,tollgate_id,starting_time,avg_travel_time
0,A,2,2016-10-18 06:00:00,41.097143
1,A,2,2016-10-18 06:20:00,43.681667
2,A,2,2016-10-18 06:40:00,68.024286
3,A,2,2016-10-18 07:00:00,52.608333
4,A,2,2016-10-18 07:20:00,56.165556
5,A,2,2016-10-18 07:40:00,63.598571
12,A,3,2016-10-18 06:00:00,69.48
13,A,3,2016-10-18 06:20:00,142.01
14,A,3,2016-10-18 06:40:00,233.68
15,A,3,2016-10-18 07:00:00,143.6475


In [14]:
# 对于训练集，每行包括6~10的12个时间段（上午），前6个时间段为特征，后6个时间段为标签
# 对于测试集和验证集，每行只包括6~8的6个时间段
def generate_knn_set(df_am, df_pm):
    inter_toll = [('A',2), ('A',3), ('B',1), ('B',3), ('C',1), ('C',3)]
    df_days = df_am.starting_time.dt.date.unique()
    df_days_len = len(df_days)
    set_am_data = {}
    set_pm_data = {}
    for i in range(len(inter_toll)):
        inter, toll = inter_toll[i]
        df_am_unit = df_am.loc[((df_am.intersection_id == inter) & (df_am.tollgate_id == toll))]
        df_pm_unit = df_pm.loc[((df_pm.intersection_id == inter) & (df_pm.tollgate_id == toll))]
        arr_am_data = df_am_unit.avg_travel_time.values.reshape(df_days_len,-1)
        arr_pm_data = df_pm_unit.avg_travel_time.values.reshape(df_days_len,-1)
        set_am_data[i] = arr_am_data
        set_pm_data[i] = arr_pm_data
    return set_am_data, set_pm_data

def generate_vali_true(df_am, df_pm):
    inter_toll = [('A',2), ('A',3), ('B',1), ('B',3), ('C',1), ('C',3)]
    vali_true_am = []
    vali_true_pm = []
    for i in range(len(inter_toll)):
        inter, toll = inter_toll[i]
        df_am_unit = vali_follow2h_am.loc[((df_am.intersection_id == inter) & (df_am.tollgate_id == toll))]
        df_pm_unit = vali_follow2h_pm.loc[((df_pm.intersection_id == inter) & (df_pm.tollgate_id == toll))]
        arr_am_data = df_am_unit.avg_travel_time.values
        arr_pm_data = df_pm_unit.avg_travel_time.values
        vali_true_am.append(arr_am_data)
        vali_true_pm.append(arr_pm_data)
    vali_true_am = np.array(vali_true_am).reshape(-1)
    vali_true_pm = np.array(vali_true_pm).reshape(-1)
    return vali_true_am, vali_true_pm

In [15]:
train_set_am, train_set_pm = generate_knn_set(train_am, train_pm)
test_set_am, test_set_pm = generate_knn_set(test_am, test_pm)


nonvali_train_set_am, nonvali_train_set_pm = generate_knn_set(nonvali_train_am, nonvali_train_pm)
vali_set_am, vali_set_pm = generate_knn_set(vali_prev2h_am, vali_prev2h_pm)
vali_true_am, vali_true_pm = generate_vali_true(vali_follow2h_am, vali_follow2h_pm)

In [16]:
print(vali_true_am.shape, vali_true_pm.shape)

((252,), (252,))


In [17]:
train_set_am_i = train_set_am[0]
train_set_pm_i = train_set_pm[0]
print(train_set_am_i.shape, train_set_pm_i.shape)
print(train_set_am_i[0:2,:])
print(train_set_pm_i[0:2,:])

((82, 12), (82, 12))
[[  46.356        48.588        66.6425       64.681        85.676
    58.96888889   81.60285714   80.20785714   63.44846154   78.05117647
    69.03833333   69.65714286]
 [  44.095        59.87444444   88.815        77.84777778   94.91916667
    73.67727273   61.007       332.162        50.995        44.935        34.115
    23.295     ]]
[[  79.42083333   72.314        58.741        59.6375       51.97
    61.60090909  105.39166667   83.82857143   65.675        85.11333333
    50.56666667   65.07384615]
 [  62.18         64.33222222   70.33307692   67.11         82.793125
    52.87142857   50.165        81.348        75.47555556   67.8875
    85.105        78.33263158]]


In [18]:
test_set_am_i = test_set_am[0]
test_set_pm_i = test_set_pm[0]
print(test_set_am_i.shape, test_set_pm_i.shape)
print(test_set_am_i[0:2,:])
print(test_set_pm_i[0:2,:])

((7, 6), (7, 6))
[[  41.09714286   43.68166667   68.02428571   52.60833333   56.16555556
    63.59857143]
 [  40.01         57.00727273   37.412        69.584       100.114
    58.29214286]]
[[ 99.96066667  88.36055556  74.886       63.30272727  71.362       99.26090909]
 [ 64.46285714  62.30777778  57.33        67.584       55.19        72.47636364]]


In [25]:
# W为距离矩阵的权值
a = np.arange(0.1,1.2,0.5)
b = np.arange(1.6,3.3,0.8)
c = np.array([4.0, 1.5])
W =  np.concatenate((a,b,c), axis=0)

def MAPE(x, y):
    return np.sum(W * np.abs(x-y))

def get_knn_features(times_am, times_pm):
    # 增加几个维度：例如均值，方差，等
    n_times = 6
    n_features = 8
    features_am = np.zeros((times_am.shape[0], n_features), dtype='float32')
    features_pm = np.zeros((times_pm.shape[0], n_features), dtype='float32')
    features_am[:,:n_times], features_pm[:,:n_times] = times_am, times_pm
    # axis=1表示对每行的数据进行取均值或者方差
    features_am[:,-2], features_pm[:,-2] = np.mean(times_am, axis=1),np.mean(times_pm, axis=1)
    features_am[:,-1], features_pm[:,-1] = np.std(times_am, axis=1),np.std(times_pm, axis=1)
    return features_am, features_pm


# 因为要对十份数据进行预测，所以封装了knn
def knn_fit(set_am, set_pm, n_neighbors=5,weights='distance',algorithm='auto',metric=MAPE):
    # set random seed to 0
    n_times = 6
    knn_set_am = {}
    knn_set_pm = {}
    for i in range(len(set_am)):
        arr_am, arr_pm = set_am[i], set_pm[i]
        times_am, labels_am = arr_am[:,:n_times], arr_am[:,n_times:]
        times_pm, labels_pm = arr_pm[:,:n_times], arr_pm[:,n_times:]
        features_am, features_pm = get_knn_features(times_am, times_pm)
        knn_reg_am = KNeighborsRegressor(n_neighbors=n_neighbors,weights=weights,
                                         algorithm=algorithm, metric=metric).fit(features_am, labels_am)
        knn_reg_pm = KNeighborsRegressor(n_neighbors=n_neighbors,weights=weights, 
                                         algorithm=algorithm, metric=metric).fit(features_pm, labels_pm)
        knn_set_am[i] = knn_reg_am
        knn_set_pm[i] = knn_reg_pm
    return knn_set_am, knn_set_pm

def knn_predict(knn_set_am, knn_set_pm, set_am, set_pm):
    pred_list_am = []
    pred_list_pm = []
    for i in range(len(set_am)):
        knn_reg_am, knn_reg_pm = knn_set_am[i], knn_set_pm[i]
        times_am, times_pm = set_am[i], set_pm[i]
        features_am, features_pm = get_knn_features(times_am, times_pm)
        pred_am = knn_reg_am.predict(features_am)
        pred_pm = knn_reg_pm.predict(features_pm)
        pred_list_am.append(pred_am)
        pred_list_pm.append(pred_pm)
    pred_vec_am = np.array(pred_list_am).reshape(-1)
    pred_vec_pm = np.array(pred_list_pm).reshape(-1)
    return pred_vec_am, pred_vec_pm    

In [26]:
def validatoin_mape(train_set_am, train_set_pm, vali_set_am, vali_set_pm,
                    vali_true_am, vali_true_pm):
    n_neig_list = range(1,76)
    vali_true = np.concatenate((vali_true_am, vali_true_pm), axis=0)
    for n_neig in n_neig_list:
        knn_set_am, knn_set_pm = knn_fit(train_set_am, train_set_pm, n_neighbors=n_neig)
        vali_pred_am, vali_pred_pm = knn_predict(knn_set_am, knn_set_pm, vali_set_am, vali_set_pm)
        vali_pred = np.concatenate((vali_pred_am, vali_pred_pm), axis=0)
        vali_mape_am = cal_mape(pred_values=vali_pred_am, true_values=vali_true_am)
        vali_mape_pm = cal_mape(pred_values=vali_pred_pm, true_values=vali_true_pm)
        vali_mape = cal_mape(pred_values=vali_pred, true_values=vali_true)
        print('n_neighbors: %i, validation_mape_am: %.4f' %(n_neig, vali_mape_am))
        print('n_neighbors: %i, validation_mape_pm: %.4f' %(n_neig, vali_mape_pm))
        print('n_neighbors: %i, validation_mape: %.4f' %(n_neig, vali_mape))
        
validatoin_mape(nonvali_train_set_am, nonvali_train_set_pm, vali_set_am, vali_set_pm,
                    vali_true_am, vali_true_pm)

n_neighbors: 1, validation_mape_am: 0.3085
n_neighbors: 1, validation_mape_pm: 0.2602
n_neighbors: 1, validation_mape: 0.2843
n_neighbors: 2, validation_mape_am: 0.2627
n_neighbors: 2, validation_mape_pm: 0.2454
n_neighbors: 2, validation_mape: 0.2540
n_neighbors: 3, validation_mape_am: 0.2482
n_neighbors: 3, validation_mape_pm: 0.2274
n_neighbors: 3, validation_mape: 0.2378
n_neighbors: 4, validation_mape_am: 0.2356
n_neighbors: 4, validation_mape_pm: 0.2242
n_neighbors: 4, validation_mape: 0.2299
n_neighbors: 5, validation_mape_am: 0.2265
n_neighbors: 5, validation_mape_pm: 0.2198
n_neighbors: 5, validation_mape: 0.2231
n_neighbors: 6, validation_mape_am: 0.2239
n_neighbors: 6, validation_mape_pm: 0.2194
n_neighbors: 6, validation_mape: 0.2216
n_neighbors: 7, validation_mape_am: 0.2219
n_neighbors: 7, validation_mape_pm: 0.2167
n_neighbors: 7, validation_mape: 0.2193
n_neighbors: 8, validation_mape_am: 0.2196
n_neighbors: 8, validation_mape_pm: 0.2176
n_neighbors: 8, validation_mape:

n_neighbors: 65, validation_mape_am: 0.2014
n_neighbors: 65, validation_mape_pm: 0.2044
n_neighbors: 65, validation_mape: 0.2029
n_neighbors: 66, validation_mape_am: 0.2009
n_neighbors: 66, validation_mape_pm: 0.2042
n_neighbors: 66, validation_mape: 0.2026
n_neighbors: 67, validation_mape_am: 0.2011
n_neighbors: 67, validation_mape_pm: 0.2042
n_neighbors: 67, validation_mape: 0.2027
n_neighbors: 68, validation_mape_am: 0.2015
n_neighbors: 68, validation_mape_pm: 0.2042
n_neighbors: 68, validation_mape: 0.2028
n_neighbors: 69, validation_mape_am: 0.2015
n_neighbors: 69, validation_mape_pm: 0.2038
n_neighbors: 69, validation_mape: 0.2026
n_neighbors: 70, validation_mape_am: 0.2018
n_neighbors: 70, validation_mape_pm: 0.2038
n_neighbors: 70, validation_mape: 0.2028
n_neighbors: 71, validation_mape_am: 0.2020
n_neighbors: 71, validation_mape_pm: 0.2036
n_neighbors: 71, validation_mape: 0.2028
n_neighbors: 72, validation_mape_am: 0.2021
n_neighbors: 72, validation_mape_pm: 0.2035
n_neighbo

In [16]:
n_neig_vali = 64
knn_set_am, knn_set_pm = knn_fit(nonvali_train_set_am, nonvali_train_set_pm, n_neighbors=n_neig_vali)
vali_pred_am, vali_pred_pm = knn_predict(knn_set_am, knn_set_pm, vali_set_am, vali_set_pm)
print(vali_pred_am.shape, vali_pred_pm.shape)
print(vali_pred_am[:10])
print(vali_pred_pm[:10])

((252,), (252,))
[ 78.43130331  86.27401932  82.78856847  75.64481272  79.30112634
  80.24345594  80.06715785  87.30473473  83.46233126  77.466487  ]
[ 71.4691913   71.33862912  72.01841968  71.15069406  65.57948653
  70.68002065  72.0815162   69.02046293  70.08780883  68.9816092 ]


In [17]:
n_neig_test = 78
knn_set_am, knn_set_pm = knn_fit(train_set_am, train_set_pm, n_neighbors=n_neig_test)
test_pred_am, test_pred_pm = knn_predict(knn_set_am, knn_set_pm, test_set_am, test_set_pm)

In [18]:
print(test_pred_am.shape, test_pred_pm.shape)
print(test_pred_am[:10])
print(test_pred_pm[:10])
a = test_df.starting_time.dt.date.unique()
print(a)

((252,), (252,))
[ 82.00601432  87.86892733  83.6886908   76.70090975  79.30635724
  78.14974486  81.07566728  88.732436    84.39312034  77.45038453]
[ 74.63414382  73.14044446  73.99224076  71.95888394  67.08928331
  70.50891884  72.86131477  71.81427952  70.15323782  70.40672478]
[datetime.date(2016, 10, 18) datetime.date(2016, 10, 19)
 datetime.date(2016, 10, 20) datetime.date(2016, 10, 21)
 datetime.date(2016, 10, 22) datetime.date(2016, 10, 23)
 datetime.date(2016, 10, 24)]


In [19]:
def generate_output(pred_am, pred_pm):
    output_df = pd.DataFrame(columns=['intersection_id','tollgate_id',
                                      'starting_time','avg_travel_time'])
    inter_toll = [('A',2), ('A',3), ('B',1), ('B',3), ('C',1), ('C',3)]
    test_days = test_df.starting_time.dt.date.unique()
    hour_min_am = [(8,0), (8,20), (8,40), (9,0), (9,20), (9,40)]
    hour_min_pm = [(17,0), (17,20), (17,40), (18,0), (18,20), (18,40)]
    i, j = 0, 0
    for inter, toll in inter_toll:
        for test_d in test_days:
            for hour, minute in hour_min_am:
                test_time = datetime(test_d.year,test_d.month,test_d.day,int(hour),int(minute),0)
                data = {'intersection_id':inter, 'tollgate_id': toll,
                    'starting_time': test_time, 'avg_travel_time': pred_am[i]}
                output_df = output_df.append(data, ignore_index=True)
                i += 1
            for hour, minute in hour_min_pm:
                test_time = datetime(test_d.year,test_d.month,test_d.day,int(hour),int(minute),0)
                data = {'intersection_id':inter, 'tollgate_id': toll, 
                        'starting_time': test_time, 'avg_travel_time': pred_pm[j]}
                output_df = output_df.append(data, ignore_index=True)
                j += 1
    output_df.tollgate_id = output_df.tollgate_id.astype(int)
    output_df.starting_time = pd.to_datetime(output_df.starting_time)
    print(i, j)
    return output_df

output_df = generate_output(test_pred_am, test_pred_pm)

(252, 252)


In [20]:
print(output_df.shape)
output_df.head(10)

(504, 4)


Unnamed: 0,intersection_id,tollgate_id,starting_time,avg_travel_time
0,A,2,2016-10-18 08:00:00,82.006014
1,A,2,2016-10-18 08:20:00,87.868927
2,A,2,2016-10-18 08:40:00,83.688691
3,A,2,2016-10-18 09:00:00,76.70091
4,A,2,2016-10-18 09:20:00,79.306357
5,A,2,2016-10-18 09:40:00,78.149745
6,A,2,2016-10-18 17:00:00,74.634144
7,A,2,2016-10-18 17:20:00,73.140444
8,A,2,2016-10-18 17:40:00,73.992241
9,A,2,2016-10-18 18:00:00,71.958884


In [21]:
# 生成对应格式的输出csv文件
output_csv = output_df.copy()
start_time= output_csv.starting_time
end_time = start_time.apply(lambda dt: dt + timedelta(minutes=20))
output_csv['starting_time']  = '['+ start_time.astype(str) + ',' + end_time.astype(str) + ')'
output_csv = output_csv.rename_axis({'starting_time':'time_window'}, axis='columns')
output_csv = output_csv.reindex_axis(['intersection_id', 'tollgate_id',
                                    'time_window','avg_travel_time'], axis='columns')

In [22]:
output_csv.head(10)

Unnamed: 0,intersection_id,tollgate_id,time_window,avg_travel_time
0,A,2,"[2016-10-18 08:00:00,2016-10-18 08:20:00)",82.006014
1,A,2,"[2016-10-18 08:20:00,2016-10-18 08:40:00)",87.868927
2,A,2,"[2016-10-18 08:40:00,2016-10-18 09:00:00)",83.688691
3,A,2,"[2016-10-18 09:00:00,2016-10-18 09:20:00)",76.70091
4,A,2,"[2016-10-18 09:20:00,2016-10-18 09:40:00)",79.306357
5,A,2,"[2016-10-18 09:40:00,2016-10-18 10:00:00)",78.149745
6,A,2,"[2016-10-18 17:00:00,2016-10-18 17:20:00)",74.634144
7,A,2,"[2016-10-18 17:20:00,2016-10-18 17:40:00)",73.140444
8,A,2,"[2016-10-18 17:40:00,2016-10-18 18:00:00)",73.992241
9,A,2,"[2016-10-18 18:00:00,2016-10-18 18:20:00)",71.958884


In [23]:
output_csv.to_csv('results/task1_knn_78_minkowski.csv', index=False)

In [40]:
a = np.array([1,2,3])
b = np.array([4,5,6])
c = np.concatenate((a,b), axis=0)
print(a.shape)
print(c.shape)
print(c)

(3,)
(6,)
[1 2 3 4 5 6]
