In [185]:
import numpy as np
import pandas as pd
import math
from datetime import datetime, timedelta, time
from sklearn.svm import SVR

In [186]:
# using SVR to predict
train_path = '../dataset/training/trajectories(table 5)_training.csv'
test_path = '../dataset/testing_phase1/trajectories(table 5)_test1.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

train_df.head()

Unnamed: 0,intersection_id,tollgate_id,vehicle_id,starting_time,travel_seq,travel_time
0,B,3,1065642,2016-07-19 00:14:24,105#2016-07-19 00:14:24#9.56;100#2016-07-19 00...,70.85
1,B,3,1047198,2016-07-19 00:35:56,105#2016-07-19 00:35:56#11.58;100#2016-07-19 0...,148.79
2,B,1,1086390,2016-07-19 00:37:15,105#2016-07-19 00:37:15#5.26;100#2016-07-19 00...,79.76
3,A,2,1071181,2016-07-19 00:37:59,110#2016-07-19 00:37:59#13.74;123#2016-07-19 0...,58.05
4,B,1,1065807,2016-07-19 00:56:21,105#2016-07-19 00:56:21#16.08;100#2016-07-19 0...,137.98


In [187]:
def per_20min(dt):
    minute = int(math.floor(dt.minute / 20) * 20)
    second = 0
    dt_new = datetime(dt.year, dt.month, dt.day, dt.hour,minute, 0)
    return dt_new

def time_window(df):
    df.starting_time = pd.to_datetime(df.starting_time)
    df['starting_time'] = df.starting_time.apply(
        per_20min)
    if {'vehicle_id','travel_seq'}.issubset(df.columns):
        df = df.drop(['vehicle_id', 'travel_seq'], axis=1)
    df = df.groupby(['intersection_id', 'tollgate_id', 'starting_time']).mean()
    df = df.reset_index()
    df = df.rename_axis({'travel_time':'avg_travel_time'}, axis='columns')
    return df
    
def generate_features(df):
    # 分别给月，天，时，分权重...
    df['hour'] = df['starting_time'].apply(t.hour) 
    return df
    
def slice_feature_label(df):
    df = generate_features(df)
    st = df['starting_time'].dt.time;
    # 以2h为单位预测, st表示start_time
    st_6_00 = time(6,0,0)
    st_8_00 = time(8,0,0)
    st_10_00 = time(10,0,0) 
    
    st_15_00 = time(15,0,0)
    st_17_00 = time(17,0,0)
    st_19_00 = time(19,0,0) 
    train_features = df.loc[((st >= st_6_00) & (st < st_8_00)) 
                     | ((st >= st_15_00) & (st < st_17_00))]
    train_labels = df.loc[(st >= st_8_00) & (st < st_10_00) 
                       | ((st >= st_17_00) & (st < st_19_00))]
    return train_features, train_labels


In [188]:
# 生成20每分钟的平均行驶时间
train_df = time_window(train_df)
test_df = time_window(test_df)
train_df.head()


Unnamed: 0,intersection_id,tollgate_id,starting_time,avg_travel_time
0,A,2,2016-07-19 00:20:00,58.05
1,A,2,2016-07-19 01:20:00,56.87
2,A,2,2016-07-19 01:40:00,77.74
3,A,2,2016-07-19 02:20:00,42.64
4,A,2,2016-07-19 02:40:00,40.173333


In [189]:
# 生成训练集的特征和标签，测试集的特征
train_features, train_labels = slice_feature_label(train_df)
train_features = train_features.ix[:, 'avg_travel_time':'hour']
train_labels = train_labels.ix[:, 'avg_travel_time']
    
test_features = generate_features(test_df)
test_features = test_features.ix[:, 'avg_travel_time':'hour']
train_features.head(10)
# train_labels.head(20)

Unnamed: 0,avg_travel_time,hour
13,46.356,6.0
14,48.588,6.0
15,66.6425,6.0
16,64.681,7.0
17,85.676,7.0
18,58.968889,7.0
40,79.420833,15.0
41,72.314,15.0
42,58.741,15.0
43,59.6375,16.0


In [190]:
# 发现有数据缺少，features和labels的维度不对应
print train_features.shape[0]
print train_labels.shape[0]
print test_features.shape[0]

5032
5162
448


In [191]:
# SVR拟合
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
train_num = train_features.shape[0]
svr_rbf.fit(train_features, train_labels[:train_num])

SVR(C=1000.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [192]:
# SVR预测
test_pred = svr_rbf.predict(test_features)
print test_pred.size
test_pred[:10]

448


array([  78.51456414,   67.52449113,   81.03978677,   69.11540408,
         72.73833804,   73.94572676,  108.59826949,  103.42464977,
         80.63631978,   78.96038047])

In [194]:
# 生成对应格式的输出csv文件
intersection_id = test_df.intersection_id
tollgate_id = test_df.tollgate_id
start_time= test_df.starting_time.apply(lambda dt: dt+timedelta(hours=2))
end_time = start_time.apply(lambda dt: dt + timedelta(minutes=20))
time_window  = '['+ start_time.astype(np.str) + ',' + end_time.astype(np.str) + ')'
data = {'intersection_id':intersection_id, 'tollgate_id': tollgate_id,
        'time_window': time_window, 'avg_travel_time': test_pred}
columns = ['intersection_id', 'tollgate_id', 'time_window', 'avg_travel_time']
test_baseline = pd.DataFrame(data=data, columns=columns)
test_baseline.to_csv('task1_svr_rbf.csv', index=False)
test_baseline.head()

Unnamed: 0,intersection_id,tollgate_id,time_window,avg_travel_time
0,A,2,"[2016-10-18 08:00:00,2016-10-18 08:20:00)",78.514564
1,A,2,"[2016-10-18 08:20:00,2016-10-18 08:40:00)",67.524491
2,A,2,"[2016-10-18 08:40:00,2016-10-18 09:00:00)",81.039787
3,A,2,"[2016-10-18 09:00:00,2016-10-18 09:20:00)",69.115404
4,A,2,"[2016-10-18 09:20:00,2016-10-18 09:40:00)",72.738338
