In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime
# get all date list ,check out which date are missing
def get_date_list(start, end, toFormat):
    date_list = []
    date = datetime.datetime.strptime(start, '%Y-%m-%d %H:%M:%S')
    end = datetime.datetime.strptime(end, '%Y-%m-%d %H:%M:%S')
    while date <= end:
        date_list.append(date.strftime(toFormat))
        date = date + datetime.timedelta(minutes=20)
    return date_list

def GetTimeSeries(train,intersection_id,tollgate_id):
    '''
    Get intersection_id & tollgate_id 's Time Series
    '''
    Tag_Data = train[train.intersection_id == intersection_id]
    Tag_Data = Tag_Data[Tag_Data.tollgate_id == tollgate_id]
    
    MinTime = min(Tag_Data.start_time)
    MaxTime = max(Tag_Data.start_time)
    DataTimeRange = pd.date_range(start = MinTime , end = MaxTime , freq = '20Min')
    ts_0 = pd.Series([0]*len(DataTimeRange),index=DataTimeRange)
    ts =pd.Series(Tag_Data.avg_travel_time.values , index = Tag_Data.start_time)
    TS = ts_0+ts
#   先用前一个时刻的数据做填充。。。暂时想法。也是比较方便的//或者使用相近的历史数据进行填充/
    TS = TS.fillna(method='pad')
#     TS = TS.fillna(0)
    return TS
def DrawTsList(ts_list):
    ts_list.T.plot(kind='line',marker='o',legend=False,figsize=[24,12])
    plt.show()
def Get_Part_of_TimeSeries(TS,TimeRange):
    '''
    Input [start_time,end_time]
    '''
    return TS[TimeRange[0]:TimeRange[1]]

def GenerateTs_0(Time):
    '''
    Input [start_time,end_time]
    '''
    timerange = pd.date_range(start = Time[0],end = Time[1] ,freq = '20Min')
    ts = pd.Series(np.zeros(len(timerange)),index = timerange)
    return ts

def TsList(train,intersection_id,tollgate_id,Time):
    '''
    Input [start_time,end_time]
    '''
    ts_list=[]
    ts = GetTimeSeries(train,intersection_id,tollgate_id)
    for i in range(0,92):
        TimeRange = Time + datetime.timedelta(i)
        ts_part = Get_Part_of_TimeSeries(ts,TimeRange)
        if len(ts_part) == 0 or ts_part.isnull().any():
            ts_list.append(GenerateTs_0(TimeRange))
        else:
            ts_list.append(ts_part)
    c = []
    for x in ts_list:
        a = list(np.array(x))
        c.append(a)        
#     return np.array(ts_list)
    return c
def TrueFalseListCombine(TFlist1,TFlist2):
    return [l1 and l2 for l1,l2 in zip(TFlist1,TFlist2)]

def ExceptOutlier(ts_list):
    Mean = pd.DataFrame([np.mean(i) for i in ts_list])
    mean_low = Mean > Mean.quantile(0.2)
    mean_up = Mean < Mean.quantile(0.8)
    TF = TrueFalseListCombine(mean_low.values,mean_up.values)
    mean_index = Mean[TF].index.values    
    Std = pd.DataFrame([np.std(i) for i in ts_list])
    std_low = Std > Std.quantile(0.2)
    std_up = Std < Std.quantile(0.8)
    TF = TrueFalseListCombine(std_low.values,std_up.values)
    std_index = Std[TF].index.values  
    valid_index = list(set(mean_index)&set(std_index))
    return valid_index 

In [2]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn import neighbors

def L2(pred,true):
    loss = np.square(pred-true)
    return loss.mean()

def L1(pred,true):
    loss = np.abs(pred-true)
    return loss.mean()

def MAPE(pred,true):
    print "pred",pred
    print "true",true
    loss = np.abs((pred-true)/(true))
    return loss.mean()

#This function chooses the best point estimate for a numpy array, according to a particular loss.
#The loss function should take two numpy arrays as arguments, and return a scalar. One example is MAPE, see above.
def solver(x,loss):
    mean = x.mean()
    best = loss(mean,x)
    result = mean
    for i in x:
        score = loss(i,x)
        if score < best:
            best = score
            result = i
    return result
class NonparametricKNN(object):
    def __init__(self,n_neighbors=5,loss='L2'):
        if loss in ['L1','L2','MAPE']:
            loss = {'L1':L1,'L2':L2,'MAPE':MAPE}[loss]
        self.model = NearestNeighbors(n_neighbors,algorithm='auto',n_jobs=-1)
        self.solver = lambda x:solver(x,loss)
    def fit(self,train,target):#All inputs should be numpy arrays.
        self.model.fit(train)
        self.f=np.vectorize(lambda x:target[x])
        self.f =lambda x:target[x]
        return self
#     在以前的历史数据中找到相同模式的历史数据，用它们的值作为预测值
    def predict(self,test):#Return predictions as a numpy array.
        neighbors = self.model.kneighbors(test,return_distance=False)
        neighbors = self.f(neighbors)
        result = np.apply_along_axis(self.solver,1,neighbors)
        return result

In [3]:
train = pd.read_csv("C:/Users/Administrator/PycharmProjects/Traffic/Data/training_20min_avg_travel_time-Copy1.csv")
train['start_time'] = train['time_window'].apply(lambda x :(x.split(','))[0][1:])
train['start_time'] = pd.to_datetime(train['start_time'])
# Turn A,B,C to 1,2,3
train.replace('B',2,inplace=True)
train.replace('A',1,inplace=True)
train.replace('C',3,inplace=True)
train.drop(['time_window'],axis=1,inplace=True)

In [4]:
test = pd.read_csv("C:/Users/Administrator/PycharmProjects/Traffic/Data/test1_20min_avg_travel_time.csv")
test['start_time'] = test['time_window'].apply(lambda x :(x.split(','))[0][1:])
test['start_time'] = pd.to_datetime(test['start_time'])
# Turn A,B,C to 1,2,3
test.replace('B',2,inplace=True)
test.replace('A',1,inplace=True)
test.replace('C',3,inplace=True)
test.drop(['time_window'],axis=1,inplace=True)

In [12]:
def run(intersection_id,tollgate_id,train,test,isMorning):    
   
    start_time= []
    end_time=[]
    if(isMorning):
        train_Time = np.array([pd.datetime(2016,7,18,6,0,0),pd.datetime(2016,7,18,9,40,0)])
        test_Time = np.array([pd.datetime(2016,10,18,6,0,0),pd.datetime(2016,10,18,9,40,0)])
        
        MoriningTime = []
        MoriningTime1 = get_date_list('2016-10-18 08:00:00', '2016-10-18 09:40:00','%Y-%m-%d %H:%M:%S')
        MoriningTime2 = get_date_list('2016-10-19 08:00:00', '2016-10-19 09:40:00','%Y-%m-%d %H:%M:%S')
        MoriningTime3 = get_date_list('2016-10-20 08:00:00', '2016-10-20 09:40:00','%Y-%m-%d %H:%M:%S')
        MoriningTime4 = get_date_list('2016-10-21 08:00:00', '2016-10-21 09:40:00','%Y-%m-%d %H:%M:%S')
        MoriningTime5 = get_date_list('2016-10-22 08:00:00', '2016-10-22 09:40:00','%Y-%m-%d %H:%M:%S')
        MoriningTime6 = get_date_list('2016-10-23 08:00:00', '2016-10-23 09:40:00','%Y-%m-%d %H:%M:%S')
        MoriningTime7 = get_date_list('2016-10-24 08:00:00', '2016-10-24 09:40:00','%Y-%m-%d %H:%M:%S')
        MoriningTimeSet = [MoriningTime1,MoriningTime2,MoriningTime3,MoriningTime4,MoriningTime5,MoriningTime6,MoriningTime7]
        
        MoriningEndTime = []
        MoriningEndTime1 = get_date_list('2016-10-18 08:20:00', '2016-10-18 10:00:00','%Y-%m-%d %H:%M:%S')
        MoriningEndTime2 = get_date_list('2016-10-19 08:20:00', '2016-10-19 10:00:00','%Y-%m-%d %H:%M:%S')
        MoriningEndTime3 = get_date_list('2016-10-20 08:20:00', '2016-10-20 10:00:00','%Y-%m-%d %H:%M:%S')
        MoriningEndTime4 = get_date_list('2016-10-21 08:20:00', '2016-10-21 10:00:00','%Y-%m-%d %H:%M:%S')
        MoriningEndTime5 = get_date_list('2016-10-22 08:20:00', '2016-10-22 10:00:00','%Y-%m-%d %H:%M:%S')
        MoriningEndTime6 = get_date_list('2016-10-23 08:20:00', '2016-10-23 10:00:00','%Y-%m-%d %H:%M:%S')
        MoriningEndTime7 = get_date_list('2016-10-24 08:20:00', '2016-10-24 10:00:00','%Y-%m-%d %H:%M:%S')
        MoriningEndTimeSet = [MoriningEndTime1,MoriningEndTime2,MoriningEndTime3,MoriningEndTime4,MoriningEndTime5,MoriningEndTime6,MoriningEndTime7]
        for y in MoriningTimeSet:
            for x in y:
                MoriningTime.append(x)
        for y in MoriningEndTimeSet:
            for x in y:
                MoriningEndTime.append(x)        
        start_time = MoriningTime  
        end_time = MoriningEndTime
    else:
        train_Time = np.array([pd.datetime(2016,7,18,15,0,0),pd.datetime(2016,7,18,18,40,0)])
        test_Time = np.array([pd.datetime(2016,10,18,15,0,0),pd.datetime(2016,10,18,18,40,0)])
        AfterNoonTime = []    
        AfterNoonTime1 = get_date_list('2016-10-18 17:00:00', '2016-10-18 18:40:00','%Y-%m-%d %H:%M:%S')
        AfterNoonTime2 = get_date_list('2016-10-19 17:00:00', '2016-10-19 18:40:00','%Y-%m-%d %H:%M:%S')
        AfterNoonTime3 = get_date_list('2016-10-20 17:00:00', '2016-10-20 18:40:00','%Y-%m-%d %H:%M:%S')
        AfterNoonTime4 = get_date_list('2016-10-21 17:00:00', '2016-10-21 18:40:00','%Y-%m-%d %H:%M:%S')
        AfterNoonTime5 = get_date_list('2016-10-22 17:00:00', '2016-10-22 18:40:00','%Y-%m-%d %H:%M:%S')
        AfterNoonTime6 = get_date_list('2016-10-23 17:00:00', '2016-10-23 18:40:00','%Y-%m-%d %H:%M:%S')
        AfterNoonTime7 = get_date_list('2016-10-24 17:00:00', '2016-10-24 18:40:00','%Y-%m-%d %H:%M:%S')
        AfterNoonTimeSet = [AfterNoonTime1,AfterNoonTime2,AfterNoonTime3,AfterNoonTime4,AfterNoonTime5,AfterNoonTime6,AfterNoonTime7]    
        for y in AfterNoonTimeSet:
            for x in y:
                AfterNoonTime.append(x)
        AfterNoonEndTime = []
        AfterNoonEndTime1 = get_date_list('2016-10-18 17:20:00', '2016-10-18 19:00:00','%Y-%m-%d %H:%M:%S')
        AfterNoonEndTime2 = get_date_list('2016-10-19 17:20:00', '2016-10-19 19:00:00','%Y-%m-%d %H:%M:%S')
        AfterNoonEndTime3 = get_date_list('2016-10-20 17:20:00', '2016-10-20 19:00:00','%Y-%m-%d %H:%M:%S')
        AfterNoonEndTime4 = get_date_list('2016-10-21 17:20:00', '2016-10-21 19:00:00','%Y-%m-%d %H:%M:%S')
        AfterNoonEndTime5 = get_date_list('2016-10-22 17:20:00', '2016-10-22 19:00:00','%Y-%m-%d %H:%M:%S')
        AfterNoonEndTime6 = get_date_list('2016-10-23 17:20:00', '2016-10-23 19:00:00','%Y-%m-%d %H:%M:%S')
        AfterNoonEndTime7 = get_date_list('2016-10-24 17:20:00', '2016-10-24 19:00:00','%Y-%m-%d %H:%M:%S')
        AfterNoonEndTimeSet = [AfterNoonEndTime1,AfterNoonEndTime2,AfterNoonEndTime3,AfterNoonEndTime4,AfterNoonEndTime5,AfterNoonEndTime6,AfterNoonEndTime7]        
        for y in AfterNoonEndTimeSet:
            for x in y:
                AfterNoonEndTime.append(x)   
        start_time = AfterNoonTime         
        end_time = AfterNoonEndTime  
    
    ts_list = TsList(train,intersection_id,tollgate_id,train_Time)
    # 此处的过滤策略是：对每天特定时间段的数据求均值与标准差，然后将均值与标准差落在10%分位数以下和90%分位数以上的日子去除 
    # 我觉得，去除异常的时候，应该把缺省值给补充回来
    valid_index = ExceptOutlier(ts_list)
    ts_list = pd.DataFrame(ts_list)
    ts_list = ts_list.ix[valid_index]
    
    td = TsList(test,intersection_id,tollgate_id,test_Time)
    td_list = pd.DataFrame(td)
#     td_list = td_list.fillna(method='pad')
    td_list = td_list[0:7]
    col =[0,1,2,3,4,5]
    label1=[6]
    label2=[7]
    label3=[8]
    label4=[9]
    label5=[10]
    label6=[11]
    
   
    label =[label2,label3,label4,label5,label6]
    
    train = np.array(ts_list[col])
    target =np.array(ts_list[label1])
    test = np.array(td_list[col])
    model = NonparametricKNN(n_neighbors=3,loss='MAPE')
    #Train

    model.fit(train,target)
    #Predict
    Y_predict=model.predict(test)
    result =pd.DataFrame(Y_predict)
    result.rename(columns={0:label1[0]},inplace = True) 
    for eachLabel in label:
        train = np.array(ts_list[col])
        target =np.array(ts_list[eachLabel])
        test = np.array(td_list[col])
        model = NonparametricKNN(n_neighbors=3,loss='MAPE')
        #Train
        model.fit(train,target)
        #Predict
        Y_predict=model.predict(test)
        Y_predict=pd.DataFrame(Y_predict)
        Y_predict.rename(columns={0:eachLabel[0]},inplace = True) 
        print Y_predict
        result = result.join(Y_predict)
    avg_travel_time =[]
    for  x in  result.values:
        for a in x:
            avg_travel_time.append(a)   
    result = pd.DataFrame(columns=["avg_travel_time"],data=avg_travel_time)
    result['start_time'] = start_time
    result['end_time'] = end_time 
    result['intersection_id']=intersection_id
    result['tollgate_id']=tollgate_id
    return result

In [25]:
Sum = pd.DataFrame({'avg_travel_time':[],'start_time':[],'end_time':[],'intersection_id':[],'tollgate_id':[]})
for intersection_id,intersection_id_group in train.groupby(['intersection_id']):
        for tollgate_id,tollgate_id_group  in intersection_id_group.groupby(['tollgate_id']):
            Morning =  run(intersection_id,tollgate_id,train,test,True)
            Afternoon =  run(intersection_id,tollgate_id,train,test,False)
            Sum = Sum.merge(Morning,how='outer')
            Sum = Sum.merge(Afternoon,how='outer')

pred 95.9433333333
true [  86.16  133.58   68.09]
pred 86.16
true [  86.16  133.58   68.09]
pred 133.58
true [  86.16  133.58   68.09]
pred 68.09
true [  86.16  133.58   68.09]
pred 60.7333333333
true [ 55.45  61.3   65.45]
pred 55.45
true [ 55.45  61.3   65.45]
pred 61.3
true [ 55.45  61.3   65.45]
pred 65.45
true [ 55.45  61.3   65.45]
pred 79.1633333333
true [ 56.51  85.51  95.47]
pred 56.51
true [ 56.51  85.51  95.47]
pred 85.51
true [ 56.51  85.51  95.47]
pred 95.47
true [ 56.51  85.51  95.47]
pred 104.666666667
true [  98.71   81.71  133.58]
pred 98.71
true [  98.71   81.71  133.58]
pred 81.71
true [  98.71   81.71  133.58]
pred 133.58
true [  98.71   81.71  133.58]
pred 72.2133333333
true [ 59.86  82.59  74.19]
pred 59.86
true [ 59.86  82.59  74.19]
pred 82.59
true [ 59.86  82.59  74.19]
pred 74.19
true [ 59.86  82.59  74.19]
pred 74.18
true [ 66.84  81.71  73.99]
pred 66.84
true [ 66.84  81.71  73.99]
pred 81.71
true [ 66.84  81.71  73.99]
pred 73.99
true [ 66.84  81.71  73.99]

In [26]:
Sum

Unnamed: 0,avg_travel_time,start_time,end_time,intersection_id,tollgate_id
0,86.16,2016-10-18 08:00:00,2016-10-18 08:20:00,1,2
1,83.28,2016-10-18 08:20:00,2016-10-18 08:40:00,1,2
2,84.31,2016-10-18 08:40:00,2016-10-18 09:00:00,1,2
3,69.75,2016-10-18 09:00:00,2016-10-18 09:20:00,1,2
4,66.51,2016-10-18 09:20:00,2016-10-18 09:40:00,1,2
5,65.32,2016-10-18 09:40:00,2016-10-18 10:00:00,1,2
6,61.30,2016-10-19 08:00:00,2016-10-19 08:20:00,1,2
7,66.75,2016-10-19 08:20:00,2016-10-19 08:40:00,1,2
8,72.87,2016-10-19 08:40:00,2016-10-19 09:00:00,1,2
9,78.27,2016-10-19 09:00:00,2016-10-19 09:20:00,1,2


In [27]:
# intersection_id,tollgate_id,time_window,avg_travel_time
# A,2,"[2016-10-18 08:00:00,2016-10-18 08:20:00)",681.3

In [28]:
# Turn A,B,C to 1,2,3
Sum['intersection_id'].replace(2,'B',inplace=True)
Sum['intersection_id'].replace(1,'A',inplace=True)
Sum['intersection_id'].replace(3,'C',inplace=True)
Sum['time_window'] = "["+Sum['start_time']+","+Sum['end_time']+")"
Sum.drop(['start_time','end_time'],axis=1,inplace=True)

In [29]:
col = ['intersection_id','tollgate_id','time_window','avg_travel_time']
Sum = Sum.ix[:,col]

In [30]:
Sum.to_csv('knn_predict.csv',index=False)