In [1]:
import pandas as pd
import os
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix
from datatool import ade,vid_is_unique,foot2meter,vehicle2track,reset_idx,getNeighborGraph,graph2seq,get_displacement,train_test_val_split,matlab2dataframe
from collections import Counter
import pickle as pkl

def add_change_label(data):
    data = data.sort_values(by=["Vehicle_ID","Frame_ID"])
    changes = []
    for vid, df in data.groupby("Vehicle_ID"):
        lane_id = df.Lane_ID.to_numpy()
        diff = lane_id[1:] - lane_id[:-1]
        changed_or_not = (diff!=0)*1 # 1 : change; 0 : unchange
        changed_or_not = np.hstack((np.zeros(1),changed_or_not))
        changes.append(changed_or_not)
    change_result = np.hstack(changes)
    return change_result

def graph2seq(data,graph_list,seq_length=16,max_vnum=30,down_sample_rate=5,sort_func="distance"):
    x,y,v_id,f_id,l = data.Local_X,data.Local_Y,data.Vehicle_ID,data.Frame_ID,data.label
    vehicle_num, frame_num = v_id.max()+1, f_id.max()+1
    sparse_X = csr_matrix((x, (v_id, f_id)), shape=(int(vehicle_num), int(frame_num))) # i行:车id;j列:时间;元素为i车j时刻的坐标x
    sparse_Y = csr_matrix((y, (v_id, f_id)), shape=(int(vehicle_num), int(frame_num))) # i行:车id;j列:时间;元素为i车j时刻的坐标y
    sparse_L = csr_matrix((l, (v_id, f_id)), shape=(int(vehicle_num), int(frame_num))) # i行:车id;j列:时间;元素为i车j时刻的下一时刻是否lane change
    seq_windows,label = [],[]
    for v,graph in enumerate(graph_list):
        if graph.data.size==0:
            continue
        row = np.unique(graph.tocoo().row)
        col = np.unique(graph.tocoo().col)
        row_start,row_end = row.min(), row.max()+1
        col_start,col_end = col.min(), col.max()+1
        dense_v = v - row_start
        dense_I = graph[row_start:row_end,col_start:col_end].toarray()
        dense_x = sparse_X[row_start:row_end,col_start:col_end].toarray()
        dense_y = sparse_Y[row_start:row_end,col_start:col_end].toarray()
        dense_xy = np.stack((dense_x,dense_y),axis=2) # (vum,total_seq,2)
        dense_l = sparse_L[row_start:row_end,col_start:col_end].toarray()
        if dense_xy.shape[0]<max_vnum:
            padding_num = max_vnum-dense_xy.shape[0]
            padding_xy = np.zeros((padding_num,dense_xy.shape[1],dense_xy.shape[2]))
            padding_I = np.zeros((padding_num,dense_I.shape[1]))
            dense_xy = np.vstack([dense_xy,padding_xy])
            dense_I = np.vstack([dense_I,padding_I])
            dense_l = np.vstack([dense_l,padding_I])
        for i in range(dense_xy.shape[1]): # for loop on sequence dim
            if (i+seq_length)*down_sample_rate > dense_xy.shape[1]:
                break
            window = dense_xy[:,i:i+seq_length*down_sample_rate:down_sample_rate,:] # (vum=30,seq=16,2)
            window_l = dense_l[:,i:i+seq_length*down_sample_rate:down_sample_rate] # (vum=30,seq=16)
            if sort_func == "duration":
                dense_seq_I = dense_I[:,i:(i+seq_length)*down_sample_rate:down_sample_rate]
                related_score = dense_seq_I.sum(axis=1)
                related_score[dense_v] = related_score[dense_v] + 100 # actually 1 is enough
                related_rank = np.argsort(-related_score)
            elif sort_func == "distance":
                related_score = ade(window[:,:6,:],window[dense_v,:6,:])
                related_rank = np.argsort(related_score)
            window = window[related_rank[:max_vnum],:,:]    
            seq_windows.append(window)
            label.append(window_l[0,6])
    if len(seq_windows)==0:
        seq_data = None
        seq_label = None
    else:
        seq_data = np.stack(seq_windows)#(n,vum=30,seq=16,2)
        seq_label = np.stack(label)
    return seq_data,seq_label
# 1
root_folder = "../../../data/"
# 2
us101_folder = "US-101-LosAngeles-CA/us-101-vehicle-trajectory-data/vehicle-trajectory-data"
i80_folder = "I-80-Emeryville-CA/vehicle-trajectory-data"
#3
us101_f1 = "0820am-0835am/trajectories-0820am-0835am.csv"
us101_f2 = "0805am-0820am/trajectories-0805am-0820am.csv"
us101_f3 = "0750am-0805am/trajectories-0750am-0805am.csv"

i80_f1 = "0500pm-0515pm/trajectories-0500-0515.csv"
i80_f2 = "0400pm-0415pm/trajectories-0400-0415.csv"
i80_f3 = "0515pm-0530pm/trajectories-0515-0530.csv"

path_list = []
path_list.append(os.path.join(root_folder,us101_folder,us101_f1))
path_list.append(os.path.join(root_folder,us101_folder,us101_f2))
path_list.append(os.path.join(root_folder,us101_folder,us101_f3))
path_list.append(os.path.join(root_folder,i80_folder,i80_f1))
path_list.append(os.path.join(root_folder,i80_folder,i80_f2))
path_list.append(os.path.join(root_folder,i80_folder,i80_f3))


seq_data_list, seq_label_list = [], []
for path in path_list:
    print(path)
    data = pd.read_csv(path)
    useful_data = data[["Vehicle_ID","Frame_ID","Local_X","Local_Y","Lane_ID"]]
    useful_data = useful_data.sort_values(by=["Vehicle_ID","Frame_ID"])
    label = add_change_label(useful_data)
    useful_data["label"] = pd.Series(label)
    print(useful_data.shape,label.shape,Counter(label))
    begin = useful_data[useful_data["label"]==1].index - 20
    end = useful_data[useful_data["label"]==1].index + 20
    new_label = np.zeros(useful_data.shape[0])
    for b,e in zip(begin,end):
        new_label[b:e] = 1
    useful_data['label'] = new_label
    useful_data = reset_idx(useful_data)
    useful_data = foot2meter(useful_data)
    neighbor_graph = getNeighborGraph(useful_data,radius=50) 
    seq_data,seq_label = graph2seq(useful_data,neighbor_graph,seq_length=17)
    seq_data_list.append(seq_data)
    seq_label_list.append(seq_label)
    print(Counter(seq_label)[0]/(Counter(seq_label)[0]+Counter(seq_label)[1]))
    print("\n")

seq_data_array = np.concatenate(seq_data_list)
seq_label_array = np.concatenate(seq_label_list)
print(Counter(seq_label_array)[0]/(Counter(seq_label_array)[0]+Counter(seq_label_array)[1]))


../../../data/US-101-LosAngeles-CA/us-101-vehicle-trajectory-data/vehicle-trajectory-data/0820am-0835am/trajectories-0820am-0835am.csv
(1048575, 6) (1048575,) Counter({0.0: 1047910, 1.0: 665})
0.923280226720304


../../../data/US-101-LosAngeles-CA/us-101-vehicle-trajectory-data/vehicle-trajectory-data/0805am-0820am/trajectories-0805am-0820am.csv
(1048575, 6) (1048575,) Counter({0.0: 1047866, 1.0: 709})
0.9531038368602975


../../../data/US-101-LosAngeles-CA/us-101-vehicle-trajectory-data/vehicle-trajectory-data/0750am-0805am/trajectories-0750am-0805am.csv
(1048575, 6) (1048575,) Counter({0.0: 1047374, 1.0: 1201})
0.9529667598633116


../../../data/I-80-Emeryville-CA/vehicle-trajectory-data/0500pm-0515pm/trajectories-0500-0515.csv
(1048575, 6) (1048575,) Counter({0.0: 1047852, 1.0: 723})
0.9742373063106536


../../../data/I-80-Emeryville-CA/vehicle-trajectory-data/0400pm-0415pm/trajectories-0400-0415.csv
(1048575, 6) (1048575,) Counter({0.0: 1047688, 1.0: 887})
0.9922059309888657


../.

In [2]:
seq_data_array.shape, seq_label_array.shape

((1107315, 30, 17, 2), (1107315,))

In [3]:
positive_samples = seq_data_array[seq_label_array==1]
negative_samples = seq_data_array[seq_label_array==0]
print(positive_samples.shape,negative_samples.shape)

(39999, 30, 17, 2) (1067316, 30, 17, 2)


In [4]:
positive_sample_number = positive_samples.shape[0]
negative_sample_number = negative_samples.shape[0]
for i in range(negative_sample_number//positive_sample_number):
    neg_data_seg = negative_samples[positive_sample_number*i:positive_sample_number*(i+1)]
    ensemble_data_seg = np.vstack((neg_data_seg,positive_samples))
    label = np.hstack((np.zeros(neg_data_seg.shape[0]),np.ones(positive_samples.shape[0])))
    save = {
        "data":ensemble_data_seg,
        "label":label,
    }
    with open(f"data/data_{i}.pkl","wb") as f:
        pkl.dump(save,f)

In [5]:
import os
import pickle as pkl

In [4]:
folder = "data"
file_list = [f for f in os.listdir(folder) if not f.startswith('.')]
for file in file_list:
    path = os.path.join(folder,file)
    with open(path,"rb") as f:
        data_dict = pkl.load(f)
    data = data_dict['data']
    label = data_dict['label']