In [2]:
import pandas as pd
import os
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix
from datatool import ade,vid_is_unique,foot2meter,vehicle2track,reset_idx,getNeighborGraph,graph2seq,get_displacement,train_test_val_split,matlab2dataframe
from collections import Counter
import time
import pickle as pkl

In [2]:
def add_change_label(data):
    data = data.sort_values(by=["id","frame"])
    changes = []
    for vid, df in data.groupby("id"):
        lane_id = df.laneId.to_numpy()
        diff = lane_id[1:] - lane_id[:-1]
        changed_or_not = (diff!=0)*1 # 1 : change; 0 : unchange
        changed_or_not = np.hstack((np.zeros(1),changed_or_not))
        changes.append(changed_or_not)
    change_result = np.hstack(changes)
    return change_result

def getNeighborGraph(data,radius=10):
    x,y,v_id,f_id = data.x,data.y,data.id,data.frame

    vehicle_num, frame_num = v_id.max()+1, f_id.max()+1
    sparse_X = csr_matrix((x, (v_id, f_id)), shape=(int(vehicle_num), int(frame_num))) # i行:车id;j列:时间;元素为i车j时刻的坐标x
    sparse_Y = csr_matrix((y, (v_id, f_id)), shape=(int(vehicle_num), int(frame_num))) # i行:车id;j列:时间;元素为i车j时刻的坐标y
    I_mat = (sparse_X!=0)*1 # i行:车id;j列:时间;元素为i车j时刻是否出现,出现为1,否则为0
    mask = []
    for v in range(I_mat.shape[0]):
        concurrent_mask = I_mat.multiply(I_mat[v]) #同或 [1,0,1,0,0,0,1] & [1,0,1,1,1,0,0] = [1,0,1,0,0,0,0]

        # 邻居xy坐标
        concurrent_X = concurrent_mask.multiply(sparse_X) 
        concurrent_Y = concurrent_mask.multiply(sparse_Y)

        # 自己xy坐标
        self_x = concurrent_mask.multiply(sparse_X[v])
        self_y = concurrent_mask.multiply(sparse_Y[v])

        # 差值
        delta_x = self_x - concurrent_X
        delta_y = self_y - concurrent_Y

        # 邻居x坐标在半径以内的指示矩阵
        x_in_id = np.where((delta_x.data>-radius) & (delta_x.data<radius))
        xc = delta_x.tocoo()
        xrow_in = xc.row[x_in_id]
        xcol_in = xc.col[x_in_id]
        xI_data = np.ones(xrow_in.shape[0])
        xneighbor_in_mat = csr_matrix((xI_data, (xrow_in, xcol_in)), shape=(I_mat.shape[0], I_mat.shape[1]))

        # 邻居y坐标在半径以内的指示矩阵
        y_in_id = np.where((delta_y.data>-radius) & (delta_y.data<radius))
        yc = delta_y.tocoo()
        yrow_in = yc.row[y_in_id]
        ycol_in = yc.col[y_in_id]
        yI_data = np.ones(yrow_in.shape[0])
        yneighbor_in_mat = csr_matrix((yI_data, (yrow_in, ycol_in)), shape=(I_mat.shape[0], I_mat.shape[1]))

        neighbor_in_mat = xneighbor_in_mat.multiply(yneighbor_in_mat).tolil()
        neighbor_in_mat[v] = I_mat[v]
        mask.append(neighbor_in_mat.tocsr())
    return mask

def graph2seq(data,graph_list,seq_length=16,max_vnum=30,down_sample_rate=5,sort_func="distance"):
    x,y,v_id,f_id,l = data.x,data.y,data.id,data.frame,data.label
    vehicle_num, frame_num = v_id.max()+1, f_id.max()+1
    sparse_X = csr_matrix((x, (v_id, f_id)), shape=(int(vehicle_num), int(frame_num))) # i行:车id;j列:时间;元素为i车j时刻的坐标x
    sparse_Y = csr_matrix((y, (v_id, f_id)), shape=(int(vehicle_num), int(frame_num))) # i行:车id;j列:时间;元素为i车j时刻的坐标y
    sparse_L = csr_matrix((l, (v_id, f_id)), shape=(int(vehicle_num), int(frame_num))) # i行:车id;j列:时间;元素为i车j时刻的下一时刻是否lane change
    seq_windows,label = [],[]
    for v,graph in enumerate(graph_list):
        if graph.data.size==0:
            continue
        row = np.unique(graph.tocoo().row)
        col = np.unique(graph.tocoo().col)
        row_start,row_end = row.min(), row.max()+1
        col_start,col_end = col.min(), col.max()+1
        dense_v = v - row_start
        dense_I = graph[row_start:row_end,col_start:col_end].toarray()
        dense_x = sparse_X[row_start:row_end,col_start:col_end].toarray()
        dense_y = sparse_Y[row_start:row_end,col_start:col_end].toarray()
        dense_xy = np.stack((dense_x,dense_y),axis=2) # (vum,total_seq,2)
        dense_l = sparse_L[row_start:row_end,col_start:col_end].toarray()
        if dense_xy.shape[0]<max_vnum:
            padding_num = max_vnum-dense_xy.shape[0]
            padding_xy = np.zeros((padding_num,dense_xy.shape[1],dense_xy.shape[2]))
            padding_I = np.zeros((padding_num,dense_I.shape[1]))
            dense_xy = np.vstack([dense_xy,padding_xy])
            dense_I = np.vstack([dense_I,padding_I])
            dense_l = np.vstack([dense_l,padding_I])
        for i in range(dense_xy.shape[1]): # for loop on sequence dim
            if (i+seq_length)*down_sample_rate > dense_xy.shape[1]:
                break
            window = dense_xy[:,i:i+seq_length*down_sample_rate:down_sample_rate,:] # (vum=30,seq=16,2)
            window_l = dense_l[:,i:i+seq_length*down_sample_rate:down_sample_rate] # (vum=30,seq=16)
            if sort_func == "duration":
                dense_seq_I = dense_I[:,i:(i+seq_length)*down_sample_rate:down_sample_rate]
                related_score = dense_seq_I.sum(axis=1)
                related_score[dense_v] = related_score[dense_v] + 100 # actually 1 is enough
                related_rank = np.argsort(-related_score)
            elif sort_func == "distance":
                related_score = ade(window[:,:6,:],window[dense_v,:6,:])
                related_rank = np.argsort(related_score)
            window = window[related_rank[:max_vnum],:,:]    
            seq_windows.append(window)
            label.append((window_l[0,6:].sum()>0)*1) # 30,17,2 (0-5),6,(7-16)
    if len(seq_windows)==0:
        seq_data = None
        seq_label = None
    else:
        seq_data = np.stack(seq_windows)#(n,vum=30,seq=16,2)
        seq_label = np.stack(label)
    return seq_data,seq_label

In [10]:
selected_col = ["frame","id","x","y","laneId"]
total_length = 0
for i in range(60):
    t1 = time.time()
    if i < 9:
        path = f"highD-dataset-v1.0/data/0{i+1}_tracks.csv"
    else:
        path = f"highD-dataset-v1.0/data/{i+1}_tracks.csv"
    data = pd.read_csv(path)
    useful_data = data[selected_col]
    useful_data = useful_data.sort_values(by=["id","frame"])
    label = add_change_label(useful_data)
    useful_data["label"] = pd.Series(label)
    '''
    uni_id = selected_data.id.unique()
    mapping_uni_id = np.arange(uni_id.shape[0])
    new_uni_id = mapping_uni_id + total_length
    total_length += uni_id.shape[0]
    id_dict = dict(zip(uni_id, new_uni_id))
    new_id = np.vectorize(id_dict.get)(data.id)
    selected_data.id = new_id
    '''
    begin = useful_data[useful_data["label"]==1].index - 20
    end = useful_data[useful_data["label"]==1].index + 20
    new_label = np.zeros(useful_data.shape[0])
    for b,e in zip(begin,end):
        new_label[b:e] = 1
    useful_data['label'] = new_label
    neighbor_graph = getNeighborGraph(useful_data,radius=50)
    seq_data,seq_label = graph2seq(useful_data,neighbor_graph,seq_length=17)
    t2 = time.time()
    print(f"file_{i}  processed. time: {t2-t1:.2f}. data shape {seq_data.shape}. label shape {seq_label.shape}.")
    file_data = {"data":seq_data,"label":seq_label}
    with open(f"pickle_data/data_{i}.pkl","wb") as f:
        pkl.dump(file_data,f)

file_0  processed. time: 10.41. data shape (52619, 30, 17, 2). label shape (52619,).
file_1  processed. time: 11.72. data shape (57405, 30, 17, 2). label shape (57405,).
file_2  processed. time: 8.57. data shape (45710, 30, 17, 2). label shape (45710,).
file_3  processed. time: 12.55. data shape (59114, 30, 17, 2). label shape (59114,).
file_4  processed. time: 13.21. data shape (60612, 30, 17, 2). label shape (60612,).
file_5  processed. time: 15.90. data shape (68786, 30, 17, 2). label shape (68786,).
file_6  processed. time: 7.37. data shape (37417, 30, 17, 2). label shape (37417,).
file_7  processed. time: 19.65. data shape (69908, 30, 17, 2). label shape (69908,).
file_8  processed. time: 16.01. data shape (64080, 30, 17, 2). label shape (64080,).
file_9  processed. time: 7.53. data shape (37657, 30, 17, 2). label shape (37657,).
file_10  processed. time: 27.78. data shape (102683, 30, 17, 2). label shape (102683,).
file_11  processed. time: 64.25. data shape (181866, 30, 17, 2). 

In [2]:
pickle_folder = "pickle_data"
total_data,total_label = [],[]
for file_path in os.listdir(pickle_folder):
    if "_" not in file_path:
        continue
    path = os.path.join(pickle_folder,file_path)
    with open(path,"rb") as f:
        pickle_file = pkl.load(f)
    label = pickle_file["label"]
    data = pickle_file["data"]
    pos_data = data[label==1]
    neg_data = data[label==0]
    pos_data_num = pos_data.shape[0]
    total_data.append(pos_data.repeat(10,axis=0))
    total_data.append(neg_data[:pos_data_num*10])
    total_label.append(np.ones(pos_data_num*10))
    total_label.append(np.zeros(pos_data_num*10))
data_array = np.vstack(total_data)
label_array = np.hstack(total_label)
print(data_array.shape, label_array.shape)

(234800, 30, 17, 2) (234800,)


In [4]:
pickle_folder = "pickle_data"
pos_data_list, neg_data_list = [], []
for file_path in os.listdir(pickle_folder):
    if "_" not in file_path:
        continue
    path = os.path.join(pickle_folder,file_path)
    with open(path,"rb") as f:
        pickle_file = pkl.load(f)
    label = pickle_file["label"]
    data = pickle_file["data"]
    pos_data = data[label==1]
    neg_data = data[label==0]
    pos_data_num = pos_data.shape[0]
    pos_data_list.append(pos_data)
    neg_data_list.append(neg_data[:pos_data_num*10])
pos_data_array = np.vstack(pos_data_list)
neg_data_array = np.vstack(neg_data_list)
print(pos_data_array.shape,neg_data_array.shape)

(22326, 30, 17, 2) (223260, 30, 17, 2)


In [3]:
pickle_folder = "pickle_data"
keep_data_list, right_data_list,left_data_list = [], [], []
for file_path in os.listdir(pickle_folder):
    if "data_" not in file_path:
        continue
    path = os.path.join(pickle_folder,file_path)
    with open(path,"rb") as f:
        pickle_file = pkl.load(f)
    label = pickle_file["label"]
    data = pickle_file["data"]
    left_number = right_number = keep_number = 0
    if (label == 1).any():
        left_data = data[label==1]
        left_data_list.append(left_data)
        left_number = left_data.shape[0]
    if (label == -1).any():
        right_data = data[label==-1]
        right_data_list.append(right_data)
        right_number = right_data.shape[0]
    if (label == 0).any():
        keep_number = max(left_number,right_number)*10
        keep_data = data[label==0]
        keep_data_list.append(keep_data[:keep_number])
    print(f"file: {file_path}, keep: {keep_number}, right: {right_number}, left: {left_number}")
keep_data_array = np.vstack(keep_data_list)
right_data_array = np.vstack(right_data_list)
left_data_array = np.vstack(left_data_list)
print(keep_data_array.shape,right_data_array.shape,left_data_array.shape)

file: data_8.pkl, keep: 950, right: 0, left: 95
file: data_3.pkl, keep: 2530, right: 253, left: 11
file: data_30.pkl, keep: 550, right: 55, left: 0
file: data_17.pkl, keep: 1020, right: 102, left: 0
file: data_25.pkl, keep: 260, right: 26, left: 0
file: data_44.pkl, keep: 1410, right: 66, left: 141
file: data_0.pkl, keep: 1500, right: 0, left: 150
file: data_57.pkl, keep: 1340, right: 134, left: 0
file: data_38.pkl, keep: 1330, right: 3, left: 133
file: data_16.pkl, keep: 1300, right: 107, left: 130
file: data_51.pkl, keep: 430, right: 6, left: 43
file: data_33.pkl, keep: 700, right: 0, left: 70
file: data_49.pkl, keep: 1850, right: 25, left: 185
file: data_4.pkl, keep: 1600, right: 87, left: 160
file: data_9.pkl, keep: 2810, right: 281, left: 157
file: data_46.pkl, keep: 2110, right: 3, left: 211
file: data_14.pkl, keep: 540, right: 43, left: 54
file: data_13.pkl, keep: 370, right: 26, left: 37
file: data_36.pkl, keep: 720, right: 72, left: 6
file: data_2.pkl, keep: 940, right: 0, lef

In [5]:
_ = {
    "right_data":right_data_array,
    "left_data":left_data_array,
    "keep_data":keep_data_array
}
with open("pickle_data/23w10v1_3cls.pkl","wb") as f:
    pkl.dump(_,f)

In [5]:
_ = {
    "pos_data":pos_data_array,
    "neg_data":neg_data_array,
}
with open("pickle_data/23w10v1.pkl","wb") as f:
    pkl.dump(_,f)

In [2]:
import pickle as pkl
import numpy as np
from datatool import train_test_val_split

with open("pickle_data/23w10v1.pkl","rb") as f:
    data = pkl.load(f)
pos_data,neg_data = data['pos_data'],data['neg_data']
print(pos_data.shape,neg_data.shape)


(22326, 30, 17, 2) (223260, 30, 17, 2)


In [4]:
pos_train, pos_val, pos_test = train_test_val_split(pos_data,test_size=0.2,val_size=0.1,seed=0)
neg_train, neg_val, neg_test = train_test_val_split(neg_data,test_size=0.2,val_size=0.1,seed=0)
pos_train, pos_val, pos_test = pos_train.repeat(10,axis=0),pos_val.repeat(10,axis=0),pos_test.repeat(10,axis=0)
print(pos_train.shape,pos_val.shape,pos_test.shape)
print(neg_train.shape,neg_val.shape,neg_test.shape)

(156280, 30, 17, 2) (22320, 30, 17, 2) (44660, 30, 17, 2)
(156282, 30, 17, 2) (22326, 30, 17, 2) (44652, 30, 17, 2)


In [7]:
X_train, X_val, X_test = np.vstack((pos_train,neg_train)),np.vstack((pos_val,neg_val)),np.vstack((pos_test,neg_test))
y_train = np.hstack((np.ones(pos_train.shape[0]),np.zeros(neg_train.shape[0])))
y_val = np.hstack((np.ones(pos_val.shape[0]),np.zeros(neg_val.shape[0])))
y_test = np.hstack((np.ones(pos_test.shape[0]),np.zeros(neg_test.shape[0])))
print(X_train.shape,y_train.shape)
print(X_val.shape,y_val.shape)
print(X_test.shape,y_test.shape)

(312562, 30, 17, 2) (312562,)
(44646, 30, 17, 2) (44646,)
(89312, 30, 17, 2) (89312,)
