In [60]:
import pandas as pd
import numpy as np

#### Loading Data from CSV File

In [61]:
def load_data(file_path, chunk_size=1000, selected_columns=None):
    """
    Load data from a CSV file in chunks.

    Parameters:
    - file_path: str, path to the CSV file
    - chunk_size: int, number of rows per chunk
    - selected_columns: list, columns to select from the CSV

    Returns:
    - generator of DataFrames
    """
    # Read the CSV file in chunks

    if selected_columns is None:
        chunk_iter = pd.read_csv(file_path, chunksize=chunk_size)
    else:
        chunk_iter = pd.read_csv(file_path, usecols=selected_columns, chunksize=chunk_size)

    # Process each chunk
    for chunk in chunk_iter:
        yield chunk

In [62]:
chunk_data = False
chunk_size = 10000
num_chunks = 10
selected_columns = ['Vehicle_ID', 'Frame_ID', 'Total_Frames', 'Global_Time', 'Local_X', 'Local_Y', 'Global_X', 'Global_Y', 'v_length', 'v_Width', 'v_Class', 'v_Vel', 'v_Acc', 'Lane_ID', 'Section_ID', 'Direction', 'Movement', 'Preceding', 'Following', 'Space_Headway', 'Time_Headway', 'Location']

if chunk_data:
    # Load data in chunks
    trajectories_data = pd.DataFrame()
    cur_chunk = 0
    for chunk in load_data('../data/ngsim.csv', chunk_size=chunk_size, selected_columns=selected_columns):
        # Process the chunk (e.g., append to a list or DataFrame)
        trajectories_data = pd.concat([trajectories_data, chunk], ignore_index=True)

        cur_chunk += 1
        if cur_chunk >= num_chunks:
            break
else:
    trajectories_data = pd.read_csv('../data/ngsim.csv')  

In [63]:
trajectories_data.tail()

Unnamed: 0,Vehicle_ID,Frame_ID,Total_Frames,Global_Time,Local_X,Local_Y,Global_X,Global_Y,v_length,v_Width,...,D_Zone,Int_ID,Section_ID,Direction,Movement,Preceding,Following,Space_Headway,Time_Headway,Location
11850521,1355,5622,938,1113438127100,18.275,1464.336,6042648.149,2134520.896,12.3,6.8,...,,,,,,1351,1361,26.91,15.92,i-80
11850522,1474,5688,401,1113438133700,5.509,560.412,6042767.84,2133624.549,17.8,7.9,...,,,,,,1468,1480,74.49,1.73,i-80
11850523,398,2368,654,1113437801700,18.903,635.841,6042771.741,2133701.076,15.2,8.5,...,,,,,,385,406,90.95,3.77,i-80
11850524,599,875,577,1113436854400,77.094,621.358,6042831.282,2133693.854,15.3,6.4,...,,,,,,0,611,0.0,0.0,i-80
11850525,2006,6461,879,1113437413000,73.816,695.035,6042818.859,2133766.666,14.8,6.9,...,,,,,,2009,2014,25.47,2.26,i-80


In [64]:
trajectories_data.shape

(11850526, 25)

In [247]:
# print all the columns
print(trajectories_data.columns)

Index(['Vehicle_ID', 'Frame_ID', 'Total_Frames', 'Global_Time', 'Local_X',
       'Local_Y', 'Global_X', 'Global_Y', 'v_length', 'v_Width', 'v_Class',
       'v_Vel', 'v_Acc', 'Lane_ID', 'O_Zone', 'D_Zone', 'Int_ID', 'Section_ID',
       'Direction', 'Movement', 'Preceding', 'Following', 'Space_Headway',
       'Time_Headway', 'Location'],
      dtype='object')


#### Choosing a Location: (Iterate over later)

In [248]:
locations = trajectories_data['Location'].unique()
locations

array(['us-101', 'i-80', 'lankershim', 'peachtree'], dtype=object)

In [249]:
chosen_location = 'us-101'

In [250]:
# Extract the data for the chosen location
us_101_data = trajectories_data[trajectories_data['Location'] == chosen_location]
us_101_data.head()

Unnamed: 0,Vehicle_ID,Frame_ID,Total_Frames,Global_Time,Local_X,Local_Y,Global_X,Global_Y,v_length,v_Width,...,D_Zone,Int_ID,Section_ID,Direction,Movement,Preceding,Following,Space_Headway,Time_Headway,Location
0,515,2330,1123,1118848075000,30.034,188.062,6451203.729,1873252.549,13.0,6.9,...,,,,,,500,523,119.1,5.11,us-101
1,515,2330,1123,1118848075000,30.034,188.062,6451203.729,1873252.549,13.0,6.9,...,,,,,,500,523,119.1,5.11,us-101
3,2127,6459,567,1118847624800,19.632,1775.614,6452425.122,1872172.475,13.5,6.9,...,,,,,,2124,2132,48.92,1.3,us-101
4,1033,4827,592,1118848324700,6.202,1701.144,6452347.673,1872258.452,13.5,4.4,...,,,,,,1029,1040,38.81,0.92,us-101
5,1033,4827,592,1118848324700,6.202,1701.144,6452347.673,1872258.452,13.5,4.4,...,,,,,,1029,1040,38.81,0.92,us-101


#### For Each Location:

##### Drop duplicates:

In [251]:
us_101_data_unique = us_101_data.drop_duplicates(subset=['Vehicle_ID', 'Frame_ID'])
us_101_data_unique.shape

(3614404, 25)

##### Choose lanes


In [252]:
chosen_lanes = [3, 4, 5]

In [253]:
# Extract the data for the chosen lanes
us_101_data_lanes = us_101_data_unique[us_101_data_unique['Lane_ID'].isin(chosen_lanes)]
# us_101_data_lanes.head()
us_101_data_lanes.shape

(2055628, 25)

##### We have the dataset with -- One Location, 3 particular lanes, and unique combination of (vehicle_ID and frame_ID)

##### Sort via frame id:

In [254]:
us_101_data_lanes.head()

Unnamed: 0,Vehicle_ID,Frame_ID,Total_Frames,Global_Time,Local_X,Local_Y,Global_X,Global_Y,v_length,v_Width,...,D_Zone,Int_ID,Section_ID,Direction,Movement,Preceding,Following,Space_Headway,Time_Headway,Location
0,515,2330,1123,1118848075000,30.034,188.062,6451203.729,1873252.549,13.0,6.9,...,,,,,,500,523,119.1,5.11,us-101
6,1890,9157,628,1118849672700,53.514,817.521,6451655.238,1872800.663,24.0,8.5,...,,,,,,1882,1897,102.65,2.27,us-101
7,744,3392,996,1118848181200,28.878,490.086,6451422.353,1873041.018,15.0,5.9,...,,,,,,740,752,37.8,1.54,us-101
15,496,2766,771,1118848118600,52.141,1656.454,6452284.002,1872253.037,15.0,4.9,...,,,,,,510,502,68.21,2.09,us-101
20,1779,5317,490,1118847510600,31.09,733.063,6451632.732,1872850.265,28.0,7.9,...,,,,,,1769,1784,189.54,3.8,us-101


In [255]:
# add index as a column
us_101_data_lanes = us_101_data_lanes.reset_index(drop=False)
us_101_data_lanes.head()

Unnamed: 0,index,Vehicle_ID,Frame_ID,Total_Frames,Global_Time,Local_X,Local_Y,Global_X,Global_Y,v_length,...,D_Zone,Int_ID,Section_ID,Direction,Movement,Preceding,Following,Space_Headway,Time_Headway,Location
0,0,515,2330,1123,1118848075000,30.034,188.062,6451203.729,1873252.549,13.0,...,,,,,,500,523,119.1,5.11,us-101
1,6,1890,9157,628,1118849672700,53.514,817.521,6451655.238,1872800.663,24.0,...,,,,,,1882,1897,102.65,2.27,us-101
2,7,744,3392,996,1118848181200,28.878,490.086,6451422.353,1873041.018,15.0,...,,,,,,740,752,37.8,1.54,us-101
3,15,496,2766,771,1118848118600,52.141,1656.454,6452284.002,1872253.037,15.0,...,,,,,,510,502,68.21,2.09,us-101
4,20,1779,5317,490,1118847510600,31.09,733.063,6451632.732,1872850.265,28.0,...,,,,,,1769,1784,189.54,3.8,us-101


<p>We need to check for vehicles which are in 70 continous frames</p>

<p>For this I was thinking, why not take 70 continous frames and then check which vehicles will be in those particular frames<p>

<p>We get to directly use those rows are they will be valid, we can the ones where we have at least one car in each of the chosen lanes<p>

<p> Then we shift to the vehicles and make a new dataset from each row, just pick from the original dataset the values with this car and this frame ID.... as they are already unique, this should give use the dataset we require<p>

<p>Let's try<p>

In [256]:
# choose vehicle ID and Frame ID from the data

us_101_data_vf = us_101_data_lanes[['Vehicle_ID', 'Frame_ID', 'index']]
us_101_data_vf.head()

Unnamed: 0,Vehicle_ID,Frame_ID,index
0,515,2330,0
1,1890,9157,6
2,744,3392,7
3,496,2766,15
4,1779,5317,20


###### Attempt 1:

In [257]:
# group by Frame ID
# us_101_data_grouped_by_frame = us_101_data_vf.groupby('Frame_ID').agg(list)
# us_101_data_grouped_by_frame['num_vehicles'] = us_101_data_grouped_by_frame['Vehicle_ID'].apply(lambda x: len(x))

us_101_data_grouped_by_frame = us_101_data_vf.groupby('Frame_ID').agg({
    'Vehicle_ID': lambda x: set(sorted(list(x))),
    'index': lambda x: x
}).reset_index()
us_101_data_grouped_by_frame['num_vehicles'] = us_101_data_grouped_by_frame['Vehicle_ID'].apply(len)
us_101_data_grouped_by_frame.head()


Unnamed: 0,Frame_ID,Vehicle_ID,index,num_vehicles
0,8,{5},8242843,1
1,9,{5},5805738,1
2,11,{5},459432,1
3,13,{5},1407728,1
4,16,{5},5740198,1


In [258]:
num_vehicles = 5

In [259]:
# Find rows where number of vehicles is qual to 5
us_101_data_grouped_by_frame = us_101_data_grouped_by_frame[us_101_data_grouped_by_frame['num_vehicles'] >= num_vehicles]
us_101_data_grouped_by_frame.head()

Unnamed: 0,Frame_ID,Vehicle_ID,index,num_vehicles
29,47,"{5, 8, 9, 14, 18}","[5627159, 5643551, 6955024, 7197319, 7294292]",5
32,50,"{5, 8, 9, 14, 18}","[2027655, 4590784, 4776750, 5360365, 7996501]",5
34,52,"{5, 8, 9, 14, 18, 21}","[1041275, 1048794, 1707937, 4973215, 7367604, ...",6
36,54,"{5, 8, 9, 18, 21}","[23481, 4709138, 5907206, 7476449, 8347983]",5
37,55,"{5, 8, 9, 18, 21}","[2349355, 3924112, 4273550, 7478071, 8436879]",5


In [260]:
num_frames = 70

In [261]:
frame_ids = us_101_data_grouped_by_frame['Frame_ID'].sort_values().reset_index(drop=True)
frame_ids 

0         47
1         50
2         52
3         54
4         55
        ... 
9876    9928
9877    9929
9878    9930
9879    9931
9880    9932
Name: Frame_ID, Length: 9881, dtype: int64

In [262]:
group = frame_ids - frame_ids.index
groups = frame_ids.groupby(group)


In [263]:
continuous_sequences = [g.tolist() for _, g in groups if len(g) >= num_frames]
len(continuous_sequences)

1

In [264]:
valid_frame_ids = [fid for seq in continuous_sequences for fid in seq]
us_101_data_grouped_by_frame_filtered = us_101_data_grouped_by_frame[us_101_data_grouped_by_frame['Frame_ID'].isin(valid_frame_ids)]
us_101_data_grouped_by_frame_filtered.head()

Unnamed: 0,Frame_ID,Vehicle_ID,index,num_vehicles
42,60,"{8, 9, 14, 18, 21}","[119289, 2581969, 3216933, 3236728, 7035608]",5
43,61,"{5, 8, 9, 18, 21}","[647716, 1748076, 2564133, 5315276, 7492481]",5
44,62,"{5, 8, 9, 18, 21}","[2549780, 2652866, 4030786, 7963650, 8350604]",5
45,63,"{5, 8, 9, 18, 21, 31}","[3231673, 4104096, 4292328, 5121890, 7818400, ...",6
46,64,"{8, 9, 18, 21, 31}","[1046846, 4012190, 4806580, 6238622, 6556731]",5


In [265]:
us_101_data_grouped_by_frame_filtered.shape

(9873, 4)

In [266]:
veh5frame70data = []

for start in range(len(us_101_data_grouped_by_frame_filtered) - num_frames + 1):
    window = us_101_data_grouped_by_frame_filtered.iloc[start:start + num_frames]

    sets = window['Vehicle_ID'].apply(set).tolist()
    if sets:
        common_ids = set.intersection(*sets)
        if len(common_ids) >= num_vehicles:
            index_union = set().union(*window['index'].tolist())
            veh5frame70data.append({
                'Frame_ID': window['Frame_ID'].tolist(),
                'intersections': common_ids,
                'index': list(index_union)
            })
        else:
            print(common_ids, 'discarding', start, 'to', start + num_frames)

{8, 18, 21} discarding 0 to 70
{8, 18, 21} discarding 1 to 71
{8, 18, 21} discarding 2 to 72
{8, 18, 21, 31} discarding 3 to 73
{8, 18, 21, 31} discarding 4 to 74
{8, 18, 21, 31} discarding 5 to 75
{8, 18, 21, 31} discarding 6 to 76
{8, 18, 21, 31} discarding 7 to 77
{8, 18, 21, 31} discarding 8 to 78
{8, 18, 21, 31} discarding 9 to 79


In [267]:
len(veh5frame70data)

9794

##### Choosing one index to go forward, later loop:

In [268]:
chosen_continuous_sequence_index = 0

In [272]:
chosen_continuous_sequence = veh5frame70data[chosen_continuous_sequence_index]
chosen_continuous_sequence['intersections']

{8, 18, 20, 21, 31}

In [273]:
us_101_data_lanes.head()

Unnamed: 0,index,Vehicle_ID,Frame_ID,Total_Frames,Global_Time,Local_X,Local_Y,Global_X,Global_Y,v_length,...,D_Zone,Int_ID,Section_ID,Direction,Movement,Preceding,Following,Space_Headway,Time_Headway,Location
0,0,515,2330,1123,1118848075000,30.034,188.062,6451203.729,1873252.549,13.0,...,,,,,,500,523,119.1,5.11,us-101
1,6,1890,9157,628,1118849672700,53.514,817.521,6451655.238,1872800.663,24.0,...,,,,,,1882,1897,102.65,2.27,us-101
2,7,744,3392,996,1118848181200,28.878,490.086,6451422.353,1873041.018,15.0,...,,,,,,740,752,37.8,1.54,us-101
3,15,496,2766,771,1118848118600,52.141,1656.454,6452284.002,1872253.037,15.0,...,,,,,,510,502,68.21,2.09,us-101
4,20,1779,5317,490,1118847510600,31.09,733.063,6451632.732,1872850.265,28.0,...,,,,,,1769,1784,189.54,3.8,us-101


In [None]:
# get values from us_101_data_lanes with the same index as the chosen_continuous_sequence['index']

us_101_data_lanes_filtered = us_101_data_lanes[us_101_data_lanes['index'].isin(chosen_continuous_sequence['index'])]
us_101_data_lanes_filtered = us_101_data_lanes_filtered[us_101_data_lanes_filtered['Vehicle_ID'].isin(list(chosen_continuous_sequence['intersections']))]
us_101_data_lanes_filtered = us_101_data_lanes_filtered[us_101_data_lanes_filtered['Frame_ID'].isin(chosen_continuous_sequence['Frame_ID'])]

In [279]:
us_101_data_lanes_filtered

Unnamed: 0,index,Vehicle_ID,Frame_ID,Total_Frames,Global_Time,Local_X,Local_Y,Global_X,Global_Y,v_length,...,D_Zone,Int_ID,Section_ID,Direction,Movement,Preceding,Following,Space_Headway,Time_Headway,Location
4301,16142,8,132,448,1118846992100,40.197,451.870,6451416.197,1873031.299,15.5,...,,,,,,5,21,58.87,1.68,us-101
19976,75424,31,83,465,1118846987200,52.845,108.243,6451160.099,1873265.959,15.5,...,,,,,,18,35,56.49,1.63,us-101
29059,109667,31,102,465,1118846989100,51.095,173.116,6451206.126,1873219.176,15.5,...,,,,,,18,35,57.45,1.74,us-101
29694,111985,8,117,448,1118846990600,40.901,395.478,6451374.253,1873068.997,15.5,...,,,,,,5,21,60.23,1.49,us-101
32543,122727,20,135,414,1118846992400,26.628,310.731,6451321.828,1873138.036,17.5,...,,,,,,9,47,130.53,2.93,us-101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2034080,8594954,31,123,465,1118846991200,52.319,246.480,6451257.751,1873164.238,15.5,...,,,,,,18,35,66.68,1.78,us-101
2037128,8610674,31,79,465,1118846986800,52.633,94.538,6451150.731,1873276.341,15.5,...,,,,,,18,35,56.22,1.73,us-101
2039564,8622894,18,96,291,1118846988500,50.955,209.563,6451231.418,1873192.949,14.5,...,,,,,,14,31,89.85,2.57,us-101
2042454,8637441,18,74,291,1118846986300,51.429,134.242,6451178.854,1873247.877,14.5,...,,,,,,14,31,81.43,2.57,us-101
