In [2]:
import pandas as pd
import numpy as np

In [68]:
def load_data(file_path, chunk_size=1000, selected_columns=None):
    """
    Load data from a CSV file in chunks.

    Parameters:
    - file_path: str, path to the CSV file
    - chunk_size: int, number of rows per chunk
    - selected_columns: list, columns to select from the CSV

    Returns:
    - generator of DataFrames
    """
    # Read the CSV file in chunks

    if selected_columns is None:
        chunk_iter = pd.read_csv(file_path, chunksize=chunk_size)
    else:
        chunk_iter = pd.read_csv(file_path, usecols=selected_columns, chunksize=chunk_size)

    # Process each chunk
    for chunk in chunk_iter:
        yield chunk

In [69]:
chunk_data = True
chunk_size = 10000
num_chunks = 10
selected_columns = None

if chunk_data:
    # Load data in chunks
    trajectories_data = pd.DataFrame()
    cur_chunk = 0
    for chunk in load_data('../data/ngsim.csv', chunk_size=chunk_size, selected_columns=selected_columns):
        # Process the chunk (e.g., append to a list or DataFrame)
        trajectories_data = pd.concat([trajectories_data, chunk], ignore_index=True)

        cur_chunk += 1
        if cur_chunk >= num_chunks:
            break
else:
    trajectories_data = pd.read_csv('../data/ngsim.csv')  

In [70]:
trajectories_data.tail()

Unnamed: 0,Vehicle_ID,Frame_ID,Total_Frames,Global_Time,Local_X,Local_Y,Global_X,Global_Y,v_length,v_Width,...,D_Zone,Int_ID,Section_ID,Direction,Movement,Preceding,Following,Space_Headway,Time_Headway,Location
99995,1690,8344,404,1118849591400,63.296,1021.238,6451802.288,1872658.894,16.0,6.9,...,,,,,,0,1699,0.0,0.0,us-101
99996,1154,8620,807,1118936541900,-24.897,568.041,6452129.123,1873321.688,17.0,7.3,...,201.0,0.0,3.0,4.0,1.0,1140,1171,43.23,3.77,lankershim
99997,832,4568,891,1118849213800,6.579,1973.726,6452557.4,1872075.583,16.5,6.9,...,,,,,,826,837,84.58,2.4,us-101
99998,1729,8524,703,1118849609400,39.312,982.817,6451789.041,1872702.188,14.5,5.9,...,,,,,,1720,1734,77.62,2.28,us-101
99999,123,1465,1022,1118935826400,-6.823,51.919,6451920.969,1872850.072,16.6,7.6,...,201.0,1.0,0.0,4.0,1.0,0,165,0.0,0.0,lankershim


In [71]:
len(trajectories_data)

100000

In [72]:
trajectories_data.head()

Unnamed: 0,Vehicle_ID,Frame_ID,Total_Frames,Global_Time,Local_X,Local_Y,Global_X,Global_Y,v_length,v_Width,...,D_Zone,Int_ID,Section_ID,Direction,Movement,Preceding,Following,Space_Headway,Time_Headway,Location
0,515,2330,1123,1118848075000,30.034,188.062,6451203.729,1873252.549,13.0,6.9,...,,,,,,500,523,119.1,5.11,us-101
1,515,2330,1123,1118848075000,30.034,188.062,6451203.729,1873252.549,13.0,6.9,...,,,,,,500,523,119.1,5.11,us-101
2,2224,6548,1902,1113437421700,41.429,472.901,6042814.264,2133542.012,14.3,6.9,...,,,,,,2208,2211,53.34,2.01,i-80
3,2127,6459,567,1118847624800,19.632,1775.614,6452425.122,1872172.475,13.5,6.9,...,,,,,,2124,2132,48.92,1.3,us-101
4,1033,4827,592,1118848324700,6.202,1701.144,6452347.673,1872258.452,13.5,4.4,...,,,,,,1029,1040,38.81,0.92,us-101


In [73]:
# print all the columns
print(trajectories_data.columns)


Index(['Vehicle_ID', 'Frame_ID', 'Total_Frames', 'Global_Time', 'Local_X',
       'Local_Y', 'Global_X', 'Global_Y', 'v_length', 'v_Width', 'v_Class',
       'v_Vel', 'v_Acc', 'Lane_ID', 'O_Zone', 'D_Zone', 'Int_ID', 'Section_ID',
       'Direction', 'Movement', 'Preceding', 'Following', 'Space_Headway',
       'Time_Headway', 'Location'],
      dtype='object')


In [12]:
trajectories_data = pd.read_csv('../data/ngsim.csv')  
# plrint unique locations
locations = trajectories_data['Location'].unique()
# Filter the data for us-101 location
us101_data = trajectories_data[trajectories_data['Location'] == 'us-101']
print(len(us101_data))
# now for data for each location if vehicle id and frame id are same in 2 entries drop duplicate entries
us101_data = us101_data.drop_duplicates(subset=['Vehicle_ID', 'Frame_ID'], keep='first')
print(len(us101_data))
# extract cars in lane 3, 4, 5
us101_data = us101_data[(us101_data['Lane_ID'] == 3) | (us101_data['Lane_ID'] == 4) | (us101_data['Lane_ID'] == 5)]
print(len(us101_data))


4802933
3614404
2055628


In [13]:
us101_data = us101_data.sort_values(by='Frame_ID')

In [14]:
# Select a specific Vehicle_ID for experimentation
vehicle_id = 251  # Replace with the desired Vehicle_ID
us101_data = us101_data[us101_data['Vehicle_ID'] == vehicle_id]

# Display the first few rows of the filtered data
# print(vehicle_data.head())

In [16]:
# Calculate the time difference between consecutive frames
us101_data['Time_Difference'] = us101_data['Global_Time'].diff()

# Display the first few rows to verify
print(us101_data[['Frame_ID', 'Global_Time', 'Time_Difference']][:100])

         Frame_ID    Global_Time  Time_Difference
3689294      1177  1118848874700              NaN
769121       1178  1118848874800            100.0
7241583      1179  1118848874900            100.0
3180957      1180  1118848875000            100.0
2132112      1181  1118848875100            100.0
...           ...            ...              ...
8230781      1272  1118848884200            100.0
3153399      1273  1118848884300            100.0
7811916      1274  1118848884400            100.0
4053430      1275  1118848884500            100.0
2284116      1276  1118848884600            100.0

[100 rows x 3 columns]


In [45]:
# Filter the data for us-101 location
us101_data = trajectories_data[trajectories_data['Location'] == 'us-101']
print(len(us101_data))

4802933


In [46]:
# now for data for each location if vehicle id and frame id are same in 2 entries drop duplicate entries
us101_data = us101_data.drop_duplicates(subset=['Vehicle_ID', 'Frame_ID'], keep='first')
print(len(us101_data))


3614404


In [47]:
# extract cars in lane 3, 4, 5
us101_data = us101_data[(us101_data['Lane_ID'] == 3) | (us101_data['Lane_ID'] == 4) | (us101_data['Lane_ID'] == 5)]
print(len(us101_data))

2055628


In [48]:
#group the data by frame id
us101_data_grouped = us101_data.groupby('Frame_ID')

In [49]:
# # Index(['Vehicle_ID', 'Frame_ID', 'Total_Frames', 'Global_Time', 'Local_X',
#        'Local_Y', 'Global_X', 'Global_Y', 'v_length', 'v_Width', 'v_Class',
#        'v_Vel', 'v_Acc', 'Lane_ID', 'O_Zone', 'D_Zone', 'Int_ID', 'Section_ID',
#        'Direction', 'Movement', 'Preceding', 'Following', 'Space_Headway',
#        'Time_Headway', 'Location'],
#       dtype='object')
# check for nan values in preceding and following
print(us101_data['Preceding'].isna().sum())
print(us101_data['Following'].isna().sum())



0
0


In [50]:
# print the data for one vehicle id
v_id = 251
vehicle_data = us101_data[us101_data['Vehicle_ID'] == v_id]
# print(vehicle_data)
# print the data in ascending order of frame id
vehicle_data = vehicle_data.sort_values(by='Frame_ID')
print(vehicle_data[10:20])


         Vehicle_ID  Frame_ID  Total_Frames    Global_Time  Local_X  Local_Y  \
8437164         251      1187           912  1118848875700   38.684  104.963   
3260003         251      1188           912  1118848875800   38.645  107.754   
4846510         251      1189           912  1118848875900   38.615  110.549   
6572266         251      1190           912  1118848876000   38.583  113.348   
8628909         251      1191           912  1118848876100   38.552  116.149   
3898706         251      1192           912  1118848876200   38.522  118.967   
3917923         251      1193           912  1118848876300   38.490  121.770   
2465042         251      1194           912  1118848876400   38.459  124.455   
3548783         251      1195           912  1118848876500   38.428  127.024   
4055477         251      1196           912  1118848876600   38.445  129.589   

            Global_X     Global_Y  v_length  v_Width  ...  D_Zone  Int_ID  \
8437164  6451144.379  1873304.220      17.

In [51]:
# Print unique lane ids in us-101 data
lane_ids = us101_data['Lane_ID'].unique()
print(lane_ids)

[3 5 4]


In [52]:
# Print unique directions in us-101 data
directions = us101_data['v_Vel'].unique().min()
print(directions)

0.0


In [53]:
# filter the data for ech frame id, and considering only 6 vehicles in a range of 10 meters
# us101_data = us101_data[us101_data['Frame_ID'] == 0]
# # print the data
# print(us101_data)
# # now for each vehicle id, get the data for the next 10 frames
# us101_data = us101_data[us101_data['Vehicle_ID'].isin([1, 2, 3, 4, 5, 6])]
# # print the data
# print(us101_data)
# # now for each vehicle id, get the data for the next 10 frames
# us101_data = us101_data[us101_data['Frame_ID'].isin([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])]


In [54]:
unique_section_ids = trajectories_data['Section_ID'].unique()
print(unique_section_ids)

[nan  3.  0.  2.  4.  5.  1.  6.]


In [55]:
# get the number of vehicles in one frame id
def get_num_vehicles_in_frame(frame_id):
    return len(us101_data[us101_data['Frame_ID'] == frame_id]['Vehicle_ID'].unique())

# get the number of vehicles in each frame id
print(get_num_vehicles_in_frame(2330))

165


In [56]:
# filter the 

In [57]:
# calculate the average distavance between vehicles in each frame id
def get_average_distance_between_vehicles_in_frame(frame_id):
    frame_data = us101_data[us101_data['Frame_ID'] == frame_id]
    frame_data = frame_data[frame_data['Section_ID'].isna()]
    vehicle_ids = frame_data['Vehicle_ID'].unique()
    distances = []
    for i in range(len(vehicle_ids)):
        for j in range(i + 1, len(vehicle_ids)):
            vehicle1 = frame_data[frame_data['Vehicle_ID'] == vehicle_ids[i]]
            vehicle2 = frame_data[frame_data['Vehicle_ID'] == vehicle_ids[j]]
            distance = np.sqrt((vehicle1['Global_X'].values[0] - vehicle2['Global_X'].values[0]) ** 2 +
                               (vehicle1['Global_Y'].values[0] - vehicle2['Global_Y'].values[0]) ** 2)
            distances.append(distance)
    return np.mean(distances) if distances else 0
print(get_average_distance_between_vehicles_in_frame(2330))

716.5957648201978


In [58]:
vehicle_id = 517 # Replace with the desired Vehicle_ID
frame_id = 2330 # Replace with the desired Frame_ID
vehicle_instances_count = trajectories_data[(trajectories_data['Frame_ID'] == frame_id) & (trajectories_data['Vehicle_ID'] == vehicle_id)].shape[0]
print(f"Number of instances for Vehicle_ID {vehicle_id} in Frame_ID {frame_id}: {vehicle_instances_count}")

Number of instances for Vehicle_ID 517 in Frame_ID 2330: 3


In [59]:
unique_vehicles_count = trajectories_data[trajectories_data['Frame_ID'] == frame_id]['Vehicle_ID'].nunique()
print(f"Number of unique vehicles in Frame_ID {frame_id}: {unique_vehicles_count}")

Number of unique vehicles in Frame_ID 2330: 654


In [60]:
# Filter the data for the specific vehicle and frame
vehicle_id = 514
frame_id = 2330
vehicle_data = trajectories_data[(trajectories_data['Vehicle_ID'] == vehicle_id) & (trajectories_data['Frame_ID'] == frame_id)]
print(vehicle_data)


          Vehicle_ID  Frame_ID  Total_Frames    Global_Time  Local_X  \
107797           514      2330          1183  1118848075000    6.072   
107798           514      2330          1183  1118848075000    6.072   
10101867         514      2330           627  1113433367900   42.967   

           Local_Y     Global_X     Global_Y  v_length  v_Width  ...  D_Zone  \
107797     318.445  6451312.678  1873175.468      13.5      5.4  ...     NaN   
107798     318.445  6451312.678  1873175.468      13.5      5.4  ...     NaN   
10101867  1550.504  6042658.724  2134609.539      16.8      7.9  ...     NaN   

          Int_ID  Section_ID  Direction  Movement  Preceding  Following  \
107797       NaN         NaN        NaN       NaN        507        521   
107798       NaN         NaN        NaN       NaN        507        521   
10101867     NaN         NaN        NaN       NaN        517        522   

          Space_Headway  Time_Headway  Location  
107797            27.75          6.77  

In [61]:
frame_id = 2330  # Replace with the desired Frame_ID
instances_count = trajectories_data[trajectories_data['Frame_ID'] == frame_id].shape[0]
print(f"Number of instances for Frame_ID {frame_id}: {instances_count}")

Number of instances for Frame_ID 2330: 1312


In [62]:
vehicle_data = trajectories_data[trajectories_data['Vehicle_ID'] == 1]
# print(vehicle_data)
vehicle_data = vehicle_data.sort_values(by='Frame_ID')
print(vehicle_data)
# vehicle_data = vehicle_data.drop_duplicates(subset='Frame_ID')
# print(len(vehicle_data))

          Vehicle_ID  Frame_ID  Total_Frames    Global_Time  Local_X  \
11526551           1        12           884  1113433136100   16.884   
11227535           1        13           884  1113433136200   16.938   
8914469            1        14           884  1113433136300   16.991   
10414622           1        15           884  1113433136400   17.045   
11206268           1        16           884  1113433136500   17.098   
...              ...       ...           ...            ...      ...   
10138773           1       891           884  1113433224000   15.980   
11054532           1       892           884  1113433224100   15.998   
10279610           1       893           884  1113433224200   16.021   
11715808           1       894           884  1113433224300   16.037   
10038743           1       895           884  1113433224400   16.055   

           Local_Y     Global_X     Global_Y  v_length  v_Width  ...  D_Zone  \
11526551    48.213  6042842.116  2133117.662      14.3 

In [63]:
unique_frame_ids = trajectories_data['Frame_ID'].unique()
print(len(unique_frame_ids))

11691
