# Dataset Summary Statistics

In [1]:
# ruff: noqa: E402

%load_ext autoreload
%autoreload 2

# find the root of the project
import os
from pathlib import Path
import sys
import polars as pl
import dotenv


ROOT = Path(os.getcwd()).parent
while not ROOT.joinpath(".git").exists():
    ROOT = ROOT.parent

# add the root to the python path
sys.path.append(str(ROOT))


dotenv.load_dotenv(ROOT.joinpath(".env"))

from src.utils import check_gpu_available

GPU = check_gpu_available()
print(f"GPU available: {GPU}")

GPU available: True


## Read in the Trajectories


In [2]:
processed_traj_df = pl.scan_parquet(
    # Path(os.environ.get("RAW_DATA_DIR")).joinpath("*.parquet")
    ROOT
    / "data/merged_march.parquet"
).select(
    [
        "epoch_time",
        "lane",
        "lane_index",
        "s_velocity_smooth",
        "vehicle_id",
        "front_s_smooth",
        "association_distance",
        "length_s",
    ]
)

In [3]:
processed_traj_df.select(
    (pl.col("epoch_time").max() - pl.col("epoch_time").min()).dt.total_seconds() / 3600
).collect()

epoch_time
f64
26.743333


In [4]:
res = (
    pl.scan_parquet(ROOT / "data/merged_march.parquet")
    .filter((pl.col("object_id").list.len() > 1).any().over("vehicle_id"))
    .sort(pl.col("epoch_time"))
    .group_by("vehicle_id")
    .agg(
        (
            pl.col("centroid_x_smooth").diff() ** 2
            + pl.col("centroid_y_smooth").diff() ** 2
        )
        .sqrt()
        .sum()
        .alias("distance")
    )
    .select(pl.col("distance").mean())
    .collect(streaming=True, )
)

res

distance
f64
672.288466


In [5]:
res = (
    pl.scan_parquet(ROOT / "data/merged_march.parquet")
    .filter((pl.col("object_id").list.len() == 1).all().over("vehicle_id"))
    .sort(pl.col("epoch_time"))
    .group_by("vehicle_id")
    .agg(
        (
            pl.col("centroid_x_smooth").diff() ** 2
            + pl.col("centroid_y_smooth").diff() ** 2
        )
        .sqrt()
        .sum()
        .alias("distance")
    )
    .select(pl.col("distance").mean())
    .collect(streaming=True, )
)

res

distance
f64
170.78881


In [6]:
res = (
    pl.scan_parquet(ROOT / "data/merged_march.parquet")
    .filter((pl.col("object_id").list.len() > 1).any().over("vehicle_id"))
    .sort(pl.col("epoch_time"))
    .group_by("vehicle_id")
    .agg(
        (
            pl.col("centroid_x_smooth").diff() ** 2
            + pl.col("centroid_y_smooth").diff() ** 2
        )
        .sqrt()
        .sum()
        .alias("distance")
    )
    .select(pl.col("distance"))
    .collect(streaming=True, background=True)
)

In [7]:
res = (
    pl.scan_parquet(ROOT / "data/merged_march.parquet")
    # .filter((pl.col("object_id").list.len() > 1).any().over("vehicle_id"))
    .sort(pl.col("epoch_time"))
    .group_by("vehicle_id")
    .agg(
        (
            pl.col("centroid_x_smooth").diff() ** 2
            + pl.col("centroid_y_smooth").diff() ** 2
        )
        .sqrt()
        .sum()
        .alias("distance"),
        (pl.col("epoch_time").max() - pl.col("epoch_time").min())
        .dt.total_seconds()
        .alias("duration"),
        (pl.col("front_s_smooth") - pl.col("back_s_smooth")).max().alias("length"),
        pl.col("s_velocity_smooth").min().alias("min_vel"),
        pl.col("s_velocity_smooth").max().alias("max_vel"),
        pl.col("association_distance")
        .mean()
        .fill_null(0)
        .alias("association_distance"),
    )
    .filter(
        (pl.col("distance") < 1600)
        & (pl.col("min_vel") > -5)
        & (pl.col("max_vel") < 40)
    )
    # .select(pl.col("distance"))
    .collect(
        streaming=True,
    )
)

In [8]:
# from scipy.stats import chi2

# res.select(
#     ((pl.col("association_distance") <= chi2.ppf(0.99, 4)).sum() / pl.count()).alias(
#         "clean_association"
#     ),
#     ((pl.col("association_distance") > chi2.ppf(0.99, 4)).sum() / pl.count()).alias(
#         "bad_association"
#     ),
# )

In [9]:
res.describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

describe,vehicle_id,distance,duration,length,min_vel,max_vel,association_distance
str,f64,f64,f64,f64,f64,f64,f64
"""count""",79204.0,79204.0,79204.0,79204.0,79204.0,79204.0,79204.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",39976.908035,412.561129,174.071726,5.616789,9.363161,20.484005,
"""std""",23123.603091,361.368795,2683.007568,3.122576,7.34748,5.367138,
"""min""",0.0,0.0,0.0,0.0,-4.863656,0.946723,0.0
"""5%""",3989.0,34.52441,3.0,1.73368,-0.369933,9.953668,0.0
"""25%""",19970.0,132.719084,9.0,4.131753,3.123658,17.494608,0.0
"""50%""",39905.0,275.27994,18.0,5.122049,7.792594,21.324322,0.0
"""75%""",59977.0,608.276493,38.0,6.266292,15.458559,24.297579,2.955949
"""90%""",72096.0,1047.67536,59.0,8.519613,20.342579,26.492929,4.954889


In [10]:
res.select((pl.col("min_vel") < 0.5).sum() / pl.count())

min_vel
f64
0.114969


In [11]:
res.filter(pl.col("min_vel") < 0)

vehicle_id,distance,duration,length,min_vel,max_vel,association_distance
u64,f64,i64,f64,f32,f32,f32
10099,727.548825,61,6.806669,-0.251334,26.739601,2.897162
39193,924.753243,86,5.084647,-0.330173,22.026722,3.350018
35001,955.523061,94,5.771488,-0.373887,22.445753,275.133362
32640,608.259909,78,5.427303,-0.66965,23.610588,2.102478
12061,739.528323,72,8.186692,-0.747412,22.486504,3.528691
27793,601.429025,62,6.36541,-0.547155,23.361601,2.857253
27528,556.063843,80,5.31412,-0.326002,23.877031,3.11438
16128,541.3971,78,10.172734,-0.679661,25.045385,4.985929
45487,502.376939,100,11.123776,-0.729449,20.435953,9.966311
19138,592.786444,62,12.526717,-1.847045,28.760643,18.465469


In [12]:
# res.

In [13]:
res.select(
    (pl.col("length") >= 8).sum() / pl.col("length").count().alias("percent_truck"),
    ((pl.col("length") < 8).sum() / pl.col("length").count()).alias("percent_car"),
)

length,percent_car
f64,f64
0.119754,0.880246


In [14]:
res.select(
    (pl.col("length") >= 8).sum().alias("percent_truck"),
    (pl.col("length") < 8).sum().alias("percent_car"),
)

percent_truck,percent_car
u32,u32
9485,69719


In [15]:
res.select(
    (pl.col("duration").sum() / 3600),
    (pl.col("distance").sum() / 1000),
)

duration,distance
f64,f64
3829.771389,32676.491645


In [16]:
res["distance"].sum() / 1e3

32676.491644544167

In [17]:
processed_traj_df.select(
    pl.col("epoch_time")
    .min()
    .dt.convert_time_zone(time_zone="US/Central")
    .alias("min_epoch_time"),
    pl.col("epoch_time")
    .max()
    .dt.convert_time_zone(time_zone="US/Central")
    .alias("max_epoch_time"),
).collect().transpose()

column_0
"datetime[ms, US/Central]"
2023-03-12 16:41:27.300 CDT
2023-03-13 19:26:04.200 CDT


In [18]:
processed_traj_df.select(pl.col("vehicle_id").n_unique()).collect()

vehicle_id
u32
80185


In [19]:
# pl.scan_parquet(ROOT / "data/merged_ma/rch.parquet").columns

In [20]:
pl.scan_parquet(ROOT / "data/merged_march.parquet").select(pl.col("object_id")).explode(
    "object_id"
).approx_n_unique().collect()

object_id
u32
211106


In [21]:
211062 / 80185

2.6321880650994576

In [22]:
# pl.scan_parquet(
#     # Path(os.environ.get("RAW_DATA_DIR")).joinpath("*.parquet")
#     ROOT / "data/merged_march.parquet"
# ).columns

In [3]:
lf_df = (
    processed_traj_df.lazy()
    .select(
        [
            "epoch_time",
            "lane",
            "lane_index",
            "front_s_smooth",
            "vehicle_id",
            "s_velocity_smooth",
            "length_s",
        ]
    )
    .with_row_count(name="row_count")
    .sort(["epoch_time", "front_s_smooth"])
    .with_columns(
        pl.col(["vehicle_id", "front_s_smooth", "length_s"])
        .shift(-1)
        .over(["epoch_time", "lane", "lane_index"])
        .name.map(lambda x: f"{x}_leader")
    )
    .filter(pl.col("vehicle_id_leader").is_not_null())
    .with_columns(
        ((pl.col("epoch_time").diff().dt.total_milliseconds() / 1000) > 0.1)
        .cum_sum()
        .over(["vehicle_id", "vehicle_id_leader", "lane", "lane_index"])
        .alias("other_leader"),
        pl.col("s_velocity_smooth")
        .rolling_mean(
            window_size=int(3 / 0.1),
            center=True,
            min_periods=1,
        )
        .over(
            [
                "vehicle_id",
                "lane",
            ]
        )
        .alias("s_velocity_smooth_rolling"),
        (
            pl.col("front_s_smooth_leader")
            - pl.col("length_s_leader")
            - pl.col("front_s_smooth")
        ).alias("dist_diff"),
        (
            (pl.col("front_s_smooth_leader") - pl.col("front_s_smooth"))
            / pl.col("s_velocity_smooth")
        ).alias("time_headway"),
        (pl.col("epoch_time").diff().dt.total_milliseconds() / 1000)
        .over(["vehicle_id", "lane"])
        .alias("time_diff"),
    )
    .with_columns(
        (pl.col("s_velocity_smooth_rolling").diff() / pl.col("time_diff"))
        .over(["vehicle_id", "lane"])
        .alias("acceleration")
    )
    .group_by(["lane", "lane_index", "vehicle_id", "vehicle_id_leader", "other_leader"])
    .agg(
        (pl.col("epoch_time").diff().dt.total_milliseconds() / 1000)
        .sum()
        .alias("time_diff"),
        pl.col("dist_diff").mean().alias("dist_diff"),
        pl.col("dist_diff").min().alias("dist_diff_min"),
        pl.col("dist_diff").max().alias("dist_diff_max"),
        pl.col("time_headway").mean().alias("time_headway"),
        pl.col("time_headway").min().alias("time_headway_min"),
        pl.col("time_headway").max().alias("time_headway_max"),
        pl.col("row_count").min().alias("start_index"),
        pl.col("row_count").max().alias("end_index"),
        pl.col("acceleration").min().alias("min_acceleration"),
        pl.col("acceleration").max().alias("max_acceleration"),
        pl.col("s_velocity_smooth").min().alias("min_velocity"),
        pl.col("s_velocity_smooth").max().alias("max_velocity"),
        pl.col("epoch_time").min().alias("epoch_time_min"),
        pl.col("epoch_time").max().alias("epoch_time_max"),
        pl.col("acceleration")
        .filter(pl.col("time_headway") > 5)
        .max()
        .alias("free_accel"),
        (
            ((pl.col("time_headway") > 5) & (pl.col("s_velocity_smooth") > 20)).sum()
            / 10
        ).alias("cruising"),
    )
    .filter(
        pl.col("vehicle_id_leader").is_not_null()
        & (pl.col("dist_diff_min") > 3)
        & pl.col("other_leader").is_not_null()
        & (pl.col("time_headway_min") > 0.3)
    )
    .collect()
)

In [4]:
lf_df.shape

(193473, 22)

### Create a DataFrame of Association Likelihood Scores

In [5]:
assoc_df = (
    processed_traj_df.lazy()
    .group_by(["vehicle_id"])
    .agg(
        pl.col("association_distance").mean().fill_null(0).alias("association_distance")
    )
    .collect(streaming=True)
)

assoc_df.head()

vehicle_id,association_distance
u64,f32
15866,0.0
34477,0.0
57021,0.0
61874,0.0
60430,0.0


### Identify Good Tra 

In [6]:
lf_df = lf_df.join(assoc_df, on="vehicle_id", how="left").join(
    assoc_df.rename({"vehicle_id": "vehicle_id_leader"}),
    on="vehicle_id_leader",
    how="left",
    suffix="_leader",
)

In [7]:
lf_df.shape

(193473, 24)

### Number of Leader-Follower Pairs in Image

In [10]:
tl = (
    pl.lit("2023-03-13 06:50:00")
    .str.strptime(
        pl.Datetime(time_unit="ns", time_zone="US/Central"),
    )
    .dt.convert_time_zone("UTC")
)
th = (
    pl.lit("2023-03-13 07:00:00")
    .str.strptime(
        pl.Datetime(time_unit="ns", time_zone="US/Central"),
    )
    .dt.convert_time_zone("UTC")
)


lf_df.filter(
    (pl.col("lane") == "EBL1")
    & (pl.col("lane_index") == 0)
    & pl.col("epoch_time_min").is_between(tl, th)
    & (pl.col("time_diff") > 10)
    & pl.col("time_headway_min").is_between(0.5, 5)
)

lane,lane_index,vehicle_id,vehicle_id_leader,other_leader,time_diff,dist_diff,dist_diff_min,dist_diff_max,time_headway,time_headway_min,time_headway_max,start_index,end_index,min_acceleration,max_acceleration,min_velocity,max_velocity,epoch_time_min,epoch_time_max,free_accel,cruising,association_distance,association_distance_leader
str,u16,u64,u64,u32,f64,f64,f64,f64,f64,f64,f64,u32,u32,f64,f64,f32,f32,"datetime[ms, UTC]","datetime[ms, UTC]",f64,f64,f32,f32
"""EBL1""",0,13576,13572,0,53.1,56.697943,41.005933,69.921246,2.934773,2.12695,3.619201,188170,20405986,-1.639919,1.09293,18.486311,24.159533,2023-03-13 11:59:48.400 UTC,2023-03-13 12:00:41.500 UTC,,0.0,2.472676,3.24241
"""EBL1""",0,13531,13535,0,13.2,82.693118,67.46608,93.426004,3.882856,3.211779,4.473044,3561851,8172176,-0.34996,0.153828,21.891325,22.916315,2023-03-13 11:58:35.500 UTC,2023-03-13 11:58:48.700 UTC,,0.0,2.226898,1.946663
"""EBL1""",0,13326,13318,0,32.7,104.217446,81.901858,177.342597,5.328666,4.048843,7.17582,183946,12248529,-0.981026,0.803146,16.581247,25.47538,2023-03-13 11:52:06.800 UTC,2023-03-13 11:52:39.500 UTC,0.803146,7.1,3.494298,2.722257
"""EBL1""",0,13302,13330,0,21.1,69.498395,55.54223,113.749581,3.794207,3.161084,5.120674,8360422,15995244,-1.439781,1.055908,16.618658,23.277256,2023-03-13 11:52:12.200 UTC,2023-03-13 11:52:33.300 UTC,0.010891,0.2,3.981827,2.625889
"""EBL1""",0,13430,13440,0,12.8,74.750419,27.329483,117.582658,5.239998,4.82253,5.655293,13060558,16372240,-1.699142,0.103855,6.960227,22.058346,2023-03-13 11:56:26.100 UTC,2023-03-13 11:56:38.900 UTC,0.103855,3.7,2.409041,2.257096
"""EBL1""",0,13446,13430,1,14.7,20.151891,5.592392,63.148547,4.605849,1.983233,6.905156,15410183,17372670,-1.457796,1.610265,1.780016,12.075953,2023-03-13 11:56:45.700 UTC,2023-03-13 11:57:00.400 UTC,0.565295,0.0,2.425365,2.409041
"""EBL1""",0,13535,13528,0,13.2,25.017902,17.735633,34.929658,1.160927,0.812848,1.61968,4682121,11040381,-0.5616,0.532246,22.299212,26.039637,2023-03-13 11:58:35.500 UTC,2023-03-13 11:58:48.700 UTC,,0.0,1.946663,4.312599
"""EBL1""",0,13378,13376,0,68.0,50.232137,9.190963,103.989548,30.036087,4.379134,297.835491,4540617,15622463,-1.835413,2.066784,0.055265,20.880713,2023-03-13 11:53:45.300 UTC,2023-03-13 11:54:53.300 UTC,2.066784,7.2,4.241367,2.859252
"""EBL1""",0,13452,13446,0,39.3,64.100546,20.51271,136.166089,4.996953,1.608105,13.026326,11543895,20502988,-0.996552,1.741066,7.811153,18.780396,2023-03-13 11:56:37.200 UTC,2023-03-13 11:57:16.500 UTC,1.741066,0.0,2.192642,2.425365
"""EBL1""",0,13396,13384,0,36.5,27.816594,18.440071,49.304898,1.713497,1.207302,3.558036,4887589,17841606,-1.108074,1.003361,15.015748,20.872761,2023-03-13 11:54:34.700 UTC,2023-03-13 11:55:11.200 UTC,,0.0,2.350451,3.364394


### All Calibrateable Trajectories

In [11]:
from scipy.stats import chi2

lf_df.filter(
    # (pl.col("time_diff") > 10)
    # & (pl.col("time_headway") < 15)
    (
        pl.col("time_headway_min").is_between(0.5, 5)
    )
    & (pl.col("association_distance") < chi2.ppf(0.999, 4))
    & (pl.col("association_distance_leader") < chi2.ppf(0.999, 4))
    & (pl.col("min_acceleration") < -0.2)
    & (pl.col("max_acceleration") > 0.2)
    & (pl.col("free_accel") > 0.2)
    & (pl.col('cruising') > 2)
    # cruising
    # & (pl.col("min_velocity") < 5)
    # & (pl.col("max_velocity") > 15)
)

lane,lane_index,vehicle_id,vehicle_id_leader,other_leader,time_diff,dist_diff,dist_diff_min,dist_diff_max,time_headway,time_headway_min,time_headway_max,start_index,end_index,min_acceleration,max_acceleration,min_velocity,max_velocity,epoch_time_min,epoch_time_max,free_accel,cruising,association_distance,association_distance_leader
str,u16,u64,u64,u32,f64,f64,f64,f64,f64,f64,f64,u32,u32,f64,f64,f32,f32,"datetime[ms, UTC]","datetime[ms, UTC]",f64,f64,f32,f32
"""WBL1""",0,1134,1080,1,23.0,100.235391,29.81752,161.419947,5.103114,2.250453,7.481436,12379694,20192263,-0.336761,1.280899,15.281343,22.683762,2023-03-12 22:10:11.600 UTC,2023-03-12 22:10:34.600 UTC,0.427532,12.3,4.604354,3.764969
"""WBL1""",1,1380,1370,0,24.9,102.75825,93.205347,114.349957,4.583499,3.918141,6.052227,7652660,18864488,-1.46122,1.408024,19.148617,25.709221,2023-03-12 22:17:45.600 UTC,2023-03-12 22:18:10.500 UTC,1.408024,5.1,2.220452,3.229852
"""WBL1""",0,1731,1724,0,30.0,76.317456,13.858784,105.410816,5.326592,4.543598,6.584079,776338,8809815,-1.68417,0.277214,2.986057,21.706232,2023-03-12 22:26:48.400 UTC,2023-03-12 22:27:18.400 UTC,0.277214,2.3,3.029629,2.162552
"""EBL1""",1,2421,2433,0,7.5,98.22375,68.925808,139.789953,5.305919,4.559159,7.337386,11170713,13373810,-1.516418,0.47802,15.7387,22.513,2023-03-12 22:45:34.200 UTC,2023-03-12 22:45:41.700 UTC,0.47802,2.2,2.147361,3.695714
"""WBL1""",0,3302,3257,0,14.4,129.377491,108.311167,144.875037,5.86583,4.372897,13.254208,11391831,17660288,-0.716629,4.029102,10.153414,28.410604,2023-03-12 23:10:27.800 UTC,2023-03-12 23:10:42.200 UTC,4.029102,6.8,2.315214,3.813095
"""EBL1""",1,3900,3891,0,29.6,78.556324,7.055241,282.531508,7.763328,3.427567,16.841736,4822131,10977993,-1.700783,1.798172,0.812778,23.923952,2023-03-12 23:29:07.700 UTC,2023-03-12 23:29:37.300 UTC,1.416731,6.0,2.958744,3.16544
"""WBL1""",0,6241,6236,1,11.7,105.693242,44.238776,149.305911,5.76559,3.963256,8.027363,4815619,8237409,-1.996469,0.885067,12.423812,21.404545,2023-03-13 00:52:01.300 UTC,2023-03-13 00:52:13 UTC,0.885067,4.0,2.654603,2.213043
"""EBL1""",1,6312,6326,0,17.1,81.125043,31.315604,161.426472,4.082997,2.140515,6.972827,11930138,17980301,-0.927067,1.574287,16.425121,24.251059,2023-03-13 00:55:07.100 UTC,2023-03-13 00:55:24.200 UTC,1.574287,4.6,1.884732,2.520603
"""WBL1""",0,8704,8701,0,43.2,114.501748,48.077526,191.130835,4.852655,2.05213,8.480266,282037,19854033,-1.986713,2.162971,21.790676,30.516462,2023-03-13 03:47:14.200 UTC,2023-03-13 03:47:57.400 UTC,0.665646,22.9,2.314481,3.057546
"""EBL1""",0,10073,10075,0,38.5,223.052189,15.323687,353.057115,12.306968,2.005872,33.387642,3932907,16754602,-1.593018,1.775331,10.307685,22.797905,2023-03-13 09:37:22.500 UTC,2023-03-13 09:38:01 UTC,1.775331,21.4,2.521836,2.137369
