In [1]:
# ruff: noqa: E402

%load_ext autoreload
%autoreload 2

# find the root of the project
import os
from pathlib import Path
import sys
import polars as pl
import dotenv


ROOT = Path(os.getcwd()).parent
while not ROOT.joinpath(".git").exists():
    ROOT = ROOT.parent

# add the root to the python path
sys.path.append(str(ROOT))


dotenv.load_dotenv(ROOT.joinpath(".env"))

from src.utils import check_gpu_available

GPU = check_gpu_available()
print(f"GPU available: {GPU}")

GPU available: True


## Read in the Trajectories


In [2]:
processed_traj_df = pl.scan_parquet(
    # Path(os.environ.get("RAW_DATA_DIR")).joinpath("*.parquet")
    ROOT
    / "data/merged_march.parquet"
).select(
    [
        "epoch_time",
        "lane",
        "lane_index",
        "s_velocity_smooth",
        "vehicle_id",
        "front_s_smooth",
        "association_distance",
        "length_s",
    ]
)

In [3]:
res = (
    pl.scan_parquet(ROOT / "data/merged_march.parquet")
    .filter((pl.col("object_id").list.len() > 1).any().over("vehicle_id"))
    .sort(pl.col("epoch_time"))
    .group_by("vehicle_id")
    .agg(
        (
            pl.col("centroid_x_smooth").diff() ** 2
            + pl.col("centroid_y_smooth").diff() ** 2
        )
        .sqrt()
        .sum()
        .alias("distance")
    )
    .select(pl.col("distance").mean())
    .collect(streaming=True, background=True)
)

res.fetch_blocking()

distance
f64
626.112695


In [4]:
res = (
    pl.scan_parquet(ROOT / "data/merged_march.parquet")
    .filter((pl.col("object_id").list.len() == 1).all().over("vehicle_id"))
    .sort(pl.col("epoch_time"))
    .group_by("vehicle_id")
    .agg(
        (
            pl.col("centroid_x_smooth").diff() ** 2
            + pl.col("centroid_y_smooth").diff() ** 2
        )
        .sqrt()
        .sum()
        .alias("distance")
    )
    .select(pl.col("distance").mean())
    .collect(streaming=True, background=True)
)

res.fetch_blocking() 

distance
f64
156.949462


In [5]:
processed_traj_df.select(
    pl.col("epoch_time")
    .min()
    .dt.convert_time_zone(time_zone="US/Central")
    .alias("min_epoch_time"),
    pl.col("epoch_time")
    .max()
    .dt.convert_time_zone(time_zone="US/Central")
    .alias("max_epoch_time"),
).collect().transpose()

column_0
"datetime[ms, US/Central]"
2023-03-12 16:41:27.300 CDT
2023-03-13 19:26:04.200 CDT


In [6]:
processed_traj_df.select(pl.col("vehicle_id").n_unique()).collect()

vehicle_id
u32
82725


In [7]:
processed_traj_df.columns

['epoch_time',
 'lane',
 'lane_index',
 's_velocity_smooth',
 'vehicle_id',
 'front_s_smooth',
 'association_distance',
 'length_s']

In [8]:
pl.scan_parquet(ROOT / "data/merged_march.parquet").select(
    pl.col('object_id')
).explode('object_id').approx_n_unique().collect()

object_id
u32
211062


In [9]:
211062 / 82725

2.551368993653672

In [10]:
# pl.scan_parquet(
#     # Path(os.environ.get("RAW_DATA_DIR")).joinpath("*.parquet")
#     ROOT / "data/merged_march.parquet"
# ).columns

In [18]:
lf_df = (
    processed_traj_df.lazy()
    .select(
        [
            "epoch_time",
            "lane",
            "lane_index",
            "front_s_smooth",
            "vehicle_id",
            "s_velocity_smooth",
            "length_s",
        ]
    )
    .with_row_count(name="row_count")
    .sort(["epoch_time", "front_s_smooth"])
    .with_columns(
        pl.col(["vehicle_id", "front_s_smooth", "length_s"])
        .shift(-1)
        .over(["epoch_time", "lane", "lane_index"])
        .name.map(lambda x: f"{x}_leader")
    )
    .filter(pl.col("vehicle_id_leader").is_not_null())
    .with_columns(
        ((pl.col("epoch_time").diff().dt.total_milliseconds() / 1000) > 0.1)
        .cum_sum()
        .over(["vehicle_id", "vehicle_id_leader", "lane", "lane_index"])
        .alias("other_leader"),
        pl.col("s_velocity_smooth")
        .rolling_mean(
            window_size=int(5 / 0.1),
            center=True,
            min_periods=1,
        )
        .over(
            [
                "vehicle_id",
                "lane",
            ]
        )
        .alias("s_velocity_smooth_rolling"),
        (
            pl.col("front_s_smooth_leader")
            - pl.col("length_s_leader")
            - pl.col("front_s_smooth")
        ).alias("dist_diff"),
        (
            (pl.col("front_s_smooth_leader") - pl.col("front_s_smooth"))
            / pl.col("s_velocity_smooth")
        ).alias("time_headway"),
        (pl.col("epoch_time").diff().dt.total_milliseconds() / 1000)
        .over(["vehicle_id", "lane"])
        .alias("time_diff"),
    )
    .with_columns(
        (pl.col("s_velocity_smooth_rolling").diff() / pl.col("time_diff"))
        .over(["vehicle_id", "lane"])
        .alias("acceleration")
    )
    .group_by(["lane", "lane_index", "vehicle_id", "vehicle_id_leader", "other_leader"])
    .agg(
        (pl.col("epoch_time").diff().dt.total_milliseconds() / 1000)
        .sum()
        .alias("time_diff"),
        pl.col("dist_diff").mean().alias("dist_diff"),
        pl.col("dist_diff").min().alias("dist_diff_min"),
        pl.col("dist_diff").max().alias("dist_diff_max"),
        pl.col("time_headway").mean().alias("time_headway"),
        pl.col("time_headway").min().alias("time_headway_min"),
        pl.col("time_headway").max().alias("time_headway_max"),
        pl.col("row_count").min().alias("start_index"),
        pl.col("row_count").max().alias("end_index"),
        pl.col("acceleration").min().alias("min_acceleration"),
        pl.col("acceleration").max().alias("max_acceleration"),
        pl.col("s_velocity_smooth").min().alias("min_velocity"),
        pl.col("s_velocity_smooth").max().alias("max_velocity"),
        pl.col('epoch_time').min().alias('epoch_time_min'),
        pl.col('epoch_time').max().alias('epoch_time_max'),
    )
    .filter(
        pl.col("vehicle_id_leader").is_not_null()
        & (pl.col("dist_diff_min") > 5)
        & pl.col("other_leader").is_not_null()
        & (pl.col('time_headway_min') > 0.3)
    )
    .collect(streaming=True)
)

In [12]:
lf_df.shape

(203886, 18)

### Create a DataFrame of Association Likelihood Scores

In [13]:
assoc_df = (
    processed_traj_df.lazy()
    .group_by(["vehicle_id"])
    .agg(
        pl.col("association_distance").mean().fill_null(0).alias("association_distance")
    )
    .collect(streaming=True)
)

assoc_df.head()

vehicle_id,association_distance
u64,f32
34742,2.966619
67376,0.0
3016,0.0
62788,0.0
52433,6.847196


### Identify Good Tra 

In [14]:
lf_df = lf_df.join(assoc_df, on="vehicle_id", how="left").join(
    assoc_df.rename({"vehicle_id": "vehicle_id_leader"}),
    on="vehicle_id_leader",
    how="left",
    suffix="_leader",
)

In [15]:
lf_df.shape

(203886, 20)

### Number of Leader-Follower Pairs in Image

In [21]:
tl = pl.lit("2023-03-13 06:50:00").str.strptime(
    pl.Datetime(time_unit="ns", time_zone="US/Central"),
).dt.convert_time_zone("UTC")
th = pl.lit("2023-03-13 07:00:00").str.strptime(
    pl.Datetime(time_unit="ns", time_zone="US/Central"),
).dt.convert_time_zone("UTC")


lf_df.filter(
    (pl.col('lane') == "EBL1") 
    & (pl.col('lane_index') == 0)
    & pl.col(
        'epoch_time_min'
    ).is_between(tl, th)
    & (pl.col("time_diff") > 10)
    & pl.col("time_headway_min").is_between(0.5, 5)
)

lane,lane_index,vehicle_id,vehicle_id_leader,other_leader,time_diff,dist_diff,dist_diff_min,dist_diff_max,time_headway,time_headway_min,time_headway_max,start_index,end_index,min_acceleration,max_acceleration,min_velocity,max_velocity,epoch_time_min,epoch_time_max
str,u16,u64,u64,u32,f64,f64,f64,f64,f64,f64,f64,u32,u32,f64,f64,f32,f32,"datetime[ms, UTC]","datetime[ms, UTC]"
"""EBL1""",0,15232,15246,0,17.1,61.595212,29.484831,132.686659,3.289776,2.02894,6.022152,11199124,16313594,-1.122456,0.101242,13.084476,22.393555,2023-03-13 11:55:25.900 UTC,2023-03-13 11:55:43 UTC
"""EBL1""",0,15138,15128,0,53.2,109.378927,82.1475,177.173132,5.739074,3.863377,7.547487,183339,19084528,-0.806789,0.890732,15.282641,25.58284,2023-03-13 11:52:06.800 UTC,2023-03-13 11:53:00 UTC
"""EBL1""",0,15420,15416,0,53.1,57.413766,45.275952,70.104978,2.959576,2.341868,3.619779,186276,20420710,-0.788918,0.642853,18.387064,24.716419,2023-03-13 11:59:48.400 UTC,2023-03-13 12:00:41.500 UTC
"""EBL1""",0,15178,15187,0,13.0,37.121525,31.741635,40.064406,2.15896,1.677611,2.579108,31514,2845155,-0.911999,0.32093,19.496853,24.0644,2023-03-13 11:53:19.300 UTC,2023-03-13 11:53:32.300 UTC
"""EBL1""",0,15199,15227,0,13.6,33.366658,8.605306,99.298825,2.731712,1.554732,6.032595,14006828,16749634,-1.518183,0.082169,6.824132,16.803793,2023-03-13 11:54:39.700 UTC,2023-03-13 11:54:53.300 UTC
"""EBL1""",0,15215,15218,0,13.2,22.593125,15.609777,36.187603,2.054622,1.606559,2.757128,17238797,20166192,0.080566,1.796026,6.762259,16.475994,2023-03-13 11:54:38.600 UTC,2023-03-13 11:54:51.800 UTC
"""EBL1""",0,15280,15273,0,26.0,84.318722,19.595995,136.14923,6.548182,2.523157,14.068292,11523400,17289463,-0.981827,1.453094,7.420079,16.582457,2023-03-13 11:56:37 UTC,2023-03-13 11:57:03 UTC
"""EBL1""",0,57872,78938,1,13.8,42.956786,23.957619,59.81746,2.993469,2.759195,3.271158,18359323,20921916,0.026531,1.540308,8.728446,19.929274,2023-03-13 11:51:37.500 UTC,2023-03-13 11:51:51.300 UTC
"""EBL1""",0,15170,15179,0,15.4,125.88519,60.150585,195.540517,5.8524,2.793365,8.647518,9063789,15375274,-0.316544,0.331039,21.263809,23.349247,2023-03-13 11:53:21.100 UTC,2023-03-13 11:53:36.500 UTC
"""EBL1""",0,15213,15219,0,36.5,53.02888,32.183113,60.393001,2.978042,2.161918,3.365443,4286186,16647422,-0.575314,1.074314,12.886529,23.048544,2023-03-13 11:54:34.700 UTC,2023-03-13 11:55:11.200 UTC


### All Calibrateable Trajectories

In [16]:
from scipy.stats import chi2

lf_df.filter(
    (pl.col("time_diff") > 10)
    # & (pl.col("time_headway") < 15)
    & (
        pl.col("time_headway_min").is_between(0.5, 5)
        # | pl.col("dist_diff_min").is_between(1, 10)
    )
    & (pl.col("association_distance") < chi2.ppf(0.95, 4))
    & (pl.col("association_distance_leader") < chi2.ppf(0.95, 4))
    # & (pl.col("min_acceleration") < -0.5)
    # & (pl.col("max_acceleration") > 0.5)
    # & (pl.col("min_velocity") < 5)
    # & (pl.col("max_velocity") > 15)
).shape

(28906, 20)