# Find Lanes


In [None]:
import os
import sys
from pathlib import Path
import polars as pl
import pandas as pd
import plotly.graph_objects as go
import utm

repo_root = Path(os.getcwd()).parent

while not (repo_root / ".git").exists():
    repo_root = repo_root.parent

sys.path.append(str(repo_root))

# set the pandas plotting backend to plotly
pd.options.plotting.backend = "plotly"


## Load Radar Data


In [None]:
## Read in the Data
from src.filtering import Filtering


df = pl.scan_parquet(
    "/Users/max/Library/CloudStorage/Box-Box/Radar-Data/new_format/167865*.parquet"
).collect()


# create the file paths
network_outline_file = repo_root / "geo_data" / "network_outline.geojson"
radar_locations_file = repo_root / "geo_data" / "radar_origins.json"

f = Filtering(
    radar_location_path=radar_locations_file,
    network_boundary_path=network_outline_file,
)

df = (
    df.pipe(f.create_object_id)
    # resample to 10 Hz
    .pipe(f.resample, 100)
    # remove objects that travel for very little time
    .pipe(f.filter_short_trajectories, minimum_distance_m=100, minimum_duration_s=5)
    # clip trajectories to not include the constant speed at the end
    .pipe(f.clip_trajectory_end)
    # rotate the heading measurements to world coordinates
    .pipe(f.rotate_heading)
    # update the centroid coordinates to the actual center of the object
    .pipe(
        f.correct_center,
        x_col="utm_x",
        y_col="utm_y",
    )
    # convert the h3 integer to a string
    .pipe(f.int_h3_2_str)
    # filter out objects that are not in the network
    .pipe(f.filter_network_boundaries)
)


### Just Look at One TL


In [None]:
interest_df = df.filter(pl.col("ip") == "10.160.7.137")

f.h3_resolution = 14

interest_df = interest_df.pipe(f.radar_to_h3)


#### Plot the x/y data in a scatter plot


In [None]:
fig = go.Figure()

_df = interest_df.with_columns(
    [
        pl.col("object_id").n_unique().over("h3").alias("n_objects"),
    ]
).filter(pl.col("n_objects") > 10)

fig.add_trace(
    go.Scatter(
        x=_df["utm_x"],
        y=_df["utm_y"],
        mode="markers",
    )
)

# make the image square
fig.update_layout(
    autosize=False,
    # width=800,
    # height=800,
    xaxis=dict(
        scaleanchor="y",
        scaleratio=1,
    ),
)
# plot with a white background and no x and y axis
fig.update_layout(
    plot_bgcolor="white",
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    xaxis_zeroline=False,
    yaxis_zeroline=False,
    xaxis_visible=False,
    yaxis_visible=False,
)

# make figure span the whole figure
fig.update_layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0,
        pad=0,
    )
)

# write to a huge figure
fig.write_image(
    repo_root / "lane-finder.png",
    width=2400,
    height=2400,
)


In [None]:
interest_df["epoch_time"].max() - interest_df["epoch_time"].min()


### Build a Grid Distance Function


In [None]:
import numpy as np
import numba as nb
import h3


def gen_cust_dist_func(parallel=True):
    def cust_dot_T(A, B):
        assert B.shape[1] == A.shape[1]

        out = np.empty((A.shape[0], B.shape[0]), dtype=A.dtype)
        for i in nb.prange(A.shape[0]):
            for j in range(B.shape[0]):
                acc = 0
                for k in range(A.shape[1]):
                    acc += h3.h3_distance(A[i, k], B[j, k])
                out[i, j] = np.mean(acc)
        return out

    # if parallel==True:
    #     return nb.njit(cust_dot_T,fastmath=True,parallel=True)
    # else:
    #     return nb.njit(cust_dot_T,fastmath=True,parallel=False
    # r)
    return cust_dot_T


cust_dist_func = gen_cust_dist_func(parallel=True)


In [None]:
from scipy.spatial.distance import pdist, squareform, cdist


def get_h3_distance_matrix(h3_list):
    """Calculate the distance matrix between all h3 indices in a list

    Parameters
    ----------
    h3_list : list
        list of h3 indices

    Returns
    -------
    np.ndarray
        distance matrix
    """
    return pdist(
        np.array(h3_list).reshape(-1, 1),
        np.array(h3_list).reshape(-1, 1),
        metric=lambda x, y: h3.h3_distance(str(x[0]), str(y[0])),
    )


In [None]:
# res = get_h3_distance_matrix(h3s)


#### Compress Each Trajectory into it's H3 representation


New idea:

- Compress each trajectory into h3 representation
- The a trajectory is 5 steps of h3
- Use the center coords of h3 to cluster the trajectories (using euclidean distance)
- The distance can be precomputed using h3 center -> utm x,y ->
-


#### Create a Unique H3 -> X,Y mapping & Then a Distance Matrix


In [None]:
import utm
import h3
import numpy as np

map_df = (
    interest_df.groupby(["object_id", "h3"])
    .first()
    .sort("epoch_time")
    .select([pl.col("h3").unique()])
    .with_columns(
        [
            pl.col("h3")
            .apply(
                lambda x: h3.h3_to_geo(
                    x,
                ),
            )
            .alias("latlon"),
        ]
    )
    .with_columns(
        [
            pl.col("latlon").apply(lambda x: utm.from_latlon(x[0], x[1])).alias("utm"),
        ]
    )
    .with_columns([pl.col("utm").apply(lambda x: list(x[:2])).alias("utm_coords")])
    .drop(["utm", "latlon"])
    # .to_pandas()
)


point_array = np.array(map_df["utm_coords"].to_list())


In [None]:
# convert the point distance to a square matrix of distances
dist = pdist(point_array, metric="euclidean")
dist = squareform(dist)

# get the h3 indices
h3s = map_df["h3"].to_list()

# add the h3 indices to the distance matrix
dist_df = pd.DataFrame(dist, columns=h3s, index=h3s)


In [None]:
sub_sequence_length = 10

sub_trajectories = (
    interest_df
    #  create a running count of each vehicle index
    .with_columns([pl.col("h3").cumcount().over("object_id").alias("vehicle_index")])
    .groupby("object_id", pl.col("vehicle_index") // sub_sequence_length)
    .agg([pl.col("h3")])
    .with_columns([pl.col("h3").arr.join("-")])
    .groupby(pl.col("h3"))
    .count()
    .filter(pl.col("count") > 1)
    .with_columns([pl.col("h3").str.split("-").alias("h3s")])
    .explode("h3s")
    .join(map_df, left_on="h3s", right_on="h3")
    .groupby("h3")
    .agg(
        [
            pl.col("h3s"),
            pl.col("utm_coords"),
        ]
    )
    .select([pl.col("h3"), pl.col("h3s"), pl.col("utm_coords")])
    .filter(pl.col("h3s").arr.lengths() == sub_sequence_length)
)

sub_trajectories


In [None]:
from numba import jit
import h3


@jit(nopython=True)
def euclidean_distance(points1, points2):
    return np.sqrt(((points1 - points2) ** 2).sum(axis=1)).mean()


# @jit(nopython=True)
def compute_distances(pairs):
    num_pairs = pairs.shape[0]
    combinations = [(i, j) for i in range(num_pairs) for j in range(i + 1, num_pairs)]
    res = np.empty(len(combinations))

    for k, (p1, p2) in enumerate(combinations):
        for z1, z2 in zip(pairs[p1], pairs[p2]):
            res[k] += h3.h3_distance(str(z1), str(z2))

    return res


In [None]:
import itertools
from tqdm import tqdm

# pairs = np.array([k for v in sub_trajectories['utm_coords'].to_list() for k in v]).reshape(len(sub_trajectories['utm_coords']), 5, 2)
pairs = np.array([v for v in sub_trajectories["h3s"].to_list()])

res = compute_distances(pairs)
res = squareform(res)

distance_df = pd.DataFrame(
    res,
    columns=sub_trajectories["h3"].to_list(),
    index=sub_trajectories["h3"].to_list(),
)


In [None]:
# create a dataframe of the combinations
distance_df


In [None]:
h3_obj_ids = (
    interest_df
    # every 3 hexagons, create a new object id
    .groupby_dynamic("epoch_time", every="2s", by="object_id")
    .agg([pl.col("h3")])
    .with_row_count()
    .with_columns(
        [(pl.col("object_id") + "-" + pl.col("row_nr").cast(str)).alias("object_id")]
    )
    .drop(["row_nr", "epoch_time"])
    .explode("h3")
    .with_columns(
        [
            pl.col("object_id").count().over("h3").alias("count"),
        ]
    )
    .filter(pl.col("count") > 2)
    .drop("count")
)


h3_dataframe = (
    h3_obj_ids.pivot(
        values="h3",
        index="object_id",
        columns="h3",
        aggregate_function="count",
    )
    .fill_null(0)
    .to_pandas()
)


In [None]:
h3s = h3_dataframe.set_index(
    "object_id",
).columns.to_list()


In [None]:
# do tsne on the h3 dataframe

from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=30, n_iter=1000, verbose=1, metric="cosine")

tsne_results = tsne.fit_transform(h3_dataframe.drop(columns="object_id").to_numpy())

# merge the tsne results with the original dataframe
tsne_df = pd.DataFrame(tsne_results, columns=["tsne_x", "tsne_y"])

tsne_df = pd.concat([h3_dataframe["object_id"], tsne_df], axis=1)

# plot the tsne results
fig = go.Figure()


_plot_df = tsne_df.sample(1000)

fig.add_trace(
    go.Scatter(
        x=_plot_df["tsne_x"],
        y=_plot_df["tsne_y"],
        mode="markers",
    )
)


### Clustering on Sub Trajectories


In [None]:
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from scipy.spatial import ConvexHull
import numpy as np
import h3


In [None]:
target_df = h3_dataframe.drop(columns="object_id").T.copy()

dbscan = DBSCAN(eps=20, min_samples=5, metric="precomputed")
dbscan.fit(
    distance_df,
)

dbscan.labels_


In [None]:
label_df = pd.DataFrame(dbscan.labels_, columns=["label"], index=distance_df.index)
# reset the index, split the h3s
label_df = label_df.reset_index().rename(columns={"index": "h3"})
label_df["h3s"] = label_df["h3"].str.split("-")
label_df = label_df.explode("h3s")


In [None]:
# plot the clusters
fig = go.Figure()

for cluster, _df in label_df.groupby("label"):
    # if cluster == -1:
    #     continue
    # get the set of h3 cells for the cluster
    h3s = _df["h3s"].unique().tolist()

    # plot the extend of the h3 cells
    geometry = h3.h3_set_to_multi_polygon(h3s, geo_json=True)

    fig.add_trace(
        go.Scattermapbox(
            lon=[x[0] for x in geometry[0][0]],
            lat=[x[1] for x in geometry[0][0]],
            mode="lines",
            name=f"cluster {cluster}",
        )
    )

    # fig.add_trace(
    #     go.Scatter(
    #         x=data["f32_positionX_m"],
    #         y=data["f32_positionY_m"],
    #         mode="markers",
    #         name=f"cluster {cluster}",
    #     )
    # )

# make the plot square
fig.update_layout(
    width=800,
    height=800,
    autosize=False,
    showlegend=True,
    xaxis=dict(
        scaleanchor="y",
        scaleratio=1,
    ),
)

# add mapbox
fig.update_layout(
    # use mapbox for base tiles
    mapbox_style="stamen-terrain",
    # center on Tuscaloosa
    mapbox_center_lat=33.2,
    mapbox_center_lon=-87.5,
    mapbox_zoom=10,
)


fig.show()


### Clustering on H3 Vectors


In [None]:
# tsne_df[tsne_df["cluster"] == cluster,].to_list()


In [None]:
## Cluster using DBSCAN
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from scipy.spatial import ConvexHull
import numpy as np
import h3


dbscan = DBSCAN(eps=0.2, min_samples=7, metric="cosine")
dbscan.fit(h3_dataframe.drop(columns="object_id"))
tsne_df["cluster"] = dbscan.labels_

# create a dictionary of included hexagons

cluster_h3s = {}

for cluster in tsne_df["cluster"].unique():
    if cluster >= 0:
        h3s = (
            h3_obj_ids.filter(
                pl.col("object_id").is_in(
                    tsne_df.loc[tsne_df["cluster"] == cluster, "object_id"].to_list()
                )
            )["h3"]
            .unique()
            .to_list()
        )

        cluster_h3s[cluster] = set(h3s)


In [None]:
# plot the clusters
fig = go.Figure()

for cluster, h3s in cluster_h3s.items():
    if cluster == -1:
        continue

    # if cluster not in [57, 18, 43, 11, 73, 42, 48]:
    #     continue

    # plot the extend of the h3 cells
    geometry = h3.h3_set_to_multi_polygon(h3s, geo_json=True)

    fig.add_trace(
        go.Scattermapbox(
            lon=[x[0] for x in geometry[0][0]],
            lat=[x[1] for x in geometry[0][0]],
            mode="lines",
            name=f"cluster {cluster}",
            fill="toself",
        )
    )

# make the plot square
fig.update_layout(
    width=800,
    height=800,
    autosize=False,
    showlegend=True,
    xaxis=dict(
        scaleanchor="y",
        scaleratio=1,
    ),
)

# add mapbox
fig.update_layout(
    mapbox_style="open-street-map",
    # center on Tuscaloosa
    mapbox_center_lat=33.2,
    mapbox_center_lon=-87.5,
    mapbox_zoom=10,
)


fig.show()


In [None]:
vehicle_h3s = (
    interest_df.groupby(["h3", "object_id"])
    .agg(
        [
            pl.col("f32_velocityInDir_mps").mean().alias("f32_velocityInDir_mps"),
            pl.col("epoch_time").first().alias("epoch_time"),
        ]
    )
    .sort(["object_id", "epoch_time"])
)


### Consume all Clusters completely in other Clusters


In [None]:
clusters = list(cluster_h3s.keys())
# overlap = {c: {v: 0} for c in clusters for v in clusters if c != v}

interest_df = interest_df.sort(["object_id", "epoch_time"]).with_columns([])
start_end = {}

overlaps = []

for cluster_1 in clusters:
    for cluster_2 in clusters:
        if (cluster_1 != cluster_2) and (
            (cluster_1 in cluster_h3s) and (cluster_2 in cluster_h3s)
        ):
            c1_h3s = cluster_h3s.get(cluster_1, set())
            c2_h3s = cluster_h3s.get(cluster_2, set())

            if len(c1_h3s.intersection(c2_h3s)):
                overlaps.append((cluster_1, cluster_2))

            # try to consume the small ones.
            if len(c1_h3s.intersection(c2_h3s)) >= len(c2_h3s):
                cluster_h3s.pop(cluster_2, None)
                print(f"popping {cluster_2}")
            elif len(c2_h3s.intersection(c1_h3s)) >= len(c1_h3s):
                cluster_h3s.pop(cluster_1, None)
                print(f"popping {cluster_1}")

#     if (cluster_1 in cluster_h3s) and (cluster_1 not in start_end):
#         # build paths that include the cluster


#     if (cluster_1 in cluster_h3s) and (cluster_1 not in start_end):

#         # find the start and end of each segment by looking at vehicles in the cluster
#         start_n_ends = (
#             interest_df.filter(
#                 pl.col('h3').is_in(list(cluster_h3s[cluster_1]))
#             ).groupby([
#                 'object_id'
#             ]).agg([
#                 pl.col('h3').first().alias('start'),
#                 pl.col('h3').last().alias('end')
#             ])
#         )


#         # take any hexagon which has 10% of the arrivals or departures
#         starts = start_n_ends['start'].value_counts().with_columns([
#             (pl.col('counts') / pl.col('counts').sum())
#         ]).filter(
#             pl.col('counts') > 0.1
#         )['start'].to_list()

#         ends = start_n_ends['end'].value_counts().with_columns([
#             (pl.col('counts') / pl.col('counts').sum())
#         ]).filter(
#             pl.col('counts') > 0.1
#         )['end'].to_list()

#         start_end[cluster_1] = (set(starts), set(ends))


# # create a distance matrix of overlap #

# overlaps_counts = {}
# for c1, c2 in overlaps:
#     # first try c1->c2:
#     s, e = start_end[c1]
#     s2, e2 = start_end[c2]
#     overlaps_counts[(c1, c2)] = len(e.intersection(s2))
#     overlaps_counts[(c2, c1)] = e2.intersection(s)


# # istead create a prob using vehicle transitions
# # if a vehicle goes from 1 -> 2 -> 1, then discard 2 from its trajectory
# # what to do if it is multiple at the same time


### Map the Label of Each Cluster to the Trajectories


In [None]:
cluster_df = (
    vehicle_h3s.with_columns(
        [
            # pl.when(pl.col('h3').is_in(list(v))).then(pl.lit(str(c))).otherwise(None).alias(str(c)) for c, v in cluster_h3s.items()
            pl.col("h3")
            .apply(lambda h3: [c for c, v in cluster_h3s.items() if h3 in v])
            .alias("clusters")
        ]
    )
    .explode("clusters")
    .filter(pl.col("clusters").is_not_null())
    .groupby(["object_id", "clusters"])
    .agg(
        [
            pl.col("epoch_time").first().alias("start_time"),
            pl.col("epoch_time").last().alias("end_time"),
            pl.col("h3").len().alias("num_h3s"),
        ]
    )
    .sort(["object_id", "start_time", "end_time"])
    .with_columns(
        [
            *(
                pl.col(c)
                .shift(1)
                .backward_fill(1)
                .over("object_id")
                .alias(f"{c}_shifted")
                for c in ["start_time", "end_time"]
            ),
            (pl.col("end_time") - pl.col("start_time")).dt.seconds().alias("duration"),
            (
                pl.col("num_h3s")
                / pl.col("clusters").apply(lambda x: len(cluster_h3s[x]))
            ).alias("percent_h3s"),
        ]
    )
    .filter(
        (pl.col("end_time") >= pl.col("end_time_shifted"))
        & (pl.col("percent_h3s") > 0.25)
        & (pl.col("duration") > 0)
    )
    .with_columns(
        [
            pl.col(c).shift(1).backward_fill(1).over("object_id").alias(f"{c}_shifted")
            for c in ["start_time", "end_time"]
        ]
    )
    .filter(pl.col("end_time") >= pl.col("end_time_shifted"))
    .groupby(["object_id"])
    .agg([pl.col("clusters")])
    .with_columns(
        [
            pl.col('clusters').arr.lengths().alias('num_clusters'),
        ]
    )
)

to_from_df = (
    cluster_df.explode("clusters")
    .with_columns(
        [pl.col("clusters").shift(-1).over("object_id").alias("clusters_shifted")]
    )
    .filter(pl.col("clusters_shifted").is_not_null())
    .rename({"clusters": "from", "clusters_shifted": "to"})
    .to_pandas()
)


In [None]:
cluster_df.filter(pl.col('num_clusters') > 15).to_pandas().iloc[0]['clusters']

In [None]:
to_from_df = to_from_df.groupby(by=['from', 'to']).count().reset_index()

In [None]:
import networkx as nx

G = nx.from_pandas_edgelist(
    to_from_df, 
    source="from", 
    target="to", 
    edge_attr="object_id",
    edge_key="object_id",
    create_using=nx.DiGraph
)


In [None]:
## Find the Shortest Parth
G.get_edge_data(7, 54)

In [None]:
# plot the graph
nx.draw(G, with_labels=True)


In [None]:
sources = [x for x in G.nodes() if (G.out_degree(x) > 0) & (G.in_degree(x) < 1)]
sources


In [None]:
sinks = [x for x in G.nodes() if (G.out_degree(x) < 1) & (G.in_degree(x) >= 1)]
sinks


In [None]:
for source in sources:
    for target in sinks:
        print(list(nx.simple_paths.all_simple_edge_paths(G, source, target, cutoff=19)))

In [None]:
G.in_degree(5)


In [None]:
# plot the clusters
fig = go.Figure()

for cluster, h3s in cluster_h3s.items():
    if cluster == -1:
        continue

    # if cluster not in sources + sinks:
    #     continue
    if cluster not in [18, 38, 34, 54,  1, 29, 30,  2, 31,  3,  4, 32, 58, 40,  5, 37, 39, 9,  0]:
        continue

    color = "red" if cluster in sources else "blue"

    # plot the extend of the h3 cells
    geometry = h3.h3_set_to_multi_polygon(h3s, geo_json=True)

    fig.add_trace(
        go.Scattermapbox(
            lon=[x[0] for x in geometry[0][0]],
            lat=[x[1] for x in geometry[0][0]],
            mode="lines",
            name=f"cluster {cluster}",
            fill="toself",
            fillcolor=color,
            line_color=color,
        )
    )

# make the plot square
fig.update_layout(
    width=800,
    height=800,
    autosize=False,
    showlegend=True,
    xaxis=dict(
        scaleanchor="y",
        scaleratio=1,
    ),
)

# add mapbox
fig.update_layout(
    mapbox_style="open-street-map",
    # center on Tuscaloosa
    mapbox_center_lat=33.2,
    mapbox_center_lon=-87.5,
    mapbox_zoom=10,
)


fig.show()


## Geometry Methods


In [None]:
f.h3_resolution = 14

interest_df = interest_df.pipe(f.radar_to_h3)


In [None]:
traj_df = interest_df.groupby("object_id").agg(
    [
        pl.col("h3").unique(),
    ]
)


In [None]:
significant_hex = (
    interest_df.groupby("h3")
    .agg(
        [
            pl.count("object_id").alias("count"),
        ]
    )
    .with_columns(
        [
            pl.col("h3").apply(lambda x: h3.h3_to_parent(x, 12)).alias("h3_12"),
            pl.col("h3").apply(lambda x: h3.h3_to_parent(x, 13)).alias("h3_13"),
        ]
    )
    .sort("count", descending=True)
    .groupby(["h3_13"])
    .agg(
        [
            pl.col("h3").head(2),
            pl.col("count").head(2),
        ]
    )
    .explode(["h3", "count"])
)


In [None]:
# create a fully connected graph of the h3 cells
from networkx import from_pandas_edgelist, Graph


h3_product = list(itertools.product(significant_hex["h3"], repeat=2))

# make a dataframe of the product
h3_product_df = pl.DataFrame(h3_product, schema=["h3_1", "h3_2"])


In [None]:
vehicle_parts = (
    interest_df.unique(maintain_order=False, subset=["h3", "object_id"])
    .select(["h3", "object_id"])
    .filter(pl.col("h3").is_in(h3_product_df["h3_1"].to_list()))
)


In [None]:
h3_product_df.lazy().join(
    vehicle_parts.lazy(),
    left_on="h3_1",
    right_on="h3",
    how="inner",
).join(
    vehicle_parts.lazy(),
    left_on="h3_2",
    right_on="h3",
    how="inner",
).filter(
    pl.col("object_id_1") == pl.col("object_id_2")
).collect()


In [None]:
# check if the connections appear in the dataset
for h3_1, h3_2 in tqdm(h3_product):
    c = traj_df.filter(pl.col("h3").is_in([h3_1, h3_2])).shape[0]


In [None]:
interest_df


## Clustering Trajectories Themselves


In [None]:
from scipy.spatial.distance import directed_hausdorff
from sklearn.cluster import DBSCAN
import numpy as np


In [None]:
def hausdorff(u, v):
    return max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0])


In [None]:
veh_df = interest_df.groupby(["object_id", "h3"]).first()


In [None]:
vehicles_ = (
    veh_df.with_columns(
        pl.struct(
            [
                "f32_positionX_m",
                "f32_positionY_m",
            ]
        )
        .apply(lambda x: [x["f32_positionX_m"], x["f32_positionY_m"]])
        .alias("position")
    )
    .groupby("object_id")
    .agg(["position"])
    .sample(100)["position"]
    .to_list()
)


In [None]:
# calculate the pairwise distance matrix
import similaritymeasures

distance_matrix = np.zeros((len(vehicles_), len(vehicles_)))

for i, u in enumerate(vehicles_):
    for j in range(i + 1, len(vehicles_)):
        distance_matrix[i, j] = similaritymeasures.frechet_dist(u, vehicles_[j])
        distance_matrix[j, i] = distance_matrix[i, j]


In [None]:
cl = DBSCAN(eps=100, min_samples=2, metric="precomputed")
cl.fit(distance_matrix)

# add the cluster labels to the tsne dataframe


### Create the Sub Trajectories


In [None]:
# create a dataframe with the cluster labels and vehicle positions
cluster_df = pl.DataFrame(
    {
        "cluster": cl.labels_,
        "position": vehicles_,
    }
)

cluster_df["cluster"].value_counts()


#### Plot the clusters


In [None]:
fig = go.Figure()

for cluster, c_df in cluster_df.groupby("cluster"):
    v = c_df["position"].to_list()

    fig.add_trace(
        go.Scatter(
            x=[x[0] for k in v for x in k],
            y=[x[1] for k in v for x in k],
            mode="markers",
            name=f"cluster {cluster}",
        )
    )

fig.show()


In [None]:
v


# next steps:

- use the convex hull to get the shape of the cluster
- find overlaps and merge clusters
- if there is three or more clusters, it is a junction


In [None]:
tsne_df["cluster"].unique()


In [None]:
h3_df["count"].median()


In [None]:
h3_df.loc[h3_df["count"] >= 6].h3


## Load Data


In [None]:
# create the file paths
network_outline_file = repo_root / "geo_data" / "network_outline.geojson"


In [None]:
import json
from shapely.geometry import Polygon

with open(network_outline_file, "r") as f:
    json_data = json.load(f)

ls = Polygon(json_data["features"][0]["geometry"]["coordinates"][0])


In [None]:
import osmnx as ox


In [None]:
road_network = ox.graph_from_polygon(ls)
ox.plot_graph(road_network)

# project to UTM
road_network = ox.project_graph(road_network)


In [None]:
import numpy as np

for e in road_network.edges:
    # get the nodes
    u, v, _ = e
    # get the node attributes
    u_data = road_network.nodes[u]
    v_data = road_network.nodes[v]

    # get the midpoint between the nodes
    pp = (u_data["x"] + v_data["x"]) / 2, (u_data["y"] + v_data["y"]) / 2
    # get the angle
    angle = np.arctan2(v_data["y"] - u_data["y"], v_data["x"] - u_data["x"])
    angle += np.pi / 2

    # draw a line perpendicular to the edge
    p1 = pp[0] + np.cos(angle) * 10, pp[1] + np.sin(angle) * 10
    p2 = pp[0] - np.cos(angle) * 10, pp[1] - np.sin(angle) * 10

    #  add some width to the polygon


In [None]:
u_data
