# Graph-learning Preprocessing
### Preprocessing the road network for doing graph learning
*Written by - Rasmus Bergman rbvp20@student.aau.dk*

This is a preprocessing pipeline for making a dataset which can be used to do graph learning on the accident data from vejman.dk

### **Prerequisits**
- Running mastra.ipynb
- Running vejman.ipynb
- Running OSM_fetching.ipynb
- Running aadt_tracker.ipynb

In [1]:
# Import libraries
import geopandas as gpd
import pandas as pd
import osmnx as ox
import constants as c
import numpy as np
import json
import os
from sklearn.preprocessing import OneHotEncoder

# Config
ox.settings.console_log = True
ox.settings.use_cache = True

# Parameters
ACCIDENT_RADIUS_METERS = 50
X_SLICES = 30
Y_SLICES = 50

TEST_THRESHOLD = 0.2
VALIDATION_THRESHOLD = 0.3

NODE_CATEGORIES = [
    "give_way", 
    "crossing", 
    "turning_circle", 
    "traffic_signals", 
    "bus_stop"
]

EDGE_CATEGORIES = [
    "residential",
    "tertiary",
    "secondary",
    "primary",
    "motorway",
    "living_street",
    "motorway_link",
    "trunk",
    "secondary_link",
    "tertiary_link",
    "primary_link",
    "trunk_link"
]

np.random.seed(420)

In [2]:
# Load the accident data from vejman.dk
accidents = gpd.read_file(c.VEJMAN_PATH)
accidents_buffered = accidents.buffer(ACCIDENT_RADIUS_METERS)
accidents["old_geometry"] = accidents["geometry"]
accidents["geometry"] = accidents_buffered

In [3]:
# Load all nodes from the OSM network
gdf_nodes = gpd.read_file(c.NODE_GDF_PATH)
gdf_nodes = gdf_nodes.to_crs("EPSG:25832")

In [4]:
# Find all nodes within 50 meters of an accident
all_accident_nodes = gpd.sjoin(gdf_nodes, accidents)


In [5]:
all_accident_nodes["distance"] = all_accident_nodes["geometry"].distance(all_accident_nodes["old_geometry"])
all_accident_nodes["distance"] = 1 - all_accident_nodes["distance"] / ACCIDENT_RADIUS_METERS


In [6]:
# Sum up the distances to all accidents for each node
all_accident_nodes_grouped = all_accident_nodes.groupby("osmid").agg({
    "distance": "sum"
    }).reset_index()

# Rescale the distances to be between 0 and 1
all_accident_nodes_grouped["distance"] = all_accident_nodes_grouped["distance"] / all_accident_nodes_grouped["distance"].max()

In [34]:
accident_nodes = gdf_nodes
accident_nodes["accident_score"] = accident_nodes["osmid"].map(all_accident_nodes_grouped.set_index("osmid")["distance"])
accident_nodes["accident_score"] = accident_nodes["accident_score"].fillna(0)

In [35]:
aadt = pd.read_csv(c.NODE_CSV_PATH)

In [36]:
accident_nodes = accident_nodes.merge(aadt, on="osmid", how="left")
accident_nodes = accident_nodes.rename(columns={"AADT": "traffic_flow"})
accident_nodes["traffic_flow"] = accident_nodes["traffic_flow"].fillna(0)

In [None]:
train_test_split = np.random.uniform(0, 1, (X_SLICES, Y_SLICES))
geographic_test_mask = train_test_split < TEST_THRESHOLD
geographic_validation_mask = (train_test_split >= TEST_THRESHOLD) & (train_test_split < VALIDATION_THRESHOLD)

In [None]:
accident_nodes["x_slice"] = pd.cut(accident_nodes["x"], X_SLICES, labels=False)
accident_nodes["y_slice"] = pd.cut(accident_nodes["y"], Y_SLICES, labels=False)

In [None]:
test_mask = geographic_test_mask[accident_nodes["x_slice"], accident_nodes["y_slice"]]
validation_mask = geographic_validation_mask[accident_nodes["x_slice"], accident_nodes["y_slice"]]
train_mask = ~(test_mask | validation_mask)

splits = pd.DataFrame({
    "test": test_mask.flatten(),
    "validation": validation_mask.flatten(),
    "train": train_mask.flatten()
})

In [None]:
splits.to_csv(os.path.join(c.GRAPH_DATASET_DIR, "splits.csv"))

In [None]:
node_category_encoder = OneHotEncoder(categories=[NODE_CATEGORIES], handle_unknown="infrequent_if_exist", sparse_output=False)

oneHot = node_category_encoder.fit_transform(accident_nodes["highway"].values.reshape(-1, 1))

In [None]:
accident_nodes = accident_nodes.drop(columns=["highway", "ref", "geometry", "street_count"])
accident_nodes = accident_nodes.join(pd.DataFrame(oneHot, columns=NODE_CATEGORIES))
accident_nodes["old_index"] = accident_nodes.index
accident_nodes = accident_nodes.set_index("osmid")

In [None]:
accident_nodes[["x", "y"]].to_csv(os.path.join(c.GRAPH_DATASET_DIR, "pos.csv"), index=False)
accident_nodes[[
    "accident_score", 
    "traffic_flow"
] + NODE_CATEGORIES].to_csv(os.path.join(c.GRAPH_DATASET_DIR, "nodes.csv"), index=False)

In [None]:
edges = []

# This is for some reason way faster than using geopandas
# Orders of magnitude faster

with open(c.EDGE_GDF_PATH) as f:
    i = 0
    for line in f:
        i += 1
        if i <= 4:
            continue
        stripped_line = line.strip().removesuffix(",")
        if stripped_line == "]":
            break
        row = json.loads(stripped_line)
        prop = row["properties"]
        from_node = accident_nodes.loc[prop["u"]]["old_index"]
        to_node = accident_nodes.loc[prop["v"]]["old_index"]

        edges.append([from_node, to_node, prop["lanes"], prop["highway"], prop["maxspeed"], prop["length"], prop["oneway"]])

        if i % 10_000 == 0:
            print(f"Processed {i} lines", end="\r")

In [None]:
edges = pd.DataFrame(edges, columns=["from", "to", "lanes", "highway", "maxspeed", "length", "oneway"])
# Type conversion
edges["from"] = edges["from"].astype(int)
edges["to"] = edges["to"].astype(int)
edges["lanes"] = edges["lanes"].fillna(1).astype(int)

In [None]:
# Find nearest speed limit for edges with no speed limit

for i, edge in edges[(edges["maxspeed"] == "signals") | (edges["maxspeed"] == "none")].iterrows():
    new_speed = None

    neighbouring_edges = edges[(edges["to"] == edge["from"]) | (edges["from"] == edge["to"])]

    while new_speed is None:
        for _, neighbour in neighbouring_edges.iterrows():
            if neighbour["maxspeed"] != "signals":
                new_speed = neighbour["maxspeed"]
                break
        if new_speed is None:
            neighbouring_edges = edges[edges["to"].isin(neighbouring_edges["from"]) | edges["from"].isin(neighbouring_edges["to"])]

    edges.loc[i, "maxspeed"] = new_speed

edges["maxspeed"] = edges["maxspeed"].astype(float)

In [None]:
# One hot encoding of edge categories

edge_category_encoder = OneHotEncoder(categories=[EDGE_CATEGORIES], handle_unknown="infrequent_if_exist", sparse_output=False)

oneHot = edge_category_encoder.fit_transform(edges["highway"].values.reshape(-1, 1))

edges = edges.drop(columns=["highway"])

edges = edges.join(pd.DataFrame(oneHot, columns=EDGE_CATEGORIES))

In [None]:
edges[["from", "to"]].to_csv(os.path.join(c.GRAPH_DATASET_DIR, "edges.csv"), index=False)
edges[["lanes", "maxspeed", "length", "oneway"] + EDGE_CATEGORIES].to_csv(os.path.join(c.GRAPH_DATASET_DIR, "edges_attributes.csv"), index=False)