In [1]:
# Import libraries
import geopandas as gpd
import pandas as pd
import os
import matplotlib.pyplot as plt
from pyproj import Transformer
import constants as c
import networkx as nx
import numpy as np
import json
from sklearn.preprocessing import OneHotEncoder

# Parameters
X_SLICES = 30
Y_SLICES = 50

TEST_THRESHOLD = 0.2
VALIDATION_THRESHOLD = 0.3

NODE_CATEGORIES = [
    "give_way", 
    "crossing", 
    "turning_circle", 
    "traffic_signals", 
    "bus_stop"
]

EDGE_CATEGORIES = [
    "residential",
    "tertiary",
    "secondary",
    "primary",
    "motorway",
    "living_street",
    "motorway_link",
    "trunk",
    "secondary_link",
    "tertiary_link",
    "primary_link",
    "trunk_link"
]

np.random.seed(9000)

In [25]:
train_test_split = np.random.uniform(0, 1, (X_SLICES, Y_SLICES))
geographic_test_mask = train_test_split < TEST_THRESHOLD
geographic_validation_mask = (train_test_split >= TEST_THRESHOLD) & (train_test_split < VALIDATION_THRESHOLD)

In [3]:
df_nodes = gpd.read_file("../data/graph/accident_flow_nodes.geojson")

In [4]:
df_nodes["x_slice"] = pd.cut(df_nodes["x"], X_SLICES, labels=False)
df_nodes["y_slice"] = pd.cut(df_nodes["y"], Y_SLICES, labels=False)

In [26]:
test_mask = geographic_test_mask[df_nodes["x_slice"], df_nodes["y_slice"]]
validation_mask = geographic_validation_mask[df_nodes["x_slice"], df_nodes["y_slice"]]
train_mask = ~(test_mask | validation_mask)

splits = pd.DataFrame({
    "test": test_mask.flatten(),
    "validation": validation_mask.flatten(),
    "train": train_mask.flatten()
})

In [29]:
splits.to_csv("../data/OSM/graphs/splits.csv")

In [8]:
node_category_encoder = OneHotEncoder(categories=[NODE_CATEGORIES], handle_unknown="infrequent_if_exist", sparse_output=False)

oneHot = node_category_encoder.fit_transform(df_nodes["highway"].values.reshape(-1, 1))

In [9]:
df_nodes = df_nodes.drop(columns=["highway", "ref", "geometry", "street_count"])

In [10]:
df_nodes = df_nodes.join(pd.DataFrame(oneHot, columns=NODE_CATEGORIES))

In [11]:
df_nodes["old_index"] = df_nodes.index
df_nodes = df_nodes.set_index("osmid")

In [12]:
df_nodes[["x", "y"]].to_csv("../data/OSM/graphs/pos.csv", index=False)
df_nodes[["test", "validation"]].to_csv("../data/OSM/graphs/splits.csv", index=False)
df_nodes[[
    "accident_score", 
    "traffic_flow"
] + NODE_CATEGORIES].to_csv("../data/OSM/graphs/nodes.csv", index=False)

In [13]:
edges = []

with open(c.EDGE_GDF_PATH) as f:
    i = 0
    for line in f:
        i += 1
        if i <= 4:
            continue
        stripped_line = line.strip().removesuffix(",")
        if stripped_line == "]":
            break
        row = json.loads(stripped_line)
        prop = row["properties"]
        from_node = df_nodes.loc[prop["u"]]["old_index"]
        to_node = df_nodes.loc[prop["v"]]["old_index"]

        edges.append([from_node, to_node, prop["lanes"], prop["highway"], prop["maxspeed"], prop["length"], prop["oneway"]])

        if i % 10_000 == 0:
            print(f"Processed {i} lines", end="\r")

Processed 4120000 lines

In [14]:
edges = pd.DataFrame(edges, columns=["from", "to", "lanes", "highway", "maxspeed", "length", "oneway"])

In [16]:
# Type conversion
edges["from"] = edges["from"].astype(int)
edges["to"] = edges["to"].astype(int)
edges["lanes"] = edges["lanes"].fillna(1).astype(int)

In [17]:
# Find nearest speed limit for edges with no speed limit

for i, edge in edges[(edges["maxspeed"] == "signals") | (edges["maxspeed"] == "none")].iterrows():
    new_speed = None

    neighbouring_edges = edges[(edges["to"] == edge["from"]) | (edges["from"] == edge["to"])]

    while new_speed is None:
        for _, neighbour in neighbouring_edges.iterrows():
            if neighbour["maxspeed"] != "signals":
                new_speed = neighbour["maxspeed"]
                break
        if new_speed is None:
            neighbouring_edges = edges[edges["to"].isin(neighbouring_edges["from"]) | edges["from"].isin(neighbouring_edges["to"])]

    edges.loc[i, "maxspeed"] = new_speed

edges["maxspeed"] = edges["maxspeed"].astype(float)

In [18]:
# One hot encoding of edge categories

edge_category_encoder = OneHotEncoder(categories=[EDGE_CATEGORIES], handle_unknown="infrequent_if_exist", sparse_output=False)

oneHot = edge_category_encoder.fit_transform(edges["highway"].values.reshape(-1, 1))

edges = edges.drop(columns=["highway"])

edges = edges.join(pd.DataFrame(oneHot, columns=EDGE_CATEGORIES))

In [19]:
edges[["from", "to"]].to_csv("../data/OSM/graphs/edges.csv", index=False)
edges[["lanes", "maxspeed", "length", "oneway"] + EDGE_CATEGORIES].to_csv("../data/OSM/graphs/edges_attributes.csv", index=False)