# Tabular Dataset Preprocessing
### Building the tabular dataset
*Written by - Rasmus Bergman rbvp20@student.aau.dk*

This is a preprocessing pipeline for making a dataset which can be used to do tabular data analysis
on the accident data from vejman.dk.

### **Prerequisits**
- Running mastra.ipynb
- Running vejman.ipynb

In [1]:
# Import libraries
import geopandas as gpd
import os
import osmnx as ox
import constants as c
import xml.etree.ElementTree as ET
from shapely.geometry import LineString
import pandas as pd

# Config
ox.settings.console_log = True
ox.settings.use_cache = True

# Parameters
MAX_DISTANCE_TO_SIGNAL_METERS = 25

In [2]:
# Load the accident data from vejman.dk
accidents = gpd.read_file(c.VEJMAN_PATH)

# Load the mastra data and filter out all traffic counts which are not for motor vehicles
traffic_flow = gpd.read_file(c.MASTRA_PATH)
traffic_flow = traffic_flow[traffic_flow["KOERETOEJSART"] == "MOTORKTJ"]

# Filter out all accidents that are not between two roads that have traffic flow data
accidents = accidents[(accidents.VEJ1.isin(traffic_flow.VEJNR)) & (accidents.VEJ2.isin(traffic_flow.VEJNR))]
traffic_flow_unacounted = traffic_flow[~traffic_flow.VEJNR.isin(accidents.VEJ1) | ~traffic_flow.VEJNR.isin(accidents.VEJ2)]


In [3]:
# Load in the OSM data
G = ox.load_graphml(c.SIMPLE_ROAD_NETWORK_PATH)
G = ox.project_graph(G, to_crs="EPSG:25832")

In [4]:
traffic_flow_unacounted = traffic_flow_unacounted.to_crs("EPSG:25832")
nearest_edges = ox.nearest_edges(G, traffic_flow_unacounted.geometry.x, traffic_flow_unacounted.geometry.y)

In [5]:
new_geometry = [G.edges[edge] for edge in nearest_edges]
new_geometry = [edge["geometry"] if "geometry" in edge else None for edge in new_geometry]

traffic_flow_unacounted["geometry"] = new_geometry
# Buffer the traffic flow data to create a polygon around the road
traffic_flow_unacounted["geometry"] = traffic_flow_unacounted["geometry"].buffer(10)
traffic_flow_unacounted.crs = "EPSG:25832"

In [6]:
# Spatial join of all traffic flow data with itself to find intersections
unacounted_intersections = gpd.sjoin(traffic_flow_unacounted, traffic_flow_unacounted)
unacounted_intersections = unacounted_intersections[unacounted_intersections.index != unacounted_intersections["index_right"]]

print(len(traffic_flow_unacounted))
print(len(unacounted_intersections))


17951
21140


In [16]:
# Get the geometry of the other road segment in the intersection
second_geom = gpd.GeoSeries(unacounted_intersections["index_right"].map(lambda x: traffic_flow_unacounted["geometry"][x]), crs="EPSG:25832").buffer(10)


In [18]:

# Find the intersection between the two road segments
intersection_geometry = unacounted_intersections["geometry"].intersection(second_geom)
unacounted_intersections["geometry"] = intersection_geometry

# Filter out all intersections that are empty
unacounted_intersections = unacounted_intersections[unacounted_intersections["geometry"].is_empty == False]

# Create point geometries for the intersections
unacounted_intersections.explode(index_parts=False)
unacounted_intersections["geometry"] = unacounted_intersections["geometry"].centroid

# Rename and drop columns
unacounted_intersections["KRYDS_UHELD"] = 0
unacounted_intersections.rename(columns={"AADT_left": "VEJ1_AADT", "AADT_right": "VEJ2_AADT", "HAST_GRAENSE_left": "VEJ1_HAST_GRAENSE", "HAST_GRAENSE_right": "VEJ2_HAST_GRAENSE"}, inplace=True)
unacounted_intersections = unacounted_intersections[["KRYDS_UHELD", "VEJ1_AADT", "VEJ1_HAST_GRAENSE", "VEJ2_AADT", "VEJ2_HAST_GRAENSE", "geometry"]]

In [20]:
# Group the data by road ids and count the number of accidents between each pair of roads
# This uses the coordinates of the last accident, which should be good enough as the coordinates
# for two accidents between the same roads are very close together.
accident_counts = accidents.groupby(["VEJ1", "VEJ2"]).agg({"X_KOORDINAT": "last", "Y_KOORDINAT": "last", "KRYDS_UHELD": "count"}).reset_index()
accident_counts.reset_index(drop=True, inplace=True)

In [21]:
# Merge the accident counts with the traffic flow data
# Make 2 columns for AADT and speed limit for vej1 og vej2
# This is done by looking up the road ids in the traffic flow data

for i, row in accident_counts.iterrows():
    vej1_row = traffic_flow[traffic_flow.VEJNR == row.VEJ1]
    vej2_row = traffic_flow[traffic_flow.VEJNR == row.VEJ2]
    if len(vej1_row) > 0:
        accident_counts.loc[i, "VEJ1_AADT"] = vej1_row.iloc[0].AADT
        accident_counts.loc[i, "VEJ1_HAST_GRAENSE"] = vej1_row.iloc[0].HAST_GRAENSE
    if len(vej2_row) > 0:
        accident_counts.loc[i, "VEJ2_AADT"] = vej2_row.iloc[0].AADT
        accident_counts.loc[i, "VEJ2_HAST_GRAENSE"] = vej2_row.iloc[0].HAST_GRAENSE

accounted_intersections = gpd.GeoDataFrame(accident_counts, geometry=gpd.points_from_xy(accident_counts.X_KOORDINAT, accident_counts.Y_KOORDINAT, crs = 'EPSG:25832'))
accounted_intersections.drop(columns=["X_KOORDINAT", "Y_KOORDINAT", "VEJ1", "VEJ2"], inplace=True)

In [22]:
# Filter out all intersections which are in both the accounted and unaccounted intersections
duplicate_intersections = gpd.sjoin_nearest(unacounted_intersections, accounted_intersections, how="inner", max_distance=10)
unacounted_intersections = unacounted_intersections[~unacounted_intersections.index.isin(duplicate_intersections.index)]

all_intersections = gpd.GeoDataFrame(pd.concat([accounted_intersections, unacounted_intersections], ignore_index=True), crs="EPSG:25832")

In [23]:
# Load in the OSM data
node_gdf = gpd.read_file(c.NODE_GDF_PATH)
traffic_signal_features = node_gdf[node_gdf["highway"] == "traffic_signals"].to_crs("EPSG:25832")

In [25]:
# Find the nearest traffic signal to each accident
traffic_signal_nearby = gpd.sjoin_nearest(traffic_signal_features, all_intersections, how="inner", max_distance=MAX_DISTANCE_TO_SIGNAL_METERS)

# Find the nearest node and edge to each intersection
nearest_nodes = ox.nearest_nodes(G, all_intersections.geometry.x, all_intersections.geometry.y)
nearest_edges = ox.nearest_edges(G, all_intersections.geometry.x, all_intersections.geometry.y)

# If the distance is less than MAX_DISTANCE_TO_SIGNAL_METERS then the intersection has a traffic signal
all_intersections["TRAFFIC_SIGNAL"] = all_intersections.index.isin(traffic_signal_nearby["index_right"])

# Junction is an attribute which describes roundabouts and other types of junctions
all_intersections["JUNCTION"] = [G.edges[edge].get("junction", None) for edge in nearest_edges]

# Amount is an attribute which describes the number of streets that meet at the junction
all_intersections["AMOUNT"] = [G.nodes[node].get("street_count", None) for node in nearest_nodes]

In [26]:
all_intersections["HIGHWAY"] = [G.edges[edge].get("highway", None) for edge in nearest_edges]

In [32]:
all_intersections["HIGHWAY"] = all_intersections["HIGHWAY"].astype(str)
all_intersections["JUNCTION"] = all_intersections["JUNCTION"].astype(str)


In [33]:
all_intersections.to_file(c.TABULAR_INTERSECTIONS, driver="GeoJSON")