In [None]:
%pip install pyrosm networkx tqdm contextily folium

In [None]:
import math

import boto3
import folium
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import plotly.graph_objs as go
import pyrosm
from tqdm import tqdm

In [None]:
CITY_ID = 1_000_000
MAP_FILE = f"{CITY_ID}-latest.osm.pbf"
S3_BUCKET = "some_bucket"
S3_SUBDIR = f"subdir_path"
S3_DATA = "data_path"

# Prepare a subgraph of Bucharest

## Extract Bucharest graph from OSM

In [None]:
s3 = boto3.client('s3')
s3.download_file(S3_BUCKET, f"{S3_SUBDIR}/{CITY_ID}-latest.osm.pbf", "bucharest.pbf")

In [None]:
osm = pyrosm.OSM("bucharest.pbf")
nodes, edges = osm.get_network(nodes=True, network_type="driving+service")
edges["edge"] = list(zip(edges.u, edges.v))

In [None]:
nodes.id.nunique(), edges.id.nunique()

In [None]:
nodes.head()

In [None]:
edges.head()

## Load Aggregated Tracking Data

In [None]:
from pyspark.sql.functions import col, to_date
edge_speeds = spark.read.parquet(f"s3://{S3_BUCKET}/{S3_SUBDIR}/{S3_DATA}/full_with_lags.parquet")
edge_speeds = edge_speeds.withColumn("date", to_date(col("minute_bucket")))
edge_speeds = edge_speeds.repartition(col("minute_bucket")) # You can write the data back this way for faster/mem efficient access during training

# Filter for a day to make data loading faster
speeds_df = edge_speeds.filter("date >= '2023-07-03' and date <= '2023-08-06'").toPandas()
# TODO: Add test set
speeds_df.head()

In [None]:
speeds_df.minute_bucket.min(), speeds_df.minute_bucket.max()

In [None]:
speeds_df.drop(["__index_level_0__", "date"], axis=1, inplace=True)

In [None]:
speeds_df["edge"] = list(zip(speeds_df.start_node, speeds_df.end_node))

In [None]:
speeds_df.edge.nunique()

In [None]:
# Create a graph from Speed data
bolt_data_graph = nx.Graph()
bolt_data_graph.add_edges_from(speeds_df.edge.unique())
num_of_nodes = len(bolt_data_graph.nodes)

# get all relevant node ids from speed data
node_ids = list(set(speeds_df.start_node).union(set(speeds_df.end_node)))

# get lat\lng info from pbf nodes and add it as node attributes to bolt graph
node_dict = nodes.set_index("id").loc[node_ids][["lat", "lon"]].T.to_dict()
bolt_data_graph.add_nodes_from(node_dict.items())
assert num_of_nodes == len(bolt_data_graph.nodes)

In [None]:
m = folium.Map(location=[44.435608, 26.102297], zoom_start=15)

# Add edges to the map
node_data = bolt_data_graph.nodes(data=True)
for u, v in bolt_data_graph.edges:
    x0, y0 = node_data[u]["lat"], node_data[u]["lon"]
    x1, y1 = node_data[v]["lat"], node_data[v]["lon"]
    folium.PolyLine(locations=[(x0, y0), (x1, y1)], color='blue').add_to(m)

m

In [None]:
bolt_data_graph.number_of_nodes(), bolt_data_graph.number_of_edges()

## Extract a k-hop neighbourhood around a node of interest

In [None]:
central_node_id = 2389982923 # Some popular node in the city centre of Bucharest
k_hop_subgraph = nx.ego_graph(bolt_data_graph, n=central_node_id, radius=20, undirected=True)

In [None]:
k_hop_subgraph.number_of_nodes(), k_hop_subgraph.number_of_edges()

In [None]:
k_hop_subgraph_reversed_edges = {(y, x) for (x, y) in k_hop_subgraph.edges}

In [None]:
subgraph_speeds_df = speeds_df[speeds_df.edge.isin(set(k_hop_subgraph.edges) | set(k_hop_subgraph_reversed_edges))]
subgraph_speeds_df.shape

In [None]:
UNIQUE_EDGES = subgraph_speeds_df.edge.unique()
len(UNIQUE_EDGES)

In [None]:
m = folium.Map(location=[44.435608, 26.102297], zoom_start=15)

# Add edges to the map
for u, v in k_hop_subgraph.edges():
    x0, y0 = nodes[nodes["id"] == u][["lat", "lon"]].iloc[0]
    x1, y1 = nodes[nodes["id"] == v][["lat", "lon"]].iloc[0]
    folium.PolyLine(locations=[(x0, y0), (x1, y1)], color='blue', weight=5).add_to(m)

# Add nodes to the map
for node in k_hop_subgraph.nodes():
    x, y = nodes[nodes["id"] == node][["lat", "lon"]].iloc[0]
    folium.CircleMarker(location=(x, y), radius=5, color='red', fill=True, fill_color='red').add_to(m)

m

# Prepare the dataset for a GNN

## Install and import Pytorch libs 

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

In [None]:
import torch
!pip install -q torch-scatter~=2.1.0 torch-sparse~=0.6.16 torch-cluster~=1.6.0 torch-spline-conv~=1.2.1 torch-geometric==2.2.0 -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install -q torch-geometric-temporal==0.54.0

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
from torch import nn
from torch_geometric.nn import GATv2Conv as GAT
from torch_geometric_temporal.signal import StaticGraphTemporalSignal, temporal_signal_split

In [None]:
subgraph_speeds_df["day"] = subgraph_speeds_df.minute_bucket.dt.weekday
subgraph_speeds_df["hour"] = subgraph_speeds_df.minute_bucket.dt.hour
subgraph_speeds_df["minute"] = subgraph_speeds_df.minute_bucket.dt.minute

In [None]:
train_subgraph_speeds_df = subgraph_speeds_df[subgraph_speeds_df.minute_bucket <= subgraph_speeds_df.minute_bucket.max() - pd.Timedelta(1, 'W')]

In [None]:
MEAN_SPEED = train_subgraph_speeds_df.speed_kmh.mean()
EDGE_AVG_DICT = train_subgraph_speeds_df[["speed_kmh", "edge"]].groupby("edge").mean().astype(int).to_dict()["speed_kmh"]
EDGE_15_MIN_BUCKET_DICT = train_subgraph_speeds_df.groupby(["edge", "day", "hour", "minute"])["speed_kmh"].mean().to_dict()

In [None]:
TRAIN_DATE_RANGE = pd.date_range(train_subgraph_speeds_df.minute_bucket.min(), train_subgraph_speeds_df.minute_bucket.max(), freq="15min")
VALID_DATE_RANGE = pd.date_range(train_subgraph_speeds_df.minute_bucket.max(), subgraph_speeds_df.minute_bucket.max(), freq="15min", inclusive="right")
len(TRAIN_DATE_RANGE), len(VALID_DATE_RANGE)

In [None]:
edge_idx_map = {edge: i for i, edge in enumerate(UNIQUE_EDGES)}
idx_edge_map = {i: edge for i, edge in enumerate(UNIQUE_EDGES)}

In [None]:
SPEED_FEATURES = ['speed_kmh', 'speed_kmh_lag_15_m', 'speed_kmh_lag_30_m', 'speed_kmh_lag_45_m',
       'speed_kmh_lag_60_m', 'speed_kmh_lag_1_W', 'speed_kmh_lag_2_W', 'speed_kmh_lag_3_W']

## Imputation of missing values with global edge mean speed

In [None]:
xs = []
ys = []
for minute_bucket, minute_bucket_group in tqdm(subgraph_speeds_df.groupby("minute_bucket")):
    edge_dict = minute_bucket_group[["edge"] + SPEED_FEATURES].set_index("edge").to_dict()
    measurements = []
    targets = []
    for edge in UNIQUE_EDGES:
        row = []
        for feature in SPEED_FEATURES:
            col = edge_dict[feature].get(edge, EDGE_AVG_DICT.get(edge, MEAN_SPEED))
            col = EDGE_AVG_DICT.get(edge, MEAN_SPEED) if math.isnan(col) else col
            row.append(col)
        measurements.append(row)
        target = edge_dict["speed_kmh"].get(edge, EDGE_AVG_DICT.get(edge, MEAN_SPEED))
        target = EDGE_AVG_DICT.get(edge, MEAN_SPEED) if math.isnan(col) else target
        targets.append(target)
    xs.append(measurements)
    ys.append(targets)
xs = np.array(xs, dtype=np.float32)
ys = np.array(ys, dtype=np.float32)

In [None]:
xs.shape, ys.shape

## Split the dataset

In [None]:
adjacency_matrix = np.zeros((len(UNIQUE_EDGES), len(UNIQUE_EDGES)))

for i, edge_i in enumerate(UNIQUE_EDGES):
    for j, edge_j in enumerate(UNIQUE_EDGES):
        if set(edge_i).intersection(set(edge_j)):
            adjacency_matrix[i, j] = 1
            adjacency_matrix[j, i] = 1

adjacency_matrix = adjacency_matrix.astype(np.float32)
edge_index = (np.array(adjacency_matrix) > 0).nonzero()

In [None]:
dataset = StaticGraphTemporalSignal(edge_index, adjacency_matrix[adjacency_matrix>0], xs, ys)
dataset[0]

In [None]:
train_dataset, valid_dataset = temporal_signal_split(dataset, train_ratio=4/5)
train_dataset.snapshot_count, valid_dataset.snapshot_count

# Create a Dummy GNN model

In [None]:
class DummyGNN(nn.Module):
    def __init__(self, num_nodes, in_channels, hidden_channels, out_channels):
        super(DummyGNN, self).__init__()
        self.lin1 = nn.Linear(in_channels, hidden_channels)
        self.relu1 = nn.ReLU()
        self.gat = GAT(hidden_channels, hidden_channels, edge_dim=hidden_channels)
        self.lin2 = nn.Linear(hidden_channels, hidden_channels)
        self.relu2 = nn.ReLU()
        self.lin3 = nn.Linear(hidden_channels, out_channels)

    def forward(self, edge_index, edge_attr, x):
        hidden_state = self.relu1(self.lin1(x))
        hidden_state = self.gat(hidden_state, edge_index)
        hidden_state = self.relu2(self.lin2(hidden_state))
        return self.lin3(hidden_state)

# Training

In [None]:
def plot_curves(losses):
    plt.plot(range(len(losses)), losses, label=["Train", "Validation"])
    plt.legend()
    plt.xlabel("Epochs")
    plt.ylabel("Mean Squared Error")
    plt.title("Learning curves for a simple GNN")
    plt.show()

def train(train_dataset, valid_dataset, epochs=10):
    # TODO: Add data imputation here

    model = DummyGNN(len(UNIQUE_EDGES), len(SPEED_FEATURES), 32, 1)
    optimiser = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-3)
    mse = torch.nn.MSELoss()
    model.train()

    losses = []
    for epoch in range(1, epochs+1):
        model.train()
        loss = 0
        for snapshot in train_dataset:
            y_pred = model(snapshot.edge_index, snapshot.edge_attr, snapshot.x)
            loss += mse(y_pred.flatten(), snapshot.y)
        loss /= train_dataset.snapshot_count 

        loss.backward()
        optimiser.step()
        optimiser.zero_grad()

        model.eval()
        val_loss = 0
        for snapshot in valid_dataset:
            y_pred = model(snapshot.edge_index, snapshot.edge_attr, snapshot.x)
            val_loss += mse(y_pred.flatten(), snapshot.y)
        val_loss /= valid_dataset.snapshot_count

        if epoch % 5 == 0:
            print(f"Epoch {epoch:>2} | Train MSE: {loss:.4f} | Valid MSE: {val_loss:.4f}")

        losses.append((loss.detach().numpy(), val_loss.detach().numpy()))

        if (loss - val_loss).abs() < 0.01:
            print(f"Triggered early stopping on epoch: {epoch}")
            break

    plot_curves(losses)
    return model

In [None]:
# TODO: adding evalution on the test set
# Different 

## Training with data imputation by edge mean speed

In [None]:
model = train(train_dataset, valid_dataset, epochs=200)

In [None]:
torch.save(model, "gnn_k20_5weeks_200epochs.pth")
s3.upload_file("gnn_k20_5weeks_200epochs.pth", S3_BUCKET, f"{S3_SUBDIR}/models/gnn_k20_5weeks_200epochs.pth")

# Naive baselines

In [None]:
def evaluate_global_mean_baseline(dataset):
    mse = 0
    for snapshot in dataset:
        mse += ((MEAN_SPEED - snapshot.y)**2).sum()
    mse /= dataset.snapshot_count * len(UNIQUE_EDGES)
    return mse

def evaluate_edge_average_baseline(dataset):
    mse = 0
    for j, snapshot in enumerate(dataset):
        for i, edge in enumerate(UNIQUE_EDGES):
            mse += (EDGE_AVG_DICT.get(edge, MEAN_SPEED) - snapshot.y[i])**2
    mse /= dataset.snapshot_count * len(UNIQUE_EDGES)
    return mse

def edge_time_naive(edge, timestamp):
    weekday = timestamp.weekday()
    hour = timestamp.hour
    minute = timestamp.minute
    return EDGE_15_MIN_BUCKET_DICT.get((edge, weekday, hour, minute), EDGE_AVG_DICT.get(edge, MEAN_SPEED))

def evaluate_edge_time_average_baseline(dataset, date_range):
    mse = 0
    for j, (timestamp, snapshot) in enumerate(zip(date_range, dataset)):
        for i, edge in enumerate(UNIQUE_EDGES):
            mse += (edge_time_naive(edge, timestamp) - snapshot.y[i])**2
    mse /= dataset.snapshot_count * len(UNIQUE_EDGES)
    return mse

## Baselines with edge average imputation

### Global average baseline

In [None]:
evaluate_global_mean_baseline(train_dataset), evaluate_global_mean_baseline(valid_dataset)

### Edge average baseline

In [None]:
evaluate_edge_average_baseline(train_dataset), evaluate_edge_average_baseline(valid_dataset)

### Baseline with average per edge, weekday, hour and 15-minute interval

In [None]:
evaluate_edge_time_average_baseline(train_dataset, TRAIN_DATE_RANGE), evaluate_edge_time_average_baseline(valid_dataset, VALID_DATE_RANGE)

# Inference

In [None]:
def plot_edge_time_series(speeds_df, edge):
    one_edge_df = speeds_df[speeds_df.edge == edge][["minute_bucket", "speed_kmh"]].sort_values("minute_bucket")

    edge_predictions = []
    for snapshot in valid_dataset:
        y_pred = model(snapshot.edge_index, snapshot.edge_attr, snapshot.x)
        edge_predictions.append(y_pred.detach().numpy()[edge_idx_map[edge]])
    edge_predictions = np.array(edge_predictions).reshape(-1)

    fig = go.Figure()

    # Add traces to the figure
    fig.add_trace(
        go.Scatter(
        x=one_edge_df.minute_bucket,
        y=one_edge_df.speed_kmh,
        mode='markers',
        name='Ground Truth'
    ))
    fig.add_trace(go.Scatter(
        x=VALID_DATE_RANGE,
        y=edge_predictions,
        mode='markers',
        name='GNN predictions'
    ))
    fig.add_trace(go.Scatter(
        x=VALID_DATE_RANGE,
        y=[edge_time_naive(edge, ts) for ts in VALID_DATE_RANGE],
        mode='markers',
        name='Naive predictions'
    ))

    # Update layout with checkboxes
    fig.update_layout(
        title=f"Time series for edge {edge}",
        title_x=0.5,
        xaxis=dict(
            title="Time [15-minute bucket]"
        ),
        yaxis=dict(
            title="Speed [km/h]"
        ),
        updatemenus=[
            dict(
                buttons=list([
                    dict(label="Ground Truth",
                        method="update",
                        args=[{"visible": [True, False, False]},
                            {"title": "Trace 1"}]),
                    dict(label="GNN predictions",
                        method="update",
                        args=[{"visible": [False, True, False]},
                            {"title": "Trace 2"}]),
                    dict(label="Naive predictions",
                        method="update",
                        args=[{"visible": [False, False, True]},
                            {"title": "Trace 3"}]),
                    dict(label="All",
                        method="update",
                        args=[{"visible": [True, True, True]},
                            {"title": "All Traces"}])
                ]),
                direction="down",
                showactive=True,
                x=0.1,
                xanchor="left",
                y=1.1,
                yanchor="top"
            ),
        ]
    )

    # Update layout with legend
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        )
    )
    fig.show()


def plot_edge(edge_id):
    edge = edges[edges.edge == edge_id]
    u, v = edge[['u', 'v']].to_numpy().reshape(-1)
    m = folium.Map(location=nodes[nodes["id"] == u][["lat", "lon"]].iloc[0], zoom_start=20)

    # Add edges to the map
    x0, y0 = nodes[nodes["id"] == u][["lat", "lon"]].iloc[0]
    x1, y1 = nodes[nodes["id"] == v][["lat", "lon"]].iloc[0]
    folium.PolyLine(locations=[(x0, y0), (x1, y1)], color='red', weight=5).add_to(m)

    # Add nodes to the map
    for node in [u, v]:
        x, y = nodes[nodes["id"] == node][["lat", "lon"]].iloc[0]
        folium.CircleMarker(location=(x, y), radius=5, color='red', fill=True, fill_color='red').add_to(m)

    return m

## Imputation with edge average

In [None]:
import random

In [None]:
random.seed(123)
edge_sample = random.choices(UNIQUE_EDGES, k=5)
edge_sample

In [None]:
plot_edge_time_series(subgraph_speeds_df, (2160093525, 2351320132))

In [None]:
MEAN_SPEED, EDGE_AVG_DICT[(248729658, 6169982502)]

In [None]:
plot_edge_time_series(subgraph_speeds_df, (248729658, 6169982502))

In [None]:
plot_edge_time_series(subgraph_speeds_df, (2351321583, 2160093493))

In [None]:
plot_edge_time_series(subgraph_speeds_df, (257717273, 647171071))

In [None]:
plot_edge_time_series(subgraph_speeds_df, (6936490726, 647171071))