In [None]:
from collections import defaultdict
from functools import reduce
import os
import random

import boto3
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import plotly.graph_objs as go
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [None]:
WEEKDAYS = range(7)
HOURS = range(24)
MINUTES = [0, 15, 30, 45]
CITY_ID = 1_000_000
FALL_BACK_SPEED = 50

S3_BUCKET = "some_bucket"
S3_SUBDIR = f"subdir_path"
S3_DATA = "data"
S3_PREDS = "model_predictions"
MODELS = ["ridge_regression_317_edges_7_weeks", "lgbm_317_edges_7_weeks", "dummygnn_1000_epochs_64_hidden_channels_317_edges_7_weeks"]
DATA_SPLITS = ["train", "valid", "test"]
LABEL = "speed_kmh"

s3 = boto3.client('s3')

# Downloading and matching model predictions

In [None]:
def merge_several_dataframes(df_list, on):
    return reduce(lambda left, right: pd.merge(left, right, on=on, suffixes=['', "_x"]), df_list)


def load_and_merge():

    for model in MODELS:
        if not (os.path.exists(model) and len(os.listdir(model)) > 0):
            os.makedirs(model)
            for split in DATA_SPLITS:
                s3.download_file(S3_BUCKET, f"{S3_SUBDIR}/{S3_PREDS}/{model}/{split}.parquet", f"{model}/{split}.parquet")

    datasets_dict = defaultdict()
    for split in DATA_SPLITS:
        df_list = [pd.read_parquet(f"{m}/{split}.parquet").reset_index(drop=True).rename(columns={"preds": f"{m}_speed"}) for m in MODELS]
        merged_df = merge_several_dataframes(df_list, on=["start_node", "end_node", "minute_bucket"])
        merged_df.drop([col for col in merged_df.columns if col.startswith(f"{LABEL}_")], inplace=True, axis=1)
        datasets_dict[split] = merged_df
    return datasets_dict

In [None]:
datasets_dict = load_and_merge()

In [None]:
datasets_dict["train"].head()

In [None]:
datasets_dict["train"].shape, datasets_dict["valid"].shape, datasets_dict["test"].shape

In [None]:
merged_df = pd.concat([datasets_dict["train"], datasets_dict["valid"], datasets_dict["test"]])

# Edge error histogram

In [None]:
def calculate_edge_errors(df, pred_col, error_metric):
    return df.groupby(["start_node", "end_node"])[[f"{pred_col}_speed", LABEL]]\
        .apply(lambda group_df: error_metric(group_df[f"{pred_col}_speed"], group_df[LABEL]))


def plot_edge_error_histograms(preds_df, figure_title=None):
    mae_df_list = [pd.DataFrame(calculate_edge_errors(preds_df, model, mean_absolute_error), columns=[f"{model}_mae"]) for model in MODELS]
    mae_df = merge_several_dataframes(mae_df_list, on=["start_node", "end_node"])
    rmse_df_list = [pd.DataFrame(calculate_edge_errors(preds_df, model, lambda x, y: np.sqrt(mean_squared_error(x, y))), columns=[f"{model}_rmse"]) for model in MODELS]
    rmse_df = merge_several_dataframes(rmse_df_list, on=["start_node", "end_node"])
    
    min_mae = mae_df.to_numpy().min()
    max_mae = mae_df.to_numpy().max()

    min_rmse = rmse_df.to_numpy().min()
    max_rmse = rmse_df.to_numpy().max()
    
    fig, axes = plt.subplots(1, 2, figsize=(20, 5))
    for model in MODELS:
        mae_df[f"{model}_mae"].hist(bins=np.linspace(min_mae, max_mae, 10), label=model, histtype="step", ax=axes[0])
        rmse_df[f"{model}_rmse"].hist(bins=np.linspace(min_rmse, max_rmse, 10), label=model, histtype="step", ax=axes[1])
        for i, metric in enumerate(["MAE", "RMSE"]):
            axes[i].legend()
            axes[i].set_title(f"{metric} per edge histogram")
            axes[i].set_xlabel(f"{metric} range")
            axes[i].set_ylabel("Edge count")
    fig.suptitle(figure_title)

In [None]:
plot_edge_error_histograms(datasets_dict["train"])

In [None]:
plot_edge_error_histograms(datasets_dict["valid"])

In [None]:
plot_edge_error_histograms(datasets_dict["test"])

# Compute error metrics

In [None]:
def calc_global_metrics(df):
    return pd.DataFrame(
        [[mean_absolute_error(df[LABEL], df[f"{model}_speed"]),
            mean_squared_error(df[LABEL], df[f"{model}_speed"]),
            np.sqrt(mean_squared_error(df[LABEL], df[f"{model}_speed"])),
            mean_absolute_percentage_error(df[LABEL], df[f"{model}_speed"])] for model in MODELS],
        columns=["MAE", "MSE", "RMSE", "MAPE"],
        index=MODELS)

In [None]:
calc_global_metrics(datasets_dict["train"])

In [None]:
calc_global_metrics(datasets_dict["valid"])

In [None]:
calc_global_metrics(datasets_dict["test"])

In [None]:
def plot_daily_and_hourly_errors(preds_df, fig_title=None):
    preds_df_copy = preds_df.copy()
    preds_df_copy["weekday"] = preds_df_copy.minute_bucket.dt.weekday
    preds_df_copy["hour"] = preds_df_copy.minute_bucket.dt.hour
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    for i, time in enumerate(["weekday", "hour"]):
        for model in MODELS:
            for j, (metric_name, metric_func) in enumerate(
                zip(["MAE", "RMSE"], [mean_absolute_error, lambda x, y: np.sqrt(mean_squared_error(x, y))])):
                preds_df_copy.groupby(time).apply(lambda df: metric_func(df[f"{model}_speed"], df[LABEL])).plot(label=model, ax=axes[i, j])
                axes[i, j].legend()
                axes[i, j].set_ylabel(metric_name)
                axes[i, j].set_title(f"{metric_name} per {time}")
                axes[i, j].grid(True)
    fig.suptitle(fig_title, fontsize=20)
    plt.tight_layout()

In [None]:
plot_daily_and_hourly_errors(datasets_dict["train"], "Model errors aggregated by weekday and hour, training set")

In [None]:
plot_daily_and_hourly_errors(datasets_dict["valid"], "Model errors aggregated by weekday and hour, validation set")

In [None]:
plot_daily_and_hourly_errors(datasets_dict["test"], "Model errors aggregated by weekday and hour, testing set")

In [None]:
def plot_daily_or_hourly_errors(preds_df, col, fig_title=None):
    preds_df_copy = preds_df.copy()
    preds_df_copy[col] = preds_df_copy.minute_bucket.dt.weekday if col == "weekday" else preds_df_copy.minute_bucket.dt.hour
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    for model in MODELS:
        for j, (metric_name, metric_func) in enumerate(
            zip(["MAE", "RMSE"], [mean_absolute_error, lambda x, y: np.sqrt(mean_squared_error(x, y))])):
            preds_df_copy.groupby(col).apply(lambda df: metric_func(df[f"{model}_speed"], df[LABEL])).plot(label=model, ax=axes[j])
            axes[j].legend()
            axes[j].set_ylabel(metric_name)
            axes[j].set_title(f"{metric_name} per {col}")
            axes[j].grid(True)
    fig.suptitle(fig_title, fontsize=20)
    plt.tight_layout()

In [None]:
plot_daily_or_hourly_errors(datasets_dict["train"], "weekday", "Training set")

In [None]:
plot_daily_or_hourly_errors(datasets_dict["valid"], "weekday", "Validation set")

In [None]:
plot_daily_or_hourly_errors(datasets_dict["test"], "weekday", "Testing set")

In [None]:
plot_daily_or_hourly_errors(datasets_dict["train"], "hour", "Training set")

In [None]:
plot_daily_or_hourly_errors(datasets_dict["valid"], "hour", "Validation set")

In [None]:
plot_daily_or_hourly_errors(datasets_dict["test"], "hour", "Testing set")

# Edge time series visualisations

In [None]:
def plot_random_edge_and_neighbours_time_series(speeds_df):
    edge = random.choice(UNIQUE_EDGES)
    neighbours = [IDX_EDGE_MAP[idx] for idx in np.nonzero(ADJACENCY_MATRIX[EDGE_IDX_MAP[edge]])[0]]
    neighbours.remove(edge)
    for e in [edge] + neighbours:
        plot_edge_time_series(e, speeds_df)
    

def plot_sequence_of_edges(speeds_df, edge_list):
    for e in edge_list:
        plot_edge_time_series(e, speeds_df)


def plot_edge_time_series(edge, preds_df):
    edge_df = preds_df[(preds_df.start_node == edge[0]) & (preds_df.end_node == edge[1])]

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
        x=edge_df.minute_bucket,
        y=edge_df.speed_kmh,
        mode='markers',
        name='Ground Truth'
    ))
    for model in MODELS:
        fig.add_trace(go.Scatter(
            x=edge_df.minute_bucket,
            y=edge_df[f"{model}_speed"],
            mode="markers",
            name=f"{model} predictions"
        ))
    # fig.add_trace(go.Scatter(
    #     x=edge_df.minute_bucket,
    #     y=[edge_time_naive(edge, ts) for ts in DATASET_DATE_RANGE],
    #     mode='markers',
    #     name='Naive predictions'
    # ))

    fig.update_layout(
        title=f"Time series for edge {edge}",
        title_x=0.5,
        xaxis=dict(
            title="Time [15-minute bucket]",
            tickfont=dict(size=14)
        ),
        yaxis=dict(
            title="Speed [km/h]",
            tickfont=dict(size=14)
        ),
        # font=dict(size=16)
    )

    # Update layout with legend
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1,
            # font=dict(size=14)
        )
    )
    fig.show()

In [None]:
def compute_adjacency_matrix():
    adjacency_matrix = np.zeros((len(UNIQUE_EDGES), len(UNIQUE_EDGES)))

    for i, edge_i in enumerate(UNIQUE_EDGES):
        for j, edge_j in enumerate(UNIQUE_EDGES):
            if set(edge_i).intersection(set(edge_j)):
                adjacency_matrix[i, j] = 1
                adjacency_matrix[j, i] = 1

    adjacency_matrix = adjacency_matrix.astype(np.float32)
    edge_index = (np.array(adjacency_matrix) > 0).nonzero()
    return adjacency_matrix, edge_index
    

UNIQUE_EDGES = list(set(zip(merged_df.start_node, merged_df.end_node)))
EDGE_IDX_MAP = {edge: i for i, edge in enumerate(UNIQUE_EDGES)}
IDX_EDGE_MAP = {i: edge for i, edge in enumerate(UNIQUE_EDGES)}
ADJACENCY_MATRIX, EDGE_INDEX = compute_adjacency_matrix()

In [None]:
len(UNIQUE_EDGES)

In [None]:
plot_random_edge_and_neighbours_time_series(merged_df)

In [None]:
plot_random_edge_and_neighbours_time_series(merged_df)

In [None]:
plot_random_edge_and_neighbours_time_series(merged_df)

# Visualising a sequence of road segments

In [None]:
%pip install pyrosm networkx folium

In [None]:
import pyrosm
import folium
import networkx as nx

In [None]:
s3 = boto3.client('s3')
s3.download_file(S3_BUCKET, f"{S3_SUBDIR}/{CITY_ID}-latest.osm.pbf", "bucharest.pbf")

In [None]:
osm = pyrosm.OSM("bucharest.pbf")
nodes, edges = osm.get_network(nodes=True, network_type="driving+service")
edges["edge"] = list(zip(edges.u, edges.v))

In [None]:
G = nx.Graph()

for edge in UNIQUE_EDGES:
    G.add_edge(edge[0], edge[1])

In [None]:
G.number_of_nodes(), G.number_of_edges()

In [None]:
unique_nodes = set([i for edge in UNIQUE_EDGES for i in edge])
m = folium.Map(location=[44.435608, 26.102297], zoom_start=15)

# Add edges to the map
for u, v in UNIQUE_EDGES:
    x0, y0 = nodes[nodes["id"] == u][["lat", "lon"]].iloc[0]
    x1, y1 = nodes[nodes["id"] == v][["lat", "lon"]].iloc[0]
    folium.PolyLine(locations=[(x0, y0), (x1, y1)], color='blue').add_to(m)

# Add nodes to the map
for node in unique_nodes:
    x, y = nodes[nodes["id"] == node][["lat", "lon"]].iloc[0]
    folium.CircleMarker(location=(x, y), radius=5, popup=node, color='red', fill=True, fill_color='red').add_to(m)

m

In [None]:
all_simple_paths = list(nx.all_simple_paths(G, 21763480, 4261982668))

In [None]:
sorted([(i, len(path)) for i, path in enumerate(all_simple_paths)], key=lambda x: x[1])[:3]

In [None]:
all_simple_paths[46467]

In [None]:
path_nodes = all_simple_paths[46467]
path_edges = [(path_nodes[i], path_nodes[i+1]) for i in range(len(path_nodes)-1)]

In [None]:
plot_sequence_of_edges(merged_df, path_edges)

In [None]:
m = folium.Map(location=[44.435608, 26.102297], zoom_start=15)

# Add edges to the map
for u, v in path_edges:
    x0, y0 = nodes[nodes["id"] == u][["lat", "lon"]].iloc[0]
    x1, y1 = nodes[nodes["id"] == v][["lat", "lon"]].iloc[0]
    folium.PolyLine(locations=[(x0, y0), (x1, y1)], color='blue').add_to(m)

# Add nodes to the map
for node in path_nodes:
    x, y = nodes[nodes["id"] == node][["lat", "lon"]].iloc[0]
    folium.CircleMarker(location=(x, y), radius=5, popup=node, color='red', fill=True, fill_color='red').add_to(m)

m

In [None]:
%pip install contextily

In [None]:
import contextily as ctx

In [None]:

ax = edges.plot()
ctx.add_basemap(crs=edges.crs, ax=ax)