# Import libraries and define globals

In [None]:
%pip install pyrosm tqdm folium

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

In [None]:
import torch
!pip install -q torch-scatter~=2.1.0 torch-sparse~=0.6.16 torch-cluster~=1.6.0 torch-spline-conv~=1.2.1 torch-geometric==2.2.0 -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install -q torch-geometric-temporal==0.54.0

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
import math
import os
import pickle

import boto3
import numpy as np
import pandas as pd
import pyrosm
from tqdm import tqdm
from torch_geometric_temporal.signal import StaticGraphTemporalSignal, temporal_signal_split

In [None]:
import warnings

warnings.filterwarnings("ignore", category=RuntimeWarning)

In [None]:
CITY_ID = 1_000_000
MAP_FILE = f"{CITY_ID}-latest.osm.pbf"
LABEL = "speed_kmh"
S3 = boto3.client('s3')
S3_BUCKET = "some_bucket"
S3_SUBDIR = f"subdir_path"
S3_DATA = "data_path"
S3_PREDS = f"{S3_SUBDIR}/model_preds"
S3_FILENAME = "edge_time_aggregated_4_lags.parquet"
N_WEEKS = 7
N_WEEKS_TRAINING = 5
N_WEEKS_VALIDATION = 1
TRAIN_RATIO = N_WEEKS_TRAINING / N_WEEKS
DATA_SPLITS = ["train", "valid", "test"]

In [None]:
S3.download_file(S3_BUCKET, f"{S3_SUBDIR}/unique_edges.pickle", "unique_edges.pickle")
with open("unique_edges.pickle", "rb") as f:
    UNIQUE_EDGES = pickle.load(f)
len(UNIQUE_EDGES)

In [None]:
# MODEL_NAME = f"gnn_2_gats_{EPOCHS}_hidden_channels_{HIDDEN_CHANNELS}_epochs_{len(UNIQUE_EDGES)}_edges_{N_WEEKS}_weeks"
GNN_DATASET_NAME = f"gnn_dataset_{len(UNIQUE_EDGES)}_edges_{N_WEEKS}_weeks_normalised"

In [None]:
def compute_adjacency_matrix():
    adjacency_matrix = np.zeros((len(UNIQUE_EDGES), len(UNIQUE_EDGES)))

    for i, edge_i in enumerate(UNIQUE_EDGES):
        for j, edge_j in enumerate(UNIQUE_EDGES):
            if set(edge_i).intersection(set(edge_j)):
                adjacency_matrix[i, j] = 1
                adjacency_matrix[j, i] = 1

    adjacency_matrix = adjacency_matrix.astype(np.float32)
    edge_index = (np.array(adjacency_matrix) > 0).nonzero()
    return adjacency_matrix, edge_index

# Data imputation methods

In [None]:
def fallback_to_past(edge, minute_bucket, fallback_horizon, unit='m'):
    return DATASET_DICT.get((edge, minute_bucket - pd.Timedelta(fallback_horizon, unit=unit)))


def neighbour_average(edge, minute_bucket):
    neighbour_indicies = np.nonzero(ADJACENCY_MATRIX[EDGE_IDX_MAP[edge]])[0]
    neighbour_speeds = []
    for idx in neighbour_indicies:
        speed = DATASET_DICT.get((edge, minute_bucket))
        if speed is None or math.isnan(speed):
            continue
        neighbour_speeds.append(speed)
    return np.mean(neighbour_speeds)


def expand_edge_time_series(edge_df):
    edge_df = (edge_df.reset_index().set_index("minute_bucket")
        .join(DATASET_RANGE_DF, how="right", lsuffix='l')
        .drop(["index", "indexl"], axis=1))
    edge_df["edge"] = edge_df.edge.ffill().bfill()
    edge_df = edge_df.reset_index()
    return edge_df
    

def compute_rolling_mean(speeds_df, window):
    rolling_window_speed_avg_df = (
        pd.concat([expand_edge_time_series(g) for _, g in subgraph_speeds_df[["edge", "minute_bucket", "speed_kmh"]].groupby("edge")])
        .set_index("minute_bucket").groupby("edge").rolling(window).mean())
    rolling_window_speed_avg_df.dropna(inplace=True)
    return rolling_window_speed_avg_df.to_dict()["speed_kmh"]


def impute_nan(edge, minute_bucket):
    """Data imputation method with the following steps:
        1. Speed on the same edge at the same time 1 week ago
        2. Speed on the same edge at the same time 2 weeks ago 
        3. Average neighbour speed at the current timestamp a week ago
        4. Average neighbour speed at the current timestamp 2 weeks ago
        5. Average accross all edges 15 minutes ago
        6. Average over all past values before current timestamp for the current edge
        7. Global mean speed
    """
    for horizon, unit in [(1, 'W'), (2, 'W')]:
        speed = fallback_to_past(edge, minute_bucket, horizon, unit)
        if speed is not None:
            return speed
        
    speed = neighbour_average(edge, minute_bucket-pd.Timedelta(1, unit='W'))
    if math.isnan(speed):
        speed = neighbour_average(edge, minute_bucket-pd.Timedelta(2, unit='W'))
    else:
        return speed
    
    if math.isnan(speed):
        speed = fallback_to_past(edge, minute_bucket, 15, 'm')
    else:
        return speed
    
    if speed is None or math.isnan(speed):
        speed = neighbour_average(edge, minute_bucket-pd.Timedelta(15, unit='m'))
    else:
        return speed
    
    if math.isnan(speed):
        speed = ROLLING_1H_WINDOW_EDGE_TIME_AVG_DICT.get((edge, minute_bucket))
    else:
        return speed
    
    if speed is None:
        speed = ROLLING_2H_WINDOW_EDGE_TIME_AVG_DICT.get((edge, minute_bucket))
    else:
        return speed
    
    if speed is None:
        speed = ROLLING_3H_WINDOW_EDGE_TIME_AVG_DICT.get((edge, minute_bucket))
    else:
        return speed
    
    if speed is None:
        speed = ROLLING_4H_WINDOW_EDGE_TIME_AVG_DICT.get((edge, minute_bucket))
    else:
        return speed

    if speed is None:
        speed = ROLLING_EDGE_TIME_AVG_DICT.get((edge, minute_bucket))
    else:
        return speed
    
    if speed is None:
        speed = EDGE_15_MIN_BUCKET_DICT.get((edge, minute_bucket))
    else:
        return speed
    
    if speed is None:
        speed = EDGE_AVG_DICT.get((edge, minute_bucket))
    else:
        return speed
    
    if speed is None:
        speed = MEAN_SPEED
    else:
        return speed
    return speed


def impute_dataset(speeds_df, imputation_method):
    """Iterate over a speeds data frame in 15-minute interval groups, fill missing values, collect into a list of snapshots."""
    xs = []
    ys = []
    feature_imputation_count = 0
    target_imputation_count = 0
    target_mask = np.ones((len(DATASET_DATE_RANGE), len(UNIQUE_EDGES)), dtype=int)
    for i, (minute_bucket, minute_bucket_group) in enumerate(tqdm(speeds_df.groupby("minute_bucket"))):
        edge_dict = minute_bucket_group[["edge", "speed_kmh"] + SPEED_FEATURES].set_index("edge").to_dict()
        measurements = []
        targets = []
        past_hour = [(minute, minute_bucket - pd.to_timedelta(minute, unit='m')) for minute in [15, 30, 45, 60]]
        next_15 = minute_bucket + pd.to_timedelta(15, unit='m')
        for j, edge in enumerate(UNIQUE_EDGES):
            row = []
            for minute, quarter in past_hour:
                speed = edge_dict[f"speed_kmh_lag_{minute}_m"].get(edge)
                if speed is None or math.isnan(speed):
                    speed = imputation_method(edge, quarter)
                    feature_imputation_count += 1
                row.append(speed)
            measurements.append(row)
            speed = edge_dict["speed_kmh"].get(edge)
            if speed is None or math.isnan(speed):
                # TODO: not the most efficient way of skipping unpopular segments
                # These are the segments that linear regression couldn't be trained on due to insufficient amount of data
                speed = imputation_method(edge, next_15)
                target_imputation_count += 1
                target_mask[i, j] = 0
            targets.append(speed)
        xs.append(measurements)
        ys.append(targets)
    xs = np.array(xs, dtype=np.float32)
    ys = np.array(ys, dtype=np.float32)

    print(f"Feature imputation count: {feature_imputation_count}")
    print(f"Target imputation count: {target_imputation_count}")
    print(f"Total number of values: {len(UNIQUE_EDGES) * len(DATASET_DATE_RANGE) * 5}")
    print()

    return xs, ys, target_mask

# Baselines

In [None]:
def evaluate_global_mean_baseline(dataset):
    mse = 0
    mae = 0
    for snapshot in dataset:
        mse += (((MEAN_SPEED - snapshot.y)*snapshot.mask)**2).sum() / snapshot.mask.sum()
        mae += (np.abs((MEAN_SPEED - snapshot.y)*snapshot.mask)).sum() / snapshot.mask.sum()
    mse /= dataset.snapshot_count
    mae /= dataset.snapshot_count
    return mse, mae


def evaluate_edge_average_baseline(dataset):
    mse = 0
    mae = 0
    for snapshot in dataset:
        snapshot_mse = 0
        snapshot_mae = 0
        for i, edge in enumerate(UNIQUE_EDGES):
            snapshot_mse += snapshot.mask[i] * (EDGE_AVG_DICT.get(edge, MEAN_SPEED) - snapshot.y[i])**2
            snapshot_mae += snapshot.mask[i] * np.abs(EDGE_AVG_DICT.get(edge, MEAN_SPEED) - snapshot.y[i])
        snapshot_mse /= snapshot.mask.sum()
        snapshot_mae /= snapshot.mask.sum()
        mse += snapshot_mse
        mae += snapshot_mae
    mse /= dataset.snapshot_count
    mae /= dataset.snapshot_count
    return mse, mae


def edge_time_naive(edge, timestamp):
    weekday = timestamp.weekday()
    hour = timestamp.hour
    minute = timestamp.minute
    return EDGE_15_MIN_BUCKET_DICT.get((edge, weekday, hour, minute), EDGE_AVG_DICT.get(edge, MEAN_SPEED))


def rolling_edge_time_avg_naive(edge, minute_bucket):
    return ROLLING_EDGE_TIME_AVG_DICT.get((edge, minute_bucket), MINUTE_BUCKET_AVG_DICT.get((minute_bucket - pd.Timedelta(15, unit='m')), MEAN_SPEED))


def evaluate_edge_time_average_baseline(dataset, date_range, naive):
    mse = 0
    mae = 0
    for timestamp, snapshot in zip(date_range, dataset):
        snapshot_mse = 0
        snapshot_mae = 0
        for i, edge in enumerate(UNIQUE_EDGES):
            snapshot_mse += snapshot.mask[i] * (naive(edge, timestamp) - snapshot.y[i])**2
            snapshot_mae += snapshot.mask[i] * np.abs(naive(edge, timestamp) - snapshot.y[i])
        snapshot_mse /= snapshot.mask.sum()
        snapshot_mae /= snapshot.mask.sum()
        mse += snapshot_mse
        mae += snapshot_mae
    mse /= dataset.snapshot_count
    mae /= dataset.snapshot_count
    return mse, mae

# Data preprocessing

In [None]:
def normalise(x):
    return (x - MEAN) / STD


def denormalise(x):
    return x * STD + MEAN
    

def extract_city_graph():
    S3.download_file(S3_BUCKET, f"{S3_SUBDIR}/{CITY_ID}-latest.osm.pbf", "bucharest.pbf")

    osm = pyrosm.OSM("bucharest.pbf")
    nodes, edges = osm.get_network(nodes=True, network_type="driving+service")
    edges["edge"] = list(zip(edges.u, edges.v))
    print(f"Unique OSM nodes: {nodes.id.nunique()}, unique OSM edges: {edges.id.nunique()}")

    if not os.path.isfile(S3_FILENAME):
        S3.download_file(S3_BUCKET, f"{S3_SUBDIR}/{S3_DATA}/{S3_FILENAME}", S3_FILENAME)
    
    speeds_df = pd.read_parquet(S3_FILENAME)

    print(f"Dataset time boundaries: {speeds_df.minute_bucket.min(), speeds_df.minute_bucket.max()}")
    print(f"Initial dataset shape: {speeds_df.shape}")

    speeds_df["edge"] = list(zip(speeds_df.start_node, speeds_df.end_node))

    speeds_df = speeds_df[speeds_df.edge.isin(UNIQUE_EDGES)]

    print(f"Dataset shape after filtering edges of interest: {speeds_df.shape}")

    speeds_df["day"] = speeds_df.minute_bucket.dt.weekday
    speeds_df["hour"] = speeds_df.minute_bucket.dt.hour
    speeds_df["minute"] = speeds_df.minute_bucket.dt.minute
    speeds_df.sort_values(["edge", "minute_bucket"], inplace=True)

    return speeds_df, nodes, edges

# Experimentation setup

In [None]:
def prepare_dataset(speeds_df, imputation_method):
    if os.path.isfile(f"{GNN_DATASET_NAME}.pickle"):
        with open(f"{GNN_DATASET_NAME}.pickle", "rb") as f:
            dataset = pickle.load(f)
        print("Loadeded imputed data")
    else:
        print("Running data imputation ...")
        xs, ys, target_mask = impute_dataset(speeds_df, imputation_method)
        dataset = StaticGraphTemporalSignal(EDGE_INDEX, ADJACENCY_MATRIX[ADJACENCY_MATRIX>0], xs, ys, mask=target_mask)
        with open(f"{GNN_DATASET_NAME}.pickle", "wb") as f:
            pickle.dump(dataset, f)
        S3.upload_file(f"{GNN_DATASET_NAME}.pickle", S3_BUCKET, f"{S3_SUBDIR}/{S3_DATA}/gnn/{GNN_DATASET_NAME}.pickle")
    return dataset


def evaluate_baselines(train_dataset, valid_dataset, test_dataset):
    for naive_name, naive_method in zip(["Global mean", "Edge mean"], [evaluate_global_mean_baseline, evaluate_edge_average_baseline]):
        for split, ds in zip(["train", "valid", "test"], [train_dataset, valid_dataset, test_dataset]):
            mse, mae = naive_method(ds)
            print(f"\t {naive_name} {split} MSE {mse:.{2}f}")
            print(f"\t {naive_name} {split} RMSE {np.sqrt(mse):.{2}f}")
            print(f"\t {naive_name} {split} MAE {mae:.{2}f}")

    for naive_name, naive_method in zip(["Edge time naive", "Edge time rolling"], [edge_time_naive, rolling_edge_time_avg_naive]):
        for split, date_range, ds in zip(["train", "valid", "test"], [TRAIN_DATE_RANGE, VALID_DATE_RANGE, VALID_DATE_RANGE], [train_dataset, valid_dataset, test_dataset]):
            mse, mae = evaluate_edge_time_average_baseline(ds, date_range, naive_method)
            print(f"\t {naive_name} {split} MSE {mse:.{2}f}")
            print(f"\t {naive_name} {split} RMSE {np.sqrt(mse):.{2}f}")
            print(f"\t {naive_name} {split} MAE {mae:.{2}f}")


def split_dataset(dataset):
    train_dataset, valid_dataset = temporal_signal_split(dataset, train_ratio=TRAIN_RATIO)
    valid_dataset, test_dataset = temporal_signal_split(valid_dataset, train_ratio=1/2) # Assume valid and test dataset are of equal length
    return train_dataset, valid_dataset, test_dataset

# Experiments

In [None]:
subgraph_speeds_df, nodes, edges = extract_city_graph()

In [None]:
DATASET_END_DATE = subgraph_speeds_df.minute_bucket.max()
DATASET_START_DATE = DATASET_END_DATE - pd.Timedelta(N_WEEKS, 'W')
TRAIN_DATE_RANGE = pd.date_range(DATASET_START_DATE, DATASET_START_DATE + pd.Timedelta(N_WEEKS_TRAINING, 'W'), freq="15min", inclusive="left")
VALID_DATE_RANGE = pd.date_range(TRAIN_DATE_RANGE[-1], TRAIN_DATE_RANGE[-1] + pd.Timedelta(N_WEEKS_VALIDATION, 'W'), freq="15min", inclusive="right")
TEST_DATE_RANGE = pd.date_range(VALID_DATE_RANGE[-1], VALID_DATE_RANGE[-1] + pd.Timedelta(N_WEEKS_VALIDATION, 'W'), freq="15min", inclusive="right")
DATASET_DATE_RANGE = pd.concat([TRAIN_DATE_RANGE.to_series(), VALID_DATE_RANGE.to_series(), TEST_DATE_RANGE.to_series()])
DATASET_RANGE_DF = pd.DataFrame(DATASET_DATE_RANGE, columns=["minute_bucket"]).reset_index().set_index("minute_bucket")

SPEED_FEATURES = [col_name for col_name in subgraph_speeds_df.columns if "lag" in col_name]

In [None]:
DATASET_START_DATE, DATASET_END_DATE

In [None]:
subgraph_speeds_df = subgraph_speeds_df[subgraph_speeds_df.minute_bucket >= DATASET_START_DATE]

In [None]:
subgraph_speeds_df.shape

In [None]:
train_subgraph_speeds_df = subgraph_speeds_df[subgraph_speeds_df.minute_bucket <= TRAIN_DATE_RANGE[-1]]
MEAN = train_subgraph_speeds_df.speed_kmh.mean()
STD = train_subgraph_speeds_df.speed_kmh.std()
MEAN, STD

In [None]:
EDGE_IDX_MAP = {edge: i for i, edge in enumerate(UNIQUE_EDGES)}
IDX_EDGE_MAP = {i: edge for i, edge in enumerate(UNIQUE_EDGES)}

MEAN_SPEED = train_subgraph_speeds_df.speed_kmh.mean()
EDGE_AVG_DICT = train_subgraph_speeds_df[["speed_kmh", "edge"]].groupby("edge").mean().astype(int).to_dict()["speed_kmh"]
EDGE_15_MIN_BUCKET_DICT = train_subgraph_speeds_df.groupby(["edge", "day", "hour", "minute"])["speed_kmh"].mean().to_dict()

with open("edge_15min_dict.pickle", "wb") as f:
    pickle.dump(EDGE_15_MIN_BUCKET_DICT, f)

S3.upload_file("edge_15min_dict.pickle", S3_BUCKET, f"{S3_SUBDIR}/models/edge_15min_dict.pickle")

ADJACENCY_MATRIX, EDGE_INDEX = compute_adjacency_matrix()

rolling_speed_avg_df = (pd.concat([expand_edge_time_series(g)
    for _, g in subgraph_speeds_df[["edge", "minute_bucket", "speed_kmh"]]
    .groupby("edge")]).set_index("minute_bucket").groupby("edge").expanding().mean())
rolling_speed_avg_df.dropna(inplace=True)
ROLLING_EDGE_TIME_AVG_DICT = rolling_speed_avg_df.to_dict()["speed_kmh"]
# TODO: Move these to data imputation methods
ROLLING_1H_WINDOW_EDGE_TIME_AVG_DICT = compute_rolling_mean(subgraph_speeds_df, "1h")
ROLLING_2H_WINDOW_EDGE_TIME_AVG_DICT = compute_rolling_mean(subgraph_speeds_df, "2h")
ROLLING_3H_WINDOW_EDGE_TIME_AVG_DICT = compute_rolling_mean(subgraph_speeds_df, "3h")
ROLLING_4H_WINDOW_EDGE_TIME_AVG_DICT = compute_rolling_mean(subgraph_speeds_df, "4h")

DATASET_DICT = subgraph_speeds_df[["edge", "minute_bucket", "speed_kmh"]].set_index(["edge", "minute_bucket"]).to_dict()["speed_kmh"]
MINUTE_BUCKET_AVG_DICT = subgraph_speeds_df[["minute_bucket", "speed_kmh"]].groupby("minute_bucket").mean().to_dict()["speed_kmh"]

In [None]:
# S3.download_file(S3_BUCKET, f"{S3_SUBDIR}/{S3_DATA}/gnn/{GNN_DATASET_NAME}.pickle", f"{GNN_DATASET_NAME}.pickle")

In [None]:
# rm "{GNN_DATASET_NAME}.pickle"

In [None]:
subgraph_speeds_df.speed_kmh.hist()

In [None]:
dataset = prepare_dataset(subgraph_speeds_df, impute_nan)

In [None]:
train_dataset, valid_dataset, test_dataset = split_dataset(dataset)

In [None]:
evaluate_baselines(train_dataset, valid_dataset, test_dataset)