# Import libraries and define globals

In [None]:
%pip install pyrosm tqdm # folium

In [None]:
import os
import pickle
import random

import boto3
# import folium
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objs as go
import pyrosm
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from tqdm import tqdm

In [None]:
CITY_ID = 1_000_000
MAP_FILE = f"{CITY_ID}-latest.osm.pbf"
S3 = boto3.client('s3')
S3_BUCKET = "some_bucket"
S3_SUBDIR = f"subdir_path"
S3_DATA = "data_path"
S3_FILENAME = "edge_time_aggregated_4_lags.parquet"
N_WEEKS = 7
N_WEEKS_TRAINING = 5
N_WEEKS_VALIDATION = 1
SUBGRAPH_K = 20
CENTRAL_NODE_ID = 2389982923 # Some popular node in the city centre of Bucharest
TRAIN_RATIO = 1/2

In [None]:
S3.download_file(S3_BUCKET, f"{S3_SUBDIR}/unique_edges.pickle", "unique_edges.pickle")
with open("unique_edges.pickle", "rb") as f:
    UNIQUE_EDGES = pickle.load(f)
len(UNIQUE_EDGES)

In [None]:
def compute_adjacency_matrix():
    adjacency_matrix = np.zeros((len(UNIQUE_EDGES), len(UNIQUE_EDGES)))

    for i, edge_i in enumerate(UNIQUE_EDGES):
        for j, edge_j in enumerate(UNIQUE_EDGES):
            if set(edge_i).intersection(set(edge_j)):
                adjacency_matrix[i, j] = 1
                adjacency_matrix[j, i] = 1

    adjacency_matrix = adjacency_matrix.astype(np.float32)
    edge_index = (np.array(adjacency_matrix) > 0).nonzero()
    return adjacency_matrix, edge_index

# Visualisation code

In [None]:
def plot_random_edge_and_neighbours_time_series(speeds_df, ys, dataset, model, nodes):
    edge = random.choice(UNIQUE_EDGES)
    neighbours = [IDX_EDGE_MAP[idx] for idx in np.nonzero(ADJACENCY_MATRIX[EDGE_IDX_MAP[edge]])[0]]
    neighbours.remove(edge)
    for e in [edge] + neighbours:
        plot_edge_time_series(e, speeds_df, ys, dataset, model)
    return plot_edges(nodes, [edge] + neighbours)
    

def plot_edge_and_neighbours_time_series(edge, speeds_df, ys, dataset, model, nodes):
    neighbours = [IDX_EDGE_MAP[idx] for idx in np.nonzero(ADJACENCY_MATRIX[EDGE_IDX_MAP[edge]])[0]]
    neighbours.remove(edge)
    for e in [edge] + neighbours:
        plot_edge_time_series(e, speeds_df, ys, dataset, model)
    return plot_edges(nodes, [edge] + neighbours)
    

def plot_edge_time_series(predictions_df, model_name):

    edge = predictions_df.edge.iloc[0]

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
        x=predictions_df.minute_bucket,
        y=predictions_df.speed_kmh,
        mode='markers',
        name='Ground Truth'
    ))
    fig.add_trace(
        go.Scatter(
        x=predictions_df.minute_bucket,
        y=predictions_df.preds,
        mode='markers',
        name=model_name
    ))
    fig.add_trace(go.Scatter(
        x=DATASET_DATE_RANGE,
        y=[edge_time_naive(edge, ts) for ts in DATASET_DATE_RANGE],
        mode='markers',
        name='Naive predictions'
    ))
    # fig.add_trace(go.Scatter(
    #     x=DATASET_DATE_RANGE,
    #     y=[rolling_edge_time_avg_naive(edge, ts) for ts in DATASET_DATE_RANGE],
    #     mode='markers',
    #     name='Rolling edge-time naive predictions'
    # ))

    # Update layout with checkboxes
    fig.update_layout(
        title=f"Time series for edge {edge}",
        title_x=0.5,
        xaxis=dict(
            title="Time [15-minute bucket]"
        ),
        yaxis=dict(
            title="Speed [km/h]"
        ),
        updatemenus=[
            dict(
                buttons=list([
                    dict(label="Ground Truth",
                        method="update",
                        args=[{"visible": [True, False, False]},
                            {"title": "Trace 1"}]),
                    dict(label=model_name,
                        method="update",
                        args=[{"visible": [True, False, False]},
                            {"title": "Trace 2"}]),
                    dict(label="Naive predictions",
                        method="update",
                        args=[{"visible": [False, True, False]},
                            {"title": "Trace 4"}]),
                    # dict(label="Rolling edge-time naive predictions",
                    #     method="update",
                    #     args=[{"visible": [False, True, False]},
                    #         {"title": "Trace 5"}]),
                    dict(label="All",
                        method="update",
                        args=[{"visible": [True, True, True]},
                            {"title": "All Traces"}])
                ]),
                direction="down",
                showactive=True,
                x=0.1,
                xanchor="left",
                y=1.1,
                yanchor="top"
            ),
        ]
    )

    # Update layout with legend
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        )
    )
    fig.show()



def plot_edges(nodes, edges):
    m = folium.Map(location=[44.435608, 26.102297], zoom_start=15)

    node_ids = [n for edge in edges for n in edge]

    # Add edges to the map
    for u, v in edges:
        x0, y0 = nodes[nodes["id"] == u][["lat", "lon"]].iloc[0]
        x1, y1 = nodes[nodes["id"] == v][["lat", "lon"]].iloc[0]
        folium.PolyLine(locations=[(x0, y0), (x1, y1)], color='blue', weight=5, tooltip=f"{u, v}").add_to(m)

    # Add nodes to the map
    for node in node_ids:
        x, y = nodes[nodes["id"] == node][["lat", "lon"]].iloc[0]
        folium.CircleMarker(location=(x, y), radius=5, color='red', fill=True, fill_color='red').add_to(m)

    return m

# Data preprocessing

In [None]:
def extract_city_graph():
    S3.download_file(S3_BUCKET, f"{S3_SUBDIR}/{CITY_ID}-latest.osm.pbf", "bucharest.pbf")

    osm = pyrosm.OSM("bucharest.pbf")
    nodes, edges = osm.get_network(nodes=True, network_type="driving+service")
    edges["edge"] = list(zip(edges.u, edges.v))
    print(f"Unique OSM nodes: {nodes.id.nunique()}, unique OSM edges: {edges.id.nunique()}")

    if not os.path.isfile(S3_FILENAME):
        S3.download_file(S3_BUCKET, f"{S3_SUBDIR}/{S3_DATA}/{S3_FILENAME}", S3_FILENAME)
    
    speeds_df = pd.read_parquet(S3_FILENAME)

    print(f"Dataset time boundaries: {speeds_df.minute_bucket.min(), speeds_df.minute_bucket.max()}")
    print(f"Initial dataset shape: {speeds_df.shape}")

    speeds_df.speed_kmh.hist()
    plt.title("Dataset speed distribution")
    plt.xlabel("Speed [km/h]")
    plt.ylabel("Count")
    plt.show()

    speeds_df["edge"] = list(zip(speeds_df.start_node, speeds_df.end_node))

    speeds_df = speeds_df[speeds_df.edge.isin(UNIQUE_EDGES)]

    print(f"Dataset shape after filtering edges of interest: {speeds_df.shape}")

    speeds_df["day"] = speeds_df.minute_bucket.dt.weekday
    speeds_df["hour"] = speeds_df.minute_bucket.dt.hour
    speeds_df["minute"] = speeds_df.minute_bucket.dt.minute
    speeds_df.sort_values(["edge", "minute_bucket"], inplace=True)

    return speeds_df, nodes, edges

# Experiments

In [None]:
subgraph_speeds_df, nodes, edges = extract_city_graph()

In [None]:
subgraph_speeds_df.head()

In [None]:
DATASET_START_DATE = subgraph_speeds_df.minute_bucket.min()
DATASET_END_DATE = subgraph_speeds_df.minute_bucket.max()
DATASET_DATE_RANGE = pd.date_range(DATASET_START_DATE, DATASET_END_DATE, freq="15min")
DATASET_RANGE_DF = pd.DataFrame(DATASET_DATE_RANGE, columns=["minute_bucket"]).reset_index().set_index("minute_bucket")

EDGE_IDX_MAP = {edge: i for i, edge in enumerate(UNIQUE_EDGES)}
IDX_EDGE_MAP = {i: edge for i, edge in enumerate(UNIQUE_EDGES)}

SPEED_FEATURES = [col_name for col_name in subgraph_speeds_df.columns if "lag" in col_name]

train_subgraph_speeds_df = subgraph_speeds_df[subgraph_speeds_df.minute_bucket < DATASET_START_DATE + pd.Timedelta(N_WEEKS_TRAINING, 'W')]
MEAN_SPEED = train_subgraph_speeds_df.speed_kmh.mean()
EDGE_AVG_DICT = train_subgraph_speeds_df[["speed_kmh", "edge"]].groupby("edge").mean().astype(int).to_dict()["speed_kmh"]
EDGE_15_MIN_BUCKET_DICT = train_subgraph_speeds_df.groupby(["edge", "day", "hour", "minute"])["speed_kmh"].mean().to_dict()

ADJACENCY_MATRIX, EDGE_INDEX = compute_adjacency_matrix()

TRAIN_DATE_RANGE = pd.date_range(DATASET_START_DATE, DATASET_START_DATE + pd.Timedelta(N_WEEKS_TRAINING, 'W'), freq="15min", inclusive="left")
VALID_DATE_RANGE = pd.date_range(TRAIN_DATE_RANGE[-1], TRAIN_DATE_RANGE[-1] + pd.Timedelta(N_WEEKS_VALIDATION, 'W'), freq="15min", inclusive="right")
TEST_DATE_RANGE = pd.date_range(VALID_DATE_RANGE[-1], VALID_DATE_RANGE[-1] + pd.Timedelta(N_WEEKS_VALIDATION, 'W'), freq="15min", inclusive="right")

# Linear regression on all edges

In [None]:
def extract_edge_neighbour_df(speeds_df, edge):
    neighbours = set([IDX_EDGE_MAP[idx] for idx in np.nonzero(ADJACENCY_MATRIX[EDGE_IDX_MAP[edge]])[0]])
    edge_neighbour_df = speeds_df[speeds_df.edge == edge][SPEED_FEATURES + ["speed_kmh", "start_node", "end_node", "minute_bucket"]].set_index("minute_bucket")
    for i, neighbour in enumerate(neighbours.difference(edge)):
        neighbour_df = speeds_df[speeds_df.edge == neighbour][SPEED_FEATURES + ["minute_bucket"]].set_index("minute_bucket")
        edge_neighbour_df = edge_neighbour_df.join(neighbour_df, how="outer", rsuffix=f"_neighbour_{i}")
    edge_neighbour_df.dropna(subset="speed_kmh", axis=0, inplace=True)
    edge_neighbour_df.start_node = edge_neighbour_df.start_node.astype(int)
    edge_neighbour_df.end_node = edge_neighbour_df.end_node.astype(int)
    return edge_neighbour_df

In [None]:
def apply_lin_reg_to_edge(speeds_df, edge):
    edge_and_neighbour_df = extract_edge_neighbour_df(speeds_df, edge)
    edge_and_neighbour_df = edge_and_neighbour_df.reset_index().copy()
    edge_and_neighbour_df.loc[:, ("weekday")] = edge_and_neighbour_df.minute_bucket.dt.weekday
    edge_and_neighbour_df.loc[:, ("hour")] = edge_and_neighbour_df.minute_bucket.dt.hour
    edge_and_neighbour_df.loc[:, ("minute")] = edge_and_neighbour_df.minute_bucket.dt.minute

    encoder = OneHotEncoder(sparse=False)
    encoded = encoder.fit_transform(edge_and_neighbour_df[["weekday", "hour", "minute"]])
    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(["weekday", "hour", "minute"]))
    edge_and_neighbour_df = pd.concat([edge_and_neighbour_df.drop(["weekday", "hour", "minute"], axis=1), encoded_df], axis=1)

    train_end_date = DATASET_START_DATE + pd.Timedelta(N_WEEKS_TRAINING, unit='W')
    train_df = edge_and_neighbour_df[edge_and_neighbour_df.minute_bucket < train_end_date].copy()
    train_df.dropna(axis=1, how="all", inplace=True)
    train_df.fillna(MEAN_SPEED, inplace=True)

    if train_df.shape[0] == 0:
        return None

    valid_end_date = train_end_date + pd.Timedelta(N_WEEKS_VALIDATION, unit='W')
    valid_df = edge_and_neighbour_df.loc[(edge_and_neighbour_df.minute_bucket >= train_end_date) & (edge_and_neighbour_df.minute_bucket < valid_end_date), train_df.columns].copy()
    if valid_df.shape[0] == 0:
        return None
    valid_df.fillna(MEAN_SPEED, inplace=True)

    test_df = edge_and_neighbour_df.loc[edge_and_neighbour_df.minute_bucket >= valid_end_date, train_df.columns].copy()
    if test_df.shape[0] == 0:
        return None
    test_df.fillna(MEAN_SPEED, inplace=True)

    scaler = StandardScaler()
    drop_cols = ["speed_kmh", "start_node", "end_node"] + [col for col in train_df.columns if (col.startswith("weekday") or col.startswith("hour") or col.startswith("minute"))]
    X = train_df.drop(drop_cols, axis=1)

    if X.shape[1] == 0:
        return None

    train_normalized = scaler.fit_transform(X)

    train_normalized_df = pd.DataFrame(train_normalized, columns=X.columns)
    train_normalized_df = pd.concat([train_normalized_df, train_df[drop_cols]], axis=1)

    valid_normalized_df = pd.DataFrame(scaler.transform(valid_df.drop(drop_cols, axis=1)), columns=X.columns)
    valid_normalized_df = pd.concat([valid_normalized_df, valid_df[drop_cols].reset_index(drop=True)], axis=1)

    test_normalized_df = pd.DataFrame(scaler.transform(test_df.drop(drop_cols, axis=1)), columns=X.columns)
    test_normalized_df = pd.concat([test_normalized_df, test_df[drop_cols].reset_index(drop=True)], axis=1)

    lin_reg = Ridge()
    non_features = ["start_node", "end_node", "minute_bucket", "speed_kmh"]
    lin_reg.fit(train_normalized_df.drop(non_features, axis=1), train_normalized_df.speed_kmh)

    for df, normalized_df in zip([train_df, valid_df, test_df], [train_normalized_df, valid_normalized_df, test_normalized_df]):
        df["preds"] = lin_reg.predict(normalized_df.drop(non_features, axis=1))

    train_preds_df = train_df[non_features + ["preds"]].reset_index(drop=True)
    valid_preds_df = valid_df[non_features + ["preds"]].reset_index(drop=True)
    test_preds_df = test_df[non_features + ["preds"]].reset_index(drop=True)

    return train_preds_df, valid_preds_df, test_preds_df

In [None]:
skipped_edges = []
train_dfs, valid_dfs, test_dfs = [], [], []
for edge in tqdm(UNIQUE_EDGES):
    dfs = apply_lin_reg_to_edge(subgraph_speeds_df, edge)

    if dfs is None:
        skipped_edges.append(edge)
    else:
        train_dfs.append(dfs[0])
        valid_dfs.append(dfs[1])
        test_dfs.append(dfs[2])

train_df = pd.concat(train_dfs)
valid_df = pd.concat(valid_dfs)
test_df = pd.concat(test_dfs)

In [None]:
train_df.head()

In [None]:
len(skipped_edges)

In [None]:
skipped_edges

In [None]:
UNIQUE_EDGES = list(set(UNIQUE_EDGES).difference(set(skipped_edges)))
with open("unique_edges.pickle", "wb") as f:
    pickle.dump(UNIQUE_EDGES, f)

In [None]:
S3.upload_file("unique_edges.pickle", S3_BUCKET, f"{S3_SUBDIR}/unique_edges.pickle")

In [None]:
train_df.shape, valid_df.shape, test_df.shape

In [None]:
for split, df in zip(["train", "valid", "test"], [train_df, valid_df, test_df]):
    print(split, np.sqrt(mean_squared_error(df.speed_kmh, df.preds)))

In [None]:
for split, df in zip(["train", "valid", "test"], [train_df, valid_df, test_df]):
    df["start_node"] = df.start_node.astype(int)
    df["end_node"] = df.end_node.astype(int)
    df.to_parquet(f"{split}.parquet")
    S3.upload_file(f"{split}.parquet", S3_BUCKET, f"{S3_SUBDIR}/model_predictions/ridge_{len(UNIQUE_EDGES)}_edges_{N_WEEKS}_weeks/{split}.parquet")

# Regression on one edge

In [None]:
mses, dfs = apply_lin_reg_to_edge(subgraph_speeds_df, (248729663, 9890593730))

In [None]:
mses

In [None]:
dfs[0].head()

In [None]:
plot_edge_time_series(pd.concat(dfs).reset_index(), "Ridge regression")

In [None]:
mses, dfs = apply_lin_reg_to_edge(subgraph_speeds_df, (248729663, 9890593730))
plot_edge_time_series(pd.concat(dfs).reset_index(), "Ridge regression")

In [None]:
dfs[0].head()

In [None]:
pd.concat(dfs)

In [None]:
def apply_lin_reg_to_a_random_edge(subgraph_speeds_df, k):
    for edge in random.sample(UNIQUE_EDGES, k=k):
        mses, dfs = apply_lin_reg_to_edge(subgraph_speeds_df, edge)
        print(mses)
        df = pd.concat(dfs).reset_index()
        df["edge"] = list(zip(df.start_node, df.end_node))
        plot_edge_time_series(df, "Ridge regression, edge")

In [None]:
apply_lin_reg_to_a_random_edge(subgraph_speeds_df, 5)