# Import libraries and define globals

In [None]:
%pip install lightgbm pyrosm networkx tqdm contextily folium

In [None]:
from collections import defaultdict
import os
import pickle
import random

import boto3
import folium
import matplotlib.pyplot as plt
import lightgbm as lgb
import numpy as np
import pandas as pd
import plotly.graph_objs as go
import pyrosm
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

In [None]:
import warnings

warnings.filterwarnings("ignore", category=RuntimeWarning)

In [None]:
CITY_ID = 1_000_000
MAP_FILE = f"{CITY_ID}-latest.osm.pbf"
LABEL = "speed_kmh"
S3 = boto3.client('s3')
S3_BUCKET = "some_bucket"
S3_SUBDIR = f"subdir_path"
S3_DATA = "data_path"
S3_PREDS = f"{S3_SUBDIR}/model_predictions"
S3_FILENAME = "edge_time_aggregated_4_lags.parquet"
N_WEEKS = 7
N_WEEKS_TRAINING = 5
N_WEEKS_VALIDATION = 1
DATA_SPLITS = ["train", "valid", "test"]

In [None]:
S3.download_file(S3_BUCKET, f"{S3_SUBDIR}/unique_edges.pickle", "unique_edges.pickle")
with open("unique_edges.pickle", "rb") as f:
    UNIQUE_EDGES = pickle.load(f)
len(UNIQUE_EDGES)

In [None]:
def compute_adjacency_matrix():
    adjacency_matrix = np.zeros((len(UNIQUE_EDGES), len(UNIQUE_EDGES)))

    for i, edge_i in enumerate(UNIQUE_EDGES):
        for j, edge_j in enumerate(UNIQUE_EDGES):
            if set(edge_i).intersection(set(edge_j)):
                adjacency_matrix[i, j] = 1
                adjacency_matrix[j, i] = 1

    adjacency_matrix = adjacency_matrix.astype(np.float32)
    edge_index = (np.array(adjacency_matrix) > 0).nonzero()
    return adjacency_matrix, edge_index

# Data imputation methods

In [None]:
def expand_edge_time_series(edge_df):
    edge_df = (edge_df.reset_index().set_index("minute_bucket")
        .join(DATASET_RANGE_DF, how="right", lsuffix='l')
        .drop(["index", "indexl"], axis=1))
    edge_df["edge"] = edge_df.edge.ffill().bfill()
    edge_df = edge_df.reset_index()
    return edge_df

# Visualisation code

In [None]:
def plot_random_edge_and_neighbours_time_series(speeds_df, ys, dataset, model, nodes):
    edge = random.choice(UNIQUE_EDGES)
    neighbours = [IDX_EDGE_MAP[idx] for idx in np.nonzero(ADJACENCY_MATRIX[EDGE_IDX_MAP[edge]])[0]]
    neighbours.remove(edge)
    for e in [edge] + neighbours:
        plot_edge_time_series(e, speeds_df, ys, dataset, model)
    return plot_edges(nodes, [edge] + neighbours)
    

def plot_edge_and_neighbours_time_series(edge, speeds_df, ys, dataset, model, nodes):
    neighbours = [IDX_EDGE_MAP[idx] for idx in np.nonzero(ADJACENCY_MATRIX[EDGE_IDX_MAP[edge]])[0]]
    neighbours.remove(edge)
    for e in [edge] + neighbours:
        plot_edge_time_series(e, speeds_df, ys, dataset, model)
    return plot_edges(nodes, [edge] + neighbours)
    

def plot_edge_time_series(edge, speeds_df, ys, dataset, model):
    one_edge_df = speeds_df[speeds_df.edge == edge][["minute_bucket", "speed_kmh"]].sort_values("minute_bucket")

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
        x=one_edge_df.minute_bucket,
        y=one_edge_df.speed_kmh,
        mode='markers',
        name='Ground Truth'
    ))
    fig.add_trace(
        go.Scatter(
        x=DATASET_DATE_RANGE,
        y=[y[EDGE_IDX_MAP[edge]] for y in ys],
        mode='markers',
        name='Imputed'
    ))
    fig.add_trace(go.Scatter(
        x=DATASET_DATE_RANGE,
        y=model_predict(model, dataset, edge),
        mode='markers',
        name='GNN predictions'
    ))
    fig.add_trace(go.Scatter(
        x=DATASET_DATE_RANGE,
        y=[edge_time_naive(edge, ts) for ts in DATASET_DATE_RANGE],
        mode='markers',
        name='Naive predictions'
    ))

    # Update layout with checkboxes
    fig.update_layout(
        title=f"Time series for edge {edge}",
        title_x=0.5,
        xaxis=dict(
            title="Time [15-minute bucket]"
        ),
        yaxis=dict(
            title="Speed [km/h]"
        ),
    )

    # Update layout with legend
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        )
    )
    fig.show()



def plot_edges(nodes, edges):
    m = folium.Map(location=[44.435608, 26.102297], zoom_start=15)

    node_ids = [n for edge in edges for n in edge]

    # Add edges to the map
    for u, v in edges:
        x0, y0 = nodes[nodes["id"] == u][["lat", "lon"]].iloc[0]
        x1, y1 = nodes[nodes["id"] == v][["lat", "lon"]].iloc[0]
        folium.PolyLine(locations=[(x0, y0), (x1, y1)], color='blue', weight=5, tooltip=f"{u, v}").add_to(m)

    # Add nodes to the map
    for node in node_ids:
        x, y = nodes[nodes["id"] == node][["lat", "lon"]].iloc[0]
        folium.CircleMarker(location=(x, y), radius=5, color='red', fill=True, fill_color='red').add_to(m)

    return m

# Data preprocessing

In [None]:
def extract_city_graph():
    S3.download_file(S3_BUCKET, f"{S3_SUBDIR}/{CITY_ID}-latest.osm.pbf", "bucharest.pbf")

    osm = pyrosm.OSM("bucharest.pbf")
    nodes, edges = osm.get_network(nodes=True, network_type="driving+service")
    edges["edge"] = list(zip(edges.u, edges.v))
    print(f"Unique OSM nodes: {nodes.id.nunique()}, unique OSM edges: {edges.id.nunique()}")

    if not os.path.isfile(S3_FILENAME):
        S3.download_file(S3_BUCKET, f"{S3_SUBDIR}/{S3_DATA}/{S3_FILENAME}", S3_FILENAME)
    
    speeds_df = pd.read_parquet(S3_FILENAME)

    print(f"Dataset time boundaries: {speeds_df.minute_bucket.min(), speeds_df.minute_bucket.max()}")
    print(f"Initial dataset shape: {speeds_df.shape}")

    speeds_df.speed_kmh.hist()
    plt.title("Dataset speed distribution")
    plt.xlabel("Speed [km/h]")
    plt.ylabel("Count")
    plt.show()

    speeds_df["edge"] = list(zip(speeds_df.start_node, speeds_df.end_node))

    speeds_df = speeds_df[speeds_df.edge.isin(UNIQUE_EDGES)]

    print(f"Dataset shape after filtering edges of interest: {speeds_df.shape}")

    # get all relevant node ids from speed data
    node_ids = list(set(speeds_df.start_node).union(set(speeds_df.end_node)))

    return speeds_df, nodes, edges

In [None]:
def load_road_network(map_path):
    osm = pyrosm.OSM(map_path)
    nodes, edges = osm.get_network(nodes=True, network_type="driving+service")
    edges["edge"] = list(zip(edges["u"], edges["v"]))
    edges["centroid_lng"] = [g.centroid.x for g in edges["geometry"]]
    edges["centroid_lat"] = [g.centroid.y for g in edges["geometry"]]

    edge_dict = {}
    for edge, way_id, highway, length, surface, service, maxspeed, oneway, lit, access, centroid_lat, centroid_lng in tqdm(
        zip(edges.edge, edges["id"], edges.highway, edges.length, edges.surface, edges.service,
            edges.maxspeed, edges.oneway, edges.lit, edges.access, edges.centroid_lat, edges.centroid_lng)):
        
        payload = {
            "highway": highway,
            "way_id": way_id,
            "length": length,
            "surface": surface,
            "service": service,
            "maxspeed": clean_maxspeed(maxspeed),
            "oneway": oneway,
            "lit": lit,
            "access": access,
            "road_centroid_lat": centroid_lat,
            "road_centroid_lng": centroid_lng
        }
        
        edge_dict[edge] = payload
    return edge_dict


def clean_maxspeed(maxspeed):

    clean_values = {
        "BY:urban": "60",
        "BY:rural": "90",
        "BY:living_street": "20",
        "ES:motorway": "120",
        "ES:urban": "30",
        "50|30": "49",
        "50|30|50": "48",
        "50|50|30": "51",
        "30|50": "31",
        "100km/h": "100",
        "RO:urban": "50",
        "RO:rural": "80",
        "50 mph": "50",
        "5 mph": "5",
    }

    return clean_values[maxspeed] if maxspeed in clean_values else maxspeed


def get_way_features(df: pd.DataFrame, edge_dict: dict):
    
    way_features = [
        "highway",
        "way_id",
        "length",
        "surface",
        "service",
        "maxspeed",
        "oneway",
        "lit",
        "access",
        "road_centroid_lat",
        "road_centroid_lng"
    ]
    
    _way_feats = defaultdict(list)
    for edge in df.edge:
        record = edge_dict.get(edge, edge_dict.get(tuple(reversed(edge)), defaultdict(lambda: None)))
        for k in way_features:
            _way_feats[k].append(record[k])

    for k in way_features:
        df[k] = _way_feats[k]
        
    return df

In [None]:
def prepare_lgb_dataset(df, edge_dict):
    df["edge"] = list(zip(df.start_node, df.end_node))
    df["weekday"] = df.minute_bucket.dt.weekday
    df["hour"] = df.minute_bucket.dt.hour
    df["minute"] = df.minute_bucket.dt.minute

    # Static road network features (road surface, road type, etc.)
    df = get_way_features(df, edge_dict)

    # Putting features together
    cat_features = ["highway", "surface", "service", "oneway"]
    # num_features = ["length", "maxspeed", "road_centroid_lat", "road_centroid_lng"]
    num_features = ["length", "maxspeed"]


    cat_codes = {}
    for cat_feat in cat_features:
        cat_codes[cat_feat] = dict(zip(df[cat_feat].astype('category'), df[cat_feat].astype('category').cat.codes))

    for cat_feat in cat_features:
        df[cat_feat + "_cat"] = [cat_codes[cat_feat].get(c, -1) for c in df[cat_feat].astype('category')]
        
    for num_feat in num_features:
        df[num_feat] = df[num_feat].astype('float')

    cat_feat_list = [x + '_cat' for x in cat_features]

    train_features = ["weekday", "hour", "minute"] + SPEED_FEATURES + num_features + cat_feat_list

    lgb_dataset = lgb.Dataset(df[train_features], label=df[LABEL], categorical_feature=cat_feat_list, free_raw_data=False).construct()
    return lgb_dataset

# Experiments

In [None]:
subgraph_speeds_df, nodes, edges = extract_city_graph()

In [None]:
DATASET_START_DATE = subgraph_speeds_df.minute_bucket.min()
DATASET_END_DATE = subgraph_speeds_df.minute_bucket.max()
DATASET_DATE_RANGE = pd.date_range(DATASET_START_DATE, DATASET_END_DATE, freq="15min")
DATASET_RANGE_DF = pd.DataFrame(DATASET_DATE_RANGE, columns=["minute_bucket"]).reset_index().set_index("minute_bucket")

EDGE_IDX_MAP = {edge: i for i, edge in enumerate(UNIQUE_EDGES)}
IDX_EDGE_MAP = {i: edge for i, edge in enumerate(UNIQUE_EDGES)}

subgraph_speeds_df["day"] = subgraph_speeds_df.minute_bucket.dt.weekday
subgraph_speeds_df["hour"] = subgraph_speeds_df.minute_bucket.dt.hour
subgraph_speeds_df["minute"] = subgraph_speeds_df.minute_bucket.dt.minute

subgraph_speeds_df.sort_values(["edge", "minute_bucket"], inplace=True)

SPEED_FEATURES = [col_name for col_name in subgraph_speeds_df.columns if "lag" in col_name]

train_subgraph_speeds_df = subgraph_speeds_df[subgraph_speeds_df.minute_bucket < DATASET_START_DATE + pd.Timedelta(N_WEEKS_TRAINING, 'W')]
MEAN_SPEED = train_subgraph_speeds_df.speed_kmh.mean()
EDGE_AVG_DICT = train_subgraph_speeds_df[["speed_kmh", "edge"]].groupby("edge").mean().astype(int).to_dict()["speed_kmh"]
EDGE_15_MIN_BUCKET_DICT = train_subgraph_speeds_df.groupby(["edge", "day", "hour", "minute"])["speed_kmh"].mean().to_dict()

ADJACENCY_MATRIX, EDGE_INDEX = compute_adjacency_matrix()

TRAIN_DATE_RANGE = pd.date_range(DATASET_START_DATE, DATASET_START_DATE + pd.Timedelta(N_WEEKS_TRAINING, 'W'), freq="15min", inclusive="left")
VALID_DATE_RANGE = pd.date_range(TRAIN_DATE_RANGE[-1], TRAIN_DATE_RANGE[-1] + pd.Timedelta(N_WEEKS_VALIDATION, 'W'), freq="15min", inclusive="right")
TEST_DATE_RANGE = pd.date_range(VALID_DATE_RANGE[-1], VALID_DATE_RANGE[-1] + pd.Timedelta(N_WEEKS_VALIDATION, 'W'), freq="15min", inclusive="right")

## LightGBM

In [None]:
train_df = subgraph_speeds_df[subgraph_speeds_df.minute_bucket <= TRAIN_DATE_RANGE.max()]
valid_df = subgraph_speeds_df[(subgraph_speeds_df.minute_bucket > TRAIN_DATE_RANGE.max()) & (subgraph_speeds_df.minute_bucket <= VALID_DATE_RANGE.max())]
test_df = subgraph_speeds_df[subgraph_speeds_df.minute_bucket > VALID_DATE_RANGE.max()]

In [None]:
train_df.shape, valid_df.shape, test_df.shape

In [None]:
train_df.head()

In [None]:
edge_dict = load_road_network("bucharest.pbf")

In [None]:
edge_dict

In [None]:
train_lgb_dataset = prepare_lgb_dataset(train_df, edge_dict)
valid_lgb_dataset = prepare_lgb_dataset(valid_df, edge_dict)
test_lgb_dataset = prepare_lgb_dataset(test_df, edge_dict)

In [None]:
train_lgb_dataset.num_feature()

In [None]:
train_lgb_dataset.get_feature_name()

In [None]:
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'num_iterations': 1000,
    'is_training_metric': True,
    'num_leaves': 100,
    'learning_rate': 0.1,
    'verbose': 1,
    'min_data_in_leaf': 100,
    'min_data_in_bin': 100,
}

In [None]:
bst = lgb.train(lgb_params, train_lgb_dataset, valid_sets=valid_lgb_dataset, callbacks=[lgb.early_stopping(300)])

In [None]:
lgb.plot_importance(bst)

## Saving results

In [None]:
MODEL_NAME = f"lgbm_{len(UNIQUE_EDGES)}_unique_edges"
with open(f"{MODEL_NAME}.pickle", "wb") as file:
    pickle.dump(bst, file)
S3.upload_file(f"{MODEL_NAME}.pickle", S3_BUCKET, f"{S3_SUBDIR}/models/{MODEL_NAME}.pickle")

In [None]:
train_lgb_dataset.get_data().to_parquet(f"{MODEL_NAME}_train.parquet")
S3.upload_file(f"{MODEL_NAME}_train.parquet", S3_BUCKET, f"{S3_SUBDIR}/data/{MODEL_NAME}_train.parquet")

In [None]:
train_preds_df = train_df[["minute_bucket", "start_node", "end_node", "speed_kmh"]].copy()
train_preds_df["preds"] = bst.predict(train_lgb_dataset.get_data())
mean_squared_error(train_preds_df.preds, train_preds_df.speed_kmh)

In [None]:
train_preds_df.to_parquet(f"{MODEL_NAME}_train_preds.parquet")

In [None]:
S3.upload_file(f"{MODEL_NAME}_train_preds.parquet", S3_BUCKET, f"{S3_PREDS}/lgbm/{MODEL_NAME}_train_preds.parquet")

In [None]:
valid_preds_df = valid_df[["minute_bucket", "start_node", "end_node", "speed_kmh"]].copy()
valid_preds_df["preds"] = bst.predict(valid_lgb_dataset.get_data())
valid_preds_df.to_parquet(f"{MODEL_NAME}_valid_preds.parquet")
S3.upload_file(f"{MODEL_NAME}_valid_preds.parquet", S3_BUCKET, f"{S3_PREDS}/lgbm/{MODEL_NAME}_valid_preds.parquet")
mean_squared_error(valid_preds_df.preds, valid_preds_df.speed_kmh)

In [None]:
test_preds_df = test_df[["minute_bucket", "start_node", "end_node", "speed_kmh"]].copy()
test_preds_df["preds"] = bst.predict(test_lgb_dataset.get_data())
test_preds_df.to_parquet(f"{MODEL_NAME}_test_preds.parquet")
S3.upload_file(f"{MODEL_NAME}_test_preds.parquet", S3_BUCKET, f"{S3_PREDS}/lgbm/{MODEL_NAME}_test_preds.parquet")
mean_squared_error(test_preds_df.preds, test_preds_df.speed_kmh)