# Installing libs and defining constants

In [None]:
%pip install tqdm folium

In [None]:
from collections import Counter
import math
import random

import boto3
import folium
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from tqdm import tqdm

In [None]:
CITY_ID = 1_000_000
MAP_FILE = f"{
    
}-latest.osm.pbf"
S3 = boto3.client('s3')
S3_BUCKET = "some_bucket"
S3_SUBDIR = f"subdir_path"
S3_DATA = "data_path"
S3_FILENAME = "edge_time_aggregated_4_lags.parquet"
DATASET_START_DATE = pd.Timestamp(2023, 7, 3, 0, 0, 0)
N_WEEKS = 4
N_WEEKS_TRAINING = 2
N_WEEKS_VALIDATION = 1
DATASET_END_DATE = DATASET_START_DATE + pd.Timedelta(N_WEEKS, 'W')
DATASET_DATE_RANGE = pd.date_range(DATASET_START_DATE, DATASET_END_DATE, freq="15min", closed="left")
DATASET_RANGE_DF = pd.DataFrame(DATASET_DATE_RANGE, columns=["minute_bucket"]).reset_index().set_index("minute_bucket")
SUBGRAPH_K = 20
CENTRAL_NODE_ID = 2389982923 # Some popular node in the city centre of Bucharest
TRAIN_RATIO = 1/2
EPOCHS = 200

In [None]:
def compute_adjacency_matrix():
    adjacency_matrix = np.zeros((len(UNIQUE_EDGES), len(UNIQUE_EDGES)))

    for i, edge_i in enumerate(UNIQUE_EDGES):
        for j, edge_j in enumerate(UNIQUE_EDGES):
            if set(edge_i).intersection(set(edge_j)):
                adjacency_matrix[i, j] = 1
                adjacency_matrix[j, i] = 1

    adjacency_matrix = adjacency_matrix.astype(np.float32)
    edge_index = (np.array(adjacency_matrix) > 0).nonzero()
    return adjacency_matrix, edge_index

# Data imputation code

In [None]:
def fallback_to_past(edge, minute_bucket, fallback_horizon, unit='m'):
    return DATASET_DICT.get((edge, minute_bucket - pd.Timedelta(fallback_horizon, unit=unit)))


def neighbour_average(edge, minute_bucket):
    neighbour_indicies = np.nonzero(ADJACENCY_MATRIX[EDGE_IDX_MAP[edge]])[0]
    neighbour_speeds = []
    for idx in neighbour_indicies:
        speed = DATASET_DICT.get((edge, minute_bucket))
        if speed is None or math.isnan(speed):
            continue
        neighbour_speeds.append(speed)
    return np.mean(neighbour_speeds)


def expand_edge_time_series(edge_df):
    edge_df = (edge_df.reset_index().set_index("minute_bucket")
        .join(DATASET_RANGE_DF, how="right", lsuffix='l')
        .drop(["index", "indexl"], axis=1))
    edge_df["edge"] = edge_df.edge.ffill().bfill()
    edge_df = edge_df.reset_index()
    return edge_df
    

def neighbour_based_impute_nan(edge, minute_bucket):
    """Data imputation method with the following steps:
        1. Speed on the same edge 15 minutes ago
        2. Speed on the same edge 30 minutes ago
        3. Speed on the same edge 45 minutes ago
        4. Speed on the same edge 60 minutes ago
        5. Speed on the same edge at the same time 1 week ago
        6. Speed on the same edge at the same time 2 weeks ago 
        7. Average neighbour speed 15 minutes ago
        8. Average over all past values before current timestamp for the current edge
        9. Average accross all edges 15 minutes ago
        10. Global mean speed
    """
    i = 0
    for horizon, unit in FALLBACK_HORIZONS:
        speed = fallback_to_past(edge, minute_bucket, horizon, unit)
        if speed is not None:
            return speed, i
        i += 1
        
    if speed is None or math.isnan(speed):
        speed = neighbour_average(edge, minute_bucket-pd.Timedelta(15, unit='m'))
    else:
        return speed, i
    
    if math.isnan(speed):
        i += 1
        speed = ROLLING_EDGE_TIME_AVG_DICT.get((edge, minute_bucket))
    else:
        return speed, i
    
    if speed is None:
        i += 1
        speed = MINUTE_BUCKET_AVG_DICT.get((minute_bucket - pd.Timedelta(15, unit='m')))
    else:
        return speed, i
    
    if speed is None:
        speed = MEAN_SPEED
        i += 1
    else:
        return speed, i
    return speed, i


def reranked_neighbour_based_impute_nan(edge, minute_bucket):
    """Data imputation method with the following steps:
        1. Speed on the same edge at the same time 1 week ago
        2. Speed on the same edge at the same time 2 weeks ago 
        3. Average neighbour speed at the current timestamp a week ago
        4. Average neighbour speed at the current timestamp 2 weeks ago
        5. Average accross all edges 15 minutes ago
        6. Average over all past values before current timestamp for the current edge
        7. Global mean speed
    """
    for i, (horizon, unit) in enumerate([(1, 'W'), (2, 'W')]):
        speed = fallback_to_past(edge, minute_bucket, horizon, unit)
        if speed is not None:
            return speed, i
        
    speed = neighbour_average(edge, minute_bucket-pd.Timedelta(1, unit='W'))
    if math.isnan(speed):
        speed = neighbour_average(edge, minute_bucket-pd.Timedelta(2, unit='W'))
    else:
        return speed, 2
    
    if math.isnan(speed):
        speed = ROLLING_EDGE_TIME_AVG_DICT.get((edge, minute_bucket))
    else:
        return speed, 3
    
    if speed is None:
        speed = MINUTE_BUCKET_AVG_DICT.get((minute_bucket - pd.Timedelta(15, unit='m')))
    else:
        return speed, 4
    
    if speed is None:
        speed = MEAN_SPEED
        i += 1
    else:
        return speed, i
    return speed, i


def imputation_3(edge, minute_bucket):
    """Data imputation method with the following steps:
        1. Speed on the same edge at the same time 1 week ago
        2. Speed on the same edge at the same time 2 weeks ago 
        3. Average neighbour speed at the current timestamp a week ago
        4. Average neighbour speed at the current timestamp 2 weeks ago
        5. Average accross all edges 15 minutes ago
        6. Average over all past values before current timestamp for the current edge
        7. Global mean speed
    """
    i = 0
    for i, (horizon, unit) in enumerate([(1, 'W'), (2, 'W')]):
        speed = fallback_to_past(edge, minute_bucket, horizon, unit)
        if speed is not None:
            return speed, i
        i += 1
        
    speed = neighbour_average(edge, minute_bucket-pd.Timedelta(1, unit='W'))
    if math.isnan(speed):
        speed = neighbour_average(edge, minute_bucket-pd.Timedelta(2, unit='W'))
        i += 1
    else:
        return speed, i
    
    if math.isnan(speed):
        speed = fallback_to_past(edge, minute_bucket, 15, 'm')
        i += 1
    else:
        return speed, i
    
    if speed is None or math.isnan(speed):
        speed = neighbour_average(edge, minute_bucket-pd.Timedelta(15, unit='m'))
        i += 1
    else:
        return speed, i
    
    if math.isnan(speed):
        speed = ROLLING_WINDOW_EDGE_TIME_AVG_DICT.get((edge, minute_bucket))
        i += 1
    else:
        return speed, i

    if speed is None:
        speed = ROLLING_EDGE_TIME_AVG_DICT.get((edge, minute_bucket))
        i += 1
    else:
        return speed, i
    
    if speed is None:
        speed = MINUTE_BUCKET_AVG_DICT.get((minute_bucket - pd.Timedelta(15, unit='m')))
        i += 1
    else:
        return speed, i
    
    if speed is None:
        speed = MEAN_SPEED
        i += 1
    else:
        return speed, i
    return speed, i


def imputation_4(edge, minute_bucket):
    """Data imputation method with the following steps:
        1. Speed on the same edge at the same time 1 week ago
        2. Speed on the same edge at the same time 2 weeks ago 
        3. Average neighbour speed at the current timestamp a week ago
        4. Average neighbour speed at the current timestamp 2 weeks ago
        5. Average accross all edges 15 minutes ago
        6. Average over all past values before current timestamp for the current edge
        7. Global mean speed
    """
    i = 0
    for i, (horizon, unit) in enumerate([(1, 'W'), (2, 'W')]):
        speed = fallback_to_past(edge, minute_bucket, horizon, unit)
        if speed is not None:
            return speed, i
        i += 1
        
    speed = neighbour_average(edge, minute_bucket-pd.Timedelta(1, unit='W'))
    if math.isnan(speed):
        speed = neighbour_average(edge, minute_bucket-pd.Timedelta(2, unit='W'))
        i += 1
    else:
        return speed, i
    
    if math.isnan(speed):
        speed = fallback_to_past(edge, minute_bucket, 15, 'm')
        i += 1
    else:
        return speed, i
    
    if speed is None or math.isnan(speed):
        speed = neighbour_average(edge, minute_bucket-pd.Timedelta(15, unit='m'))
        i += 1
    else:
        return speed, i
    
    if math.isnan(speed):
        speed = ROLLING_1H_WINDOW_EDGE_TIME_AVG_DICT.get((edge, minute_bucket))
        i += 1
    else:
        return speed, i

    if speed is None:
        speed = ROLLING_EDGE_TIME_AVG_DICT.get((edge, minute_bucket))
        i += 1
    else:
        return speed, i
    
    if speed is None:
        speed = EDGE_15_MIN_BUCKET_DICT.get((edge, minute_bucket))
        i += 1
    else:
        return speed, i
    
    if speed is None:
        speed = EDGE_AVG_DICT.get((edge, minute_bucket))
        i += 1
    else:
        return speed, i
    
    if speed is None:
        speed = MEAN_SPEED
        i += 1
    else:
        return speed, i
    return speed, i



def imputation_5(edge, minute_bucket):
    """Data imputation method with the following steps:
        1. Speed on the same edge at the same time 1 week ago
        2. Speed on the same edge at the same time 2 weeks ago 
        3. Average neighbour speed at the current timestamp a week ago
        4. Average neighbour speed at the current timestamp 2 weeks ago
        5. Average accross all edges 15 minutes ago
        6. Average over all past values before current timestamp for the current edge
        7. Global mean speed
    """
    i = 0
    for i, (horizon, unit) in enumerate([(1, 'W'), (2, 'W')]):
        speed = fallback_to_past(edge, minute_bucket, horizon, unit)
        if speed is not None:
            return speed, i
        i += 1
        
    speed = neighbour_average(edge, minute_bucket-pd.Timedelta(1, unit='W'))
    if math.isnan(speed):
        speed = neighbour_average(edge, minute_bucket-pd.Timedelta(2, unit='W'))
        i += 1
    else:
        return speed, i
    
    if math.isnan(speed):
        speed = fallback_to_past(edge, minute_bucket, 15, 'm')
        i += 1
    else:
        return speed, i
    
    if speed is None or math.isnan(speed):
        speed = neighbour_average(edge, minute_bucket-pd.Timedelta(15, unit='m'))
        i += 1
    else:
        return speed, i
    
    if math.isnan(speed):
        speed = ROLLING_1H_WINDOW_EDGE_TIME_AVG_DICT.get((edge, minute_bucket))
        i += 1
    else:
        return speed, i
    
    if speed is None:
        speed = ROLLING_2H_WINDOW_EDGE_TIME_AVG_DICT.get((edge, minute_bucket))
        i += 1
    else:
        return speed, i
    
    if speed is None:
        speed = ROLLING_3H_WINDOW_EDGE_TIME_AVG_DICT.get((edge, minute_bucket))
        i += 1
    else:
        return speed, i
    
    if speed is None:
        speed = ROLLING_4H_WINDOW_EDGE_TIME_AVG_DICT.get((edge, minute_bucket))
        i += 1
    else:
        return speed, i

    if speed is None:
        speed = ROLLING_EDGE_TIME_AVG_DICT.get((edge, minute_bucket))
        i += 1
    else:
        return speed, i
    
    if speed is None:
        speed = EDGE_15_MIN_BUCKET_DICT.get((edge, minute_bucket))
        i += 1
    else:
        return speed, i
    
    if speed is None:
        speed = EDGE_AVG_DICT.get((edge, minute_bucket))
        i += 1
    else:
        return speed, i
    
    if speed is None:
        speed = MEAN_SPEED
        i += 1
    else:
        return speed, i
    return speed, i

def impute_dataset(speeds_df, imputation_method):
    """Iterate over a speeds data frame in 15-minute interval groups, fill missing values, collect into a list of snapshots."""
    xs = []
    ys = []
    feature_imputation_count = 0
    target_imputation_count = 0
    imputation_methods_counts = Counter()
    for i, (minute_bucket, minute_bucket_group) in enumerate(tqdm(speeds_df.groupby("minute_bucket"))):
        edge_dict = minute_bucket_group[["edge", "speed_kmh"] + SPEED_FEATURES].set_index("edge").to_dict()
        measurements = []
        targets = []
        past_hour = [(minute, minute_bucket - pd.to_timedelta(minute, unit='m')) for minute in [15, 30, 45, 60]]
        next_15 = minute_bucket + pd.to_timedelta(15, unit='m')
        for j, edge in enumerate(UNIQUE_EDGES):
            row = []
            for minute, quarter in past_hour:
                speed = edge_dict[f"speed_kmh_lag_{minute}_m"].get(edge)
                if speed is None or math.isnan(speed):
                    speed, method_name = imputation_method(edge, quarter)
                    imputation_methods_counts[method_name] += 1
                    feature_imputation_count += 1
                row.append(speed)
            measurements.append(row)
            speed = edge_dict["speed_kmh"].get(edge)
            if speed is None or math.isnan(speed):
                speed, method_name = imputation_method(edge, next_15)
                imputation_methods_counts[method_name] += 1
                target_imputation_count += 1
            targets.append(speed)
        xs.append(measurements)
        ys.append(targets)
    xs = np.array(xs, dtype=np.float32)
    ys = np.array(ys, dtype=np.float32)
    imputation_stats = (feature_imputation_count, target_imputation_count, imputation_methods_counts)
    return xs, ys, imputation_stats

# Visualisation code

In [None]:
def plot_edge_and_neighbours_time_series(speeds_df, ys):
    edge = random.choice(UNIQUE_EDGES)
    neighbours = [IDX_EDGE_MAP[idx] for idx in np.nonzero(ADJACENCY_MATRIX[EDGE_IDX_MAP[edge]])[0]]
    neighbours.remove(edge)
    for e in [edge] + neighbours:
        plot_edge_time_series(e, speeds_df, ys)
    # return plot_edges(nodes, [edge] + neighbours)
    

def plot_one_column(col_name, edge_speeds_df, edge_imputed_speeds, mask, imputation_codes):

    edge = edge_speeds_df.edge.iloc[0]
    offset = pd.Timedelta(0 if col_name == "speed_kmh" else int([s for s in col_name.split('_') if s.isdecimal()][0]), unit='m')

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
        x=edge_speeds_df.minute_bucket - offset,
        y=edge_speeds_df[col_name],
        mode='markers',
        name=f'Ground Truth {col_name}'
    ))

    unique_codes = set(mask)
    if None in unique_codes:
        unique_codes.remove(None)

    for c in unique_codes:
        minute_buckets = []
        speeds = []
        for j, (code, minute_bucket) in enumerate(zip(mask, DATASET_DATE_RANGE)):
            if code == c:
                minute_buckets.append(minute_bucket - offset)
                speeds.append(edge_imputed_speeds[j])
            else:
                continue
        fig.add_trace(go.Scatter(x=minute_buckets, y=speeds, mode='markers', name=imputation_codes[c], marker=dict(color=COLOURS[c])))


    # Update layout with checkboxes
    fig.update_layout(
        title=f"Time series for edge {edge}",
        title_x=0.5,
        xaxis=dict(
            title="Time [15-minute bucket]"
        ),
        yaxis=dict(
            title="Speed [km/h]"
        ),
        updatemenus=[
            dict(
                buttons=list([
                    dict(label="Ground Truth",
                        method="update",
                        args=[{"visible": [True, False, False]},
                            {"title": "Trace 1"}]),
                ]
                
                + [dict(label=imputation_codes[c],
                        method="update",
                        args=[{"visible": [True, False, False]}]) for c in unique_codes
                ]
                
                + [dict(label="All",
                        method="update",
                        args=[{"visible": [True, True, True]},
                            {"title": "All Traces"}])
                ]),
                direction="down",
                showactive=True,
                x=0.1,
                xanchor="left",
                y=1.1,
                yanchor="top"
            ),
        ]
    )

    # Update layout with legend
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        )
    )
    fig.show()


def plot_edge_features_and_targets_time_series(speeds_df, imputation_method, imputation_codes, edge=None):
    edge = random.choice(UNIQUE_EDGES) if edge is None else edge
    edge_df = speeds_df[speeds_df.edge == edge]

    xs, ys, _, (features_mask, target_mask) = impute_edge(speeds_df, edge, imputation_method)

    plot_one_column("speed_kmh", edge_df, ys, target_mask, imputation_codes)

    for i, col in enumerate(['speed_kmh_lag_15_m', 'speed_kmh_lag_30_m', 'speed_kmh_lag_45_m', 'speed_kmh_lag_60_m']):
        plot_one_column(col, edge_df, xs[:, i], features_mask[:, i], imputation_codes)



def plot_edge_time_series(speeds_df, imputation_method, imputation_codes, edge=None):

    edge = random.choice(UNIQUE_EDGES) if edge is None else edge
    edge_df = speeds_df[speeds_df.edge == edge]

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
        x=edge_df.minute_bucket,
        y=edge_df.speed_kmh,
        mode='markers',
        name='Ground Truth'
    ))

    xs, ys, imputation_stats, (_, target_mask) = impute_edge(speeds_df, edge, imputation_method)
    unique_codes = set(target_mask)
    if None in unique_codes:
        unique_codes.remove(None)

    for c in unique_codes:
        minute_buckets = []
        speeds = []
        for j, (code, minute_bucket) in enumerate(zip(target_mask, DATASET_DATE_RANGE)):
            if code == c:
                minute_buckets.append(minute_bucket)
                speeds.append(ys[j])
            else:
                continue
        fig.add_trace(go.Scatter(x=minute_buckets, y=speeds, mode='markers', name=imputation_codes[c], marker=dict(color=COLOURS[c])))


    # Update layout with checkboxes
    fig.update_layout(
        title=f"Time series for edge {edge}",
        title_x=0.5,
        xaxis=dict(
            title="Time [15-minute bucket]"
        ),
        yaxis=dict(
            title="Speed [km/h]"
        ),
        updatemenus=[
            dict(
                buttons=list([
                    dict(label="Ground Truth",
                        method="update",
                        args=[{"visible": [True, False, False]},
                            {"title": "Trace 1"}]),
                ]
                
                + [dict(label=imputation_codes[c],
                        method="update",
                        args=[{"visible": [True, False, False]}]) for c in unique_codes
                ]
                
                + [dict(label="All",
                        method="update",
                        args=[{"visible": [True, True, True]},
                            {"title": "All Traces"}])
                ]),
                direction="down",
                showactive=True,
                x=0.1,
                xanchor="left",
                y=1.1,
                yanchor="top"
            ),
        ]
    )

    # Update layout with legend
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        )
    )
    fig.show()


def plot_edges(nodes, edges):
    m = folium.Map(location=[44.435608, 26.102297], zoom_start=15)

    node_ids = [n for edge in edges for n in edge]

    # Add edges to the map
    for u, v in edges:
        x0, y0 = nodes[nodes["id"] == u][["lat", "lon"]].iloc[0]
        x1, y1 = nodes[nodes["id"] == v][["lat", "lon"]].iloc[0]
        folium.PolyLine(locations=[(x0, y0), (x1, y1)], color='blue', weight=5, tooltip=f"{u, v}").add_to(m)

    # Add nodes to the map
    for node in node_ids:
        x, y = nodes[nodes["id"] == node][["lat", "lon"]].iloc[0]
        folium.CircleMarker(location=(x, y), radius=5, color='red', fill=True, fill_color='red').add_to(m)

    return m

# Calculating imputation stats

In [None]:
S3.download_file(S3_BUCKET, f"{S3_SUBDIR}/{S3_DATA}/subgraph_speeds_df.parquet", "subgraph_speeds_df.parquet")

In [None]:
subgraph_speeds_df = pd.read_parquet("subgraph_speeds_df.parquet")
subgraph_speeds_df["edge"] = list(zip(subgraph_speeds_df.start_node, subgraph_speeds_df.end_node))

In [None]:
UNIQUE_EDGES = subgraph_speeds_df.edge.unique()
EDGE_IDX_MAP = {edge: i for i, edge in enumerate(UNIQUE_EDGES)}
IDX_EDGE_MAP = {i: edge for i, edge in enumerate(UNIQUE_EDGES)}

subgraph_speeds_df["day"] = subgraph_speeds_df.minute_bucket.dt.weekday
subgraph_speeds_df["hour"] = subgraph_speeds_df.minute_bucket.dt.hour
subgraph_speeds_df["minute"] = subgraph_speeds_df.minute_bucket.dt.minute

subgraph_speeds_df.sort_values(["edge", "minute_bucket"], inplace=True)

train_subgraph_speeds_df = subgraph_speeds_df[subgraph_speeds_df.minute_bucket < DATASET_START_DATE + pd.Timedelta(N_WEEKS_TRAINING, 'W')]
MEAN_SPEED = train_subgraph_speeds_df.speed_kmh.mean()
EDGE_AVG_DICT = train_subgraph_speeds_df[["speed_kmh", "edge"]].groupby("edge").mean().astype(int).to_dict()["speed_kmh"]
EDGE_15_MIN_BUCKET_DICT = train_subgraph_speeds_df.groupby(["edge", "day", "hour", "minute"])["speed_kmh"].mean().to_dict()
SPEED_FEATURES = [col_name for col_name in subgraph_speeds_df.columns if "speed" in col_name]

ADJACENCY_MATRIX, EDGE_INDEX = compute_adjacency_matrix()

rolling_speed_avg_df = (pd.concat([expand_edge_time_series(g)
    for _, g in subgraph_speeds_df[["edge", "minute_bucket", "speed_kmh"]]
    .groupby("edge")]).set_index("minute_bucket").groupby("edge").expanding().mean())
rolling_speed_avg_df.dropna(inplace=True)
ROLLING_EDGE_TIME_AVG_DICT = rolling_speed_avg_df.to_dict()["speed_kmh"]
rolling_speed_avg_df.hist()
plt.xlabel("Speed [km/h]")
plt.ylabel("Count")
plt.title("Distribution of speeds after rolling mean")


def compute_rolling_mean(speeds_df, window):
    rolling_window_speed_avg_df = (
        pd.concat([expand_edge_time_series(g) for _, g in subgraph_speeds_df[["edge", "minute_bucket", "speed_kmh"]].groupby("edge")])
        .set_index("minute_bucket").groupby("edge").rolling(window).mean())
    rolling_window_speed_avg_df.dropna(inplace=True)
    return rolling_window_speed_avg_df.to_dict()["speed_kmh"]


ROLLING_1H_WINDOW_EDGE_TIME_AVG_DICT = compute_rolling_mean(subgraph_speeds_df, "1h")
ROLLING_2H_WINDOW_EDGE_TIME_AVG_DICT = compute_rolling_mean(subgraph_speeds_df, "2h")
ROLLING_3H_WINDOW_EDGE_TIME_AVG_DICT = compute_rolling_mean(subgraph_speeds_df, "3h")
ROLLING_4H_WINDOW_EDGE_TIME_AVG_DICT = compute_rolling_mean(subgraph_speeds_df, "4h")

DATASET_DICT = subgraph_speeds_df[["edge", "minute_bucket", "speed_kmh"]].set_index(["edge", "minute_bucket"]).to_dict()["speed_kmh"]
MINUTE_BUCKET_AVG_DICT = subgraph_speeds_df[["minute_bucket", "speed_kmh"]].groupby("minute_bucket").mean().to_dict()["speed_kmh"]
FALLBACK_HORIZONS = [(15, 'm'), (30, 'm'), (45, 'm'), (60, 'm'), (1, 'W'), (2, 'W')]

TRAIN_DATE_RANGE = pd.date_range(DATASET_START_DATE, DATASET_START_DATE + pd.Timedelta(N_WEEKS_TRAINING, 'W'), freq="15min", inclusive="left")
VALID_DATE_RANGE = pd.date_range(TRAIN_DATE_RANGE[-1], TRAIN_DATE_RANGE[-1] + pd.Timedelta(N_WEEKS_VALIDATION, 'W'), freq="15min", inclusive="right")
TEST_DATE_RANGE = pd.date_range(VALID_DATE_RANGE[-1], VALID_DATE_RANGE[-1] + pd.Timedelta(N_WEEKS_VALIDATION, 'W'), freq="15min", inclusive="right")

In [None]:
xs, ys, imputation_stats = impute_dataset(subgraph_speeds_df, neighbour_based_impute_nan)

In [None]:
imputation_codes_1 = {
    0: "15min_ago",
    1: "30min_ago",
    2: "45min_ago",
    3: "60min_ago",
    4: "week_ago",
    5: "2_weeks_ago",
    6: "avg_neighbour_15min_ago",
    7: "avg_of_all_past",
    8: "avg_of_all_edges_15min_ago",
    9: "global_avg"
}

imputation_codes_2 = {
    0: "week_ago",
    1: "2_weeks_ago",
    2: "avg_neighbour_week_ago",
    3: "avg_neighbour_2_weeks_ago",
    4: "avg_of_all_past",
    5: "avg_of_all_edges_15min_ago",
    6: "global_avg"
}

imputation_codes_3 = {
    0: "week_ago",
    1: "2_weeks_ago",
    2: "neighbour_avg_week_ago",
    3: "neighbour_avg_2_weeks_ago",
    4: "15_mins_ago",
    5: "neighbour_avg_15_mins_ago",
    6: "past_hour_avg",
    7: "avg_of_all_past",
    8: "avg_of_all_edges_15min_ago",
    9: "global_avg"
}

imputation_codes_4 = {
    0: "week_ago",
    1: "2_weeks_ago",
    2: "neighbour_avg_week_ago",
    3: "neighbour_avg_2_weeks_ago",
    4: "15_mins_ago",
    5: "neighbour_avg_15_mins_ago",
    6: "past_hour_avg",
    7: "avg_of_all_past",
    8: "train_edge_15min_bucket_avg",
    9: "train_edge_avg",
    10: "global_avg"
}

imputation_codes_5 = {
    0: "week_ago",
    1: "2_weeks_ago",
    2: "neighbour_avg_week_ago",
    3: "neighbour_avg_2_weeks_ago",
    4: "15_mins_ago",
    5: "neighbour_avg_15_mins_ago",
    6: "past_hour_avg",
    7: "past_2hours_avg",
    8: "past_3hours_avg",
    9: "past_4hours_avg",
    10: "avg_of_all_past",
    11: "train_edge_15min_bucket_avg",
    12: "train_edge_avg",
    13: "global_avg"
}

In [None]:
for code, cnt in imputation_stats[-1].items():
    print(imputation_codes_1[code], cnt)

In [None]:
xs, ys, imputation_stats = impute_dataset(subgraph_speeds_df, reranked_neighbour_based_impute_nan)

In [None]:
imputation_stats

In [None]:
for code, cnt in imputation_stats[-1].items():
    print(imputation_codes_2[code], cnt)

# Visualising edge imputation

In [None]:
def impute_edge(speeds_df, edge, imputation_method):
    """Iterate over a speeds data frame in 15-minute interval groups, fill missing values, collect into a list of snapshots."""
    one_edge_df = speeds_df[speeds_df.edge == edge]
    xs = []
    ys = []
    feature_imputation_mask = np.full((len(DATASET_DATE_RANGE), len(SPEED_FEATURES)), None)
    target_imputation_mask = np.full(len(DATASET_DATE_RANGE), None)
    feature_imputation_count = 0
    target_imputation_count = 0
    imputation_methods_counts = Counter()
    one_edge_dicts = one_edge_df[["minute_bucket", "speed_kmh"] + SPEED_FEATURES].set_index("minute_bucket").to_dict()
    for i, minute_bucket in enumerate(DATASET_DATE_RANGE):
        past_hour = [(minute, minute_bucket - pd.to_timedelta(minute, unit='m')) for minute in [15, 30, 45, 60]]
        next_15 = minute_bucket + pd.to_timedelta(15, unit='m')
        row = []
        for j, (minute, quarter) in enumerate(past_hour):
            speed = one_edge_dicts[f"speed_kmh_lag_{minute}_m"].get(minute_bucket)
            if speed is None or math.isnan(speed):
                speed, method_name = imputation_method(edge, quarter)
                imputation_methods_counts[method_name] += 1
                feature_imputation_count += 1
                feature_imputation_mask[i, j] = method_name
            row.append(speed)
        xs.append(row)
        speed = one_edge_dicts["speed_kmh"].get(minute_bucket)
        if speed is None or math.isnan(speed):
            speed, method_name = imputation_method(edge, next_15)
            imputation_methods_counts[method_name] += 1
            target_imputation_count += 1
            target_imputation_mask[i] = method_name
        ys.append(speed)
    xs = np.array(xs, dtype=np.float32)
    ys = np.array(ys, dtype=np.float32)
    imputation_stats = (feature_imputation_count, target_imputation_count, imputation_methods_counts)
    masks = (feature_imputation_mask, target_imputation_mask)
    return xs, ys, imputation_stats, masks

In [None]:
COLOURS = ["red", "green", "yellow", "blue", "brown", "black", "pink", "orange", "violet", "gray", "magenta", "cyan", "coral", "aqua"]

## Target imputations only

In [None]:
plot_edge_time_series(subgraph_speeds_df, neighbour_based_impute_nan, imputation_codes_1,  (3924023215, 6258431111))
plot_edge_time_series(subgraph_speeds_df, reranked_neighbour_based_impute_nan, imputation_codes_2,  (3924023215, 6258431111))
plot_edge_time_series(subgraph_speeds_df, imputation_3, imputation_codes_3,  (3924023215, 6258431111))
plot_edge_time_series(subgraph_speeds_df, imputation_4, imputation_codes_4, (3924023215, 6258431111))
plot_edge_time_series(subgraph_speeds_df, imputation_5, imputation_codes_5, (3924023215, 6258431111))

In [None]:
random.choices(UNIQUE_EDGES, k=5)

In [None]:
plot_edge_time_series(subgraph_speeds_df, neighbour_based_impute_nan, imputation_codes_1,  (1390254321, 245958409))
plot_edge_time_series(subgraph_speeds_df, reranked_neighbour_based_impute_nan, imputation_codes_2,  (1390254321, 245958409))
plot_edge_time_series(subgraph_speeds_df, imputation_3, imputation_codes_3,  (1390254321, 245958409))
plot_edge_time_series(subgraph_speeds_df, imputation_4, imputation_codes_4,  (1390254321, 245958409))
plot_edge_time_series(subgraph_speeds_df, imputation_5, imputation_codes_5,  (1390254321, 245958409))

In [None]:
plot_edge_time_series(subgraph_speeds_df, neighbour_based_impute_nan, imputation_codes_1,  (248728917, 248728918))
plot_edge_time_series(subgraph_speeds_df, reranked_neighbour_based_impute_nan, imputation_codes_2,  (248728917, 248728918))
plot_edge_time_series(subgraph_speeds_df, imputation_3, imputation_codes_3,  (248728917, 248728918))
plot_edge_time_series(subgraph_speeds_df, imputation_4, imputation_codes_4,  (248728917, 248728918))
plot_edge_time_series(subgraph_speeds_df, imputation_5, imputation_codes_5,  (248728917, 248728918))

In [None]:
plot_edge_time_series(subgraph_speeds_df, neighbour_based_impute_nan, imputation_codes_1,  (2351320121, 2351320128))
plot_edge_time_series(subgraph_speeds_df, reranked_neighbour_based_impute_nan, imputation_codes_2,  (2351320121, 2351320128))
plot_edge_time_series(subgraph_speeds_df, imputation_3, imputation_codes_3,  (2351320121, 2351320128))
plot_edge_time_series(subgraph_speeds_df, imputation_4, imputation_codes_4,  (2351320121, 2351320128))
plot_edge_time_series(subgraph_speeds_df, imputation_5, imputation_codes_5,  (2351320121, 2351320128))

In [None]:
plot_edge_time_series(subgraph_speeds_df, neighbour_based_impute_nan, imputation_codes_1,  (4490165323, 4490165315))
plot_edge_time_series(subgraph_speeds_df, reranked_neighbour_based_impute_nan, imputation_codes_2,  (4490165323, 4490165315))
plot_edge_time_series(subgraph_speeds_df, imputation_3, imputation_codes_3,  (4490165323, 4490165315))
plot_edge_time_series(subgraph_speeds_df, imputation_4, imputation_codes_4,  (4490165323, 4490165315))
plot_edge_time_series(subgraph_speeds_df, imputation_5, imputation_codes_5,  (4490165323, 4490165315))

In [None]:
plot_edge_time_series(subgraph_speeds_df, neighbour_based_impute_nan, imputation_codes_1,  (1754330932, 672078070))
plot_edge_time_series(subgraph_speeds_df, reranked_neighbour_based_impute_nan, imputation_codes_2,  (1754330932, 672078070))
plot_edge_time_series(subgraph_speeds_df, imputation_3, imputation_codes_3,  (1754330932, 672078070))
plot_edge_time_series(subgraph_speeds_df, imputation_4, imputation_codes_4,  (1754330932, 672078070))
plot_edge_time_series(subgraph_speeds_df, imputation_5, imputation_codes_5,  (1754330932, 672078070))

In [None]:
plot_edge_time_series(subgraph_speeds_df, neighbour_based_impute_nan, imputation_codes_1,  (254373201, 6169982511))
plot_edge_time_series(subgraph_speeds_df, reranked_neighbour_based_impute_nan, imputation_codes_2,  (254373201, 6169982511))
plot_edge_time_series(subgraph_speeds_df, imputation_3, imputation_codes_3,  (254373201, 6169982511))
plot_edge_time_series(subgraph_speeds_df, imputation_4, imputation_codes_4,  (254373201, 6169982511))
plot_edge_time_series(subgraph_speeds_df, imputation_5, imputation_codes_5,  (254373201, 6169982511))

## Features and targets imputations

In [None]:
plot_edge_features_and_targets_time_series(subgraph_speeds_df, neighbour_based_impute_nan, imputation_codes_1,  (1390254321, 245958409))

In [None]:
plot_edge_features_and_targets_time_series(subgraph_speeds_df, neighbour_based_impute_nan, imputation_codes_1,  (248728917, 248728918))
plot_edge_features_and_targets_time_series(subgraph_speeds_df, reranked_neighbour_based_impute_nan, imputation_codes_2,  (248728917, 248728918))

In [None]:
plot_edge_features_and_targets_time_series(subgraph_speeds_df, neighbour_based_impute_nan, imputation_codes_1,  (4490165323, 4490165315))
plot_edge_features_and_targets_time_series(subgraph_speeds_df, reranked_neighbour_based_impute_nan, imputation_codes_2,  (4490165323, 4490165315))