In [1]:
import sys
import os

# Set the main path in the root folder of the project.
sys.path.append(os.path.join('..'))

In [2]:
# Settings for autoreloading.
%load_ext autoreload
%autoreload 2

In [3]:
from src.utils.seed import set_random_seed

# Set the random seed for deterministic operations.
SEED = 42
set_random_seed(SEED)

In [4]:
import torch

# Set the device for training and querying the model.
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'The selected device is: "{DEVICE}"')

The selected device is: "cuda"


# Loading the Data

In [5]:
import os

BASE_DATA_DIR = os.path.join('..', 'data', 'pems-bay')

In [6]:
import pickle
with open(os.path.join(BASE_DATA_DIR, 'processed', 'scaler.pkl'), 'rb') as f:
    scaler = pickle.load(f)

In [7]:
from src.spatial_temporal_gnn.model import SpatialTemporalGNN
from src.data.data_extraction import get_adjacency_matrix

# Get the adjacency matrix
adj_matrix_structure = get_adjacency_matrix(
    os.path.join(BASE_DATA_DIR, 'raw', 'adj_mx_pems_bay.pkl'))

# Get the header of the adjacency matrix, the node indices and the
# matrix itself.
header, node_ids_dict, adj_matrix = adj_matrix_structure

# Get the STGNN and load the checkpoints.
spatial_temporal_gnn = SpatialTemporalGNN(9, 1, 12, 12, adj_matrix, DEVICE, 64)

stgnn_checkpoints_path = os.path.join('..', 'models', 'checkpoints',
                                      'st_gnn_pems_bay.pth')

stgnn_checkpoints = torch.load(stgnn_checkpoints_path)
spatial_temporal_gnn.load_state_dict(stgnn_checkpoints['model_state_dict'])

# Set the model in evaluation mode.
spatial_temporal_gnn.eval();

In [8]:
from src.data.data_extraction import get_locations_dataframe

# Get the dataframe containing the latitude and longitude of each sensor.
locations_df = get_locations_dataframe(
    os.path.join(BASE_DATA_DIR, 'raw', 'graph_sensor_locations_pems_bay.csv'),
    has_header=False)

In [9]:
# Get the node positions dictionary.
node_pos_dict = { i: id for id, i in node_ids_dict.items() }

In [10]:
import os
import numpy as np
from src.spatial_temporal_gnn.prediction import predict

# Get the data and the values predicted by the STGNN.
x_train = np.load(os.path.join(BASE_DATA_DIR, 'explained', 'x_train.npy'))[..., :1]
x_val = np.load(os.path.join(BASE_DATA_DIR, 'explained', 'x_val.npy'))[..., :1]
x_test = np.load(os.path.join(BASE_DATA_DIR, 'explained', 'x_test.npy'))[..., :1]

In [11]:
from src.utils.config import MPH_TO_KMH_FACTOR

# Turn the dataset in kilometers per hour.
x_train = x_train * MPH_TO_KMH_FACTOR
x_val = x_val * MPH_TO_KMH_FACTOR
x_test = x_test * MPH_TO_KMH_FACTOR

In [12]:
_, n_timesteps, n_nodes, _ = x_train.shape

# Adjacency Distance Matrix

In [13]:
from src.explanation.clustering.clustering import (
    get_adjacency_distance_matrix)

adj_distance_matrix = get_adjacency_distance_matrix(adj_matrix, n_timesteps)

In [14]:
print(f'Shape of the Adjacency Distance Matrix: {adj_distance_matrix.shape}')

Shape of the Adjacency Distance Matrix: (2484, 2484)


# Temporal Distance Matrix

In [15]:
from src.explanation.clustering.clustering import (
    get_temporal_distance_matrix)

temporal_distance_matrix = get_temporal_distance_matrix(n_nodes, n_timesteps)

In [16]:
print('Shape of the Temporal Distance Matrix:',
      f'{temporal_distance_matrix.shape}')

Shape of the Temporal Distance Matrix: (2484, 2484)


# Clustering Function

In [50]:
from src.explanation.clustering.evaluation import (
    apply_grid_search_on_explanation_dataset)

# Apply the grid search on a subset of the training set.
apply_grid_search_on_explanation_dataset(
    x=x_train[::10],
    adj_distance_matrix=adj_distance_matrix,
    temporal_distance_matrix=temporal_distance_matrix,
    speed_distance_weight_list=[2, 3],
    n_clusters_list=[3, 4, 5])

Testing: n_clusters: 3 speed_distance_weight: 2
[99/99] - 381s - Within Cluster Variance: 3.12 - Connected Clusters Dissimilarity: 15.5 - Average time: 3.84s              
Testing: n_clusters: 3 speed_distance_weight: 3
[99/99] - 363s - Within Cluster Variance: 2.8 - Connected Clusters Dissimilarity: 16.3 - Average time: 3.67s               
Testing: n_clusters: 4 speed_distance_weight: 2
[99/99] - 522s - Within Cluster Variance: 2.38 - Connected Clusters Dissimilarity: 17 - Average time: 5.27s                
Testing: n_clusters: 4 speed_distance_weight: 3
[99/99] - 521s - Within Cluster Variance: 2.44 - Connected Clusters Dissimilarity: 17.6 - Average time: 5.27s              
Testing: n_clusters: 5 speed_distance_weight: 2
[99/99] - 650s - Within Cluster Variance: 1.84 - Connected Clusters Dissimilarity: 17.9 - Average time: 6.57s              
Testing: n_clusters: 5 speed_distance_weight: 3
[99/99] - 556s - Within Cluster Variance: 1.9 - Connected Clusters Dissimilarity: 17.7 - Ave

In [51]:
# Set the best parameters based on the results of the grid search.

SPEED_DISTANCE_WEIGHT = 2
N_CLUSTERS = 5

In [52]:
from src.explanation.clustering.evaluation import (
    get_explanation_dataset_clustering_scores)

get_explanation_dataset_clustering_scores(
    x_train,
    adj_distance_matrix=adj_distance_matrix,
    temporal_distance_matrix=temporal_distance_matrix,
    speed_distance_weight=SPEED_DISTANCE_WEIGHT,
    n_clusters=N_CLUSTERS)

[755/995] - 4573s - Within Cluster Variance: 2.01 - Connected Clusters Dissimilarity: 18.9 - Average time: 6.06s              

In [None]:
from src.explanation.clustering.evaluation import (
    get_explanation_dataset_clustering_scores)

get_explanation_dataset_clustering_scores(
    x_val,
    adj_distance_matrix=adj_distance_matrix,
    temporal_distance_matrix=temporal_distance_matrix,
    speed_distance_weight=SPEED_DISTANCE_WEIGHT,
    n_clusters=N_CLUSTERS)

Sample Within Cluster Variance: 0.0421 Sample Connected Cluster Dissimilarity: 18 Sample Noise Ratio: 0.00966


In [None]:
from src.explanation.clustering.evaluation import (
    get_explanation_dataset_clustering_scores)

get_explanation_dataset_clustering_scores(
    x_test,
    adj_distance_matrix=adj_distance_matrix,
    temporal_distance_matrix=temporal_distance_matrix,
    speed_distance_weight=SPEED_DISTANCE_WEIGHT,
    n_clusters=N_CLUSTERS)