In [1]:
import sys
import os

# Set the main path in the root folder of the project.
sys.path.append(os.path.join('..'))

In [2]:
# Settings for autoreloading.
%load_ext autoreload
%autoreload 2

In [3]:
from src.utils.seed import set_random_seed

# Set the random seed for deterministic operations.
SEED = 42
set_random_seed(SEED)

In [4]:
import torch

# Set the device for training and querying the model.
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'The selected device is: "{DEVICE}"')

The selected device is: "cuda"


# Loading the Data

In [7]:
import os

BASE_DATA_DIR = os.path.join('..', 'data', 'pems-bay')

In [8]:
import pickle
with open(os.path.join(BASE_DATA_DIR, 'processed', 'scaler.pkl'), 'rb') as f:
    scaler = pickle.load(f)

In [13]:
from src.spatial_temporal_gnn.model import SpatialTemporalGNN
from src.data.data_extraction import get_adjacency_matrix

# Get the adjacency matrix
adj_matrix_structure = get_adjacency_matrix(
    os.path.join(BASE_DATA_DIR, 'raw', 'adj_mx_pems_bay.pkl'))

# Get the header of the adjacency matrix, the node indices and the
# matrix itself.
header, node_ids_dict, adj_matrix = adj_matrix_structure

# Get the STGNN and load the checkpoints.
spatial_temporal_gnn = SpatialTemporalGNN(9, 1, 12, 12, adj_matrix, DEVICE, 64)

stgnn_checkpoints_path = os.path.join('..', 'models', 'checkpoints',
                                      'st_gnn_pems_bay.pth')

stgnn_checkpoints = torch.load(stgnn_checkpoints_path)
spatial_temporal_gnn.load_state_dict(stgnn_checkpoints['model_state_dict'])

# Set the model in evaluation mode.
spatial_temporal_gnn.eval();

In [14]:
from src.data.data_extraction import get_locations_dataframe

# Get the dataframe containing the latitude and longitude of each sensor.
locations_df = get_locations_dataframe(
    os.path.join(BASE_DATA_DIR, 'raw', 'graph_sensor_locations_pems_bay.csv'),
    has_header=False)

In [15]:
# Get the node positions dictionary.
node_pos_dict = { i: id for id, i in node_ids_dict.items() }

In [16]:
import os
import numpy as np
from src.spatial_temporal_gnn.prediction import predict

# Get the data and the values predicted by the STGNN.
x_train = np.load(os.path.join(BASE_DATA_DIR, 'predicted', 'x_train.npy'))
y_train = np.load(os.path.join(BASE_DATA_DIR, 'predicted', 'y_train.npy'))
x_val = np.load(os.path.join(BASE_DATA_DIR, 'predicted', 'x_val.npy'))
y_val = np.load(os.path.join(BASE_DATA_DIR, 'predicted', 'y_val.npy'))
x_test = np.load(os.path.join(BASE_DATA_DIR, 'predicted', 'x_test.npy'))
y_test = np.load(os.path.join(BASE_DATA_DIR, 'predicted', 'y_test.npy'))

# Get the time information of the train, validation and test sets.
x_train_time = np.load(
    os.path.join(BASE_DATA_DIR, 'processed', 'x_train_time.npy'))
y_train_time = np.load(
    os.path.join(BASE_DATA_DIR, 'processed', 'y_train_time.npy'))
x_val_time = np.load(
    os.path.join(BASE_DATA_DIR, 'processed', 'x_val_time.npy'))
y_val_time = np.load(
    os.path.join(BASE_DATA_DIR, 'processed', 'y_val_time.npy'))
x_test_time = np.load(
    os.path.join(BASE_DATA_DIR, 'processed', 'x_test_time.npy'))
y_test_time = np.load(
    os.path.join(BASE_DATA_DIR, 'processed', 'y_test_time.npy'))

In [17]:
# Turn the results in kilometers per hour.
MPH_TO_KMH_FACTOR = 1.609344

y_train = y_train * MPH_TO_KMH_FACTOR
y_val = y_val * MPH_TO_KMH_FACTOR
y_test = y_test * MPH_TO_KMH_FACTOR

In [18]:
_, n_timesteps, n_nodes, _ = y_train.shape

# Adjacency Distance Matrix

In [22]:
from src.explanation.clustering.clustering import (
    get_adjacency_distance_matrix)

adj_distance_matrix = get_adjacency_distance_matrix(adj_matrix, n_timesteps)

In [23]:
print(f'Shape of the Adjacency Distance Matrix: {adj_distance_matrix.shape}')

Shape of the Adjacency Distance Matrix: (3900, 3900)


# Temporal Distance Matrix

In [24]:
from src.explanation.clustering.clustering import (
    get_temporal_distance_matrix)

temporal_distance_matrix = get_temporal_distance_matrix(n_nodes, n_timesteps)

In [25]:
print('Shape of the Temporal Distance Matrix:',
      f'{temporal_distance_matrix.shape}')

Shape of the Temporal Distance Matrix: (3900, 3900)


# Clustering Function

In [27]:
from src.explanation.clustering.evaluation import apply_grid_search

# Apply the grid search on a subset of the training set.
apply_grid_search(
    instances=y_train[:200],
    eps_list=[.1, .15, .2, .25, .3, .35, .4, .45, .5],
    min_samples_list=[5, 7, 10, 12, 15, 17, 20],
    adj_distance_matrix=adj_distance_matrix,
    temporal_distance_matrix=temporal_distance_matrix)

eps: 0.1 min_samples: 5
	Within-Cluster Variance: 0.982 Connected Cluster Dissimilarity: 4.29 Noise points ratio: 0.947

eps: 0.1 min_samples: 7
	Within-Cluster Variance: 0.997 Connected Cluster Dissimilarity: 4.5 Noise points ratio: 0.99

eps: 0.1 min_samples: 10
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0.979 Noise points ratio: 0.999

eps: 0.1 min_samples: 12
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0.0745 Noise points ratio: 1

eps: 0.1 min_samples: 15
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.1 min_samples: 17
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.1 min_samples: 20
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.15 min_samples: 5
	Within-Cluster Variance: 0.838 Connected Cluster Dissimilarity: 6.13 Noise points ratio: 0.676

eps: 0.15 min_samples: 7
	Within-Cluster Variance: 0.962 Connected Clu

In [28]:
# Set the best parameters based on the results of the grid search.

EPS = .35
MIN_SAMPLES = 5

In [29]:
from src.explanation.clustering.evaluation import get_dataset_clustering_scores

(avg_within_cluster_variance, avg_connected_cluster_dissimilarity,
 avg_noise_ratio) = get_dataset_clustering_scores(
     y_test, adj_distance_matrix, temporal_distance_matrix, EPS, MIN_SAMPLES)

print(
    'Within-Cluster Variance on the test set:',
    f'{avg_within_cluster_variance:.3g}',
    'Connected Cluster Dissimilarity on the test set:',
    f'{avg_connected_cluster_dissimilarity:.3g}',
    'Noise points ratio on the test set:', f'{avg_noise_ratio:.3g}')

Within-Cluster Variance on the test set: 0.156 Connected Cluster Dissimilarity on the test set: 9.61 Noise points ratio on the test set: 0.00614


In [38]:
import os

DATA_DIR = os.path.join('..', 'data', 'pems-bay', 'explainable')

In [39]:
from numpy import save
from src.explanation.clustering.clustering import (
    get_dataset_for_explainability)

os.makedirs(DATA_DIR, exist_ok=True)

(x_train_expl, y_train_expl,
 x_train_time_expl, y_train_time_expl) = get_dataset_for_explainability(
    x_train,
    y_train,
    x_train_time,
    y_train_time,
    EPS,
    MIN_SAMPLES,
    adj_distance_matrix,
    temporal_distance_matrix,
    total_samples=1_000)
save(os.path.join(DATA_DIR, 'x_train.npy'), x_train_expl)
save(os.path.join(DATA_DIR, 'y_train.npy'), y_train_expl)
save(os.path.join(DATA_DIR, 'x_train_time.npy'), x_train_time_expl)
save(os.path.join(DATA_DIR, 'y_train_time.npy'), y_train_time_expl)

(x_val_expl, y_val_expl,
 x_val_time_expl, y_val_time_expl) = get_dataset_for_explainability(
    x_val,
    y_val,
    x_val_time,
    y_val_time,
    EPS,
    MIN_SAMPLES,
    adj_distance_matrix,
    temporal_distance_matrix,
    total_samples=170)
save(os.path.join(DATA_DIR, 'x_val.npy'), x_val_expl)
save(os.path.join(DATA_DIR, 'y_val.npy'), y_val_expl)
save(os.path.join(DATA_DIR, 'x_val_time.npy'), x_val_time_expl)
save(os.path.join(DATA_DIR, 'y_val_time.npy'), y_val_time_expl)

(x_test_expl, y_test_expl,
 x_test_time_expl, y_test_time_expl) = get_dataset_for_explainability(
    x_test,
    y_test,
    x_test_time,
    y_test_time,
    EPS,
    MIN_SAMPLES,
    adj_distance_matrix,
    temporal_distance_matrix,
    total_samples=400)
save(os.path.join(DATA_DIR, 'x_test.npy'), x_test_expl)
save(os.path.join(DATA_DIR, 'y_test.npy'), y_test_expl)
save(os.path.join(DATA_DIR, 'x_test_time.npy'), x_test_time_expl)
save(os.path.join(DATA_DIR, 'y_test_time.npy'), y_test_time_expl)

In [40]:
print('Train dataset for explainability shapes:',
      x_train_expl.shape, y_train_expl.shape)
print('Validation dataset for explainability shapes:',
      x_val_expl.shape, y_val_expl.shape)
print('Test dataset for explainability shapes:',
      x_test_expl.shape, y_test_expl.shape)

Train dataset for explainability shapes: (999, 12, 325, 9) (999, 12, 325, 1)
Validation dataset for explainability shapes: (168, 12, 325, 9) (168, 12, 325, 1)
Test dataset for explainability shapes: (399, 12, 325, 9) (399, 12, 325, 1)
