In [1]:
import sys
import os

# Set the main path in the root folder of the project.
sys.path.append(os.path.join('..'))

In [2]:
# Settings for autoreloading.
%load_ext autoreload
%autoreload 2

In [3]:
from src.utils.seed import set_random_seed

# Set the random seed for deterministic operations.
SEED = 42
set_random_seed(SEED)

In [4]:
import torch

# Set the device for training and querying the model.
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'The selected device is: "{DEVICE}"')

The selected device is: "cuda"


# Loading the Data

In [5]:
import os

BASE_DATA_DIR = os.path.join('..', 'data', 'metr-la')

In [6]:
import pickle
with open(os.path.join(BASE_DATA_DIR, 'processed', 'scaler.pkl'), 'rb') as f:
    scaler = pickle.load(f)

In [7]:
from src.spatial_temporal_gnn.model import SpatialTemporalGNN
from src.data.data_extraction import get_adjacency_matrix

# Get the adjacency matrix
adj_matrix_structure = get_adjacency_matrix(
    os.path.join(BASE_DATA_DIR, 'raw', 'adj_mx_metr_la.pkl'))

# Get the header of the adjacency matrix, the node indices and the
# matrix itself.
header, node_ids_dict, adj_matrix = adj_matrix_structure

# Get the STGNN and load the checkpoints.
spatial_temporal_gnn = SpatialTemporalGNN(9, 1, 12, 12, adj_matrix, DEVICE, 64)

stgnn_checkpoints_path = os.path.join('..', 'models', 'checkpoints',
                                      'st_gnn_metr_la.pth')

stgnn_checkpoints = torch.load(stgnn_checkpoints_path)
spatial_temporal_gnn.load_state_dict(stgnn_checkpoints['model_state_dict'])

# Set the model in evaluation mode.
spatial_temporal_gnn.eval();

In [8]:
from src.data.data_extraction import get_locations_dataframe

# Get the dataframe containing the latitude and longitude of each sensor.
locations_df = get_locations_dataframe(
    os.path.join(BASE_DATA_DIR, 'raw', 'graph_sensor_locations_metr_la.csv'),
    has_header=True)

In [9]:
# Get the node positions dictionary.
node_pos_dict = { i: id for id, i in node_ids_dict.items() }

In [10]:
import os
import numpy as np

# Get the data and the values predicted by the STGNN.
x_train = np.load(os.path.join(BASE_DATA_DIR, 'explained', 'x_train.npy'))[..., :1]
x_val = np.load(os.path.join(BASE_DATA_DIR, 'explained', 'x_val.npy'))[..., :1]
x_test = np.load(os.path.join(BASE_DATA_DIR, 'explained', 'x_test.npy'))[..., :1]

In [11]:
from src.utils.config import MPH_TO_KMH_FACTOR


# Turn the dataset in kilometers per hour.
x_train = x_train * MPH_TO_KMH_FACTOR
x_val = x_val * MPH_TO_KMH_FACTOR
x_test = x_test * MPH_TO_KMH_FACTOR

In [12]:
_, n_timesteps, n_nodes, _ = x_train.shape

# Adjacency Distance Matrix

In [13]:
from src.explanation.clustering.clustering import (
    get_adjacency_distance_matrix)

adj_distance_matrix = get_adjacency_distance_matrix(adj_matrix, n_timesteps)

In [14]:
print(f'Shape of the Adjacency Distance Matrix: {adj_distance_matrix.shape}')

Shape of the Adjacency Distance Matrix: (2484, 2484)


# Temporal Distance Matrix

In [15]:
from src.explanation.clustering.clustering import (
    get_temporal_distance_matrix)

temporal_distance_matrix = get_temporal_distance_matrix(n_nodes, n_timesteps)

In [16]:
print('Shape of the Temporal Distance Matrix:',
      f'{temporal_distance_matrix.shape}')

Shape of the Temporal Distance Matrix: (2484, 2484)


# Clustering Function

In [420]:
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import cdist

def get_explanation_clusters(
    x: np.ndarray,
    adj_distance_matrix: np.ndarray,
    temporal_distance_matrix: np.ndarray,
    speed_distance_weight: float = 3,
    n_clusters: int = 4,
    ) -> np.ndarray:
    """
    Get the clusters of the given explanation instance using the
    k-medoids algorithm.

    Parameters
    ----------
    instance : ndarray
        The spatial-temporal graph instance to cluster.
    adj_distance_matrix : ndarray
        The adjacency matrix of the nodes in the graph measured in distance
        between 0 and 1.
    temporal_distance_matrix : ndarray
        The matrix measuring the distance between the time steps
        of the nodes in the graph between 0 and 1.
    speed_distance_weight : float, optional
        The weight of the speed distance in the clustering process,
        by default 3.
    n_clusters : int, optional
        The number of clusters to find, by default 4.

    Returns
    -------
    ndarray
        The clusters of the given instance.
    """
    n_timesteps, n_nodes, _ = x.shape

    # Reshape the instance to be a column vector.
    reshaped_instance = x.reshape(-2, 1)

    # Compute the distance matrix between the speed of the nodes in the graph.
    speed_distance_matrix = cdist(
        reshaped_instance,
        reshaped_instance,
        'euclidean')
    # Normalize the distance matrix between 0 and 1.
    speed_distance_matrix /= np.max(speed_distance_matrix)

    # Compute the weighted distance matrix between the nodes in the graph
    # in terms of speed and the spatial and temporal distances.
    distance_matrix = speed_distance_matrix * speed_distance_weight +\
        adj_distance_matrix + temporal_distance_matrix

    # Set the distance between nodes that are not connected to an
    # unreachable value.
    distance_matrix[adj_distance_matrix == 1] = 1_000

    # Get the non-zero indices of the reshaped instance.
    non_zeros = np.where(reshaped_instance != 0)[0]
    # Reduce distance matrix by solely considering the nodes that are
    # present in the instance.
    distance_matrix = distance_matrix[non_zeros, :][:, non_zeros]

    # Compute the clusters of the given instance using the k-medoids
    # algorithm.
    agglomerative_clustering = AgglomerativeClustering(
        n_clusters=n_clusters,
        metric='precomputed',
        linkage='single')

    clusters = agglomerative_clustering.fit_predict(distance_matrix)

    # Add a dummy dimension to the clusters array.
    clusters = np.expand_dims(clusters, axis=1)

    # Create a cluster vector with dummy -1 values.
    clusters_vector = np.full_like(reshaped_instance, -1)
    # Set the non-zero values of the cluster vector to the clusters
    clusters_vector[non_zeros] = clusters[:]

    # Reshape the clusters array to have the same shape as the instance.
    clusters_vector = clusters_vector.reshape(n_timesteps, n_nodes, 1)

    # Set the cluster IDs as integers.
    clusters_vector = clusters_vector.astype(int)

    return clusters_vector

In [432]:
def get_mean_clusters_variance(
    x: np.ndarray,
    clusters: np.ndarray,
    ignore_noise: bool = False
    ) -> float:
    """
    Get the Within-Cluster Variance metric of the clusters
    obtained on the given instance in terms of speed.

    Parameters
    ----------
    x : ndarray
        The spatial-temporal graph instance on which the clusters
        are evaluated.
    clusters : ndarray
        The clusters obtained on the given instance.

    Returns
    -------
    float
        The Within-Cluster Variance metric result.
    """
    # Set the intial value of the numerator sum to 0.
    numerator_sum = 0.
    # Set the initial value of the total number of nodes to 0.
    total_node_number = 0.

    if ignore_noise:
        cluster_types = [c for c in np.unique(clusters) if c >= 0]
    else:
        cluster_types = np.unique(clusters)
    for c in cluster_types:
        # Get the sub-sample of the nodes in the graph that belong to the
        # current cluster.
        sub_sample = x[clusters == c]
        # Get the length of the sub-sample.
        len_sub_sample = len(sub_sample)
        # Update the total nominator sum.
        numerator_sum += np.var(sub_sample) #* len_sub_sample
        # Update the total number of nodes with the length of the sub-sample.
        total_node_number += len_sub_sample
    return numerator_sum / (total_node_number)

In [504]:
i = 0

cl = get_explanation_clusters(
    x=x_train[i],
    adj_distance_matrix=adj_distance_matrix,
    temporal_distance_matrix=temporal_distance_matrix,
    speed_distance_weight=3,
    n_clusters=3)

print('Variance within clusters:', get_mean_clusters_variance(
    x_train[i],
    cl,
    ignore_noise=True))

Variance within clusters: 5.452483550334905


In [58]:
from src.explanation.clustering.metrics import (
    get_within_clusters_variance,
    get_connected_cluster_dissimilarity)

cl = get_explanation_clusters(
    x=x_train[0],
    adj_distance_matrix=adj_distance_matrix,
    temporal_distance_matrix=temporal_distance_matrix,
    speed_distance_weight=3,
    distance_threshold=.5
    )

print(np.unique(cl))


print('Variance within clusters:', get_within_clusters_variance(
    x_train[0],
    cl,
    ignore_noise=True))
print('Dissimilarity between clusters:',get_connected_cluster_dissimilarity(
    x_train[0],
    cl, ignore_noise=True))

[-1  0  1  2  3  4  5  6  7  8  9]
Variance within clusters: 0.0970757680408836
Dissimilarity between clusters: 30.283116096489607


In [61]:
from src.explanation.clustering.metrics import (
    get_within_clusters_variance,
    get_connected_cluster_dissimilarity)

cl = get_explanation_clusters(
    x=x_train[0],
    adj_distance_matrix=adj_distance_matrix,
    temporal_distance_matrix=temporal_distance_matrix,
    speed_distance_weight=3,
    distance_threshold=1.
    )

print(np.unique(cl))


print('Variance within clusters:', get_within_clusters_variance(
    x_train[0],
    cl,
    ignore_noise=True))
print('Dissimilarity between clusters:',get_connected_cluster_dissimilarity(
    x_train[0],
    cl, ignore_noise=True))

[-1  0  1  2  3  4  5  6]
Variance within clusters: 0.28257857846696
Dissimilarity between clusters: 28.15513314273189


In [59]:
from src.explanation.clustering.metrics import (
    get_within_clusters_variance,
    get_connected_cluster_dissimilarity)

cl = get_explanation_clusters(
    x=x_train[0],
    adj_distance_matrix=adj_distance_matrix,
    temporal_distance_matrix=temporal_distance_matrix,
    speed_distance_weight=3,
    distance_threshold=3.
    )

print(np.unique(cl))


print('Variance within clusters:', get_within_clusters_variance(
    x_train[0],
    cl,
    ignore_noise=True))
print('Dissimilarity between clusters:',get_connected_cluster_dissimilarity(
    x_train[0],
    cl, ignore_noise=True))

[-1  0  1  2  3  4]
Variance within clusters: 4.083315684095293
Dissimilarity between clusters: 22.777141667383862


In [83]:
i = 10

from src.explanation.clustering.metrics import (
    get_within_clusters_variance,
    get_connected_cluster_dissimilarity)

cl = get_explanation_clusters(
    x=x_train[i],
    adj_distance_matrix=adj_distance_matrix,
    temporal_distance_matrix=temporal_distance_matrix,
    speed_distance_weight=3,
    n_clusters=4
    )

print(np.unique(cl))

print('Variance within clusters:', get_within_clusters_variance(
    x_train[i],
    cl,
    ignore_noise=True))
print('Dissimilarity between clusters:',get_connected_cluster_dissimilarity(
    x_train[i],
    cl, ignore_noise=True))

[-1  0  1  2  3  4  5]
Variance within clusters: 0.364716434831156
Dissimilarity between clusters: 35.33421425279196


AttributeError: 'AgglomerativeClustering' object has no attribute 'bic'

In [39]:
f.bic(distance_matrix)

In [36]:
np.unique(cl)

array([-1,  0,  1,  2,  3,  4])

In [33]:
# import sklearn
# sklearn.__version__

'1.3.2'

In [20]:
#from src.explanation.clustering.evaluation import (
#    apply_grid_search_on_explanation_dataset)

# Apply the grid search on a subset of the training set.
apply_grid_search_on_explanation_dataset(
    x=x_train[::10],
    adj_distance_matrix=adj_distance_matrix,
    temporal_distance_matrix=temporal_distance_matrix,
    speed_distance_weight_list=[2, 3],
    n_clusters_list=[3, 4, 5])

Testing: n_clusters: 3 speed_distance_weight: 2
[100/100] - 317s - Within Cluster Variance: 2.71 - Connected Clusters Dissimilarity: 17.5 - Average time: 3.17s 
Testing: n_clusters: 3 speed_distance_weight: 3
[100/100] - 308s - Within Cluster Variance: 2.78 - Connected Clusters Dissimilarity: 17.4 - Average time: 3.08s 
Testing: n_clusters: 4 speed_distance_weight: 2
[100/100] - 343s - Within Cluster Variance: 1.9 - Connected Clusters Dissimilarity: 18.2 - Average time: 3.42s 
Testing: n_clusters: 4 speed_distance_weight: 3
[100/100] - 375s - Within Cluster Variance: 1.67 - Connected Clusters Dissimilarity: 18.9 - Average time: 3.75s 
Testing: n_clusters: 5 speed_distance_weight: 2
[100/100] - 434s - Within Cluster Variance: 1.41 - Connected Clusters Dissimilarity: 18.2 - Average time: 4.34s 
Testing: n_clusters: 5 speed_distance_weight: 3
[100/100] - 446s - Within Cluster Variance: 1.44 - Connected Clusters Dissimilarity: 18.4 - Average time: 4.45s 


In [21]:
# Set the best parameters based on the results of the grid search.

SPEED_DISTANCE_WEIGHT = 3
N_CLUSTERS = 4

In [22]:
from src.explanation.clustering.evaluation import (
    get_explanation_dataset_clustering_scores)

get_explanation_dataset_clustering_scores(
    x_train,
    adj_distance_matrix=adj_distance_matrix,
    temporal_distance_matrix=temporal_distance_matrix,
    speed_distance_weight=SPEED_DISTANCE_WEIGHT,
    n_clusters=N_CLUSTERS)

[995/995] - 3500s - Within Cluster Variance: 2.1 - Connected Clusters Dissimilarity: 17.4 - Average time: 3.52s 


In [23]:
from src.explanation.clustering.evaluation import (
    get_explanation_dataset_clustering_scores)

get_explanation_dataset_clustering_scores(
    x_val,
    adj_distance_matrix=adj_distance_matrix,
    temporal_distance_matrix=temporal_distance_matrix,
    speed_distance_weight=SPEED_DISTANCE_WEIGHT,
    n_clusters=N_CLUSTERS)

[197/197] - 773s - Within Cluster Variance: 2.39 - Connected Clusters Dissimilarity: 17.4 - Average time: 3.92s 


In [24]:
from src.explanation.clustering.evaluation import (
    get_explanation_dataset_clustering_scores)

get_explanation_dataset_clustering_scores(
    x_test,
    adj_distance_matrix=adj_distance_matrix,
    temporal_distance_matrix=temporal_distance_matrix,
    speed_distance_weight=SPEED_DISTANCE_WEIGHT,
    n_clusters=N_CLUSTERS)

[299/299] - 1235s - Within Cluster Variance: 2.31 - Connected Clusters Dissimilarity: 18.4 - Average time: 4.13s 
