In [1]:
import sys
import os

# Set the main path in the root folder of the project.
sys.path.append(os.path.join('..'))

In [2]:
# Settings for autoreloading.
%load_ext autoreload
%autoreload 2

In [3]:
from src.utils.seed import set_random_seed

# Set the random seed for deterministic operations.
SEED = 42
set_random_seed(SEED)

In [4]:
import torch

# Set the device for training and querying the model.
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'The selected device is: "{DEVICE}"')

The selected device is: "cuda"


# Loading the Data

In [5]:
import os

BASE_DATA_DIR = os.path.join('..', 'data', 'metr-la')

In [6]:
import pickle
with open(os.path.join(BASE_DATA_DIR, 'processed', 'scaler.pkl'), 'rb') as f:
    scaler = pickle.load(f)

In [7]:
from src.spatial_temporal_gnn.model import SpatialTemporalGNN
from src.data.data_extraction import get_adjacency_matrix

# Get the adjacency matrix
adj_matrix_structure = get_adjacency_matrix(
    os.path.join(BASE_DATA_DIR, 'adj_mx_metr_la.pkl'))

# Get the header of the adjacency matrix, the node indices and the
# matrix itself.
header, node_ids_dict, adj_matrix = adj_matrix_structure

# Get the STGNN and load the checkpoints.
spatial_temporal_gnn = SpatialTemporalGNN(9, 1, 12, 12, adj_matrix, DEVICE, 64)

stgnn_checkpoints_path = os.path.join('..', 'models', 'checkpoints',
                                      'st_gnn_metr_la.pth')

stgnn_checkpoints = torch.load(stgnn_checkpoints_path)
spatial_temporal_gnn.load_state_dict(stgnn_checkpoints['model_state_dict'])

# Set the model in evaluation mode.
spatial_temporal_gnn.eval();

SpatialTemporalGNN(
  (encoder): Linear(in_features=9, out_features=64, bias=False)
  (s_gnns): ModuleList(
    (0): S_GNN(
      (latent_encoder): Sequential(
        (0): Linear(in_features=64, out_features=64, bias=False)
        (1): Linear(in_features=64, out_features=32, bias=False)
      )
      (linear): Linear(in_features=64, out_features=64, bias=False)
    )
    (1): S_GNN(
      (latent_encoder): Sequential(
        (0): Linear(in_features=64, out_features=64, bias=False)
        (1): Linear(in_features=64, out_features=32, bias=False)
      )
      (linear): Linear(in_features=64, out_features=64, bias=False)
    )
    (2): S_GNN(
      (latent_encoder): Sequential(
        (0): Linear(in_features=64, out_features=64, bias=False)
        (1): Linear(in_features=64, out_features=32, bias=False)
      )
      (linear): Linear(in_features=64, out_features=64, bias=False)
    )
    (3): S_GNN(
      (latent_encoder): Sequential(
        (0): Linear(in_features=64, out_features

In [8]:
from src.data.data_extraction import get_locations_dataframe

# Get the dataframe containing the latitude and longitude of each sensor.
locations_df = get_locations_dataframe(
    os.path.join(BASE_DATA_DIR, 'graph_sensor_locations_metr_la.csv'),
    has_header=True)

In [9]:
# Get the node positions dictionary.
node_pos_dict = { i: id for id, i in node_ids_dict.items() }

In [10]:
import os
import numpy as np
from src.spatial_temporal_gnn.prediction import predict

# Get the data and the values predicted by the STGNN.
x_train = np.load(os.path.join(BASE_DATA_DIR, 'processed', 'x_train.npy'))
y_train = predict(spatial_temporal_gnn, x_train, scaler, DEVICE)
x_val = np.load(os.path.join(BASE_DATA_DIR, 'processed', 'x_val.npy'))
y_val = predict(spatial_temporal_gnn, x_val, scaler, DEVICE)
x_test = np.load(os.path.join(BASE_DATA_DIR, 'processed', 'x_test.npy'))
y_test = predict(spatial_temporal_gnn, x_test, scaler, DEVICE)

In [11]:
# Turn the results in kilometers per hour.
MPH_TO_KMH_FACTOR = 1.609344

y_train = y_train * MPH_TO_KMH_FACTOR
y_val = y_val * MPH_TO_KMH_FACTOR
y_test = y_test * MPH_TO_KMH_FACTOR

In [12]:
# MPH_TO_KMH_FACTOR = 1.609344
# sample = y_test[100] * MPH_TO_KMH_FACTOR

In [13]:
_, n_timesteps, n_nodes, _ = y_train.shape

In [14]:
# sample_reshaped = sample.reshape(-2, 1)

In [15]:
# n_total_nodes = sample_reshaped.shape[0]

In [16]:
# import numpy as np

In [17]:
# np.std(adj_matrix)

# Adjacency Distance Matrix

In [18]:
adj_matrix_expanded = np.concatenate(
    [np.concatenate([adj_matrix] * n_timesteps, axis=0)] * n_timesteps, axis=1)

distance_adj_matrix = 1 - adj_matrix_expanded

In [19]:
distance_adj_matrix

array([[0.        , 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [1.        , 0.        , 0.6090446 , ..., 1.        , 1.        ,
        1.        ],
       [1.        , 0.28256208, 0.        , ..., 1.        , 1.        ,
        1.        ],
       ...,
       [1.        , 1.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 1.        , 0.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 1.        , 1.        ,
        0.        ]], dtype=float32)

In [20]:
distance_adj_matrix.shape

(2484, 2484)

# Temporal Distance Matrix

In [21]:
from scipy.spatial.distance import cdist

# Line-space the time steps between 0 and 1.
linespaced_time_steps = np.linspace(0, 1, n_timesteps)

# Repeat each time step for each node.
extended_time_steps = np.repeat(linespaced_time_steps, n_nodes)

# Add dummy dimension to the array.
extended_time_steps = np.expand_dims(extended_time_steps, axis=1)

distance_time_matrix = cdist(extended_time_steps, extended_time_steps, 'euclidean')

In [22]:
#distance_time_matrix[1, 1]

In [23]:
distance_time_matrix.shape

(2484, 2484)

# Clustering Function

In [24]:
from sklearn.cluster import DBSCAN

def get_clusters(
    instance: np.ndarray, adj_distance_matrix: np.ndarray,
    time_distance_matrix: np.ndarray, eps: float,
    min_samples: int, speed_distance_weight: float = 3) -> np.ndarray:
    """
    Get the clusters of the given instance using the DBSCAN algorithm.

    Parameters
    ----------
    instance : ndarray
        The spatial-temporal graph instance to cluster.
    adj_distance_matrix : ndarray
        The adjacency matrix of the nodes in the graph measured in distance
        between 0 and 1.
    time_distance_matrix : ndarray
        The matrix measuring the distance between the time steps
        of the nodes in the graph between 0 and 1.
    eps : float
        The maximum distance between two samples for one to be considered
        as in the neighborhood of the other.
    min_samples : int
        The number of samples in a neighborhood for a point for it to be
        considered as a core point.
    speed_distance_weight : float, optional
        The weight of the speed distance in the clustering process,
        by default 3.

    Returns
    -------
    ndarray
        The clusters of the given instance.
    """
    n_timesteps, n_nodes, _ = instance.shape

    # Reshape the instance to be a column vector.
    instance = instance.reshape(-2, 1)

    # Compute the distance matrix between the speed of the nodes in the graph.
    speed_distance_matrix = cdist(instance, instance, 'euclidean')
    # Normalize the distance matrix between 0 and 1.
    speed_distance_matrix /= np.max(speed_distance_matrix)

    # Compute the weighted distance matrix between the nodes in the graph
    # in terms of speed and the spatial and temporal distances.
    distance_matrix = speed_distance_matrix * speed_distance_weight +\
        adj_distance_matrix + time_distance_matrix

    # Normalize the distance matrix between 0 and 1.
    #distance_matrix /= np.max(distance_matrix)
    # Set the distance between nodes that are not connected to the maximum.
    distance_matrix[distance_adj_matrix == 1] = 1_000

    # Compute the clusters of the given instance using the DBSCAN algorithm.
    dbscan = DBSCAN(metric='precomputed', eps=eps, min_samples=min_samples,
                    n_jobs=-1)
    clusters = dbscan.fit_predict(distance_matrix)

    # Add a dummy dimension to the clusters array.
    clusters = np.expand_dims(clusters, axis=1)

    # Reshape the clusters array to have the same shape as the instance.
    clusters = clusters.reshape(n_timesteps, n_nodes, 1)

    return clusters

In [25]:
import numpy as np

def get_within_clusters_variance(
    instance: np.ndarray, clusters: np.ndarray) -> float:
    """Get the Within-Cluster Variance metric of the clusters
    obtained on the given instance in terms of speed.

    Parameters
    ----------
    instance : ndarray
        The spatial-temporal graph instance on which the clusters
        are evaluated.
    clusters : ndarray
        The clusters obtained on the given instance.

    Returns
    -------
    float
        The Within-Cluster Variance metric result.
    """
    # Set the intial value of the numerator sum to 0.
    numerator_sum = 0.
    # Set the initial value of the total number of nodes to 0.
    total_node_number = 0.

    for c in np.unique(clusters):
        # Get the sub-sample of the nodes in the graph that belong to the
        # current cluster.
        sub_sample = instance[clusters == c]
        # Get the length of the sub-sample.
        len_sub_sample = len(sub_sample)
        # Update the total nominator sum.
        numerator_sum += np.var(sub_sample) * len_sub_sample
        # Update the total number of nodes with the length of the sub-sample.
        total_node_number += len_sub_sample

    return numerator_sum / (total_node_number * np.var(instance))

In [26]:
import numpy as np

def are_clusters_connected(
    clusters: np.ndarray, cluster_1: int, cluster_2: int,
    adj_matrix: np.ndarray) -> bool:
    """
    Check whether the given clusters are connected or not, by
    observing whethere there are node spatially or temporally
    connected between the two clusters.

    Parameters
    ----------
    clusters : ndarray
        The clustered instance.
    cluster_1 : int
        The ID of the first cluster.
    cluster_2 : int
        The ID of the second cluster.
    adj_matrix : ndarray
        The adjacency matrix of the nodes in the graph.

    Returns
    -------
    bool
        Whether the given clusters are connected or not.
    """
    # Get the indices of the nodes that belong to the two clusters.
    # indices = (list of timesteps, list of nodes).
    indices_cluster_1 = np.where(clusters == cluster_1)[:-1]
    indices_cluster_2 = np.where(clusters == cluster_2)[:-1]

    # Zip the indices of the nodes that belong to the two clusters.
    zip_indices_cluster_1 = zip(indices_cluster_1[0], indices_cluster_1[1])
    zip_indices_cluster_2 = zip(indices_cluster_2[0], indices_cluster_2[1])

    for indices_cluster_1 in zip_indices_cluster_1:
        for indices_cluster_2 in zip_indices_cluster_2:
            # Get the indices of the timestep and the nodes.
            timestep_0, node_0 = indices_cluster_1
            timestep_1, node_1 = indices_cluster_2
            # Check if the nodes are spatially connected in the same timestep.
            if timestep_0 == timestep_1 and (adj_matrix[node_0, node_1] > 0 or adj_matrix[node_1, node_0] > 0):
                return True
            # Check if the nodes are the same and temporally connected.
            if node_0 == node_1 and np.abs(timestep_0 - timestep_1) == 1:
                return True
    # If no connection is found, return False.
    return False

In [27]:
import numpy as np

def get_connected_cluster_dissimilarity(
    instance: np.ndarray, clusters: np.ndarray,
    adj_matrix: np.ndarray) -> float:
    """Get the Connected Cluster Dissimilarity metric of the clusters
    obtained on the given instance in terms of speed.

    Parameters
    ----------
    instance : ndarray
        The spatial-temporal graph instance on which the clusters
        are evaluated.
    clusters : ndarray
        The clusters obtained on the given instance.
    adj_matrix : ndarray
        The adjacency matrix of the nodes in the graph.

    Returns
    -------
    float
        The Connected Cluster Dissimilarity metric result.
    """
    # Get the total unique cluster IDs.
    total_clusters = np.unique(clusters)

    # Set the initial value of the denominator sum to 0.
    denominator_sum = 0.
    # Set the initial value of the nominator sum to 0.
    nominator_sum = 0.

    for i, c1 in enumerate(total_clusters):
        for c2 in total_clusters[i+1:]:
            # If the two clusters are not connected, continue the loop.
            #if not are_clusters_connected(clusters, c1, c2, adj_matrix):
            #    continue
            # Get the sub-samples of the nodes in the graph that belong to the
            # current clusters.
            sub_sample1 = instance[clusters == c1]
            sub_sample2 = instance[clusters == c2]
            # Get the length of the sub-samples.
            len_sub_sample1 = len(sub_sample1)
            len_sub_sample2 = len(sub_sample2)
            # Compute the square root of the product of the lengths.
            sqrt_lens = np.sqrt(len_sub_sample1 * len_sub_sample2)
            # Compute the absolute difference between the means.
            abs_mean_diff = np.abs(np.mean(sub_sample1) - np.mean(sub_sample2))
            # Update the nominator sum.
            nominator_sum += sqrt_lens * abs_mean_diff
            # Update the denominator sum.
            denominator_sum += sqrt_lens

    return nominator_sum / denominator_sum if denominator_sum > 0 else 0

In [28]:
sample = y_test[100]
 
clusters = get_clusters(sample, distance_adj_matrix, distance_time_matrix,
                        eps=.1, min_samples=6)

print('Within-Cluster Variance:', get_within_clusters_variance(sample, clusters))
print('Connected Cluster Dissimilarity:', get_connected_cluster_dissimilarity(sample, clusters, adj_matrix))

Within-Cluster Variance: 1.0
Connected Cluster Dissimilarity: 0


In [29]:
#from tqdm import tqdm
from sklearn.model_selection import ParameterGrid

parameter_grid = ParameterGrid({
    'eps': [.1, .15, .2, .25, .3, .35, .4, .45, .5],
    'min_samples': [5, 7, 10, 12, 15, 17, 20]
    })

for p in parameter_grid:
    total_within_cluster_variance = 0.
    total_connected_cluster_dissimilarity = 0.
    total_noise_ratio = 0.
    print('eps:', p['eps'], 'min_samples:', p['min_samples'])
    for instance in y_train[:200]:
        clusters = get_clusters(instance, distance_adj_matrix, distance_time_matrix,
                                eps=p['eps'], min_samples=p['min_samples'])
        within_cluster_variance = get_within_clusters_variance(
            instance, clusters)
        connected_cluster_dissimilarity = get_connected_cluster_dissimilarity(
            instance, clusters, adj_matrix)
        total_within_cluster_variance += within_cluster_variance
        total_connected_cluster_dissimilarity += connected_cluster_dissimilarity
        noise = clusters[clusters == -1]
        total_noise_ratio += len(noise) / len(instance.flatten())
        
        

    avg_within_cluster_variance = total_within_cluster_variance / len(y_train[:200])
    avg_connected_cluster_dissimilarity = total_connected_cluster_dissimilarity / len(y_train[:200])
    avg_noise_ratio = total_noise_ratio / len(y_train[:200])
    
    noise = clusters[clusters == -1]
    
    print('\tWithin-Cluster Variance:', f'{avg_within_cluster_variance:.3g}', 
          'Connected Cluster Dissimilarity:', f'{avg_connected_cluster_dissimilarity:.3g}',
          'Noise points ratio:', f'{avg_noise_ratio:.3g}')
    print()

eps: 0.1 min_samples: 5
	Within-Cluster Variance: 0.999 Connected Cluster Dissimilarity: 6.07 Noise points ratio: 0.997

eps: 0.1 min_samples: 7
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.1 min_samples: 10
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.1 min_samples: 12
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.1 min_samples: 15
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.1 min_samples: 17
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.1 min_samples: 20
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.15 min_samples: 5
	Within-Cluster Variance: 0.946 Connected Cluster Dissimilarity: 12.1 Noise points ratio: 0.913

eps: 0.15 min_samples: 7
	Within-Cluster Variance: 0.993 Connected Cluster Dissimilarity: 8.

In [30]:
from sklearn.model_selection import ParameterGrid

'''parameter_grid = ParameterGrid({
    'eps': [.05, .1, .13, .15, .17],
    'min_samples': [4, 5, 6, 7, 8, 9, 10]
    })''';


'''parameter_grid = ParameterGrid({
    'eps': [.5, .6, .7, .8, .9, 1.],
    'min_samples': [5, 7, 10, 12, 15, 17, 20]
    })''';

In [31]:
len(parameter_grid)

63

In [32]:
#from tqdm import tqdm

for p in parameter_grid:
    total_within_cluster_variance = 0.
    total_connected_cluster_dissimilarity = 0.
    total_noise_ratio = 0.
    print('eps:', p['eps'], 'min_samples:', p['min_samples'])
    for instance in y_train[:200]:
        clusters = get_clusters(instance, distance_adj_matrix, distance_time_matrix,
                                eps=p['eps'], min_samples=p['min_samples'])
        within_cluster_variance = get_within_clusters_variance(
            instance, clusters)
        connected_cluster_dissimilarity = get_connected_cluster_dissimilarity(
            instance, clusters, adj_matrix)
        total_within_cluster_variance += within_cluster_variance
        total_connected_cluster_dissimilarity += connected_cluster_dissimilarity
        noise = clusters[clusters == -1]
        total_noise_ratio += len(noise) / len(instance.flatten())
        
        

    avg_within_cluster_variance = total_within_cluster_variance / len(y_train[:200])
    avg_connected_cluster_dissimilarity = total_connected_cluster_dissimilarity / len(y_train[:200])
    avg_noise_ratio = total_noise_ratio / len(y_train[:200])
    
    noise = clusters[clusters == -1]
    
    print('\tWithin-Cluster Variance:', f'{avg_within_cluster_variance:.3g}', 
          'Connected Cluster Dissimilarity:', f'{avg_connected_cluster_dissimilarity:.3g}',
          'Noise points ratio:', f'{avg_noise_ratio:.3g}')
    print()

eps: 0.1 min_samples: 5
	Within-Cluster Variance: 0.999 Connected Cluster Dissimilarity: 6.07 Noise points ratio: 0.997

eps: 0.1 min_samples: 7
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.1 min_samples: 10
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.1 min_samples: 12
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.1 min_samples: 15
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.1 min_samples: 17
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.1 min_samples: 20
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.15 min_samples: 5
	Within-Cluster Variance: 0.946 Connected Cluster Dissimilarity: 12.1 Noise points ratio: 0.913

eps: 0.15 min_samples: 7
	Within-Cluster Variance: 0.993 Connected Cluster Dissimilarity: 8.

KeyboardInterrupt: 

In [33]:
EPS = .5
MIN_SAMPLES = 5

#EPS = .1
#MIN_SAMPLES = 6

In [34]:
total_within_cluster_variance = 0.
total_connected_cluster_dissimilarity = 0.
print('Test set evaluation with eps:', EPS, 'min_samples:', MIN_SAMPLES)
for instance in y_test:
    clusters = get_clusters(instance, distance_adj_matrix, distance_time_matrix,
                            eps=EPS, min_samples=MIN_SAMPLES)
    within_cluster_variance = get_within_clusters_variance(instance, clusters)
    connected_cluster_dissimilarity = get_connected_cluster_dissimilarity(
        instance, clusters, adj_matrix)
    total_within_cluster_variance += within_cluster_variance
    total_connected_cluster_dissimilarity += connected_cluster_dissimilarity

avg_within_cluster_variance = total_within_cluster_variance / len(y_train)
avg_connected_cluster_dissimilarity = total_connected_cluster_dissimilarity / len(y_train)
print('\tWithin-Cluster Variance:', f'{avg_within_cluster_variance:.3g}', 
        'Connected Cluster Dissimilarity:', f'{avg_connected_cluster_dissimilarity:.3g}')

Test set evaluation with eps: 0.5 min_samples: 5
	Within-Cluster Variance: 0.033 Connected Cluster Dissimilarity: 4.43


In [35]:
sample = y_test[100]

In [36]:
clusters = get_clusters(instance, distance_adj_matrix, distance_time_matrix,
                        eps=EPS, min_samples=MIN_SAMPLES)

print(get_within_clusters_variance(instance, clusters))
print(get_connected_cluster_dissimilarity(instance, clusters, adj_matrix))

0.15982233576406152
10.59636877024353


In [37]:
print(np.unique(clusters.squeeze()))

[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
 71 72 73 74 75 76 77 78 79 80]


In [38]:
print('Number of clusters found:', len(np.unique(clusters)))

Number of clusters found: 82


In [39]:
sample = np.concatenate((sample, clusters), axis=2)

In [40]:
import pandas as pd
from typing import Dict

def get_node_values_with_location_dataframe(
    node_values: np.ndarray, node_pos_dict: Dict[int, str],
    locations_df: pd.DataFrame) -> pd.DataFrame:
    """Get a pandas dataframe from a pandas dataframe of node speed values
    and a pandas dataframe of node locations. The resulting dataframe
    has for each timestamp the value of the metric for each node and
    the location of the node in the form of latitude and longitude.

    Parameters
    ----------
    node_values : ndarray
        The numpy array containing the values of the cluster and speed of
        each node for each timestamp.
    locations_df : DataFrame
        The dataframe containing the location of each node.
    metric_name : str
        The name of the metric that will be used in the resulting dataframe.
    turn_datetimes_to_timestamp : bool
        Whether to turn the datetimes to timestamp or not.

    Returns
    -------
    DataFrame
        The resulting dataframe containing the values of the metric for
        each node and the location of the node in the form of latitude
        and longitude.
    """
    nodes_information = []
    
    for time_idx, node_matrix in enumerate(node_values):
        for node_idx, features in enumerate(node_matrix):
            node_id = node_pos_dict[node_idx]

            latitude = locations_df.loc[
                locations_df['sensor_id'] == node_id].latitude.values[0]
            longitude = locations_df.loc[
                locations_df['sensor_id'] == node_id].longitude.values[0]
    
            nodes_information.append(
                [node_id,
                 latitude,
                 longitude,
                 features[1],
                 features[0],
                 time_idx])

    df = pd.DataFrame({
        'sensor_id': [n[0] for n in nodes_information],
        'latitude': [n[1] for n in nodes_information],
        'longitude': [n[2] for n in nodes_information],
        'cluster': [n[3] for n in nodes_information],
        'speed': [n[4] for n in nodes_information],
        'datetime': [n[5] for n in nodes_information]
    })
    df['cluster'] = df['cluster'].astype(int)
    return df


In [41]:
location_df_with_clusters = get_node_values_with_location_dataframe(
    sample, node_pos_dict, locations_df)


In [42]:
location_df_with_clusters.head()

Unnamed: 0,sensor_id,latitude,longitude,cluster,speed,datetime
0,773869,34.15497,-118.31829,0,105.516815,0
1,767541,34.11621,-118.23799,1,106.30835,0
2,767542,34.11641,-118.23819,2,110.634773,0
3,717447,34.07248,-118.26772,3,81.586983,0
4,717446,34.07142,-118.26572,70,44.713619,0


In [43]:
from keplergl.keplergl import KeplerGl

m = KeplerGl(height=800, show_docs=False, data={'data': location_df_with_clusters})

In [44]:
'''from src.data.data_analysis import show_kepler_map

print('Metr-LA speed clusters on the first Monday:')
show_kepler_map(location_df_with_clusters, None)''';

In [45]:
m

KeplerGl(data={'data':      sensor_id  latitude  longitude  cluster       speed  datetime
0       773869  34.1…

In [None]:
def are_clusters_connected(clusters, c1, c2, adj_matrix):
    cluster_nodes_0 = np.where(clusters == c1)[:-1]
    cluster_nodes_1 = np.where(clusters == c2)[:-1]

    for i in zip(cluster_nodes_0[0], cluster_nodes_0[1], cluster_nodes_1[0], cluster_nodes_1[1]):
        # The nodes are spatially connected.
        if adj_matrix[i[1], i[3]] > 0:
            return True
        # The nodes are the same ones and there is a temporal distance of 1.
        if i[1] == i[3] and np.abs(i[0] - i[2]) == 1:
            return True
    return False

In [None]:
total_clusters = np.unique(clusters)

denominator_sum = 0.
nominator_sum = 0.

for i, c1 in enumerate(total_clusters):
    for cluster_2 in total_clusters[i+1:]:
        if not are_clusters_connected(clusters, c1, cluster_2, adj_matrix):
            continue
        sub_sample1 = sample[clusters == c1]
        sub_sample2 = sample[clusters == cluster_2]
        len_sub_sample1 = len(sub_sample1)
        len_sub_sample2 = len(sub_sample2)
        sqrt_lens = np.sqrt(len_sub_sample1 * len_sub_sample2)
        nominator_sum += sqrt_lens * np.abs(np.mean(sub_sample1) - np.mean(sub_sample2))
        denominator_sum += sqrt_lens

nominator_sum / denominator_sum

IndexError: boolean index did not match indexed array along dimension 2; dimension is 2 but corresponding boolean dimension is 1

In [None]:
print(sample_.shape)

(12, 207, 2)


In [None]:
import pandas as pd
from typing import Dict

def get_node_values_with_location_dataframe(
    node_values: np.ndarray, node_pos_dict: Dict[int, str],
    locations_df: pd.DataFrame) -> pd.DataFrame:
    """Get a pandas dataframe from a pandas dataframe of node speed values
    and a pandas dataframe of node locations. The resulting dataframe
    has for each timestamp the value of the metric for each node and
    the location of the node in the form of latitude and longitude.

    Parameters
    ----------
    node_values : ndarray
        The numpy array containing the values of the cluster and speed of
        each node for each timestamp.
    locations_df : DataFrame
        The dataframe containing the location of each node.
    metric_name : str
        The name of the metric that will be used in the resulting dataframe.
    turn_datetimes_to_timestamp : bool
        Whether to turn the datetimes to timestamp or not.

    Returns
    -------
    DataFrame
        The resulting dataframe containing the values of the metric for
        each node and the location of the node in the form of latitude
        and longitude.
    """
    nodes_information = []
    
    for time_idx, node_matrix in enumerate(node_values):
        for node_idx, features in enumerate(node_matrix):
            node_id = node_pos_dict[node_idx]

            latitude = locations_df.loc[
                locations_df['sensor_id'] == node_id].latitude.values[0]
            longitude = locations_df.loc[
                locations_df['sensor_id'] == node_id].longitude.values[0]
    
            nodes_information.append(
                [node_id,
                 latitude,
                 longitude,
                 features[1],
                 features[0],
                 time_idx])

    df = pd.DataFrame({
        'sensor_id': [n[0] for n in nodes_information],
        'latitude': [n[1] for n in nodes_information],
        'longitude': [n[2] for n in nodes_information],
        'cluster': [n[3] for n in nodes_information],
        'speed': [n[4] for n in nodes_information],
        'datetime': [n[5] for n in nodes_information]
    })
    df['cluster'] = df['cluster'].astype(int)
    return df


In [None]:
location_df_with_clusters = get_node_values_with_location_dataframe(
    sample_, node_pos_dict, locations_df)


In [None]:
location_df_with_clusters['cluster'].unique()

array([  0,   1,   2,   3,  48,  28,  63,   4,  64,  65,  66,  67,  -1,
         5,   6,  50,   7,   8,  14,  68,   9,  10,  69,  25,  51,  70,
        61,  71,  72,  27,  11,  73,  12,  13,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  52,  15,  16,  18,  17,  53,  83,  84,  85,
        32,  19,  44,  20,  86,  87,  21,  22,  23,  54,  55,  24,  88,
        26,  89,  90,  91,  92,  29,  39,  93,  94,  30,  56,  95,  31,
        96,  57,  97,  33,  58,  34,  35,  98,  36,  37,  99,  59,  47,
        38,  60,  40, 100,  41, 101, 102, 103,  42, 104, 105, 106, 107,
       108,  43, 109, 110, 111,  45, 112, 113, 114, 115,  46, 116, 117,
        62, 118, 119, 120, 121, 122, 123,  49, 124, 125])

In [None]:
#location_df_with_clusters['cluster'] = location_df_with_clusters['cluster'].astype(int)

In [None]:
location_df_with_clusters

Unnamed: 0,sensor_id,latitude,longitude,cluster,speed,datetime
0,773869,34.15497,-118.31829,0,105.516815,0
1,767541,34.11621,-118.23799,1,106.308350,0
2,767542,34.11641,-118.23819,2,110.634773,0
3,717447,34.07248,-118.26772,3,81.586983,0
4,717446,34.07142,-118.26572,48,44.713619,0
...,...,...,...,...,...,...
2479,717592,34.14604,-118.22430,121,100.792572,11
2480,717595,34.14163,-118.18290,122,109.369804,11
2481,772168,34.16542,-118.47985,123,95.773941,11
2482,718141,34.15133,-118.37456,19,106.050285,11


In [None]:
from keplergl.keplergl import KeplerGl

m = KeplerGl(height=800, show_docs=False, data={'data': location_df_with_clusters})

In [None]:
'''from src.data.data_analysis import show_kepler_map

print('Metr-LA speed clusters on the first Monday:')
show_kepler_map(location_df_with_clusters, None)''';

In [None]:
m

KeplerGl(data={'data':      sensor_id  latitude  longitude  cluster       speed  datetime
0       773869  34.1…

In [None]:
config = m.config

In [None]:
#np.expand_dims(np.repeat(np.linspace(0, 1, n_timesteps), n_nodes), axis=1).shape

In [None]:
#np.concatenate([np.repeat(np.linspace(0, 1, n_timesteps), n_nodes)] * n_nodes, axis=1).shape

In [None]:
# np.repeat(adj_matrix, 12).reshape(speed_distance_matrix.shape)

In [None]:
#speed_distance_matrix

In [None]:
similarity_matrix = np.zeros((n_total_nodes, n_total_nodes))

for i in range(similarity_matrix.shape[0]):
    for j in range(similarity_matrix.shape[1]):
        i_timestep = i // n_nodes
        j_timestep = j // n_nodes
        
        i_id = i % n_nodes
        j_id = j % n_nodes
        
        time_difference = abs(i_timestep - j_timestep)
        #speed_difference = np.linalg.norm(sample_reshaped[i] - sample_reshaped[j])
        
        if (adj_matrix[i_id][j_id] > .5) * (time_difference <= 2):
            speed_difference = np.linalg.norm(sample_reshaped[i] - sample_reshaped[j])
        else:
            speed_difference = float('inf')
        
        similarity_matrix[i][j] = speed_difference

In [None]:
#similarity_matrix

In [None]:
max_value = np.max(similarity_matrix[similarity_matrix != float('inf')])

In [None]:
similarity_matrix[similarity_matrix == float('inf')] = max_value

In [None]:
print(np.unique(similarity_matrix[1]))

[0.00000000e+00 3.14331055e-03 3.24172974e-02 4.27729034e+00
 4.36017609e+00 4.36315918e+00 5.13887024e+00 5.16516113e+00
 5.19464874e+00 8.32704544e+01]


In [None]:
similarity_matrix.std()

5.796418782568813

In [None]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(metric='precomputed', eps=10., min_samples=5, n_jobs=-1)

In [None]:
clusters = dbscan.fit_predict(similarity_matrix)

In [None]:
len(np.unique(clusters))

62

In [None]:
clusters = np.expand_dims(clusters, axis=1)

In [None]:
clusters.shape

(2484, 1)

In [None]:
clusters = clusters.reshape(n_timesteps, n_nodes, 1)

In [None]:
clusters.shape

(12, 207, 1)

In [None]:
sample_ = np.concatenate((sample, clusters), axis=2)

In [None]:
print(sample_.shape)

(12, 207, 2)


In [None]:
import pandas as pd
from typing import Dict

def get_node_values_with_location_dataframe(
    node_values: np.ndarray, node_pos_dict: Dict[int, str],
    locations_df: pd.DataFrame) -> pd.DataFrame:
    """Get a pandas dataframe from a pandas dataframe of node speed values
    and a pandas dataframe of node locations. The resulting dataframe
    has for each timestamp the value of the metric for each node and
    the location of the node in the form of latitude and longitude.

    Parameters
    ----------
    node_values : ndarray
        The numpy array containing the values of the cluster and speed of
        each node for each timestamp.
    locations_df : DataFrame
        The dataframe containing the location of each node.
    metric_name : str
        The name of the metric that will be used in the resulting dataframe.
    turn_datetimes_to_timestamp : bool
        Whether to turn the datetimes to timestamp or not.

    Returns
    -------
    DataFrame
        The resulting dataframe containing the values of the metric for
        each node and the location of the node in the form of latitude
        and longitude.
    """
    nodes_information = []
    
    for time_idx, node_matrix in enumerate(node_values):
        for node_idx, features in enumerate(node_matrix):
            node_id = node_pos_dict[node_idx]

            latitude = locations_df.loc[
                locations_df['sensor_id'] == node_id].latitude.values[0]
            longitude = locations_df.loc[
                locations_df['sensor_id'] == node_id].longitude.values[0]
    
            nodes_information.append(
                [node_id,
                 latitude,
                 longitude,
                 features[1],
                 features[0],
                 time_idx])

    df = pd.DataFrame({
        'sensor_id': [n[0] for n in nodes_information],
        'latitude': [n[1] for n in nodes_information],
        'longitude': [n[2] for n in nodes_information],
        'cluster': [n[3] for n in nodes_information],
        'speed': [n[4] for n in nodes_information],
        'datetime': [n[5] for n in nodes_information]
    })
    df['cluster'] = df['cluster'].astype(int)
    return df


In [None]:
location_df_with_clusters = get_node_values_with_location_dataframe(
    sample_, node_pos_dict, locations_df)


In [None]:
location_df_with_clusters['cluster'].unique()

array([ 0,  1,  2,  3,  4, 13,  5,  6, 37,  7, 41,  8,  9, 10, 42, 43, 11,
       44, 12, 14, 15, -1, 16, 45, 46, 17, 18, 47, 19, 20, 21, 48, 49, 50,
       22, 23, 24, 25, 51, 52, 26, 27, 28, 29, 30, 53, 54, 31, 32, 38, 55,
       33, 56, 57, 34, 58, 40, 35, 36, 59, 60, 39])

In [None]:
#location_df_with_clusters['cluster'] = location_df_with_clusters['cluster'].astype(int)

In [None]:
location_df_with_clusters

Unnamed: 0,sensor_id,latitude,longitude,cluster,speed,datetime
0,773869,34.15497,-118.31829,0,105.516815,0
1,767541,34.11621,-118.23799,1,106.308350,0
2,767542,34.11641,-118.23819,1,110.634773,0
3,717447,34.07248,-118.26772,2,81.586983,0
4,717446,34.07142,-118.26572,3,44.713619,0
...,...,...,...,...,...,...
2479,717592,34.14604,-118.22430,0,100.792572,11
2480,717595,34.14163,-118.18290,1,109.369804,11
2481,772168,34.16542,-118.47985,39,95.773941,11
2482,718141,34.15133,-118.37456,2,106.050285,11


In [None]:
from keplergl.keplergl import KeplerGl

m = KeplerGl(height=800, show_docs=False, data={'data': location_df_with_clusters})

In [None]:
'''from src.data.data_analysis import show_kepler_map

print('Metr-LA speed clusters on the first Monday:')
show_kepler_map(location_df_with_clusters, None)''';

In [None]:
m

KeplerGl(data={'data':      sensor_id  latitude  longitude  cluster       speed  datetime
0       773869  34.1…

In [None]:
config = m.config