In [1]:
import sys
import os

# Set the main path in the root folder of the project.
sys.path.append(os.path.join('..'))

In [2]:
# Settings for autoreloading.
%load_ext autoreload
%autoreload 2

In [3]:
from src.utils.seed import set_random_seed

# Set the random seed for deterministic operations.
SEED = 42
set_random_seed(SEED)

In [4]:
import torch

# Set the device for training and querying the model.
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'The selected device is: "{DEVICE}"')

The selected device is: "cuda"


# Loading the Data

In [5]:
import os

BASE_DATA_DIR = os.path.join('..', 'data', 'metr-la')

In [6]:
import pickle
with open(os.path.join(BASE_DATA_DIR, 'processed', 'scaler.pkl'), 'rb') as f:
    scaler = pickle.load(f)

In [7]:
from src.spatial_temporal_gnn.model import SpatialTemporalGNN
from src.explanation.navigator.model import Navigator
from src.data.data_extraction import get_adjacency_matrix

# Get the adjacency matrix
adj_matrix_structure = get_adjacency_matrix(
    os.path.join(BASE_DATA_DIR, 'raw', 'adj_mx_metr_la.pkl'))

# Get the header of the adjacency matrix, the node indices and the
# matrix itself.
header, node_ids_dict, adj_matrix = adj_matrix_structure

# Get the STGNN and load the checkpoints.
spatial_temporal_gnn = SpatialTemporalGNN(9, 1, 12, 12, adj_matrix, DEVICE, 64)

stgnn_checkpoints_path = os.path.join('..', 'models', 'checkpoints',
                                      'st_gnn_metr_la.pth')

stgnn_checkpoints = torch.load(stgnn_checkpoints_path)
spatial_temporal_gnn.load_state_dict(stgnn_checkpoints['model_state_dict'])

# Set the STGNN in evaluation mode.
spatial_temporal_gnn.eval();

# Get the Navigator and load the checkpoints.
navigator = Navigator(DEVICE)

navigator_checkpoints_path = os.path.join('..', 'models', 'checkpoints',
                                          'navigator_metr_la.pth')

navigator_checkpoints = torch.load(navigator_checkpoints_path)
navigator.load_state_dict(navigator_checkpoints['model_state_dict'])

# Set the Navigator in evaluation mode.
navigator.eval();



In [8]:
from src.data.data_extraction import get_locations_dataframe

# Get the dataframe containing the latitude and longitude of each sensor.
locations_df = get_locations_dataframe(
    os.path.join(BASE_DATA_DIR, 'raw', 'graph_sensor_locations_metr_la.csv'),
    has_header=True)

In [9]:
# Get the node positions dictionary.
node_pos_dict = { i: id for id, i in node_ids_dict.items() }

In [10]:
import pickle

# Get the data scaler.
with open(os.path.join(BASE_DATA_DIR, 'processed', 'scaler.pkl'), 'rb') as f:
    scaler = pickle.load(f)

In [11]:
import os
import numpy as np

# Get the explained data.
x_test = np.load(os.path.join(BASE_DATA_DIR, 'explained', 'x_test.npy'))
y_test = np.load(os.path.join(BASE_DATA_DIR, 'explained', 'y_test.npy'))

# Get the time information of the explained data.
x_test_time = np.load(os.path.join(BASE_DATA_DIR, 'explained', 'x_test_time.npy'))
y_test_time = np.load(os.path.join(BASE_DATA_DIR, 'explained', 'y_test_time.npy'))

In [12]:
import pickle

with open(os.path.join(BASE_DATA_DIR, 'structured', 'node_locations.pkl'), 'rb') as f:
    node_info = pickle.load(f)

In [34]:
list(node_info.items())[:5]

[('773869', ('Ventura Freeway', 14.54715370560186)),
 ('767541', ('Glendale Freeway', 10.345751960044197)),
 ('767542', ('Glendale Freeway', 10.327808458197385)),
 ('717447', ('Hollywood Freeway', 13.112986186569106)),
 ('717446', ('Hollywood Freeway', 13.329055658812624))]

In [13]:
sample_x, sample_y, sample_x_time, sample_y_time = x_test[0], y_test[0], x_test_time[0], y_test_time[0]

In [14]:
_, n_timesteps, n_nodes, _ = y_test.shape

In [15]:
from src.explanation.clustering.clustering import (
    get_adjacency_distance_matrix)

adj_distance_matrix = get_adjacency_distance_matrix(adj_matrix, n_timesteps)

In [16]:
from src.explanation.clustering.clustering import (
    get_temporal_distance_matrix)

temporal_distance_matrix = get_temporal_distance_matrix(n_nodes, n_timesteps)

In [17]:
# Set the best parameters based on the results of the grid search.

EPS = .35
MIN_SAMPLES = 5

In [18]:
sample_x[..., 0:]

array([[[ 0.  ],
        [ 0.  ],
        [ 0.  ],
        ...,
        [ 0.  ],
        [ 0.  ],
        [ 0.  ]],

       [[ 0.  ],
        [ 0.  ],
        [ 0.  ],
        ...,
        [ 0.  ],
        [ 0.  ],
        [ 0.  ]],

       [[ 0.  ],
        [ 0.  ],
        [ 0.  ],
        ...,
        [ 0.  ],
        [ 0.  ],
        [ 0.  ]],

       ...,

       [[ 0.  ],
        [62.  ],
        [ 0.  ],
        ...,
        [ 0.  ],
        [ 0.  ],
        [ 0.  ]],

       [[63.  ],
        [63.  ],
        [ 0.  ],
        ...,
        [ 0.  ],
        [ 0.  ],
        [ 0.  ]],

       [[63.75],
        [59.25],
        [ 0.  ],
        ...,
        [ 0.  ],
        [ 0.  ],
        [ 0.  ]]])

In [19]:
from src.explanation.clustering.clustering import get_clusters

clusters = get_clusters(
    sample_x[..., :1],
    adj_distance_matrix,
    temporal_distance_matrix,
    eps=EPS,
    min_samples=MIN_SAMPLES,
    remove_zeros=True)

(2484, 1)


In [20]:
# print(np.sum(sample_x != 0.))

In [21]:
'''
import numpy as np

print(np.unique(clusters))
''';

In [22]:
#print(np.unique(sample_x[clusters == -2]))

In [23]:
np.nonzero(sample_y)

(array([ 0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,
         8,  9,  9, 10, 10, 11, 11], dtype=int64),
 array([ 16, 196,  16, 196,  16, 196,  16, 196,  16, 196,  16, 196,  16,
        196,  16, 196,  16, 196,  16, 196,  16, 196,  16, 196], dtype=int64),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0], dtype=int64))

In [71]:
from typing import Dict, List, Tuple

import numpy as np


def _get_time(date: np.datetime64) -> Tuple[str, str, str]:
    days = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday',
            4: 'Friday', 5: 'Saturday', 6: 'Sunday'}

    Y, M, D, h, m = [date.astype('datetime64[%s]' % kind) for kind in 'YMDhm']

    year = Y.astype(int) + 1970
    month = M.astype(int) % 12 + 1
    day = (D - M).astype(int) + 1
    day_of_week = days[((D - M).astype(int) - 1) % 7]
    hour = (h - D).astype(int)
    minute = (m - h).astype(int)

    return day_of_week, f'{day:02d}/{month:02d}/{year}', f'{hour:02d}:{minute:02d}'

def _get_cluster_type(
    values: np.ndarray,
    congestion_max_speed: float = 60,
    free_flow_min_speed: float = 110
    ) -> str:
    if np.all(values) <= congestion_max_speed:
        return 'congestion'
    elif np.all(values) >= free_flow_min_speed:
        return 'free flow'
    else:
        return 'group of nodes'

def _set_cluster_location_info(
    knowledge_graph: List[Tuple[str, str, str]],
    reference_cluster_name: str,
    node_info: Dict[str, Tuple[str, int]],
    node_indices: np.ndarray
    ) -> None:
    # Get the unique node indices.
    node_indices = np.unique(node_indices)
    # Get the IDs of the nodes by their indices.
    node_ids = [ node_pos_dict[idx] for idx in node_indices ]

    # Get a dictionary containing the street and kilometrage of each node.
    streets = {}
    for node_id in node_ids:
        # Get the street and kilometrage of the node.
        street, km = node_info[node_id]
        # Add the street and kilometrage to the dictionary.
        if not street in streets.keys():
            streets[street] = [km]
        else:
            streets[street].append(km)

    for street, kms in streets.items():
        # Add the street and its kilometrages to the knowledge graph.
        knowledge_graph.append((reference_cluster_name, 'in highway', street))
        for km in kms:
            knowledge_graph.append((street, 'at km', f'{km:.2g}'))

def _set_cluster_time_info(
    knowledge_graph: List[Tuple[str, str, str]],
    reference_cluster_name: str,
    time_info: np.ndarray,
    time_indices: np.ndarray
    ) -> None:
    # Get the minimum and maximum timestep of the target nodes.
    min_timestep, max_timestep = np.min(time_indices), np.max(time_indices)
    y_min_time, y_max_time = time_info[min_timestep][0], time_info[max_timestep][0]
    beginning_day, beginning_date, beginning_hour = _get_time(y_min_time)
    end_day, end_date, end_hour = _get_time(y_max_time)


    # Put the date and day information of the target nodes in the
    # knowledge graph.
    if beginning_date == end_date:
        knowledge_graph.append(
            (reference_cluster_name, 'on date', beginning_date))
        knowledge_graph.append(
            (reference_cluster_name, 'on day', beginning_day))
    else:
        knowledge_graph.append(
            (reference_cluster_name, 'from date', beginning_date))
        knowledge_graph.append(
            (reference_cluster_name, 'to date', end_date))

        knowledge_graph.append(
            (reference_cluster_name, 'from day', beginning_day))
        knowledge_graph.append(
            (reference_cluster_name, 'to day', end_day))

    # Put the time information of the target nodes in the knowledge graph.
    knowledge_graph.append(
        (reference_cluster_name, 'from time', beginning_hour))
    knowledge_graph.append(
        (reference_cluster_name, 'to time', end_hour))

def get_knowledge_graph(
    x: np.ndarray,
    x_times: np.ndarray,
    x_clusters: np.ndarray,
    y: np.ndarray,
    y_times: np.ndarray) -> List[Tuple[str, str, str]]:
    knowledge_graph = []

    # Get the values of the selected target nodes.
    target_node_values = y[y > 0]

    # Get the type of the target nodes cluster (eg.: congestion, free flow).
    target_type = _get_cluster_type(target_node_values)

    # Get the name of the target nodes cluster.
    target_name = f'target {target_type}' 
    # Put the average speed of the target nodes cluster in the knowledge graph.
    knowledge_graph.append(
        (target_name, 'has average speed',
         f'{target_node_values.mean():.3g} km/h'))

    # Get the indices of the non-null values of the target nodes.
    y_indices = np.nonzero(y)

    _set_cluster_time_info(knowledge_graph, target_name, y_times, y_indices[0])

    _set_cluster_location_info(
        knowledge_graph, target_name, node_info, y_indices[1])

    for i, c in enumerate([c for c in np.unique(x_clusters) if c != -1]):
        # Get the values of the nodes of the cluster.
        cluster_node_values = x[x_clusters == c]

        # Get the type of the target nodes cluster (eg.: congestion, free flow).
        if c == -2:
            cluster_type = 'group of nodes'
        else:
            cluster_type = _get_cluster_type(cluster_node_values)

        # Get the name of the target nodes cluster.
        cluster_name = f'{cluster_type}{{{i}}}'
        
        # Add the causation information to the knowledge graph.
        knowledge_graph.append((target_name, 'caused by', cluster_name))
        
        # Put the average speed of the target nodes cluster in the knowledge graph.
        if c != -2:
            knowledge_graph.append(
                (cluster_name, 
                'has average speed', 
                f'{cluster_node_values.mean():.3g} km/h'))

        x_indices = np.where(x_clusters == c)
        _set_cluster_time_info(knowledge_graph, cluster_name, x_times, x_indices[0])
        _set_cluster_location_info(knowledge_graph, cluster_name, node_info, x_indices[1])

    print(knowledge_graph)

In [72]:
sample_y[sample_y != 0.]

array([35.352768, 35.112137, 34.866005, 34.5799  , 34.926838, 34.584217,
       35.600292, 35.142715, 34.550823, 34.035618, 32.767227, 32.132698,
       33.677856, 33.246357, 33.76597 , 33.834396, 33.604694, 34.004375,
       33.706665, 33.565376, 34.097023, 33.589268, 33.505253, 33.066093],
      dtype=float32)

In [73]:
get_knowledge_graph(sample_x, sample_x_time, clusters, sample_y, sample_y_time)

[('target congestion', 'has average speed', '34.1 km/h'), ('target congestion', 'on date', '04/06/2012'), ('target congestion', 'on day', 'Wednesday'), ('target congestion', 'from time', '05:45'), ('target congestion', 'to time', '06:40'), ('target congestion', 'in highway', 'Arroyo Seco Parkway'), ('Arroyo Seco Parkway', 'at km', '1.8'), ('Arroyo Seco Parkway', 'at km', '2.5'), ('target congestion', 'caused by', 'group of nodes{0}'), ('group of nodes{0}', 'on date', '04/06/2012'), ('group of nodes{0}', 'on day', 'Wednesday'), ('group of nodes{0}', 'from time', '04:45'), ('group of nodes{0}', 'to time', '05:40'), ('group of nodes{0}', 'in highway', 'Ventura Freeway'), ('Ventura Freeway', 'at km', '15'), ('group of nodes{0}', 'in highway', 'Glendale Freeway'), ('Glendale Freeway', 'at km', '10'), ('Glendale Freeway', 'at km', '8.2'), ('group of nodes{0}', 'in highway', 'Hollywood Freeway'), ('Hollywood Freeway', 'at km', '14'), ('Hollywood Freeway', 'at km', '1.6'), ('Hollywood Freeway'

In [53]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id, max_length=1000)

In [54]:
text = '''Translate this tuple set representing a knowledge graph into a textual output. Note, numbers between curly brackets are used to aid you to identify the entities of the graph, although they shouldn\'t be considered in the textual output:
"
(target congestion, from, 10:00)
(target_congestion, to, 11:00)
(target_congestion, caused by, congestion{0})
(congestion{0}, from, 09:00)
(congestion{0}, to, 10:00)
(target_congestion, caused by, congestion{1})
(congestion{1}, from, 07:00)
(congestion{1}, to, 08:00)
"'''
encoded_input = tokenizer(text, return_tensors='pt')
output = model.generate(**encoded_input)



In [59]:
#output.shape

In [60]:
tokenizer.decode(output[0], skip_special_tokens=True)

'Translate this tuple set representing a knowledge graph into a textual output. Note, numbers between curly brackets are used to aid you to identify the entities of the graph, although they shouldn\'t be considered in the textual output:\n"\n(target congestion, from, 10:00)\n(target_congestion, to, 11:00)\n(target_congestion, caused by, congestion{0})\n(congestion{0}, from, 09:00)\n(congestion{0}, to, 10:00)\n(target_congestion, caused by, congestion{1})\n(congestion{1}, from, 07:00)\n(congestion{1}, to, 08:00)\n"\n(target_congestion, from, 10:00)\n(congestion{1}, caused by, congestion{2})\n(congestion{2}, from, 09:00)\n(congestion{2}, caused by, congestion{3})\n(congestion{3}, from, 07:00)\n"\n(target_congestion, from, 10:00)\n(congestion{3}, caused by, congestion{4})\n(congestion{4}, from, 09:00)\n"\n(target_congestion, from, 10:00)\n(congestion{4}, caused by, congestion{5})\n(congestion{5}, from, 07:00)\n"\n(target_congestion, from, 10:00)\n(congestion{5}, caused by, congestion{6})\