# Computing distances for graph neural networks

## Part 1 - data preparation

Imports

In [2]:
import math
import os
import numpy as np
import pandas as pd
import requests
import torch
import copy
import plotly.graph_objects as go
import multiprocessing
import time
from os import listdir
from os.path import join,isdir
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim import Adam
from tqdm import tqdm
from ray import tune
from ray.tune.schedulers.async_hyperband import ASHAScheduler
from datetime import datetime
from pathlib import Path
from random import sample
from typing import Tuple
from geopy.distance import geodesic
from glob import glob
from sklearn.preprocessing import normalize
from enum import Enum
from sklearn.linear_model import LinearRegression
from torch_geometric_temporal.nn.attention.stgcn import STConv
from torch_geometric_temporal.nn.recurrent import GCLSTM, DCRNN
from torch_geometric.nn import GCNConv
from torch.nn import ReLU, Linear, Module, BatchNorm1d, Dropout
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *
from sklearn.metrics import r2_score
init_notebook_mode(connected=True) 

In [3]:
DATA_FOLDER = "Data"
GRAPH_INFO_TXT = "d07_text_meta_2021_03_27.txt"
current_directory = os.getcwd()
path_raw_data = os.path.join(current_directory, DATA_FOLDER)
path_raw_data_graph_info_txt = os.path.join(path_raw_data, GRAPH_INFO_TXT)


Set total number of nodes (nb_days) from Metadata (may contain empty nodes).

In [4]:
def DataReader_get_number_of_nodes(path_raw_data_graph_info_txt):
    nodes_location = []
    skip = True
    with open(path_raw_data_graph_info_txt) as f:
        content = f.readlines()
        for line in content:
            if skip:
                skip = False
            else:
                line = line.split('\t')
                line = line[:-1]         # ID     #LAT     #LONG
                nodes_location.append([line[0], line[8], line[9]])

    return len(nodes_location)


In [5]:
total_num_nodes = DataReader_get_number_of_nodes(path_raw_data_graph_info_txt)
print(f'The total number of nodes are : {total_num_nodes}')

The total number of nodes are : 4904


Get nodes that have data as good_nodes and nodes that have empty data as empty_nodes from the data.

In [6]:
def DataReader_get_good_empty_nodes(path_raw_data,total_num_nodes):
    index = total_num_nodes
    empty_nodes = []
    good_nodes = []
    txtFiles = os.path.join(path_raw_data, "*", "*.txt")
    for file in glob(txtFiles):
        with open(file) as f:
            content = f.readlines()
            for line in content:
                line = line.split(',')
                line = [line1.replace("\n", "") for line1 in line]
                if not (line[9] == '' or line[10] == '' or line[11] == ''):
                    good_nodes.append((int)(line[1]))
                else:
                    empty_nodes.append((int)(line[1]))
                index -= 1
                if index == 0:
                    return (good_nodes,empty_nodes)

In [7]:
(good_nodes, empty_nodes) = DataReader_get_good_empty_nodes(
    path_raw_data, total_num_nodes)
print(f'Five examples of nodes that contain data: {good_nodes[:5]}')
print(f'Five examples of nodes that do not have data: {empty_nodes[:5]}')


Five examples of nodes that contain data: [715898, 715918, 715920, 715929, 715930]
Five examples of nodes that do not have data: [715900, 715901, 715903, 715904, 715905]


Set data and labels as X and Y variables from Data. They will contain good nodes and empty nodes as well

In [8]:
def DataReader_read_data(path_raw_data):
    X = []
    Y = []
    txtFiles = os.path.join(path_raw_data, "*", "*.txt")
    nb_days = 0
    for file in glob(txtFiles):
        with open(file) as f:
            print(f'Reading day {nb_days + 1}')
            content = f.readlines()
            for line in content:
                line = line.split(',')
                line = [line1.replace("\n", "") for line1 in line]
                if not (line[9] == '' or line[10] == '' or line[11] == ''):
                    Y.append((float)(line[11]))
                    X.append([(float)(line[9]), (float)(line[10])])
        nb_days += 1
        # TODO : code for debugging, delete when finished
        # ------------------------------------------------
        # if nb_days == 2:
        #     break
        # ------------------------------------------------
    X = normalize(np.array(X))
    Y = Y
    return X,Y,nb_days

In [9]:
X_data,Y_data,nb_days = DataReader_read_data(path_raw_data)

Reading day 1
Reading day 2
Reading day 3
Reading day 4
Reading day 5
Reading day 6
Reading day 7
Reading day 8
Reading day 9
Reading day 10
Reading day 11
Reading day 12
Reading day 13
Reading day 14
Reading day 15
Reading day 16
Reading day 17
Reading day 18
Reading day 19
Reading day 20
Reading day 21
Reading day 22


In [10]:
print(f'Total number of days read: {nb_days}')
print(f'Five examples of normalized input data: {X_data[:5]}')
print(f'Five examples of input data label: {Y_data[:5]}')

Total number of days read: 22
Five examples of normalized input data: [[9.99999984e-01 1.76732671e-04]
 [9.99999978e-01 2.08088231e-04]
 [9.99999974e-01 2.29906536e-04]
 [9.99999979e-01 2.03723400e-04]
 [9.99999976e-01 2.17045449e-04]]
Five examples of input data label: [70.4, 69.2, 66.0, 72.5, 69.8]


Get nodes geo location from Metadata. They will contain good nodes only

In [11]:
def DataReader_read_nodes_data(path_raw_data_graph_info_txt, good_nodes):

    nodes_location = []
    skip = True
    with open(path_raw_data_graph_info_txt) as f:
        content = f.readlines()
        for line in content:
            if skip:
                skip = False
            else:
                line = line.split('\t')
                line = line[:-1]
                if (int)(line[0]) in good_nodes:  # ID  #LAT    #LONG
                    nodes_location.append([line[0], line[8], line[9]])
    return nodes_location


In [12]:
nodes_location = DataReader_read_nodes_data(
    path_raw_data_graph_info_txt, good_nodes)
print(
    f'Five examples of nodes geolocation as well as their ID\'s {nodes_location[:5]}')

Five examples of nodes geolocation as well as their ID's [['715898', '33.880183', '-118.021787'], ['715918', '33.93311', '-118.091005'], ['715920', '33.938544', '-118.094941'], ['715929', '33.971707', '-118.123095'], ['715930', '33.971763', '-118.122905']]


In [13]:
def visualization(path_raw_data,graph_info_txt):
        columnsInfo = [
            'Timestamp', 'Station', 'District', 'Freeway', 'DirOfTravel',
            'LaneType', 'Length', 'Samples', 'Observed', 'Flow', 'Occupancy',
            'Speed'
        ]
        for i in range(1, 9):
            columnsInfo.extend([
                str(i) + '_Samples',
                str(i) + '_Flow',
                str(i) + '_Occupancy',
                str(i) + '_Speed',
                str(i) + '_Observed'
            ])
        columnsMetadata = [
            'ID', 'Fwy', 'Dir', 'District', 'County', 'City', 'State_PM',
            'Abs_PM', 'Latitude', 'Longitude', 'Length', 'Type', 'Lanes',
            'Name', 'User_ID_1', 'User_ID_2', 'User_ID_3', 'User_ID_4'
        ]
        txtFiles = os.path.join(path_raw_data, "*", "*.txt")
        print("Reading Metadata")
        dataframeMetadata = pd.read_csv(path_raw_data_graph_info_txt,
                                        sep='\t',
                                        skiprows=1,
                                        header=None,
                                        names=columnsMetadata)
        print("Finished Reading Metadata")
        print("Reading Information")
        nb_days = 0
        dataframeInfo = pd.DataFrame(columns=columnsInfo)

        for file in glob(txtFiles):
            print("Reading day {0}".format(nb_days + 1))
            with open(file) as f:
                dataframeInfo = dataframeInfo.append(pd.read_csv(
                    file, sep=',', header=None, names=columnsInfo),
                                                     ignore_index=True)
                day = None
                nb_days += 1

            if nb_days == 5:
                break

        print("Finished Reading Information")
        return dataframeInfo, dataframeMetadata

In [14]:
path_plots = os.path.join(current_directory,"Plots")
path_plots_general = os.path.join(path_plots,"General")
dfInfo,dfMetaData = visualization(path_raw_data,path_raw_data_graph_info_txt)
print(dfInfo[:5])
print(dfMetaData[:5])

Reading Metadata
Finished Reading Metadata
Reading Information
Reading day 1
Reading day 2
Reading day 3
Reading day 4
Reading day 5
Finished Reading Information
             Timestamp Station District Freeway DirOfTravel LaneType  Length  \
0  06/01/2021 00:00:00  715898        7       5           S       ML    0.43   
1  06/01/2021 00:00:00  715900        7       5           S       OR     NaN   
2  06/01/2021 00:00:00  715901        7       5           N       OR     NaN   
3  06/01/2021 00:00:00  715903        7       5           N       OR     NaN   
4  06/01/2021 00:00:00  715904        7       5           S       OR     NaN   

  Samples Observed   Flow  ...  7_Samples  7_Flow  7_Occupancy  7_Speed  \
0       0        0  202.0  ...        NaN     NaN          NaN      NaN   
1       0        0    NaN  ...        NaN     NaN          NaN      NaN   
2       0        0    NaN  ...        NaN     NaN          NaN      NaN   
3       0        0    NaN  ...        NaN     NaN        

In [15]:
def PieChartRoadwayType(dfMeta):

    df = dfMeta
    series = df['Type'].value_counts(ascending=False, dropna=True)
    dataframe = pd.DataFrame({
        'Type': series.index,
        'count': series.values
    })
    dataframe = dataframe.replace({
        'CD': 'Coll/Dist',
        'CH': 'Conventional Highway',
        'FF': 'Freeway-Freeway connector',
        'FR': 'Off Ramp',
        'HV': 'HOV',
        'ML': 'Mainline',
        'OR': 'On Ramp'
    })
    fig = px.pie(dataframe,
                    values='count',
                    names='Type',
                    title='Types of roads')
                    
    iplot(fig, filename=f'PieChartRoadwayType')


In [16]:
PieChartRoadwayType(dfMetaData)

In [17]:
def BoxPlotSpeed(dfInfo) -> None:

    df = dfInfo
    df = df[df['Speed'].notna()]
    df['Day'] = df['Timestamp'].apply(lambda x: datetime.strptime(
        x, '%m/%d/%Y %H:%M:%S').weekday())
    fig = px.box(df, x='Day', y='Speed')
    iplot(fig, filename='BoxPlotSpeed')

In [18]:
# BoxPlotSpeed(dfInfo)

In [19]:
def MapHeatmapSpeed(dfInfo,dfMeta,hour) -> None:

    dfInfoFunc = dfInfo
    dfMetaFunc = dfMeta

    dfInfo2 = dfInfoFunc[dfInfoFunc['Speed'].notna()]
    dfInfo2['Hour'] = dfInfo2['Timestamp'].apply(
        lambda x: datetime.strptime(x, '%m/%d/%Y %H:%M:%S').hour)
    dfInfo2 = dfInfo2.loc[dfInfo2['Hour'] == hour]
    dfInfo2 = dfInfo2[['Hour', 'Station', 'Speed']]
    dfInfo2 = dfInfo2.groupby(['Hour', 'Station']).mean()
    df = pd.merge(dfMetaFunc, dfInfo2, left_on='ID', right_on='Station')
    fig = px.scatter_mapbox(
        df,
        lat="Latitude",
        lon="Longitude",
        color="Speed",
        color_continuous_scale=px.colors.sequential.Bluered,
        zoom=8,
        mapbox_style="open-street-map",
        title='Traffic speed at {0}:00'.format(str(hour)))
    iplot(fig, filename=f'MapHeatmapSpeed_{hour}')

In [20]:
MapHeatmapSpeed(dfInfo,dfMetaData,9)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [21]:
MapHeatmapSpeed(dfInfo,dfMetaData,15)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [22]:
MapHeatmapSpeed(dfInfo,dfMetaData,18)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [23]:
MapHeatmapSpeed(dfInfo,dfMetaData,22)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Enumartion used for Dataset sizes

In [24]:
class Dataset(Enum):
    Experimental = 0
    ExperimentalManual = 1
    ExperimentalLR = 2
    Tiny = 3
    TinyManual = 4
    TinyLR = 5


In [25]:
class DatasetNodes(Enum):
    Experimental = [ 718292, 769496, 718291, 718290, 764567, 774279, 774278, 764671 ]
    Tiny = [
        775637, 718165, 776986, 759289, 774672, 760643, 774671, 717046, 718419,
        769105, 764026, 759280, 775636, 759385, 760635, 718166, 774685, 774658,
        716938, 776177, 763453, 718421, 717045, 768598, 717043, 716063, 717041,
        717040, 717039, 737184, 717042, 718335, 763458, 776981, 737158, 737313,
        769118, 772501, 718173, 764037, 763447, 763246, 718041, 763251, 763424,
        763429, 763434, 763439, 764032, 764418
    ]

In [26]:
class DatasetSize(Enum):
    Experimental = len(DatasetNodes.Experimental.value)
    Tiny = len(DatasetNodes.Tiny.value)

Function to set the nodes for the datasets such that they are the same throughout the project

More constants defined, as well as the nodes id's for the Experimental and Tiny dataset 

In [27]:
def Graph_get_nodes_for_dataset(dataset):
    experimental_all = [Dataset.Experimental,
                        Dataset.ExperimentalManual, Dataset.ExperimentalLR]
    tiny_all = [Dataset.Tiny, Dataset.TinyManual, Dataset.TinyLR]
    if dataset in experimental_all:
        return DatasetNodes.Experimental.value
    elif dataset in tiny_all:
        return DatasetNodes.Tiny.value


In [28]:
print(Graph_get_nodes_for_dataset(Dataset.Experimental))


[718292, 769496, 718291, 718290, 764567, 774279, 774278, 764671]


In [29]:
def Graph_get_number_of_nodes_for_dataset(dataset):
    experimental_all = [Dataset.Experimental,
                        Dataset.ExperimentalManual, Dataset.ExperimentalLR]
    tiny_all = [Dataset.Tiny, Dataset.TinyManual, Dataset.TinyLR]
    if dataset in experimental_all:
        return DatasetSize.Experimental.value
    elif dataset in tiny_all:
        return DatasetSize.Tiny.value


In [30]:
def MapPlotSensors(dataset,dfMeta):

    df = dfMeta
    zoom = 8
    datanodes = Graph_get_nodes_for_dataset(dataset)
    df = df[df['ID'].isin(datanodes)]
    if dataset == Dataset.Experimental:
        zoom = 14
    if dataset == Dataset.Tiny:
        zoom = 12
    fig = px.scatter_mapbox(df,
                            lat="Latitude",
                            lon="Longitude",
                            hover_name="ID",
                            hover_data=["Type", "Lanes"],
                            color_discrete_sequence=["black"],
                            zoom=zoom,
                            size_max=15,
                            mapbox_style="open-street-map")

    iplot(fig, filename=f'BoxPlotSpeed_{dataset.name}')

In [31]:
MapPlotSensors(Dataset.Experimental,dfMetaData)

In [32]:
MapPlotSensors(Dataset.Tiny,dfMetaData)

In [33]:
print(Graph_get_number_of_nodes_for_dataset(Dataset.Experimental))
print(Graph_get_number_of_nodes_for_dataset(Dataset.Tiny))

8
50


In [34]:
def Graph_extract_time(json):
    try:
        return float(json['routes']['distance'])
    except KeyError:
        return 0


In [35]:
def Graph_OSRM_loop(p1: tuple, p2: tuple) -> float:
    requestUrl = f'http://router.project-osrm.org/route/v1/driving/{p1[1]},{p1[0]};{p2[1]},{p2[0]}'

    try:
        response = requests.get(requestUrl)
    except:
        return -1
    if (response.status_code != 204
            and response.headers["content-type"].strip().startswith(
                "application/json")):
        try:
            responseJson = response.json()
        except:
            return 0
    else:
        return -1
    if responseJson['code'] == "Ok":
        routes = responseJson['routes']
        routes.sort(key=Graph_extract_time, reverse=True)
        shortest_distance = float(routes[0]['distance']) * (1 / 1000)
        return shortest_distance
    return


In [36]:
def Graph_OSRM(p1: tuple, p2: tuple) -> float:
    distance = Graph_OSRM_loop(p1, p2)
    while (distance == -1):
        distance = Graph_OSRM_loop(p1, p2)
    return distance


In [37]:
p1 = ('33.880183', '-118.021787')
p2 = ('33.93311', '-118.091005')
print(
    f'Geodesic distance between node id 715898 and node id 715918 is {geodesic(p1,p2)}')
print(
    f'Road distance between node id 715898 and node id 715918 is {Graph_OSRM(p1,p2)} km')


Geodesic distance between node id 715898 and node id 715918 is 8.68599891018443 km
Road distance between node id 715898 and node id 715918 is 8.849 km


In [38]:
def Graph_compute_all_OSRM_and_Geodesic(nodes_location, path_distances):
    for dataset in [Dataset.Experimental, Dataset.Tiny]:

        name_OSRM = os.path.join(
            path_distances, f'distances_OSRM_{dataset.name}.npy')
        name_geodesic = os.path.join(
            path_distances, f'distances_Geodesic_{dataset.name}.npy')

        if (os.path.exists(name_OSRM) and os.path.exists(name_geodesic)):
            print(f'Distances already computed for {dataset.name}')
            continue

        nodes_ids = Graph_get_nodes_for_dataset(dataset)
        nodes_location_dataset = [node for node in nodes_location if (int)(node[0]) in nodes_ids]

        matrix_size = len(nodes_location_dataset)

        OSRM_array = np.zeros((matrix_size, matrix_size))
        geodesic_array = np.zeros((matrix_size, matrix_size))

        for i in range(matrix_size - 1):
            for j in range(i + 1, matrix_size):
                p1 = ((float)(nodes_location_dataset[i][1]),
                      (float)(nodes_location_dataset[i][2]))
                p2 = ((float)(nodes_location_dataset[j][1]),
                      (float)(nodes_location_dataset[j][2]))

                id_1 = (int)(nodes_location_dataset[i][0])
                id_2 = (int)(nodes_location_dataset[j][0])

                print(f'Computing distances for Id\'s {id_1} and {id_2}')

                OSRM_array[i][j] = Graph_OSRM(p1, p2)
                geodesic_array[i][j] = geodesic(p1, p2).km
                OSRM_array[j][i] = Graph_OSRM(p2, p1)
                geodesic_array[j][i] = geodesic(p2, p1).km

        np.save(name_OSRM, OSRM_array)
        np.save(name_geodesic, geodesic_array)

    return


In [39]:
path_processed_data = os.path.join(current_directory, "Processed")
if not os.path.exists(path_processed_data):
    os.makedirs(path_processed_data)

path_distances = os.path.join(path_processed_data, "Distances")
if not os.path.exists(path_distances):
    os.makedirs(path_distances)


In [40]:
print("Started computing distances...")
Graph_compute_all_OSRM_and_Geodesic(nodes_location, path_distances)
print("Finished computing distances...")


Started computing distances...
Distances already computed for Experimental
Distances already computed for Tiny
Finished computing distances...


In [41]:
array_distances_OSRM_Experimental = np.load(os.path.join(
    path_distances, f'distances_OSRM_Experimental.npy'))
array_distances_Geodesic_Experimental = np.load(os.path.join(
    path_distances, f'distances_Geodesic_Experimental.npy'))
print(array_distances_OSRM_Experimental)
print(array_distances_Geodesic_Experimental)


[[0.     2.3406 3.5436 7.6358 7.1201 3.2997 3.6629 5.1336]
 [1.8412 0.     1.203  5.2952 4.7795 0.9591 1.3223 3.7805]
 [4.7693 7.1099 0.     4.0921 3.5765 8.069  6.1248 5.4476]
 [0.6772 3.0178 4.2208 0.     7.7973 3.9769 4.3401 5.8108]
 [1.1929 3.5335 0.7643 0.5157 0.     4.4926 2.5484 3.1371]
 [5.0133 7.3539 0.2439 4.3361 3.8204 0.     6.3688 5.6916]
 [3.573  5.9136 3.4803 5.0002 4.4845 6.8727 0.     2.4172]
 [3.1712 5.5117 2.5385 6.6306 6.1149 6.4709 3.2839 0.    ]]
[[0.         0.02477349 1.20259522 0.67630258 1.19130412 0.95912002
  1.15459812 1.1201794 ]
 [0.02477349 0.         1.20137815 0.67577782 1.19058306 0.95783532
  1.15965723 1.12529727]
 [1.20259522 1.20137815 0.         0.52659971 0.02639492 0.24354287
  0.30303407 0.30835885]
 [0.67630258 0.67577782 0.52659971 0.         0.51500154 0.28354955
  0.52112531 0.48918423]
 [1.19130412 1.19058306 0.02639492 0.51500154 0.         0.23375099
  0.27704067 0.28200812]
 [0.95912002 0.95783532 0.24354287 0.28354955 0.23375099 0.
  

In [42]:
class DistanceType(Enum):
    Geodesic = 0
    OSRM = 1


In [43]:
def Graph_get_adjency_matrix_weight(ID1_index, ID2_index, epsilon, sigma,distances_array) -> float:
    distance = distances_array[ID1_index][ID2_index]
    weight = math.exp(-((distance**2) / (sigma**2)))
    if weight >= epsilon:
        return weight
    else:
        return 0


In [44]:
edge_index_Experimental_manual = [[0, 1, 7, 4, 7, 5], [1, 2, 4, 3, 6, 4]]
edge_index_Tiny_manual = [[
    0, 0, 0, 1, 5, 5, 5, 9, 9, 9, 10, 10, 10, 6, 14, 15, 7, 13, 10, 11, 8,
    9, 4, 16, 5, 2, 20, 3, 22, 23, 24, 25, 26, 28, 29, 30, 31, 21, 32, 33,
    34, 36, 17, 38, 12, 39, 40, 41, 42, 44, 45, 46, 47, 48, 27, 35, 9
],
    [
    1, 2, 3, 4, 6, 7, 2, 4, 3, 12, 12, 11,
    4, 14, 15, 12, 13, 10, 11, 8, 2, 3, 16,
    17, 6, 20, 21, 22, 23, 24, 25, 26, 27,
    29, 30, 31, 7, 32, 33, 34, 35, 9, 37,
    5, 39, 40, 41, 42, 43, 45, 46, 47, 48,
    0, 19, 18, 36
]]


In [45]:
def Graph_get_info_for_Standard(nodes_location, epsilon, sigma, dataset,array_distances):
    nodes = Graph_get_nodes_for_dataset(dataset)
    edge_index = []
    edge_weight = []
    nodes_location_manual = [node for node in nodes_location if (int)(node[0]) in nodes]
    num_nodes = len(nodes_location_manual)
    for i in range(num_nodes):
        for j in range(num_nodes):
            if i != j:
                weight = Graph_get_adjency_matrix_weight(i,j,epsilon, sigma,array_distances)
                if weight > 0:
                    edge_index.append([i, j])
                    edge_weight.append(weight)
    edge_index = np.transpose(edge_index)
    return edge_index, edge_weight

In [46]:
def Graph_get_info_for_Manual(nodes_location, epsilon, sigma, dataset,
            edge_index_Experimental_manual,edge_index_Tiny_manual,array_distances):
    
    edge_weight = []
    if dataset == Dataset.ExperimentalManual:
        edge_index = edge_index_Experimental_manual
    elif dataset == Dataset.TinyManual:
        edge_index = edge_index_Tiny_manual
    nodes = Graph_get_nodes_for_dataset(dataset)
    nodes_location_standard = [node for node in nodes_location if (int)(node[0]) in nodes]
    num_nodes = len(nodes_location_standard)
    nodes_info = np.zeros((num_nodes, 3))
    for i in range(num_nodes):
        np.where(nodes == (int)(nodes_location_standard[i][0]))[0]
        nodes_info[np.where(nodes == (int)(nodes_location_standard[i][0]))[0]] = nodes_location_standard[i]
    nodes_location_standard = np.array(nodes_info)
    for i in range(len(edge_index[0])):
        weight = Graph_get_adjency_matrix_weight(
            edge_index[0][i],edge_index[1][i], epsilon, sigma,array_distances
        )
        edge_weight.append(weight)
    return edge_index, edge_weight


In [47]:
def Graph_get_top_3_nodes_for_node_with_LR(node, dataset, nodes_location, speed_vector):
    nodes_ids = Graph_get_nodes_for_dataset(dataset)
    ids_index = 0
    X_train = []
    Y_train = []
    nodes_used = []
    node_ids_order = [(int)(node[0]) for node in nodes_location]
    nodes_used_computed = False
    X_train_snapshot = []
    for speed in speed_vector:

        if node != node_ids_order[ids_index] and node_ids_order[ids_index] in nodes_ids:
            X_train_snapshot.append(speed)
            if not nodes_used_computed:
                nodes_used.append(node_ids_order[ids_index])

        if node == node_ids_order[ids_index]:
            Y_train.append(speed)

        ids_index += 1
        if ids_index == len(node_ids_order): 
            ids_index = 0
            X_train.append(X_train_snapshot)
            X_train_snapshot = []
            nodes_used_computed = True

    regression = LinearRegression(positive=True).fit(X_train, Y_train)
    coeffiecients = regression.coef_.tolist()
    results = zip(nodes_used, coeffiecients)
    sorted_results = sorted(results, key=lambda tup: tup[1], reverse=True)
    best = sorted_results[:3]
    return [result[0] for result in best]

In [48]:
print(Graph_get_top_3_nodes_for_node_with_LR(717042, Dataset.TinyLR, nodes_location, Y_data))

[759385, 718335, 764032]


In [49]:
def Graph_get_info_for_LR(nodes_location, epsilon, sigma, dataset ,array_distances, Y_data):
    edge_index = []
    edge_weight = []
    nodes_ids = Graph_get_nodes_for_dataset(dataset)
    for node in nodes_ids:
        nodes_relevant = Graph_get_top_3_nodes_for_node_with_LR(node, dataset, nodes_location, Y_data)
        for node_relevant in nodes_relevant:
            edge_index.append([nodes_ids.index(node_relevant),nodes_ids.index(node)])
            edge_weight.append(Graph_get_adjency_matrix_weight(
                    nodes_ids.index(node_relevant), nodes_ids.index(node),
                    epsilon, sigma,array_distances))
    edge_index = [list(x) for x in set(tuple(x) for x in edge_index)]
    edge_index = np.transpose(edge_index)
    return edge_index, edge_weight

In [50]:
def Graph_save_graph(nodes_location, epsilon, sigma, dataset, distanceType, path_processed_data,
                    edge_index_Experimental_manual,edge_index_Tiny_manual,path_distances,Y_data):

        nodes = Graph_get_nodes_for_dataset(dataset)
        name_folder_weight = os.path.join(path_processed_data,'EdgeWeight')
        name_folder_index = os.path.join(path_processed_data,'EdgeIndex')
        if dataset in [Dataset.Experimental,Dataset.ExperimentalLR,Dataset.ExperimentalManual]:
            dataset_name = Dataset.Experimental.name
        if dataset in [Dataset.Tiny,Dataset.TinyLR,Dataset.TinyManual]:
            dataset_name = Dataset.Tiny.name
        array_distances = np.load(os.path.join(path_distances,f'distances_{distanceType.name}_{dataset_name}.npy'))

        if not os.path.exists(name_folder_weight):
            os.makedirs(name_folder_weight)

        if not os.path.exists(name_folder_index):
            os.makedirs(name_folder_index)

        postfix = f'{distanceType.name}_{epsilon}_{sigma}_{dataset.name}'

        name_weight = os.path.join(name_folder_weight,f'weight_{postfix}.npy')
        name_index = os.path.join(name_folder_index,f'index_{postfix}.npy')

        if os.path.exists(name_weight) and os.path.exists(name_index): 
            print(f'Graph already saved with configuration : epsilon = {epsilon}, sigma = {sigma}, size = {dataset.name}, distance = {distanceType.name}')
            return

        print(f'Saving graph with configuration : epsilon = {epsilon}, sigma = {sigma}, size = {dataset.name}, distance = {distanceType.name}')
        if (dataset == Dataset.ExperimentalManual or dataset == Dataset.TinyManual):
            edge_index, edge_weight = Graph_get_info_for_Manual(nodes_location, epsilon, sigma, dataset,
                    edge_index_Experimental_manual,edge_index_Tiny_manual,array_distances)
        elif (dataset == Dataset.ExperimentalLR or dataset == Dataset.TinyLR):
            edge_index, edge_weight = Graph_get_info_for_LR(nodes_location, epsilon, sigma, dataset,array_distances,Y_data)
        else:
            edge_index, edge_weight = Graph_get_info_for_Standard(nodes_location, epsilon, sigma, dataset ,array_distances)
        
        np.save(name_index, edge_index)
        np.save(name_weight, edge_weight)

In [51]:
EPSILON_ARRAY = [0.1, 0.3, 0.5, 0.7]
SIGMA_ARRAY = [1, 3, 5, 10]


In [52]:
for epsilon in EPSILON_ARRAY:
    for sigma in SIGMA_ARRAY:
        for distanceType in DistanceType:
            for dataset in Dataset:
                Graph_save_graph(nodes_location, epsilon, sigma, dataset, distanceType, path_processed_data,
                    edge_index_Experimental_manual,edge_index_Tiny_manual,path_distances,Y_data) 

Graph already saved with configuration : epsilon = 0.1, sigma = 1, size = Experimental, distance = Geodesic
Graph already saved with configuration : epsilon = 0.1, sigma = 1, size = ExperimentalManual, distance = Geodesic
Graph already saved with configuration : epsilon = 0.1, sigma = 1, size = ExperimentalLR, distance = Geodesic
Graph already saved with configuration : epsilon = 0.1, sigma = 1, size = Tiny, distance = Geodesic
Graph already saved with configuration : epsilon = 0.1, sigma = 1, size = TinyManual, distance = Geodesic
Graph already saved with configuration : epsilon = 0.1, sigma = 1, size = TinyLR, distance = Geodesic
Graph already saved with configuration : epsilon = 0.1, sigma = 1, size = Experimental, distance = OSRM
Graph already saved with configuration : epsilon = 0.1, sigma = 1, size = ExperimentalManual, distance = OSRM
Graph already saved with configuration : epsilon = 0.1, sigma = 1, size = ExperimentalLR, distance = OSRM
Graph already saved with configuration :

In [53]:
def GetGraphMatrixFromGraph(edge_index, edge_weight, dataset):
    nr_nodes = Graph_get_number_of_nodes_for_dataset(dataset)
    graph_matrix = np.zeros((nr_nodes, nr_nodes))
    for index, (edge_index, edge_weight) in enumerate(
            zip(np.transpose(edge_index), edge_weight)):
        graph_matrix[edge_index[0]][edge_index[1]] = edge_weight
    return graph_matrix

In [54]:
def GraphHeatmap(epsilon,sigma,distanceType,dataset,proccessed_data_path):
    name_index = os.path.join(proccessed_data_path, "EdgeIndex", 
        f"index_{distanceType.name}_{epsilon}_{sigma}_{dataset.name}.npy")
    name_weight = os.path.join(proccessed_data_path, "EdgeWeight", 
        f"weight_{distanceType.name}_{epsilon}_{sigma}_{dataset.name}.npy")
    edge_index = np.load(name_index)
    edge_weight = np.load(name_weight)
    graph_matrix = GetGraphMatrixFromGraph(edge_index, edge_weight, dataset)
    fig = px.imshow(
        graph_matrix,
        title='Graph Heatmap for epsilon {0} and sigma {1} with size {2}'
        .format(epsilon, sigma, dataset.name))
    
    iplot(fig, filename=f'GraphHeatmap_{dataset.name}')

In [55]:
GraphHeatmap(0.1,1,DistanceType.OSRM,Dataset.Tiny,path_processed_data)

In [56]:
GraphHeatmap(0.1,1,DistanceType.Geodesic,Dataset.Tiny,path_processed_data)

In [57]:
def arrange_data(data, num_nodes):
    r"""
        Function to arrange data. At a point it represents the temporal 
        state of the graph
    """
    New_Data = []
    for i in range((int)(len(data) / num_nodes)):
        Data = []
        for k in range(num_nodes):
            Data.append(data[(i * num_nodes) + k])
        New_Data.append(Data)
    return New_Data

In [58]:
def get_clean_data_by_nodes(dataset,nodes_location,X,Y):
    r"""
        Returns data for a specific datasize.
        Instance Function.
        Args:
            size : DatasetSize
        Returns a tuple of 2 lists with the Data and Labels
    """

    new_X = []
    new_Y = []
    nodes_index = 0
    nodes_ids = Graph_get_nodes_for_dataset(dataset)
    for _, tuple in enumerate(zip(X, Y)):
        if int(nodes_location[nodes_index][0]) in nodes_ids:
            new_X.append(tuple[0])
            new_Y.append([tuple[1]])
        nodes_index += 1
        if nodes_index == len(nodes_location):
            nodes_index = 0
    return new_X, new_Y

In [59]:
def save_data_for_train_LSTM(dataset,path_processed_data,nodes_location, X_data, Y_data):
    print(f"Saving data with configuration : size = {dataset.name}")

    X_all, Y_all = get_clean_data_by_nodes(dataset,nodes_location, X_data, Y_data)

    num_nodes = Graph_get_number_of_nodes_for_dataset(dataset)

    X_all = arrange_data(X_all, num_nodes)
    Y_all = arrange_data(Y_all, num_nodes)

    proccessed_data_path_model = os.path.join(path_processed_data,"LSTM")
    if not os.path.exists(proccessed_data_path_model):
        os.makedirs(proccessed_data_path_model)

    name_folder = os.path.join(proccessed_data_path_model,f'Data_{dataset.name}')
    if not os.path.exists(name_folder):
        os.makedirs(name_folder)

    for index, data in enumerate(X_all):
        name_x = os.path.join(name_folder, f'X_{index}.npy')
        if not os.path.exists(name_x):
            np.save(name_x, data)

    for index, data in enumerate(Y_all):
        name_y = os.path.join(name_folder, f'Y_{index}.npy')
        if not os.path.exists(name_y):
            np.save(name_y, data)

In [60]:
def save_proccess_data_STCONV(dataset,nb_days,path_processed_data,nodes_location, X_data, Y_data):
    batch_size = 8
    time_steps = 1
    interval_per_day = (int)(24 * 60 / 5)

    print(f"Saving data with configuration : size = {dataset.name}")

    Skip = (int)(time_steps / 2) * 2
    interval_per_day -= Skip

    X_all, Y_all = get_clean_data_by_nodes(dataset,nodes_location, X_data, Y_data)
    num_nodes = Graph_get_number_of_nodes_for_dataset(dataset)

    data_size_old = len(X_all)

    X_all = arrange_data(X_all, num_nodes)
    Y_all = arrange_data(Y_all, num_nodes)

    new_size = (int)(interval_per_day * nb_days / batch_size)
    data_size = len(X_all)
    if new_size * batch_size != data_size:
        difference = data_size - ((new_size - 1) * batch_size)
        X_all = X_all[:data_size - difference]
        Y_all = Y_all[:data_size - difference]
        new_size -= 1
    X_all = np.array(X_all).reshape(new_size, batch_size, time_steps, num_nodes,
                            2)
    Y_all = np.array(Y_all).reshape(new_size, batch_size, time_steps, num_nodes,
                            1)

    proccessed_data_path_model = os.path.join(path_processed_data,"STCONV")
    if not os.path.exists(proccessed_data_path_model):
        os.makedirs(proccessed_data_path_model)

    name_folder = os.path.join(proccessed_data_path_model,f'Data_{dataset.name}')
    if not os.path.exists(name_folder):
        os.makedirs(name_folder)

    for index, data in enumerate(X_all):
        name_x = os.path.join(name_folder, f'X_{index}.npy')
        if not os.path.exists(name_x):
            np.save(name_x, data)

    for index, data in enumerate(Y_all):
        name_y = os.path.join(name_folder, f'Y_{index}.npy')
        if not os.path.exists(name_y):
            np.save(name_y, data)

In [61]:
for dataset in [Dataset.Experimental,Dataset.Tiny]:
    save_proccess_data_STCONV(dataset,nb_days,path_processed_data,nodes_location,X_data,Y_data)
    save_data_for_train_LSTM(dataset,path_processed_data,nodes_location,X_data,Y_data)

Saving data with configuration : size = Experimental
Saving data with configuration : size = Experimental
Saving data with configuration : size = Tiny
Saving data with configuration : size = Tiny


Enumeration used for the folders name in which the processed data will be saved

In [62]:
class FoldersProcesedNames(Enum):
    STCONV = 0
    LSTM = 1
    EdgeWeight = 2
    EdgeIndex = 3
    Distances = 4

Function which will return true if the folders created after data processing exists

In [63]:
def Datareader_is_data_read(path_processed_data : str,folders_procesed_names : FoldersProcesedNames):
    for folder_name in folders_procesed_names:
        is_read_folder = os.path.isdir(os.path.join(path_processed_data, str(folder_name.name)))
        if not is_read_folder:
            return False
    return True

In [64]:
print(f'Data is read and stored: {Datareader_is_data_read(path_processed_data,FoldersProcesedNames)}')

Data is read and stored: True


In [65]:
class ModelType(Enum):
    r"""
        Enumeration for each model type.
            LSTM = 0
            STCONV = 1
            LinearRegression = 2
    """
    LSTM = 0
    DCRNN = 1
    STCONV = 2

In [66]:

class LossFunction():

    def Criterions():
        return [
            LossFunction.RMSE, LossFunction.MAPE, LossFunction.MAE,
            LossFunction.MSE, LossFunction.R2_SQUARED
        ]

    def RMSE(y_pred, y_true):
        return torch.sqrt(torch.mean((y_pred - y_true)**2))

    def MAPE(y_pred, y_true):
        return torch.mean(torch.abs((y_true - y_pred) / y_true))

    def MAE(y_pred, y_true):
        return torch.mean(torch.abs((y_true - y_pred)))

    def MSE(y_pred, y_true):
        return torch.mean((y_true - y_pred)**2)

    def R2_SQUARED(y_pred, y_true):
        true_mean = torch.mean(y_true)
        ss_tot = torch.sum((y_true - true_mean) ** 2)
        ss_res = torch.sum((y_true - y_pred) ** 2)
        r2 = 1 - ss_res / ss_tot
        return r2
    #endregion

In [67]:
class STConvModel(Module):

    def __init__(self, node_features, num_nodes, hidden_channels, kernel_size,
                 ):
        super(STConvModel, self).__init__()

        self.STCONV = STConv(num_nodes=num_nodes,
                             in_channels=node_features,
                             hidden_channels=hidden_channels,
                             out_channels=node_features,
                             kernel_size=kernel_size,
                             K=1)
        self.linear = Linear(node_features, 1)
        self.ReLU = ReLU()

    def forward(self, x, edge_index, edge_weight):
        x = self.STCONV(x, edge_index, edge_weight)
        x = self.ReLU(x)
        x = self.STCONV(x, edge_index, edge_weight)
        x = self.ReLU(x)
        x = self.STCONV(x, edge_index, edge_weight)
        x = self.ReLU(x)
        x = self.linear(x)
        return x

In [68]:

class DCRNNModel(Module):

    def __init__(self, node_features, hidden_channels):
        super(DCRNNModel, self).__init__()

        self.DCRNN = DCRNN(in_channels=node_features,
                           out_channels=hidden_channels,
                           K=1)
        self.Conv1 = GCNConv(in_channels=hidden_channels,
                             out_channels=hidden_channels)
        self.BatchNorm1 = BatchNorm1d(num_features=hidden_channels)
        self.ReLU = ReLU()
        self.Dropout = Dropout()
        self.linear = Linear(hidden_channels, 1)

    def forward(self, x, edge_index, edge_weight):
        x = self.DCRNN(x, edge_index, edge_weight)
        x = self.Conv1(x, edge_index)
        x = self.ReLU(x)
        x = self.BatchNorm1(x)
        x = self.Dropout(x)
        x = self.linear(x)
        return x

In [69]:

class LSTMModel(Module):

    def __init__(self, node_features, hidden_channels):
        super(LSTMModel, self).__init__()

        self.GCLSTM1 = GCLSTM(in_channels=node_features,
                              out_channels=hidden_channels,
                              K=1)
        self.Conv1 = GCNConv(in_channels=hidden_channels,
                             out_channels=hidden_channels)
        self.BatchNorm1 = BatchNorm1d(num_features=hidden_channels)
        self.ReLU = ReLU()
        self.Dropout = Dropout()
        self.linear = Linear(hidden_channels, 1)

    def forward(self, x, edge_index, edge_weight):
        x = self.GCLSTM1(x, edge_index, edge_weight)
        x = self.Conv1(x[0], edge_index)
        x = self.ReLU(x)
        x = self.BatchNorm1(x)
        x = self.Dropout(x)
        x = self.linear(x)
        return x

In [70]:
class DatasetHelper(object):
    def __init__(self,
                 sigma: int,
                 epsilon: float,
                 dataset: Dataset,
                 distanceType: DistanceType,
                 proccessed_data_path,
                 folder_name,
                 device: str = 'cpu',
                 time_start: int = 0,
                 time_stop: float = -1):
                 
        self.proccessed_data_path = proccessed_data_path
        self.sigma = sigma
        self.epsilon = epsilon
        self.time_start = time_start
        self.time_stop = time_stop
        self.device = device
        self.dataset = dataset
        self.folder_name = folder_name
        self.distanceType = distanceType
        self.proccessed_data_path_model = os.path.join(self.proccessed_data_path, self.folder_name.name)
        self.__set_graph()
        self.__check_temporal_consistency()
        self.__set_snapshot_count()

    def __set_graph(self):
        name_weight = os.path.join(
            self.proccessed_data_path, 'EdgeWeight',
            f'weight_{self.distanceType.name}_{self.epsilon}_{self.sigma}_{self.dataset.name}.npy')
        self.edge_weight = np.load(name_weight, allow_pickle=True)

        name_index = os.path.join(
            self.proccessed_data_path, 'EdgeIndex',
            f'index_{self.distanceType.name}_{self.epsilon}_{self.sigma}_{self.dataset.name}.npy')
        self.edge_index = np.load(name_index, allow_pickle=True)

    def get_edge_index(self):
        if self.edge_index is None:
            return self.edge_index
        else:
            return torch.LongTensor(self.edge_index).to(self.device)

    def get_edge_weight(self):
        if self.edge_weight is None:
            return self.edge_weight
        else:
            return torch.FloatTensor(self.edge_weight).to(self.device)

    def __get_features(self, time_index: int):
        if (self.dataset in [Dataset.TinyManual, Dataset.TinyLR]):
            dataset = Dataset.Tiny
        elif (self.dataset in [Dataset.ExperimentalManual, Dataset.ExperimentalLR]):
            dataset = Dataset.Experimental
        else:
            dataset = self.dataset
        name_x = os.path.join(self.proccessed_data_path_model,
                              f"Data_{dataset.name}", f'X_{time_index}.npy')
        X = np.load(name_x)
        if X is None:
            return X
        else:
            return torch.FloatTensor(X).to(self.device)

    def __get_target(self, time_index: int):
        if (self.dataset in [Dataset.TinyManual, Dataset.TinyLR]):
            dataset = Dataset.Tiny
        elif (self.dataset in [Dataset.ExperimentalManual, Dataset.ExperimentalLR]):
            dataset = Dataset.Experimental
        else:
            dataset = self.dataset
        name_y = os.path.join(self.proccessed_data_path_model,
                              f"Data_{dataset.name}", f'Y_{time_index}.npy')
        Y = np.load(name_y)
        if Y is None:
            return Y
        else:
            if Y.dtype.kind == 'i':
                return torch.LongTensor(Y).to(self.device)
            elif Y.dtype.kind == 'f':
                return torch.FloatTensor(Y).to(self.device)

    def __getitem__(self, time_index: int):
        x = self.__get_features(time_index)
        y = self.__get_target(time_index)
        return x, y

    def __next__(self):
        if self.t < self.time_stop:
            snapshot = self.__getitem__(self.t)
            self.t = self.t + 1
            return snapshot
        else:
            self.t = self.time_start
            raise StopIteration

    def __iter__(self):
        self.t = self.time_start
        return self

    def __len__(self):
        return self.snapshot_count

    def __set_snapshot_count(self):
        if (self.dataset in [Dataset.TinyManual, Dataset.TinyLR]):
            dataset = Dataset.Tiny
        elif (self.dataset in [Dataset.ExperimentalManual, Dataset.ExperimentalLR]):
            dataset = Dataset.Tiny
        else:
            dataset = self.dataset
        self.snapshot_count = len(
            glob( os.path.join(self.proccessed_data_path_model, f"Data_{dataset.name}", "X_*.npy")))
    
    def __check_temporal_consistency(self):
        assert len(
            glob( os.path.join(self.proccessed_data_path_model, f"Data_{dataset.name}", "X_*.npy"))) == len(
            glob( os.path.join(self.proccessed_data_path_model, f"Data_{dataset.name}", "Y_*.npy"))), "Temporal dimension inconsistency."
    
    def get_dataset(self):
        train_ratio = 0.6
        val_ratio= 0.2
        test_ratio= 0.2

        time_train = int(train_ratio * self.snapshot_count)
        time_test = time_train + int(test_ratio * self.snapshot_count)

        train_iterator = DatasetHelper(self.sigma, self.epsilon, self.dataset,
                                       self.distanceType, self.proccessed_data_path, self.folder_name,
                                       self.device, 0, time_train)

        val_iterator = DatasetHelper(self.sigma, self.epsilon, self.dataset,
                                      self.distanceType,  self.proccessed_data_path, self.folder_name,
                                      self.device, time_train + 1, time_test)

        test_iterator = DatasetHelper(self.sigma, self.epsilon, self.dataset,
                                     self.distanceType,  self.proccessed_data_path, self.folder_name,
                                     self.device, time_test + 1,
                                     self.snapshot_count)

        return train_iterator, val_iterator, test_iterator

In [71]:
def test(best_model, dfResults, epoch, test_dataset,model_type,dataset,sigma,epsilon) -> None:
    best_model.eval()
    loss = 0
    edge_index = test_dataset.get_edge_index()
    edge_weight = test_dataset.get_edge_weight()
    MAE_loss = 0
    for criterion in LossFunction.Criterions():
        loss = 0
        for index, (X, Y) in enumerate(test_dataset):
            y_hat = best_model(X, edge_index, edge_weight)
            loss += criterion(y_hat, Y)
            if criterion == LossFunction.MAE:
                MAE_loss += criterion(y_hat, Y)

        loss = loss / (index + 1)
        loss = loss.item()

        results = {
            "Model": str(model_type.name),
            "Epsilon": str(epsilon),
            "Sigma": str(sigma),
            "Dataset": str(dataset.name),
            "Criterion": str(criterion.__name__),
            "Loss": str(loss),
            "Epoch": str(epoch),
            "TestOrVal": "Test",
            "Trial": tune.get_trial_id()
        }
        dfResults = dfResults.append(results, ignore_index=True)

        if criterion == LossFunction.MAE:
            MAE_loss = MAE_loss / (index + 1)
            MAE_loss = MAE_loss.item()

    print("Best trial test set loss: {}".format(MAE_loss))
    return dfResults


In [72]:
def validate(dfResults, epoch,validation_dataset,model,model_type,dataset,sigma,epsilon):
    model.eval()
    loss = 0
    edge_index = validation_dataset.get_edge_index()
    edge_weight = validation_dataset.get_edge_weight()
    MAE_loss = 0
    for criterion in LossFunction.Criterions():
        loss = 0
        for index, (X, Y) in enumerate(validation_dataset):
            y_hat = model(X, edge_index, edge_weight)
            loss += criterion(y_hat, Y)
            if criterion == LossFunction.MAE:
                MAE_loss += criterion(y_hat, Y)

        loss = loss / (index + 1)
        loss = loss.item()

        results = {
            "Model": str(model_type.name),
            "Epsilon": str(epsilon),
            "Sigma": str(sigma),
            "Dataset": str(dataset.name),
            "Criterion": str(criterion.__name__),
            "Loss": str(loss),
            "Epoch": str(epoch),
            "TestOrVal": "Validation",
            "Trial": tune.get_trial_id()
        }
        dfResults = dfResults.append(resu/lts, ignore_index=True)

        if criterion == LossFunction.MAE:
            MAE_loss = MAE_loss / (index + 1)
            MAE_loss = MAE_loss.item()

    return MAE_loss, dfResults

In [73]:
def train_validate_and_test(experiment_name,model,model_type,optimizer,
    EarlyStoppingPatience,train_dataset,validation_dataset,test_dataset,nb_epoch,
    results_path,dataset,distanceType,sigma,epsilon):
    
    dfResults = pd.DataFrame(columns=[
        "Model", "Epsilon", "Sigma", "Dataset", "Criterion", "Loss", "Epoch",  "TestOrVal", "Trial"
    ])
    best_val_loss = np.inf
    epoch_no_improvement = EarlyStoppingPatience
    edge_index = train_dataset.get_edge_index()
    edge_weight = train_dataset.get_edge_weight()
    for epoch in tqdm(range(1,nb_epoch)):
        train_loss = 0
        for index, (X, Y) in enumerate(train_dataset):
            optimizer.zero_grad()
            y_hat = model(X, edge_index, edge_weight)
            loss = LossFunction.MAE(y_hat, Y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss = train_loss / (index + 1)
        
        # Validation Step at epoch end
        val_loss, dfResults = validate(dfResults, epoch,validation_dataset,model,model_type,dataset,sigma,epsilon)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epoch_no_improvement = EarlyStoppingPatience
        else:
            epoch_no_improvement -= 1

        print("Epoch {0} : Validation loss {1} ; Train loss {2};".format(epoch, val_loss, train_loss))

        Log to tune
        with tune.checkpoint_dir(step=epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((model.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=val_loss)

        if (epoch_no_improvement == 0):
            print("Early stopping at epoch: {0}".format(epoch))
            break

    dfResults = test(model, dfResults, epoch, test_dataset,model_type,dataset,sigma,epsilon)
    if not os.path.exists(
            os.path.join(results_path, experiment_name)):
        os.makedirs(
            os.path.join(results_path, experiment_name))
    file_save = os.path.join( results_path, experiment_name,
        f"{model_type.name}_{dataset.name}_{distanceType.name}_{tune.get_trial_id()}.csv")
    dfResults.to_csv(path_or_buf=file_save, index=False)

    return

In [74]:
def start_train(config, param, checkpoint_dir=None) -> None:
    folder_name = config["modelType"]
    if config["modelType"] == ModelType.DCRNN:
        folder_name = ModelType.LSTM

    train_dataset, validation_dataset, test_dataset = DatasetHelper(
                config["sigma"],
                config["epsilon"],
                config["dataset"],
                param["distanceType"],
                param["proccessed_data_path"],
                folder_name).get_dataset()
    if config["modelType"] == ModelType.STCONV:
        model = STConvModel(node_features=2,
            num_nodes=Graph_get_number_of_nodes_for_dataset(config["dataset"]),
            hidden_channels=32,
            kernel_size=1)
    elif config["modelType"] == ModelType.LSTM:
        model = LSTMModel(node_features=2, hidden_channels=32)
    elif config["modelType"] == ModelType.DCRNN:
        model = DCRNNModel(node_features=2, hidden_channels=32)
    optimizer = Adam(model.parameters(), lr=param["learning_rate"])
    train_validate_and_test(param["experiment_name"],model,config["modelType"],
        optimizer, param["EarlyStoppingPatience"],train_dataset,
        validation_dataset,test_dataset,param["nb_epoch"],
        param["results_path"],config["dataset"],param["distanceType"],config["sigma"],config["epsilon"])

In [75]:
def trail_dirname_creator(trial):
    return f"{trial.config['modelType'].name}_{trial.config['dataset'].name}_{datetime.now().strftime('%d_%m_%Y-%H_%M_%S')}"

In [76]:
def hyperParameterTuning(dataset, model, distanceType, experiment_name,
    results_ray_path,epsilon_array,sigma_array,proccessed_data_path,results_path):

    nb_epoch = 30
    grace_period = 30
    reduction_factor = 3
    scheduler = ASHAScheduler(max_t=nb_epoch,
                                grace_period=grace_period,
                                reduction_factor=reduction_factor)
    learning_rate = 0.01
    num_features = 2
    EarlyStoppingPatience = 5
    param = {
        "learning_rate": learning_rate,
        "num_features": num_features,
        "EarlyStoppingPatience": EarlyStoppingPatience,
        "nb_epoch": nb_epoch,
        "experiment_name": experiment_name,
        "distanceType": distanceType,
        "proccessed_data_path": proccessed_data_path,
        "results_path": results_path
    }
    
    config = {
        "epsilon": tune.choice(epsilon_array),
        "sigma": tune.choice(sigma_array),
        "modelType": tune.choice([model]),
        "dataset" : tune.choice([dataset])
    }


    directory_experiment_ray = os.path.join(results_ray_path, experiment_name)

    num_samples = 16
    start_train.__name__ = f"{model.name}_{dataset.name}_{distanceType.name}"

    result = tune.run(tune.with_parameters(start_train, param=param),
                        local_dir=directory_experiment_ray,
                        trial_dirname_creator=trail_dirname_creator,
                        resources_per_trial={
                            "cpu": 8,
                            "gpu": 1
                        },
                        config=config,
                        metric="loss",
                        mode="min",
                        num_samples=num_samples,
                        scheduler=scheduler,
                        verbose=0)

    best_trial = result.get_best_trial("loss", "min", "last")

    print("Best trial config: {}".format(best_trial.config))
    print("Best trial for {} model final validation loss: {}".format(
        model.name, best_trial.last_result["loss"]))

In [77]:
results_path = os.path.join(current_directory,"Results")
if not os.path.exists(results_path):
    os.makedirs(results_path)

results_ray_path = os.path.join(current_directory,"Results-RAY")
if not os.path.exists(results_ray_path):
    os.makedirs(results_ray_path)

In [78]:
def start_learn(results_ray_path,path_processed_data,results_path):
    experiment_name = 'Experiment'
    directory_experiment_ray = os.path.join(results_ray_path, experiment_name)
    if not os.path.exists(directory_experiment_ray):
        os.makedirs(directory_experiment_ray)

    models_to_train = []
    for dataset in Dataset:
        for distanceType in DistanceType:
            for model in ModelType:
                models_to_train.append([dataset,distanceType,model])

    models_trained = []
    for models in models_to_train:
        if len(glob(os.path.join(directory_experiment_ray,f'{models[2].name}_{models[0].name}_{models[1].name}*'))) >= 1:
            models_trained.append(models)
    print(f"{len(models_trained)} trained models out of {len(models_to_train)}")
    models_list = [model for model in models_to_train if model not in models_trained][:1]
    for models in models_list:
        dataset = models[0]
        distanceType = models[1]
        model = models[2]
        hyperParameterTuning(dataset, model, distanceType, experiment_name,
            results_ray_path,EPSILON_ARRAY,SIGMA_ARRAY,path_processed_data,results_path)

In [79]:
# Just for testing
# nb_epoch = 30
# grace_period = 30
# reduction_factor = 3
# learning_rate = 0.01
# num_features = 2
# EarlyStoppingPatience = 5

# param = {
#     "learning_rate": learning_rate,
#     "num_features": num_features,
#     "EarlyStoppingPatience": EarlyStoppingPatience,
#     "nb_epoch": nb_epoch,
#     "experiment_name": "Experiment",
#     "distanceType": distanceType.OSRM,
#     "proccessed_data_path": path_processed_data,
#     "results_path": results_path
# }

# config = {
#     "epsilon": EPSILON_ARRAY[0],
#     "sigma": SIGMA_ARRAY[1],
#     "modelType": ModelType.LSTM,
#     "dataset" : Dataset.Experimental
# }

# start_train(config, param)

  3%|▎         | 1/29 [01:06<31:00, 66.46s/it]

Epoch 1 : Validation loss 13.249163627624512 ; Train loss 15.618491208293005;


  7%|▋         | 2/29 [01:55<25:13, 56.06s/it]

Epoch 2 : Validation loss 14.618284225463867 ; Train loss 10.59150454823013;


 10%|█         | 3/29 [02:43<22:40, 52.34s/it]

Epoch 3 : Validation loss 14.513259887695312 ; Train loss 10.779033658756013;


 14%|█▍        | 4/29 [03:31<21:13, 50.92s/it]

Epoch 4 : Validation loss 14.204203605651855 ; Train loss 10.919414944787993;


 17%|█▋        | 5/29 [04:22<20:17, 50.72s/it]

Epoch 5 : Validation loss 14.305279731750488 ; Train loss 10.842278817364242;


 17%|█▋        | 5/29 [05:11<24:57, 62.38s/it]

Epoch 6 : Validation loss 14.468317985534668 ; Train loss 10.884538313427338;
Early stopping at epoch: 6





Best trial test set loss: 13.473071098327637


Session not detected. You should not be calling `get_trial_id` outside `tune.run` or while using the class API. 
  File "c:\Users\Sebi\anaconda3\envs\Dissertation\lib\runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\Sebi\anaconda3\envs\Dissertation\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "c:\Users\Sebi\anaconda3\envs\Dissertation\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "c:\Users\Sebi\anaconda3\envs\Dissertation\lib\site-packages\traitlets\config\application.py", line 976, in launch_instance
    app.start()
  File "c:\Users\Sebi\anaconda3\envs\Dissertation\lib\site-packages\ipykernel\kernelapp.py", line 712, in start
    self.io_loop.start()
  File "c:\Users\Sebi\anaconda3\envs\Dissertation\lib\site-packages\tornado\platform\asyncio.py", line 215, in start
    self.asyncio_loop.run_forever()
  File "c:\Users\Sebi\anaconda3\envs\Dissertati

In [78]:
start_learn(results_ray_path,path_processed_data,results_path)

0 trained models out of 36


2022-12-06 15:04:14,500	ERROR syncer.py:73 -- Log sync requires rsync to be installed.
2022-12-06 15:04:14,501	INFO trial.py:179 -- Creating a new dirname LSTM_Experimental_06_12_2022-15_04_14_7772 because trial dirname 'LSTM_Experimental_06_12_2022-15_04_14' already exists.
  0%|          | 0/29 [00:00<?, ?it/s]


 pid=2392)[0m Epoch 1 : Validation loss 14.31080436706543 ; Train loss 15.528481005995062;


  3%|▎         | 1/29 [01:11<33:15, 71.26s/it]


 pid=2392)[0m Epoch 2 : Validation loss 14.158637046813965 ; Train loss 13.821869167708247;


  7%|▋         | 2/29 [01:55<25:01, 55.61s/it]


 pid=2392)[0m Epoch 3 : Validation loss 21.80392074584961 ; Train loss 13.330895982896363;


 10%|█         | 3/29 [02:39<21:48, 50.35s/it]


 pid=2392)[0m Epoch 4 : Validation loss 32.129634857177734 ; Train loss 12.972156256570592;


 14%|█▍        | 4/29 [03:22<19:44, 47.37s/it]


 pid=2392)[0m Epoch 5 : Validation loss 42.82939147949219 ; Train loss 12.680379162396735;


 17%|█▋        | 5/29 [04:08<18:43, 46.82s/it]


 pid=2392)[0m Epoch 6 : Validation loss 21.641616821289062 ; Train loss 12.40716669428383;


 21%|██        | 6/29 [04:54<17:51, 46.59s/it]


 pid=2392)[0m Epoch 7 : Validation loss 58.90500259399414 ; Train loss 12.301731071984007;
 pid=2392)[0m Early stopping at epoch: 7


 21%|██        | 6/29 [05:35<21:25, 55.89s/it]
2022-12-06 15:10:15,729	INFO logger.py:699 -- Removed the following hyperparameter values when logging to tensorboard: {'modelType': <ModelType.LSTM: 0>, 'dataset': <Dataset.Experimental: 0>}
2022-12-06 15:10:15,751	INFO trial.py:179 -- Creating a new dirname LSTM_Experimental_06_12_2022-15_04_14_e335 because trial dirname 'LSTM_Experimental_06_12_2022-15_04_14' already exists.
[2m[36m(pid=2392)[0m Windows fatal exception: access violation
[2m[36m(pid=2392)[0m 


 pid=2392)[0m Best trial test set loss: 57.72837448120117


  0%|          | 0/29 [00:00<?, ?it/s]


 pid=9304)[0m Epoch 1 : Validation loss 13.297567367553711 ; Train loss 15.501836538252094;


  3%|▎         | 1/29 [00:37<17:32, 37.60s/it]


 pid=9304)[0m Epoch 2 : Validation loss 19.950044631958008 ; Train loss 13.89164796160322;


  7%|▋         | 2/29 [01:15<16:57, 37.67s/it]


 pid=9304)[0m Epoch 3 : Validation loss 15.595378875732422 ; Train loss 13.422550701085155;


 10%|█         | 3/29 [01:53<16:20, 37.71s/it]


 pid=9304)[0m Epoch 4 : Validation loss 32.61343765258789 ; Train loss 13.00577686799573;


 14%|█▍        | 4/29 [02:31<15:47, 37.89s/it]


 pid=9304)[0m Epoch 5 : Validation loss 112.78544616699219 ; Train loss 12.618612649847853;


 17%|█▋        | 5/29 [03:08<15:05, 37.75s/it]


 pid=9304)[0m Epoch 6 : Validation loss 74.66780090332031 ; Train loss 12.470630383685965;
 pid=9304)[0m Early stopping at epoch: 6


 17%|█▋        | 5/29 [03:46<18:06, 45.29s/it]
2022-12-06 15:14:24,932	INFO logger.py:699 -- Removed the following hyperparameter values when logging to tensorboard: {'modelType': <ModelType.LSTM: 0>, 'dataset': <Dataset.Experimental: 0>}
2022-12-06 15:14:24,950	INFO trial.py:179 -- Creating a new dirname LSTM_Experimental_06_12_2022-15_04_14_c84e because trial dirname 'LSTM_Experimental_06_12_2022-15_04_14' already exists.
[2m[36m(pid=9304)[0m Windows fatal exception: access violation
[2m[36m(pid=9304)[0m 


 pid=9304)[0m Best trial test set loss: 73.51013946533203


  0%|          | 0/29 [00:00<?, ?it/s]


 pid=7748)[0m Epoch 1 : Validation loss 205.0992431640625 ; Train loss 15.512043147048207;


  3%|▎         | 1/29 [00:37<17:38, 37.82s/it]


 pid=7748)[0m Epoch 2 : Validation loss 37.683746337890625 ; Train loss 13.900689426142113;


  7%|▋         | 2/29 [01:15<17:02, 37.87s/it]
[2m[36m(pid=6232)[0m [*** LOG ERROR #0001 ***] [2022-12-06 15:16:14] [ray_log_sink] {wincolor_sink: write_to_file_ failed. GetLastError(): 6}
[2m[36m(pid=6232)[0m Windows fatal exception: access violation
[2m[36m(pid=6232)[0m 
[2m[36m(pid=6232)[0m Stack (most recent call first):
[2m[36m(pid=6232)[0m   File "c:\Users\Sebi\anaconda3\envs\Dissertation\lib\site-packages\ray\worker.py", line 428 in main_loop
[2m[36m(pid=6232)[0m   File "c:\Users\Sebi\anaconda3\envs\Dissertation\lib\site-packages\ray\workers/default_worker.py", line 212 in <module>
2022-12-06 15:16:14,886	ERROR tune.py:613 -- Trials did not complete: [LSTM_Experimental_Geodesic_7bc27_00002, LSTM_Experimental_Geodesic_7bc27_00003, LSTM_Experimental_Geodesic_7bc27_00004, LSTM_Experimental_Geodesic_7bc27_00005, LSTM_Experimental_Geodesic_7bc27_00006, LSTM_Experimental_Geodesic_7bc27_00007, LSTM_Experimental_Geodesic_7bc27_00008, LSTM_Experimental_Geodesic_7bc27_00

Best trial config: {'epsilon': 0.7, 'sigma': 3, 'modelType': <ModelType.LSTM: 0>, 'dataset': <Dataset.Experimental: 0>}
Best trial for LSTM model final validation loss: 37.683746337890625


In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [None]:
start_learn(results_ray_path,path_processed_data,results_path)

In [103]:
COLUMNS_INFO = [
        "Model", "Epsilon", "Sigma", "Dataset", "Criterion", "Loss", "Epoch","TestOrVal", "Trial"
    ]

In [104]:
def makeOneFile(experiment_path, columnsInfo, type):
    File = os.path.join(experiment_path, f"{type}.csv")
    if not (os.path.exists(File)):
        dataframeInfo = pd.DataFrame(columns=columnsInfo)
        Files = os.path.join(experiment_path, f"{type}*_*.csv")
        for file in glob(Files):
            distanceType = Path(file).stem.split("_")[2]
            with open(file) as f:
                dataframeRead = pd.read_csv(
                    file,
                    sep=',',
                    header=None,
                    names=columnsInfo,
                    skiprows=1)
                dataframeRead["DistanceType"] = distanceType
                dataframeInfo = dataframeInfo.append(dataframeRead,ignore_index=True)

        dataframeInfo.to_csv(File)
    return pd.read_csv(os.path.join(experiment_path, f"{type}.csv"))

In [105]:
def read_results(experiment_path):
    dfSTCONV = makeOneFile(experiment_path,COLUMNS_INFO, "STCONV")
    dfLSTM = makeOneFile(experiment_path, COLUMNS_INFO,
                                    "LSTM")
    dfDCRNN = makeOneFile(experiment_path,
                                        COLUMNS_INFO, "DCRNN")
    return dfSTCONV, dfLSTM, dfDCRNN

In [106]:
all_experiments = [os.path.join(results_path,d) for d in os.listdir(results_path)]
# latest_experiment = max(all_experiments, key=os.path.getmtime)
latest_experiment = "Experiment"
path_save_plots = os.path.join(current_directory,"Plots")
if not(os.path.exists(path_save_plots)):
    os.makedirs(path_save_plots)
path_save_plots_Experiment = os.path.join(path_save_plots,latest_experiment)
if not(os.path.exists(path_save_plots_Experiment)):
    os.makedirs(path_save_plots_Experiment)
experiment_path = os.path.join(results_path, latest_experiment)
dfSTCONV, dfLSTM, dfDCRNN = read_results(experiment_path)

In [107]:
dfSTCONV[:5]

Unnamed: 0.1,Unnamed: 0,Model,Epsilon,Sigma,Dataset,Criterion,Loss,Epoch,TestOrVal,Trial


In [108]:
dfLSTM[:5]

Unnamed: 0.1,Unnamed: 0,Model,Epsilon,Sigma,Dataset,Criterion,Loss,Epoch,TestOrVal,Trial,DistanceType
0,0,LSTM,0.5,3,Experimental,RMSE,18.822023,1,Validation,7bc27_00000,Geodesic
1,1,LSTM,0.5,3,Experimental,MAPE,0.58625,1,Validation,7bc27_00000,Geodesic
2,2,LSTM,0.5,3,Experimental,MAE,14.310804,1,Validation,7bc27_00000,Geodesic
3,3,LSTM,0.5,3,Experimental,MSE,540.186951,1,Validation,7bc27_00000,Geodesic
4,4,LSTM,0.5,3,Experimental,R2_SQUARED,-1.463044,1,Validation,7bc27_00000,Geodesic


In [109]:
dfDCRNN[:5]

Unnamed: 0.1,Unnamed: 0,Model,Epsilon,Sigma,Dataset,Criterion,Loss,Epoch,TestOrVal,Trial


In [110]:
def calc_table_height(df, base=208, height_per_row=20, char_limit=30, height_padding=16.5):
    '''
    df: The dataframe with only the columns you want to plot
    base: The base height of the table (header without any rows)
    height_per_row: The height that one row requires
    char_limit: If the length of a value crosses this limit, the row's height needs to be expanded to fit the value
    height_padding: Extra height in a row when a length of value exceeds char_limit
    '''
    total_height = 0 + base
    for x in range(df.shape[0]):
        total_height += height_per_row
    for y in range(df.shape[1]):
        if len(str(df.iloc[x][y])) > char_limit:
            total_height += height_padding
    return total_height

In [111]:
def TableFinalResults(dfSTCONV,dfLSTM,dfDCRNN) -> None:

    dfResults = dfSTCONV.append(dfDCRNN, ignore_index=True)
    dfResults = dfResults.append(dfLSTM, ignore_index=True)

    df = pd.DataFrame(
        columns=["Model", "Size", "Distance", "Generation", "RMSE", "MAPE", "MAE", "MSE", "R2_SQUARED"])

    
    for model in [ModelType.DCRNN, ModelType.LSTM, ModelType.STCONV]:
        for dataset in [Dataset.Experimental, Dataset.Tiny]:
            for GenerationType in ["Generative", "Manual", "LR"]:
                for distanceType in DistanceType:
                    if not (GenerationType == "Generative"):
                        datasetName = f"{dataset.name}{GenerationType}"
                    else:
                        datasetName = dataset.name
                    if (model == ModelType.LSTM):
                        modelName = "GCLSTM"
                    else:
                        modelName = model.name
                    dfResultsTemp = dfResults[dfResults["Model"]
                                                == model.name]
                    dfResultsTemp = dfResultsTemp[dfResultsTemp["Dataset"]
                                                    == datasetName]
                    dfResultsTemp = dfResultsTemp[dfResultsTemp["DistanceType"]
                                                    == distanceType.name]
                    dfResultsTemp = dfResultsTemp[dfResultsTemp["TestOrVal"] == "Test"]

                    dfResultsTemp = dfResultsTemp[["Criterion", "Loss"]]
                    # dfResultsR2 = dfResultsTemp.loc[int(dfResultsTemp["Criterion"]) > 0, "Criterion"].min()
                    dfResultsTemp = dfResultsTemp.groupby(["Criterion"]).min().T
                    df = df.append(
                        {
                            "Model": modelName,
                            "Size": datasetName,
                            "Distance": distanceType.name,
                            "Generation": GenerationType,
                            "RMSE": format(
                                (float)(dfResultsTemp["RMSE"].iloc[0]), '.2f'),
                            "MAPE": format(
                                (float)(dfResultsTemp["MAPE"].iloc[0]), '.2f'),
                            "MAE": format((float)(dfResultsTemp["MAE"].iloc[0]), '.2f'),
                            "MSE": format((float)(dfResultsTemp["MSE"].iloc[0]), '.2f'),
                            # "R2_SQUARED": format((float)(dfResultsTemp["R2_SQUARED"].iloc[0]), '.2f')
                            "R2_SQUARED": 0
                        },
                        ignore_index=True)

    layout = dict(height=calc_table_height(df) + 300)
    fig = go.Figure(data=[
        go.Table(header=dict(values=list(df.columns),
                                fill_color='paleturquoise',
                                align='left'),
                    cells=dict(values=[
                        df.Model, df.Size, df.Distance, df.Generation, df.RMSE, df.MAPE, df.MAE, df.MSE, df.R2_SQUARED
                    ],
            fill_color='lavender',
            align='left'))
    ], layout=layout)

    iplot(fig, filename=f'TableFinalResults')

In [112]:
TableFinalResults(dfSTCONV,dfLSTM,dfDCRNN)

KeyError: 'RMSE'

In [113]:
def BoxPlotResults(dfSTCONV,dfLSTM,dfDCRNN,distanceType) -> None:
    dfSTCONV["Type"] = "STCONV"
    dfLSTM["Type"] = "GCLSTM"
    dfDCRNN["Type"] = "DCRNN"
    df = dfSTCONV.append(dfLSTM, ignore_index=True)
    df = df.append(dfDCRNN, ignore_index=True)
    df = df[df["DistanceType"] == distanceType.name]
    df = df[df["Loss"] < 20]
    dfTemp = df[df["Criterion"] == LossFunction.MAE.__name__]
    fig = px.box(dfTemp, x="Dataset", y="Loss", color="Type")
    
    iplot(fig, filename=f'BoxPlotResults')


In [114]:
BoxPlotResults(dfSTCONV,dfLSTM,dfDCRNN,DistanceType.OSRM)

In [115]:
BoxPlotResults(dfSTCONV,dfLSTM,dfDCRNN,DistanceType.Geodesic)

In [116]:
def SigmaEpsilonTable(dfSTCONV, dfDCRNN, dfLSTM, SIGMA_ARRAY,EPSILON_ARRAY) -> None:


    dfResults = dfSTCONV.append(dfDCRNN, ignore_index=True)
    dfResults = dfResults.append(dfLSTM, ignore_index=True)

    df = pd.DataFrame(columns=SIGMA_ARRAY,
                        index=EPSILON_ARRAY)
    for sigma in SIGMA_ARRAY:
        for epsilon in EPSILON_ARRAY:
            dfResultsTemp = dfResults[dfResults["Sigma"] == sigma]
            dfResultsTemp = dfResultsTemp[dfResultsTemp["Epsilon"] == epsilon]
            dfResultsTemp = dfResultsTemp.groupby(["Criterion"]).min()
            df.loc[epsilon][sigma] = format(
                (float)(dfResultsTemp["Loss"].iloc[0]), '.2f')
    headers = list(df.columns)
    headers.insert(0, "")
    fig = go.Figure(data=[
        go.Table(header=dict(values=headers,
                                fill_color='paleturquoise',
                                align='left'),
                    cells=dict(values=[
                        EPSILON_ARRAY, df[1], df[3], df[5], df[10]
                    ],
            fill_color='lavender',
            align='left'))
    ])
    iplot(fig, filename=f'SigmaEpsilonTable')

In [117]:
SigmaEpsilonTable(dfSTCONV, dfDCRNN, dfLSTM, SIGMA_ARRAY,EPSILON_ARRAY)

IndexError: single positional indexer is out-of-bounds

In [118]:
def get_correct_trial_dir(experiment_dir,trial_id):
    experiment_dir_glob = os.path.join(experiment_dir,"*")
    for directory in glob(experiment_dir_glob):
        if not os.path.isdir(directory): continue
        progress_csv = os.path.join(directory,"progress.csv")
        progress_df = pd.read_csv(progress_csv)
        if progress_df["trial_id"].iloc[0] == trial_id: return directory
    return ""

In [128]:
def RegresionForDatasetNodes(dfSTCONV, dfDCRNN, dfLSTM, dataset, distanceType, modelType, results_ray_path, experiment_name,proccessed_data_path,path_save_plots) :
    if modelType == ModelType.STCONV:
        dfResults = dfSTCONV
        model = STConvModel(node_features=2,
            num_nodes=Graph_get_number_of_nodes_for_dataset(dataset),
            hidden_channels=32,
            kernel_size=1)
    elif modelType == ModelType.DCRNN:
        dfResults = dfDCRNN
        model = DCRNNModel(node_features=2, hidden_channels=32)
    elif modelType == ModelType.LSTM:
        dfResults = dfLSTM
        model = LSTMModel(node_features=2, hidden_channels=32)

    dfResultsTemp = dfResults[dfResults["Model"] == modelType.name]
    dfResultsTemp = dfResultsTemp[dfResultsTemp["Dataset"] == dataset.name]
    dfResultsTemp = dfResultsTemp[dfResultsTemp["DistanceType"] == distanceType.name]
    dfResultsTemp = dfResultsTemp[dfResultsTemp["TestOrVal"] == "Validation"]
    dfResultsTemp = dfResultsTemp[["Criterion", "Loss"]]
    dfResultsTemp = dfResultsTemp.groupby(["Criterion"]).min().T
    min_loss = dfResultsTemp["MAE"].iloc[0]

    dfBestModel = dfResults[dfResults["Criterion"] == "MAE"]
    dfBestModel = dfBestModel[dfBestModel["Loss"] == min_loss]
    dfBestModel = dfBestModel[dfBestModel["TestOrVal"] == "Validation"]

    checkpoint_directory = os.path.join(results_ray_path,experiment_name)

    ray_experiment = os.path.join(checkpoint_directory, f"{modelType.name}_{dataset.name}_{distanceType.name}_*")
    ray_model_dir = glob(ray_experiment)[0]

    checkpoint_directory = os.path.join(checkpoint_directory,ray_model_dir)
    trial_directory = get_correct_trial_dir(checkpoint_directory,dfBestModel["Trial"].iloc[0])
    if trial_directory == "": 
        print("Trial id not found")
        return
    checkpoint_directory = os.path.join(checkpoint_directory,trial_directory)
    checkpoint_number =  f"checkpoint_00000{dfBestModel['Epoch'].iloc[0]}" if int(dfBestModel['Epoch'].iloc[0]) < 10 else f"checkpoint_0000{dfBestModel['Epoch'].iloc[0]}"
    checkpoint_directory = os.path.join(checkpoint_directory,checkpoint_number)
    path = os.path.join(checkpoint_directory, "checkpoint")
    model.to("cpu")
    model_state,optimizer_state = torch.load(path)
    model.load_state_dict(model_state)
    model.eval()

    folder_name = modelType
    if modelType == ModelType.DCRNN:
        folder_name = ModelType.LSTM

    train_dataset, validation_dataset, test_dataset = DatasetHelper(
                dfBestModel["Sigma"].iloc[0],
                dfBestModel["Epsilon"].iloc[0],
                dataset,
                distanceType,
                proccessed_data_path,
                folder_name).get_dataset()
    edge_index = test_dataset.get_edge_index()
    edge_weight = test_dataset.get_edge_weight()
    nodes_ids = Graph_get_nodes_for_dataset(dataset)
    Y_true_All = []
    Y_pred_All = []
    loss = 0
    for index,(X_test, Y_true) in enumerate(test_dataset):
        Y_pred = model(X_test, edge_index, edge_weight)
        loss += LossFunction.MAE(Y_pred, Y_true)

        Y_true = [item for sublist in Y_true.tolist() for item in sublist]
        Y_pred = [item for sublist in Y_pred.tolist() for item in sublist]
        Y_true_All.append(Y_true)
        Y_pred_All.append(Y_pred)
    loss = loss / (index + 1)
    loss = loss.item()
    print(loss)
    Y_true_All = np.array(Y_true_All).T.tolist()
    Y_pred_All = np.array(Y_pred_All).T.tolist()
    
    for index,(Y_true,Y_pred) in enumerate(zip(Y_true_All,Y_pred_All)):
        node_id = nodes_ids[index]
        dict1 = dict(Time=np.arange(len(Y_pred)),
                        Actual=Y_true,
                        Predicted=Y_pred)
        df = pd.DataFrame(dict1)
        fig = px.line(df,
                        x='Time',
                        y=["Actual", "Predicted"],
                        title="Prediction for node {0}".format(node_id))
        dirname = os.path.join(path_save_plots,f"Regression_{modelType.name}_{dataset.name}_{distanceType.name}")
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        filename = os.path.join(dirname,f"Node_{node_id}.png")
        fig.write_image(filename)



In [129]:
RegresionForDatasetNodes(dfSTCONV, dfDCRNN, dfLSTM, Dataset.Experimental, DistanceType.Geodesic, ModelType.LSTM,results_ray_path, "Experiment", path_processed_data, path_save_plots_Experiment)

12.659317016601562
