In [1]:
import torch
import numpy as np
import pandas as pd
from torch_geometric.loader import DataLoader
from torch_geometric.data import InMemoryDataset, Data
from math import cos, asin, sqrt, pi

In [2]:
i8_vds_list = ['1111514 E',
'1108728 E',
'1114540 E',
'1114546 W',
'1113486 E',
'1116574 W',
'1115356 E',
'1115528 W',
'1115413 E',
'1115517 W',
'1115432 E',
'1115555 W',
'1115438 E',
'1115565 W',
'1111551 E',
'1111552 W',
'1115420 E',
'1116165 W',
'1115426 E',
'1115548 W',
'1111566 E',
'1111540 W',
'1115444 E',
'1115572 W',
'1116783 E',
'1111541 W',
'1111565 E',
'1115578 W',
'1111549 E',
'1111550 W',
'1122623 W',
'1115450 E',
'1115584 W',
'1108366 E',
'1115592 W',
'1127052 W',
'1118752 E',
'1118760 W',
'1108333 W',
'1111564 E',
'1114573 E',
'1127026 W',
'1111535 E',
'1108341 W',
'1108343 W',
'1111534 E',
'1108345 W',
'1108423 W',
'1108347 W',
'1115463 E',
'1115608 W',
'1111530 E',
'1108385 W',
'1111532 E',
'1108351 W',
'1111531 E',
'1111561 W',
'1122637 E',
'1122646 E',
'1108387 W',
'1108353 W',
'1111563 W',
'1115612 W',
'1115616 W',
'1111569 W',
'1115477 E',
'1116593 E',
'1111575 W',
'1115624 E',
'1115628 E',
'1113364 W'
]

In [3]:
data = []
for vds in i8_vds_list:
    vds_id = vds.split()[0] # Sensor ID
    vds_dir = vds.split()[1] # Sensor Direction
    
    # Filepath for each week
    w1_file = vds_id + '_I8' + vds_dir + '_W1' + '.csv' 
    w2_file = vds_id + '_I8' + vds_dir + '_W2' + '.csv'
    
    # Load in dataset for each week
    w1_df = pd.read_csv('../datasets/sensor_speeds/PEMS_I8/'+w1_file)
    w2_df = pd.read_csv('../datasets/sensor_speeds/PEMS_I8/'+w2_file)
    
    # Check that both datasets contain 1 weeks worth of 5 min intervals (1 day = 288 intervals * 7 days = 2016)
    if (len(w1_df) != 2016) or (len(w2_df) != 2016):
        print(vds_id + ' does not contain all times')
        continue
    
    # Create row representing all speeds for one sensor
    row = [int(vds_id)] + list(w1_df['Speed (mph)']) + list(w2_df['Speed (mph)'])
    
    data.append(row)

time_ints = list(w1_df['5 Minutes']) + list(w2_df['5 Minutes']) # Get all time intervals to use as columns
cols = ['vds_id'] + time_ints

sensor_speed = pd.DataFrame(data, columns=cols).set_index('vds_id')

In [4]:
sensor_speed

Unnamed: 0_level_0,1/1/2024 0:00,1/1/2024 0:05,1/1/2024 0:10,1/1/2024 0:15,1/1/2024 0:20,1/1/2024 0:25,1/1/2024 0:30,1/1/2024 0:35,1/1/2024 0:40,1/1/2024 0:45,...,1/14/2024 23:10,1/14/2024 23:15,1/14/2024 23:20,1/14/2024 23:25,1/14/2024 23:30,1/14/2024 23:35,1/14/2024 23:40,1/14/2024 23:45,1/14/2024 23:50,1/14/2024 23:55
vds_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1111514,69.8,67.4,65.2,65.0,63.5,65.6,61.4,62.5,64.3,64.8,...,66.2,66.3,67.5,66.7,67.3,68.3,66.9,61.9,65.2,66.5
1108728,69.7,67.6,67.4,67.3,66.4,67.0,66.2,66.0,65.8,66.3,...,67.9,67.5,67.3,67.4,68.2,67.8,67.4,67.9,67.5,67.9
1114540,69.5,67.2,67.8,68.0,68.4,69.1,68.4,69.2,68.2,69.0,...,68.4,68.8,69.4,68.5,67.6,68.5,69.0,68.1,67.6,68.2
1114546,67.7,67.0,68.7,68.0,69.1,69.3,68.3,68.1,69.3,67.7,...,69.8,68.0,67.9,68.0,68.7,69.2,67.8,68.2,68.1,68.0
1113486,67.9,68.1,67.5,67.6,67.2,67.6,67.7,68.0,67.5,67.9,...,68.3,68.0,68.0,68.4,67.7,68.2,67.5,67.9,67.9,68.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1116593,65.9,65.7,66.3,66.2,66.7,65.7,64.5,65.3,65.8,65.8,...,64.2,65.0,65.8,65.9,63.8,64.9,65.6,65.7,66.0,65.5
1111575,67.0,66.3,67.6,69.1,68.3,68.9,68.5,68.9,68.7,69.2,...,60.8,62.6,63.7,64.2,62.8,62.8,63.6,63.3,62.9,63.8
1115624,67.2,66.9,66.6,66.5,66.7,66.5,66.5,66.3,66.3,66.5,...,66.7,66.5,66.5,66.5,66.6,66.5,66.6,66.7,66.5,66.6
1115628,67.2,66.9,66.6,66.5,66.7,66.5,66.5,66.3,66.3,66.5,...,66.7,66.5,66.5,66.5,66.6,66.5,66.6,66.7,66.5,66.6


In [5]:
sensor_pos = pd.read_csv('../datasets/sensor_positions/I8_vds_pos.csv').set_index('vds_id')

In [6]:
sensor_pos

Unnamed: 0_level_0,Lat,Lng
vds_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1111514,32.759902,-117.198808
1108728,32.759348,-117.183573
1114540,32.759183,-117.178777
1114546,32.759391,-117.178628
1113486,32.760765,-117.168352
...,...,...
1116593,32.839935,-116.887743
1111575,32.844154,-116.882891
1115624,32.845664,-116.876283
1115628,32.850581,-116.870112


In [7]:
# Find distance (in miles) between two coordinates 
# Source: https://stackoverflow.com/questions/27928/calculate-distance-between-two-latitude-longitude-points-haversine-formula
def distance(lat1, lon1, lat2, lon2):
    r = 3956 # miles
    p = pi / 180

    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p) * cos(lat2*p) * (1-cos((lon2-lon1)*p))/2
    return 2 * r * asin(sqrt(a))

In [8]:
sensor_list = list(sensor_pos.index)
sensor_dist = pd.DataFrame(index=sensor_list, columns=sensor_list) # Empty dataframe with sensors as index and columns
# Find distances (miles) for all pairs of sensors
for sen1 in sensor_list:
    for sen2 in sensor_list:
        if sen1 == sen2:
            sensor_dist.loc[sen1, sen2] = 0.0
            continue
            
        sen1_lat = sensor_pos.loc[sen1, 'Lat']
        sen1_lon = sensor_pos.loc[sen1, 'Lng']
        sen2_lat = sensor_pos.loc[sen2, 'Lat']
        sen2_lon = sensor_pos.loc[sen2, 'Lng']
        
        sensor_dist.loc[sen1, sen2] = distance(sen1_lat, sen1_lon, sen2_lat, sen2_lon)

In [9]:
sensor_dist

Unnamed: 0,1111514,1108728,1114540,1114546,1113486,1116574,1115356,1115528,1115413,1115517,...,1111563,1115612,1115616,1111569,1115477,1116593,1111575,1115624,1115628,1113364
1111514,0.0,0.885423,1.164129,1.172251,1.76937,1.767786,2.033846,2.040105,2.188741,2.186428,...,17.015535,17.369983,17.715828,18.073822,18.497598,18.880093,19.235202,19.632129,20.076641,21.090337
1108728,0.885423,0.0,0.278706,0.28714,0.889178,0.888571,1.160234,1.16797,1.320574,1.319376,...,16.165172,16.524823,16.87367,17.234458,17.661997,18.048281,18.406559,18.803174,19.251102,20.262943
1114540,1.164129,0.278706,0.0,0.016766,0.615086,0.615353,0.889902,0.89864,1.053049,1.05258,...,15.898099,16.259469,16.609298,16.970995,17.399761,17.787281,18.146589,18.543078,18.992111,20.003283
1114546,1.172251,0.28714,0.016766,0.0,0.604152,0.604166,0.87816,0.886705,1.040847,1.04026,...,15.88615,16.247275,16.596975,16.958557,17.387174,17.774553,18.133747,18.53025,18.979166,19.99041
1113486,1.76937,0.889178,0.615086,0.604152,0.0,0.012253,0.278597,0.289408,0.445252,0.446356,...,15.284715,15.647199,15.997748,16.360158,16.78995,17.178574,17.538846,17.935201,18.385305,19.395791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1116593,18.880093,18.048281,17.787281,17.774553,17.178574,17.176644,16.90036,16.890562,16.735215,16.73526,...,2.214129,1.729949,1.333483,0.934426,0.449083,0.0,0.405065,0.773568,1.259478,2.23004
1111575,19.235202,18.406559,18.146589,18.133747,17.538846,17.536787,17.260491,17.250545,17.095069,17.094994,...,2.61904,2.134948,1.738513,1.339477,0.853993,0.405065,0.0,0.397241,0.863936,1.857479
1115624,19.632129,18.803174,18.543078,18.53025,17.935201,17.93316,17.656865,17.646939,17.491479,17.49142,...,2.962677,2.487427,2.091711,1.691898,1.20955,0.773568,0.397241,0.0,0.493343,1.462409
1115628,20.076641,19.251102,18.992111,18.979166,18.385305,18.383129,18.10685,18.096771,17.941194,17.941011,...,3.456018,2.980206,2.584248,2.184274,1.700739,1.259478,0.863936,0.493343,0.0,1.019624


In [10]:
sensor_conn = pd.DataFrame(index=sensor_list, columns=sensor_list)
# Find connectivity between all pairs of sensors
for sen1 in i8_vds_list:
    for sen2 in i8_vds_list:
        sen1_vds = int(sen1.split()[0])
        sen1_dir = sen1.split()[1]
        sen2_vds = int(sen2.split()[0])
        sen2_dir = sen2.split()[1]
        
        if sen1_dir == sen2_dir:
            sensor_conn.loc[sen1_vds, sen2_vds] = 1
        
        else:
            sensor_conn.loc[sen1_vds, sen2_vds] = 0

In [11]:
sensor_conn

Unnamed: 0,1111514,1108728,1114540,1114546,1113486,1116574,1115356,1115528,1115413,1115517,...,1111563,1115612,1115616,1111569,1115477,1116593,1111575,1115624,1115628,1113364
1111514,1,1,1,0,1,0,1,0,1,0,...,0,0,0,0,1,1,0,1,1,0
1108728,1,1,1,0,1,0,1,0,1,0,...,0,0,0,0,1,1,0,1,1,0
1114540,1,1,1,0,1,0,1,0,1,0,...,0,0,0,0,1,1,0,1,1,0
1114546,0,0,0,1,0,1,0,1,0,1,...,1,1,1,1,0,0,1,0,0,1
1113486,1,1,1,0,1,0,1,0,1,0,...,0,0,0,0,1,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1116593,1,1,1,0,1,0,1,0,1,0,...,0,0,0,0,1,1,0,1,1,0
1111575,0,0,0,1,0,1,0,1,0,1,...,1,1,1,1,0,0,1,0,0,1
1115624,1,1,1,0,1,0,1,0,1,0,...,0,0,0,0,1,1,0,1,1,0
1115628,1,1,1,0,1,0,1,0,1,0,...,0,0,0,0,1,1,0,1,1,0


# Helper Functions

In [12]:
def z_score(x, mean, std):
    """
    Z-score normalization function: $z = (X - \mu) / \sigma $,
    where z is the z-score, X is the value of the element,
    $\mu$ is the population mean, and $\sigma$ is the standard deviation.
    :param x: torch array, input array to be normalized.
    :param mean: float, the value of mean.
    :param std: float, the value of standard deviation.
    :return: torch array, z-score normalized array.
    """
    return (x - mean) / std

def un_z_score(x_normed, mean, std):
    """
    Undo the Z-score calculation
    :param x_normed: torch array, input array to be un-normalized.
    :param mean: float, the value of mean.
    :param std: float, the value of standard deviation.
    """
    return x_normed * std  + mean


def MAPE(v, v_):
    """
    Mean absolute percentage error, given as a % (e.g. 99 -> 99%)
    :param v: torch array, ground truth.
    :param v_: torch array, prediction.
    :return: torch scalar, MAPE averages on all elements of input.
    """
    return torch.mean(torch.abs((v_ - v)) /(v + 1e-15) * 100)


def RMSE(v, v_):
    """
    Mean squared error.
    :param v: torch array, ground truth.
    :param v_: torch array, prediction.
    :return: torch scalar, RMSE averages on all elements of input.
    """
    return torch.sqrt(torch.mean((v_ - v) ** 2))


def MAE(v, v_):
    """
    Mean absolute error.
    :param v: torch array, ground truth.
    :param v_: torch array, prediction.
    :return: torch scalar, MAE averages on all elements of input.
    """
    return torch.mean(torch.abs(v_ - v))

In [13]:
def distance_to_weights(dist_df, conn_df):
    # Inverse transform distances
    dist_array = dist_df.values
    dist_array = np.where(dist_array == 0, np.nan, dist_array)
    dist_array_inv = 1 / dist_array
    dist_array_inv = pd.DataFrame(dist_array_inv).fillna(0).values
    
    # Mask with directional connectivity
    conn_array = conn_df.values
    W = dist_array_inv * conn_array
    
    # Mask with nearest sensor connectivity
    near_sen = np.zeros((W.shape[0], W.shape[0]))
    for sen in range(W.shape[0]-1):
        no_neigh = False
        count = 1
        while W[sen][sen+count] == 0:
            if count == (W.shape[0]-sen-1):
                no_neigh = True
                break            
            count+=1

        if no_neigh:
            near_sen[sen][sen+count] = 0

        else:
            near_sen[sen][sen+count] = 1
    
    near_sen_sym = np.triu(near_sen) + np.triu(near_sen, 1).T # Make symmetric  
    W = W * near_sen_sym
    
    return W

In [14]:
W = distance_to_weights(sensor_dist, sensor_conn)
W

array([[0.0, 1.129403272735138, 0.0, ..., 0.0, 0.0, 0.0],
       [1.129403272735138, 0.0, 3.5880054045242775, ..., 0.0, 0.0, 0.0],
       [0.0, 3.5880054045242775, 0.0, ..., 0.0, 0.0, 0.0],
       ...,
       [0.0, 0.0, 0.0, ..., 0.0, 2.026986637233521, 0.0],
       [0.0, 0.0, 0.0, ..., 2.026986637233521, 0.0, 0.0],
       [0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0]], dtype=object)

In [15]:
class BaselineDataset(InMemoryDataset):
    def __init__(self, config, W, root='', transform=None, pre_transform=None):
        self.config = config
        self.W = W
        super().__init__(root, transform, pre_transform)
        self.process()
    
    def process(self):
        data = sensor_speed.T.values
        mean = np.mean(data)
        std_dev = np.std(data)
        data = z_score(data, mean, std_dev)
        
        n_node = data.shape[1]
        n_window = self.config['N_PRED'] + self.config['N_HIST']
        
        edge_index = torch.zeros((2, n_node**2), dtype=torch.long)
        edge_attr = torch.zeros((n_node**2, 1))
        num_edges = 0
        for i in range(n_node):
            for j in range(n_node):
                if self.W[i, j] != 0:
                    edge_index[0, num_edges] = i
                    edge_index[1, num_edges] = j
                    edge_attr[num_edges] = self.W[i, j]
                    num_edges += 1
        
        # Resize to keep first num_edges entries
        edge_ind_aslst = edge_index.tolist()
        for i in range(len(edge_ind_aslst[0])):
            if (edge_ind_aslst[0][i] == 0) and (edge_ind_aslst[1][i] == 0):
                first = edge_ind_aslst[0][:i]
                second = edge_ind_aslst[1][:i]
                edge_index = torch.tensor([first, second], dtype=torch.long)
                break
        edge_attr = edge_attr.resize_(num_edges, 1)
        
        sequences = []
        # T x F x N
        for i in range(self.config['N_DAYS']):
            for j in range(self.config['N_SLOT']):
                # for each time point construct a different graph with data object
                
                g = Data()
                g.__num_nodes__ = n_node

                g.edge_index = edge_index
                g.edge_attr  = edge_attr

                # (F,N) switched to (N,F)
                sta = i * self.config['N_DAY_SLOT'] + j
                end = sta + n_window
                # [21, 228]
                full_window = np.swapaxes(data[sta:end, :], 0, 1)
                g.x = torch.FloatTensor(full_window[:, 0:self.config['N_HIST']])
                g.y = torch.FloatTensor(full_window[:, self.config['N_HIST']::])
                sequences += [g]   
        
        data, slices = self.collate(sequences)
        self.data, self.slices = data, slices
        self.n_node, self.mean, self.std_dev = n_node, mean, std_dev
        
    @property
    def processed_file_names(self):
        return []

In [16]:
# Constant config to use throughout
config = {
    'N_PRED': 2,
    'N_HIST': 2,
    # number of possible 5 minute measurements per day
    'N_DAY_SLOT': 288,
    # number of days worth of data in the dataset
    'N_DAYS': 14,
    'N_NODE': 71,
}
# Number of possible windows in a day
config['N_SLOT']= config['N_DAY_SLOT'] - (config['N_PRED']+config['N_HIST']) + 1

W = distance_to_weights(sensor_dist, sensor_conn)
dataset = BaselineDataset(config, W)

Processing...
Done!


In [17]:
dataset

BaselineDataset(3990)

In [18]:
sensor_speed.T

vds_id,1111514,1108728,1114540,1114546,1113486,1116574,1115356,1115528,1115413,1115517,...,1111563,1115612,1115616,1111569,1115477,1116593,1111575,1115624,1115628,1113364
1/1/2024 0:00,69.8,69.7,69.5,67.7,67.9,66.5,70.9,70.4,67.5,68.5,...,68.1,68.2,64.3,66.7,67.0,65.9,67.0,67.2,67.2,68.1
1/1/2024 0:05,67.4,67.6,67.2,67.0,68.1,65.6,69.9,70.0,67.1,69.0,...,68.4,68.2,64.2,67.2,66.6,65.7,66.3,66.9,66.9,67.5
1/1/2024 0:10,65.2,67.4,67.8,68.7,67.5,66.2,69.2,70.2,66.9,68.6,...,68.6,68.1,64.4,67.1,67.4,66.3,67.6,66.6,66.6,67.5
1/1/2024 0:15,65.0,67.3,68.0,68.0,67.6,65.5,70.2,71.3,66.6,67.8,...,68.2,68.0,64.7,67.5,67.4,66.2,69.1,66.5,66.5,68.8
1/1/2024 0:20,63.5,66.4,68.4,69.1,67.2,67.3,69.1,70.7,65.8,67.7,...,68.1,68.0,64.7,66.6,67.9,66.7,68.3,66.7,66.7,67.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1/14/2024 23:35,68.3,67.8,68.5,69.2,68.2,66.6,69.0,71.1,66.4,68.5,...,68.0,68.0,68.0,66.5,66.7,64.9,62.8,66.5,66.5,68.9
1/14/2024 23:40,66.9,67.4,69.0,67.8,67.5,66.5,69.1,70.7,67.5,68.1,...,68.6,68.0,68.0,66.6,68.0,65.6,63.6,66.6,66.6,69.2
1/14/2024 23:45,61.9,67.9,68.1,68.2,67.9,67.2,70.3,70.3,68.3,68.4,...,69.3,68.1,68.1,66.7,67.5,65.7,63.3,66.7,66.7,68.3
1/14/2024 23:50,65.2,67.5,67.6,68.1,67.9,67.0,70.1,71.2,67.7,68.7,...,68.5,68.0,68.0,66.5,67.5,66.0,62.9,66.5,66.5,68.8
