one grpahs dataset contain:
1. 12 set of node features (num_nodes, node_features)*12
2. static edge indexes (2, nodes)
3. 12 set of edge attributes (num_edges , edge feature dim)*12
4. 12 set of time attributes (1 , num_nodes)*12

In [None]:
import numpy as np
import pandas as pd
from geopy.distance import geodesic
from sklearn.preprocessing import LabelEncoder

import torch
import json

In [None]:
# read the dataset : bus dataset
dataset=pd.read_csv('dataset_final.csv',parse_dates=["datetime"]).drop('Unnamed: 0',axis=1)
dataset

In [None]:
# read the dataset : stop dataset
stops = pd.read_csv('../bus_stops_rio_de_janeiro.csv',delimiter="|")
stops = stops[stops['line_number'].str.match(r'^-?\d+(\.\d+)?$', na=False)]
stops['line_number'] = stops['line_number'].astype(float)
stops = stops[stops["line_number"].isin(dataset["line"].unique())]
stops

In [None]:
stops["num_routes_at_stop"] = stops.groupby("stop_name")["line_number"].transform("nunique")
stops["is_shared_stop"] = stops["num_routes_at_stop"] > 1

In [None]:
# filter out few routes
stops = stops[stops['line_number'].isin([847,935 ,334])]
dataset = dataset[dataset['line'].isin([847,935 ,334])]


In [None]:
# add shared stop feature dataset
stop_shared_dict = stops[['stop_name', 'is_shared_stop']].drop_duplicates().set_index('stop_name')['is_shared_stop'].to_dict()
dataset['is_shared_stop'] = dataset['nearest_stop_name'].map(stop_shared_dict)

In [None]:
# create features : stop dataset
stops["prev_latitude"] = stops.groupby("line_number")["stop_latitude"].shift(1)
stops["prev_longitude"] = stops.groupby("line_number")["stop_longitude"].shift(1)

def haversine(row):
    if pd.isna(row["prev_latitude"]) or pd.isna(row["prev_longitude"]):
        return 0
    return geodesic((row["prev_latitude"], row["prev_longitude"]), (row["stop_latitude"], row["stop_longitude"])).meters
stops["distance_from_previous_stop"] = stops.apply(haversine, axis=1)

In [None]:
# create features : bus dataset
dataset["hour"] = dataset["datetime"].dt.hour
dataset["minute"] = dataset["datetime"].dt.minute
dataset["is_weekday"] = dataset["datetime"].dt.weekday < 5 
dataset["is_weekday"] = dataset["is_weekday"].astype(int)
dataset["travel_time"] = dataset["distance_to_next_stop"] / dataset["speed"].replace(0, np.nan)
dataset["travel_time"] = dataset["travel_time"].fillna(0)

In [None]:
# edge features: distance, route_multiplicity (1) static
# node features: distance_to_next_stop, is_at_stop, speed , hour , minute , is_shared_stop (1) dynamic
# time feature: is_weekday (1) dynamic
# target: travel_time (1)

In [None]:
# create edge dataset
edges = []
for line in stops['line_number'].unique():
    line_data = stops[stops['line_number'] == line]
    
    for i in range(len(line_data) - 1):
        stop1 = line_data.iloc[i]
        stop2 = line_data.iloc[i + 1]
        distance = stop2['distance_from_previous_stop']
        # (start stop, end stop, distance)
        edge = {
            'start_stop': stop1['stop_name'],
            'end_stop': stop2['stop_name'],
            'distance': distance,
            'route_multiplicity': None
        }
        edges.append(edge)
edges_df = pd.DataFrame(edges)

In [None]:
edges_df.head()

In [None]:
dataset.head(2)

In [None]:
# add route_multiplicity
shared_stops = dataset.set_index("nearest_stop_name")["is_shared_stop"].to_dict()
edges_df["route_multiplicity"] = edges_df.apply(
    lambda row: shared_stops.get(row["start_stop"], False) and shared_stops.get(row["end_stop"], False),
    axis=1
)

In [None]:
# encode node datset
encode_columns = ["order", "nearest_stop_name", "is_at_stop", "is_shared_stop"]
encoders = {}

for col in encode_columns:
    le = LabelEncoder()
    dataset[col] = le.fit_transform(dataset[col])
    encoders[col] = le

In [None]:
encoders

In [None]:
# encode edge datset
encoded_edge_df =edges_df.copy()
encode_columns = ["start_stop", "end_stop"]
for col in encode_columns:
    if col in encoded_edge_df:
        encoded_edge_df[col] = encoded_edge_df[col].map(lambda x: encoders['nearest_stop_name'].transform([x])[0] if x in encoders['nearest_stop_name'].classes_ else -1)

binary_mapping = {True: 1, False: 0}
encoded_edge_df["route_multiplicity"] = encoded_edge_df["route_multiplicity"].map(binary_mapping)

In [None]:
encoded_edge_df

In [None]:
# create edge indices
encoded_edge_df=encoded_edge_df.drop_duplicates(subset=['start_stop', 'end_stop'])
edge_indices = torch.tensor([
    encoded_edge_df["start_stop"].values, 
    encoded_edge_df["end_stop"].values
], dtype=torch.long)


In [None]:
# create edge features
edge_features = encoded_edge_df[['distance', 'route_multiplicity']].values
edge_features

In [None]:
len(edge_features)

In [None]:
# create node features
node_dataset = dataset.copy().sort_values(by=['datetime']).drop(['date','time','latitude','longitude','datetime','is_weekday','travel_time'],axis=1)
node_features = node_dataset.groupby('nearest_stop_name').apply(lambda x: x.drop(columns=['nearest_stop_name']).values.tolist()).to_dict()

In [None]:

total_snapshots = sum(len(v) for v in node_features.values())

node_queues = {node: list(features) for node, features in node_features.items()}
last_known_features = {node: node_queues[node][0] for node in node_queues}  # Initialize with first feature

node_snapshots = []

while any(node_queues.values()):  
    snapshot_features = []
    
    for node in node_queues:
        if node_queues[node]:  
            last_known_features[node] = node_queues[node].pop(0)  
        snapshot_features.append(last_known_features[node])  
    
    node_snapshots.append(snapshot_features)  

node_snapshots

In [None]:
# create time features
time_dataset = dataset.copy().sort_values(by=['datetime'])[['nearest_stop_name','is_weekday']]
time_features = time_dataset.groupby('nearest_stop_name').apply(lambda x: x.drop(columns=['nearest_stop_name']).values.tolist()).to_dict()

In [None]:
time_features = {key: [item[0] for item in value] for key, value in time_features.items()}

In [None]:
time_queues = {node: list(features) for node, features in time_features.items()}
last_known_features = {node: time_queues[node][0] for node in time_queues}  #

time_snapshots = []

while any(time_queues.values()):  
    snapshot_features = []
    
    for node in time_queues:
        if time_queues[node]:  
            last_known_features[node] = time_queues[node].pop(0) 
        snapshot_features.append(last_known_features[node])  
    
    time_snapshots.append(snapshot_features)

time_snapshots

In [None]:
# create target feature
target_dataset = dataset.copy().sort_values(by=['datetime'])[['nearest_stop_name','travel_time']]
target_features = target_dataset.groupby('nearest_stop_name').apply(lambda x: x.drop(columns=['nearest_stop_name']).values.tolist()).to_dict()

In [None]:
target_features = {key: [item[0] for item in value] for key, value in target_features.items()}
target_features

In [None]:
len(target_features[1])

In [None]:
# node_snapshots(12, num_nodes , node_feature_dim), 
# edge_indices(2, edge_count) , 
# edge_features(12 ,edge_count , edge_feature_dim) , 
# time_snapshots(12, num_nodes)  , | 
# target_features(num_nodes, 12)


In [None]:
batch_size = 12
num_nodes = len(target_features)
max_length = max(len(v) for v in target_features.values())  # Longest queue

# Ensure all lists are padded with their last value if shorter
for node in target_features:
    while len(target_features[node]) < max_length:
        target_features[node].append(target_features[node][-1])  

target_batches = {node: [] for node in target_features}
num_batches = max_length - batch_size + 1

for node, values in target_features.items():
    for i in range(num_batches):
        target_batches[node].append(values[i : i + batch_size])

target_batches

In [None]:
target_batches.keys()

In [None]:
batch_size = 12
num_snapshots = len(time_snapshots)

time_snapshot_batches = []

for i in range(num_snapshots - batch_size + 1):  
    time_snapshot_batches.append(time_snapshots[i : i + batch_size])
time_snapshot_batches

In [None]:
edge_features_batches = [edge_features]*12

In [None]:
batch_size = 12
num_snapshots = len(node_snapshots)

node_snapshot_batches = []

for i in range(num_snapshots - batch_size + 1):  
    node_snapshot_batches.append(node_snapshots[i : i + batch_size])
node_snapshot_batches

In [None]:
len(node_snapshot_batches[0][0][0])

In [None]:
display(len(target_batches[0]))
display(len(node_snapshot_batches))
display(len(edge_features_batches))
display(len(time_snapshot_batches))

In [None]:
graph_seq = []
for i in range(len(node_snapshot_batches)):
    seq = [torch.tensor(node_snapshot_batches[i]), torch.tensor(edge_indices), torch.tensor(edge_features_batches,dtype=torch.float32), torch.tensor(time_snapshot_batches[i])]
    graph_seq.append(seq)
graph_seq

In [None]:
graph_seq[0]

In [None]:
# save input datas
torch.save(graph_seq, "../train_data/rio_data/rio_data.pth")

In [None]:
# Load tensor
loaded_tensor = torch.load("../train_data/rio_data/rio_data.pth")
loaded_tensor[0]

In [None]:
nodes = list(target_batches.keys())
num_snapshots = len(next(iter(target_batches.values())))  # 4300

target_snapshot_batches = []
for i in range(num_snapshots):  
    batch = [target_batches[node][i] for node in nodes]  # Collect i-th snapshot from each node
    target_snapshot_batches.append(batch)

snapshot_batches_tensor = torch.tensor(target_snapshot_batches, dtype=torch.float32)

In [None]:
# save target datas
torch.save(snapshot_batches_tensor, "../train_data/rio_data/rio_data_target.pth")

In [None]:
snapshot_batches_tensor[4299]