### Preprocessing

In [5]:
# read dataset and select routes having less missing values
import pandas as pd

nyc_06 = pd.read_csv("../NYC-Bus Data/cleaned_mta_1706.csv")
nyc_08 = pd.read_csv("../NYC-Bus Data/cleaned_mta_1708.csv")
nyc_10 = pd.read_csv("../NYC-Bus Data/cleaned_mta_1710.csv")
nyc_12 = pd.read_csv("../NYC-Bus Data/cleaned_mta_1712.csv")

In [6]:
nyc_06.dtypes

RecordedAtTime                object
DirectionRef                   int64
PublishedLineName             object
OriginName                    object
OriginLat                    float64
OriginLong                   float64
DestinationName               object
DestinationLat               float64
DestinationLong              float64
VehicleRef                    object
VehicleLocation.Latitude     float64
VehicleLocation.Longitude    float64
NextStopPointName             object
ArrivalProximityText          object
DistanceFromStop             float64
ExpectedArrivalTime           object
ScheduledArrivalTime          object
dtype: object

In [7]:
# combine the datasets
nyc = pd.concat([nyc_06, nyc_08, nyc_10, nyc_12] ,ignore_index=True)
display(nyc.dtypes)
display(nyc.head())

RecordedAtTime                object
DirectionRef                 float64
PublishedLineName             object
OriginName                    object
OriginLat                    float64
OriginLong                   float64
DestinationName               object
DestinationLat               float64
DestinationLong              float64
VehicleRef                    object
VehicleLocation.Latitude     float64
VehicleLocation.Longitude    float64
NextStopPointName             object
ArrivalProximityText          object
DistanceFromStop             float64
ExpectedArrivalTime           object
ScheduledArrivalTime          object
dtype: object

Unnamed: 0,RecordedAtTime,DirectionRef,PublishedLineName,OriginName,OriginLat,OriginLong,DestinationName,DestinationLat,DestinationLong,VehicleRef,VehicleLocation.Latitude,VehicleLocation.Longitude,NextStopPointName,ArrivalProximityText,DistanceFromStop,ExpectedArrivalTime,ScheduledArrivalTime
0,2017-06-01 00:03:34,0.0,B8,4 AV/95 ST,40.616104,-74.031143,BROWNSVILLE ROCKAWAY AV,40.656048,-73.907379,NYCT_430,40.63517,-73.960803,FOSTER AV/E 18 ST,approaching,76.0,2017-06-01 00:03:59,24:06:14
1,2017-06-01 00:03:43,1.0,S61,ST GEORGE FERRY/S61 & S91,40.643169,-74.073494,S I MALL YUKON AV,40.575935,-74.167686,NYCT_8263,40.590802,-74.15834,MERRYMOUNT ST/TRAVIS AV,approaching,62.0,2017-06-01 00:03:56,23:58:02
2,2017-06-01 00:03:49,0.0,Bx10,E 206 ST/BAINBRIDGE AV,40.875008,-73.880142,RIVERDALE 263 ST,40.912376,-73.902534,NYCT_4223,40.88601,-73.912647,HENRY HUDSON PKY E/W 235 ST,at stop,5.0,2017-06-01 00:03:56,24:00:53
3,2017-06-01 00:03:31,0.0,Q5,TEARDROP/LAYOVER,40.701748,-73.802399,ROSEDALE LIRR STA via MERRICK,40.666012,-73.735939,NYCT_8422,40.668002,-73.729348,HOOK CREEK BL/SUNRISE HY,< 1 stop away,267.0,2017-06-01 00:04:03,24:03:00
4,2017-06-01 00:03:22,1.0,Bx1,RIVERDALE AV/W 231 ST,40.881187,-73.90934,MOTT HAVEN 136 ST via CONCOURSE,40.809654,-73.92836,NYCT_4710,40.868134,-73.893032,GRAND CONCOURSE/E 196 ST,at stop,11.0,2017-06-01 00:03:56,23:59:38


In [8]:
# create vehicle id
vehicle_ref_map = {vehicle: idx for idx, vehicle in enumerate(nyc_06['VehicleRef'].unique())}

for df2 in [nyc_08,nyc_10,nyc_12]:
    new_vehicle_refs = set(df2['VehicleRef'].unique()) - set(vehicle_ref_map.keys())
    # Update the map by adding new VehicleRef values with new encoded values
    for vehicle_ref in new_vehicle_refs:
        vehicle_ref_map[vehicle_ref] = len(vehicle_ref_map)

vehicle_ref_map

{'NYCT_430': 0,
 'NYCT_8263': 1,
 'NYCT_4223': 2,
 'NYCT_8422': 3,
 'NYCT_4710': 4,
 'NYCT_3831': 5,
 'NYCT_4611': 6,
 'NYCT_4841': 7,
 'NYCT_6592': 8,
 'NYCT_8279': 9,
 'NYCT_8334': 10,
 'NYCT_7141': 11,
 'NYCT_6503': 12,
 'NYCT_4723': 13,
 'NYCT_5131': 14,
 'NYCT_274': 15,
 'NYCT_7766': 16,
 'NYCT_6678': 17,
 'NYCT_8408': 18,
 'NYCT_3988': 19,
 'NYCT_7158': 20,
 'NYCT_445': 21,
 'NYCT_5642': 22,
 'NYCT_1280': 23,
 'NYCT_7379': 24,
 'NYCT_342': 25,
 'NYCT_4394': 26,
 'NYCT_6372': 27,
 'NYCT_251': 28,
 'NYCT_3870': 29,
 'NYCT_5139': 30,
 'NYCT_5851': 31,
 'NYCT_399': 32,
 'NYCT_7260': 33,
 'NYCT_714': 34,
 'NYCT_2656': 35,
 'NYCT_7094': 36,
 'NYCT_4548': 37,
 'NYCT_7738': 38,
 'NYCT_4671': 39,
 'NYCT_4792': 40,
 'NYCT_5716': 41,
 'NYCT_4122': 42,
 'NYCT_7104': 43,
 'NYCT_5693': 44,
 'NYCT_5568': 45,
 'NYCT_5685': 46,
 'NYCT_5942': 47,
 'NYCT_6756': 48,
 'NYCT_7366': 49,
 'NYCT_4718': 50,
 'NYCT_5999': 51,
 'NYCT_5673': 52,
 'NYCT_4062': 53,
 'NYCT_8134': 54,
 'NYCT_6597': 55,
 'NYCT_23

In [9]:
# take s61,s62,s66 route with atstop records only
s61_06 = nyc_06[(nyc_06['PublishedLineName'] == 'S61') & (nyc_06['DistanceFromStop'] <= 50)]
s62_06 = nyc_06[(nyc_06['PublishedLineName'] == 'S62') & (nyc_06['DistanceFromStop'] <= 50)]
s66_06 = nyc_06[(nyc_06['PublishedLineName'] == 'S66') & (nyc_06['DistanceFromStop'] <= 50)]

#### s61

In [10]:
# consider S61 route
s61_order_0 = s61_06[(s61_06['DirectionRef'] == 0)]
s61_order_1 = s61_06[s61_06['DirectionRef'] == 1]

# order it according to Recorded time and VehicleRef
s61_order_0 = s61_order_0.sort_values(by=['RecordedAtTime','VehicleRef'])
s61_order_1 = s61_order_1.sort_values(by=['RecordedAtTime','VehicleRef'])

# get unique values of DestinationName of each direction dataset 
display(s61_order_0['DestinationName'].unique())
display(s61_order_1['DestinationName'].unique())

array(['ST GEORGE FERRY'], dtype=object)

array(['S I MALL YUKON AV', 'MARSH AVENUE RICHMOND HILL RD'], dtype=object)

In [11]:
# get only needed destinations
s61_order_0 = s61_order_0[(s61_order_0['DestinationName'] == 'ST GEORGE FERRY')]
s61_order_1 = s61_order_1[(s61_order_1['DestinationName'] == 'S I MALL YUKON AV')]
display(s61_order_0.shape)
display(s61_order_1.shape)

(3115, 17)

(2369, 17)

In [12]:
# get unique NextStopPointName in dataset and compare it with actual Stops Names
actual_stop_names_st_gr_ferry = [
    'YUKON AV/WATCHMANS OFFICE',
# 'YUKON AV/TARGET DRIVEWAY',
'INDEPENDENCE AV/PATHMARK',
'RICHMOND AV/YUKON AV',
'PLATINUM AV/TOYS R US',
'MARSH AV/WINDHAM LOOP',
'MARSH AV/ELMWOOD PK DR',
# 'MARSH AV/RING RD',
'MARSH AV/RICHMOND HILL RD',
'RICHMOND HILL RD/VASSAR ST',
'MERRYMOUNT ST/SHILOH ST',
'MERRYMOUNT ST/TRAVIS AV',
'TRAVIS AV/NEHRING AV',
'TRAVIS AV/DENKER PL',
'TRAVIS AV/EVANS ST',
'TRAVIS AV/LEWISTON ST',
'FOREST HILL RD/ROCKLAND AV',
'FOREST HILL RD/FIELD ST',
'FOREST HILL RD/STEERS ST',
'FOREST HILL RD/DOROTHY ST',
'HAROLD ST/FOREST HILL RD',
'HAROLD ST/WASHINGTON AV',
'HAROLD ST/SUNSET AV',
'HAROLD ST/BRADLEY AV',
'BRADLEY AV/HOLDEN BL',
'BRADLEY AV/WESTWOOD AV',
'BRADLEY AV/PURDY AV',
'BRADLEY AV/VICTORY BL',
'BRADLEY AV/S GANNON AV',
'VICTORY BL/PERRY AV',
'VICTORY BL/MOUNTAINVIEW AV',
'VICTORY BL/LESTER ST',
'VICTORY BL/MANOR RD',
'VICTORY BL/TODT HILL RD',
'VICTORY BL/SLOSSON AV',
'VICTORY BL/LITTLE CLOVE RD',
'VICTORY BL/ALBERT ST',
'VICTORY BL/RENWICK AV',
'VICTORY BL/SENECA AV',
'VICTORY BL/CLOVE RD',
'VICTORY BL/GRAND AV',
'VICTORY BL/HIGHLAND AV',
'VICTORY BL/SILVER LAKE APTS',
'VICTORY BL/SILVER MT CEM',
'VICTORY BL/THERESA PL',
'VICTORY BL/EDDY ST',
'VICTORY BL/FOREST AV',
'VICTORY BL/AUSTIN PL',
'VICTORY BL/CEBRA AV',
'VICTORY BL/JERSEY ST',
'VICTORY BL/ BROOK ST',
'VICTORY BL/VAN DUZER ST',
'VICTORY BL / BAY ST',
'BAY ST/SLOSSON TE',
'BAY ST/BOROUGH PL',
'ST GEORGE FERRY/ST GEORGE FERRY',
'ST GEORGE FERRY/S61 & S91'
]

unique_next_point_names_in_s61_order_0 = s61_order_0.NextStopPointName.unique()

list1 = actual_stop_names_st_gr_ferry
list2 = unique_next_point_names_in_s61_order_0

# compare 2 list to see different names in our dataset
difference = [item for item in list2 if item not in list1]  # Find values in list2 not in list1
print("Values in list2 but not in list1:", difference)

Values in list2 but not in list1: []


In [13]:
# encode stop list
encoded_stops_map = {stop_name: idx for idx, stop_name in enumerate(actual_stop_names_st_gr_ferry)}
print("Encoding Map:", encoded_stops_map)

Encoding Map: {'YUKON AV/WATCHMANS OFFICE': 0, 'INDEPENDENCE AV/PATHMARK': 1, 'RICHMOND AV/YUKON AV': 2, 'PLATINUM AV/TOYS R US': 3, 'MARSH AV/WINDHAM LOOP': 4, 'MARSH AV/ELMWOOD PK DR': 5, 'MARSH AV/RICHMOND HILL RD': 6, 'RICHMOND HILL RD/VASSAR ST': 7, 'MERRYMOUNT ST/SHILOH ST': 8, 'MERRYMOUNT ST/TRAVIS AV': 9, 'TRAVIS AV/NEHRING AV': 10, 'TRAVIS AV/DENKER PL': 11, 'TRAVIS AV/EVANS ST': 12, 'TRAVIS AV/LEWISTON ST': 13, 'FOREST HILL RD/ROCKLAND AV': 14, 'FOREST HILL RD/FIELD ST': 15, 'FOREST HILL RD/STEERS ST': 16, 'FOREST HILL RD/DOROTHY ST': 17, 'HAROLD ST/FOREST HILL RD': 18, 'HAROLD ST/WASHINGTON AV': 19, 'HAROLD ST/SUNSET AV': 20, 'HAROLD ST/BRADLEY AV': 21, 'BRADLEY AV/HOLDEN BL': 22, 'BRADLEY AV/WESTWOOD AV': 23, 'BRADLEY AV/PURDY AV': 24, 'BRADLEY AV/VICTORY BL': 25, 'BRADLEY AV/S GANNON AV': 26, 'VICTORY BL/PERRY AV': 27, 'VICTORY BL/MOUNTAINVIEW AV': 28, 'VICTORY BL/LESTER ST': 29, 'VICTORY BL/MANOR RD': 30, 'VICTORY BL/TODT HILL RD': 31, 'VICTORY BL/SLOSSON AV': 32, 'VICTOR

In [14]:
from geopy.distance import geodesic
# order the dataset
s61_order_0_new = s61_order_0[['RecordedAtTime','NextStopPointName','DirectionRef','PublishedLineName','VehicleRef','VehicleLocation.Latitude','VehicleLocation.Longitude','DistanceFromStop']] 
s61_order_0_new["RecordedAtTime"] = pd.to_datetime(s61_order_0_new["RecordedAtTime"])
s61_order_0_new = s61_order_0_new.sort_values(by=["VehicleRef", "RecordedAtTime"]).reset_index(drop=True)

# get location of 55 stops
Lon = []
Lat = []
for stop in actual_stop_names_st_gr_ferry:
    df = s61_order_0_new[s61_order_0_new['NextStopPointName'] == stop]
    Longitude = df['VehicleLocation.Longitude'].iloc[0]
    Lattitude = df['VehicleLocation.Latitude'].iloc[0]

    Lon.append(Longitude)
    Lat.append(Lattitude)

data = {
    "Stop": actual_stop_names_st_gr_ferry,
    "Latitude":Lat,
    "Longitude":Lon
}
temp_df = pd.DataFrame(data)

# calculate distance from prv_stop
distances_from_last_stop = []
last_stop = None

for index, row in temp_df.iterrows():
    current_stop = (row["Latitude"], row["Longitude"])
    if last_stop:
        distance = geodesic(last_stop, current_stop).kilometers*1000
    else:
        distance = 0  # For the first stop, there's no previous stop
    distances_from_last_stop.append(distance)
    last_stop = current_stop  # Update the last stop to current

temp_df["DistanceFromLastStop"] = distances_from_last_stop
temp_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s61_order_0_new["RecordedAtTime"] = pd.to_datetime(s61_order_0_new["RecordedAtTime"])


Unnamed: 0,Stop,Latitude,Longitude,DistanceFromLastStop
0,YUKON AV/WATCHMANS OFFICE,40.57494,-74.165408,0.0
1,INDEPENDENCE AV/PATHMARK,40.573856,-74.16826,269.82845
2,RICHMOND AV/YUKON AV,40.575581,-74.16967,225.713983
3,PLATINUM AV/TOYS R US,40.57847,-74.167575,366.586003
4,MARSH AV/WINDHAM LOOP,40.578058,-74.163652,335.293195


In [15]:
# create edge indexes
import torch
temp_df['StopId'] = temp_df['Stop'].map(encoded_stops_map)

edge_index = []
for i in range(len(temp_df) - 1):
    edge_index.append([temp_df['StopId'].iloc[i], temp_df['StopId'].iloc[i + 1]])
edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
print(edge_index)

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
         36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53],
        [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
         19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
         37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]])


In [16]:
# file_path = "Preprocess/edge_index_s61_1.pt"
# torch.save(edge_index, file_path)

In [17]:
# create edge attributes
edge_attr = torch.tensor(distances_from_last_stop[1:], dtype=torch.float).view(-1, 1)
edge_attr

tensor([[269.8285],
        [225.7140],
        [366.5860],
        [335.2932],
        [378.6680],
        [353.3776],
        [241.3157],
        [376.2319],
        [221.1701],
        [195.5168],
        [231.3532],
        [365.8459],
        [196.9879],
        [459.1738],
        [657.3147],
        [297.6126],
        [205.4578],
        [252.1765],
        [298.1562],
        [121.4185],
        [252.2964],
        [257.9235],
        [181.0103],
        [539.9190],
        [196.2313],
        [388.5688],
        [496.0692],
        [263.2419],
        [157.6568],
        [212.3649],
        [292.2908],
        [188.7126],
        [320.1454],
        [152.2356],
        [319.7011],
        [267.1190],
        [ 93.0674],
        [223.9962],
        [315.6412],
        [135.0781],
        [189.2886],
        [375.1378],
        [310.2053],
        [425.9579],
        [317.8258],
        [234.7634],
        [173.2364],
        [359.1135],
        [222.5727],
        [201.9707],


In [18]:
# file_path = "Preprocess/edge_attr_s61_0.pt"
# torch.save(edge_attr, file_path)

In [19]:
# encode stop points
s61_order_0_new['StopId'] = s61_order_0_new['NextStopPointName'].map(encoded_stops_map)

# add trip id
trip_id_df = s61_order_0_new.copy()
trip_id_df['TripId'] = 0
trip_id = 0

for i in range(1, len(trip_id_df)):
    if trip_id_df.loc[i, 'VehicleRef'] == trip_id_df.loc[i-1, 'VehicleRef']:
        if trip_id_df.loc[i, 'StopId'] < trip_id_df.loc[i-1, 'StopId']:
            trip_id += 1
    else:
        trip_id += 1
    
    trip_id_df.loc[i, 'TripId'] = trip_id

trip_id_df.loc[0, 'TripId'] = 0    # Adjust the first trip_id

# encode vehicle ref
trip_id_df['VehicleId'] = trip_id_df['VehicleRef'].map(vehicle_ref_map)

# encode line name
line_map = {
    "S61":0,
    "S62":1,
    "S66":2
}
trip_id_df['RouteId'] = trip_id_df['PublishedLineName'].map(line_map)
trip_id_df = trip_id_df.drop(['NextStopPointName','VehicleLocation.Latitude','VehicleLocation.Longitude','VehicleRef','PublishedLineName','DistanceFromStop'],axis=1)

trip_id_df.head(20)

Unnamed: 0,RecordedAtTime,DirectionRef,StopId,TripId,VehicleId,RouteId
0,2017-06-05 08:50:26,0,14,0,3210,0
1,2017-06-05 11:21:04,0,53,0,3210,0
2,2017-06-05 12:40:50,0,11,1,3210,0
3,2017-06-05 14:51:14,0,9,2,3210,0
4,2017-06-05 14:51:14,0,9,2,3210,0
5,2017-06-05 15:30:30,0,46,2,3210,0
6,2017-06-05 15:30:30,0,46,2,3210,0
7,2017-06-05 17:01:31,0,3,3,3210,0
8,2017-06-05 17:41:15,0,44,3,3210,0
9,2017-06-03 06:37:00,0,27,4,3155,0


In [20]:
# add all the stops to dataset (for each tour)
import pandas as pd
import numpy as np

all_stops = list(range(55))

def add_missing_stops(df, all_stops):
    # Sort by TripId and RecordedAtTime
    df['RecordedAtTime'] = pd.to_datetime(df['RecordedAtTime'])
    df = df.sort_values(by=['TripId', 'RecordedAtTime'])
    rows_to_add = []

    for trip_id, group in df.groupby('TripId'):
        existing_stops = group['StopId'].tolist()                   # Get the list of stops for the current trip
        missing_stops = list(set(all_stops) - set(existing_stops))  # Find the missing stops for the current trip

        # Add missing stops in order
        for stop_id in sorted(missing_stops):
            new_row = {
                'RecordedAtTime': None, 
                'DirectionRef': 0,
                'StopId': stop_id,
                'TripId': trip_id,
                'VehicleId': group['VehicleId'].unique()[0],  
                'RouteId': 0,  
                'DistanceFromLastStop':distances_from_last_stop[stop_id],
            }
            rows_to_add.append(new_row)
    
    df_missing_stops = pd.DataFrame(rows_to_add)    # Create a DataFrame for the new rows
    df = pd.concat([df, df_missing_stops], ignore_index=True)   # Append missing stops to the original dataframe
    df = df.sort_values(by=['TripId', 'StopId']).reset_index(drop=True) # Re-sort the dataframe
    
    return df

trip_id_df = trip_id_df.dropna(subset=['StopId'])
df_updated = add_missing_stops(trip_id_df, all_stops)
df_updated.head(55)


Unnamed: 0,RecordedAtTime,DirectionRef,StopId,TripId,VehicleId,RouteId,DistanceFromLastStop
0,NaT,0,0,0,3210,0,0.0
1,NaT,0,1,0,3210,0,269.82845
2,NaT,0,2,0,3210,0,225.713983
3,NaT,0,3,0,3210,0,366.586003
4,NaT,0,4,0,3210,0,335.293195
5,NaT,0,5,0,3210,0,378.667978
6,NaT,0,6,0,3210,0,353.377635
7,NaT,0,7,0,3210,0,241.315664
8,NaT,0,8,0,3210,0,376.231918
9,NaT,0,9,0,3210,0,221.170122


In [21]:
# drop duplicates
df_updated = df_updated.drop_duplicates()
df_updated = df_updated.loc[~df_updated.drop(columns=["RecordedAtTime"]).duplicated()]
df_updated.shape

(59345, 7)

In [22]:
# fill missing values in DistanceFromLastStop
def fill_missing(row):
    if pd.isna(row['DistanceFromLastStop']) and not pd.isna(row['StopId']):
        stop_id = int(row['StopId'])
        return distances_from_last_stop[stop_id]
    return row['DistanceFromLastStop']

df_updated['DistanceFromLastStop'] = df_updated.apply(fill_missing, axis=1)
df_updated.head(55)

Unnamed: 0,RecordedAtTime,DirectionRef,StopId,TripId,VehicleId,RouteId,DistanceFromLastStop
0,NaT,0,0,0,3210,0,0.0
1,NaT,0,1,0,3210,0,269.82845
2,NaT,0,2,0,3210,0,225.713983
3,NaT,0,3,0,3210,0,366.586003
4,NaT,0,4,0,3210,0,335.293195
5,NaT,0,5,0,3210,0,378.667978
6,NaT,0,6,0,3210,0,353.377635
7,NaT,0,7,0,3210,0,241.315664
8,NaT,0,8,0,3210,0,376.231918
9,NaT,0,9,0,3210,0,221.170122


In [23]:
# calculate the distance from start
df_feature_added = df_updated.copy().reset_index()

# fill every first record of TravelTime column with 0
df_feature_added['TravelTime'] = None
for idx in range(0, len(df_feature_added), 55):
    df_feature_added.at[idx, 'TravelTime'] = 0        


# calculate the propotion of distnce from start
def calculate_propotion(group):
    group['ProportionFromStart'] = group['DistanceFromStart'] / group['DistanceFromStart'].max()
    return group


distances = df_feature_added['DistanceFromLastStop'].iloc[:55].tolist()
distance_from_start = []
current_sum = 0
for value in distances:
    current_sum += value
    distance_from_start.append(current_sum)

result = []
for i in range(0, len(df_feature_added), 55):
    chunk = df_feature_added.iloc[i:i + 55].copy()  
    chunk['DistanceFromStart'] = distance_from_start
    result.append(chunk)

df_feature_added = pd.concat(result, ignore_index=True)

# df_feature_added = df_feature_added.groupby('TripId', group_keys=False).apply(calculate_travel_time)
df_feature_added_propotion = df_feature_added.groupby('TripId', group_keys=False).apply(calculate_propotion)

df_feature_added_propotion.head(55)

Unnamed: 0,index,RecordedAtTime,DirectionRef,StopId,TripId,VehicleId,RouteId,DistanceFromLastStop,TravelTime,DistanceFromStart,ProportionFromStart
0,0,NaT,0,0,0,3210,0,0.0,0.0,0.0,0.0
1,1,NaT,0,1,0,3210,0,269.82845,,269.82845,0.018185
2,2,NaT,0,2,0,3210,0,225.713983,,495.542433,0.033397
3,3,NaT,0,3,0,3210,0,366.586003,,862.128436,0.058103
4,4,NaT,0,4,0,3210,0,335.293195,,1197.421631,0.0807
5,5,NaT,0,5,0,3210,0,378.667978,,1576.089608,0.10622
6,6,NaT,0,6,0,3210,0,353.377635,,1929.467243,0.130035
7,7,NaT,0,7,0,3210,0,241.315664,,2170.782907,0.146299
8,8,NaT,0,8,0,3210,0,376.231918,,2547.014825,0.171655
9,9,NaT,0,9,0,3210,0,221.170122,,2768.184947,0.18656


In [24]:
# remove if there is only a single value for RecordedAtTime
df_feature_added_propotion = df_feature_added_propotion.groupby('TripId').filter(
    lambda group: group['RecordedAtTime'].nunique() > 1
)
df_feature_added_propotion.iloc[55:110]

Unnamed: 0,index,RecordedAtTime,DirectionRef,StopId,TripId,VehicleId,RouteId,DistanceFromLastStop,TravelTime,DistanceFromStart,ProportionFromStart
110,110,NaT,0,0,2,3210,0,0.0,0.0,0.0,0.0
111,111,NaT,0,1,2,3210,0,269.82845,,269.82845,0.018185
112,112,NaT,0,2,2,3210,0,225.713983,,495.542433,0.033397
113,113,NaT,0,3,2,3210,0,366.586003,,862.128436,0.058103
114,114,NaT,0,4,2,3210,0,335.293195,,1197.421631,0.0807
115,115,NaT,0,5,2,3210,0,378.667978,,1576.089608,0.10622
116,116,NaT,0,6,2,3210,0,353.377635,,1929.467243,0.130035
117,117,NaT,0,7,2,3210,0,241.315664,,2170.782907,0.146299
118,118,NaT,0,8,2,3210,0,376.231918,,2547.014825,0.171655
119,119,2017-06-05 14:51:14,0,9,2,3210,0,221.170122,,2768.184947,0.18656


In [25]:
df_feature_added_propotion.head(55)

Unnamed: 0,index,RecordedAtTime,DirectionRef,StopId,TripId,VehicleId,RouteId,DistanceFromLastStop,TravelTime,DistanceFromStart,ProportionFromStart
0,0,NaT,0,0,0,3210,0,0.0,0.0,0.0,0.0
1,1,NaT,0,1,0,3210,0,269.82845,,269.82845,0.018185
2,2,NaT,0,2,0,3210,0,225.713983,,495.542433,0.033397
3,3,NaT,0,3,0,3210,0,366.586003,,862.128436,0.058103
4,4,NaT,0,4,0,3210,0,335.293195,,1197.421631,0.0807
5,5,NaT,0,5,0,3210,0,378.667978,,1576.089608,0.10622
6,6,NaT,0,6,0,3210,0,353.377635,,1929.467243,0.130035
7,7,NaT,0,7,0,3210,0,241.315664,,2170.782907,0.146299
8,8,NaT,0,8,0,3210,0,376.231918,,2547.014825,0.171655
9,9,NaT,0,9,0,3210,0,221.170122,,2768.184947,0.18656


In [26]:
travl_df = df_feature_added_propotion.copy().drop('index',axis=1)
def fill_travel_time(group):
    group = group.reset_index()
    for i in range(len(group) - 1):
        if pd.notna(group.loc[i, 'RecordedAtTime']) and pd.notna(group.loc[i + 1, 'RecordedAtTime']):
            time_diff = (group.loc[i + 1, 'RecordedAtTime'] - group.loc[i, 'RecordedAtTime']).total_seconds()
            group.loc[i + 1, 'TravelTime'] = time_diff

    for i in range(len(group) - 1):
        start_idx = i
        while start_idx >= 0 and pd.isna(group.loc[start_idx, 'RecordedAtTime']):
            start_idx -= 1
        end_idx = i + 1
        while end_idx < len(group) and pd.isna(group.loc[end_idx, 'RecordedAtTime']):
            end_idx += 1
        
        if start_idx >= 0 and end_idx < len(group):
            total_time = (group.loc[end_idx, 'RecordedAtTime'] - group.loc[start_idx, 'RecordedAtTime']).total_seconds()
            total_distance = group.loc[start_idx + 1:end_idx, 'DistanceFromLastStop'].sum()
            
            # Proportionally distribute the travel time
            cumulative_distance = 0
            for j in range(start_idx + 1, end_idx + 1):
                if pd.isna(group.loc[j, 'TravelTime']):
                    distance = group.loc[j, 'DistanceFromLastStop']
                    group.loc[j, 'TravelTime'] = (distance / total_distance) * total_time

    # Handle records before the first available RecordedAtTime
    first_valid_idx = group[~group['RecordedAtTime'].isna()].index[0]
    first_valid_distance = group.loc[first_valid_idx+1, 'DistanceFromLastStop']
    first_valid_travel_time = group.loc[first_valid_idx+1, 'TravelTime']

    for i in range(first_valid_idx, -1, -1):
        if pd.isna(group.loc[i, 'TravelTime']):
            distance = group.loc[i, 'DistanceFromLastStop']
            group.loc[i, 'TravelTime'] = (distance / first_valid_distance) * first_valid_travel_time
    
    # Handle records after the last available RecordedAtTime
    last_valid_idx = group[~group['RecordedAtTime'].isna()].index[-1]
    last_valid_distance = group.loc[last_valid_idx, 'DistanceFromLastStop']
    last_valid_travel_time = group.loc[last_valid_idx, 'TravelTime']

    for i in range(last_valid_idx + 1, len(group)):
        if pd.isna(group.loc[i, 'RecordedAtTime']):
            distance = group.loc[i, 'DistanceFromLastStop']
            group.loc[i, 'TravelTime'] = (distance / last_valid_distance) * last_valid_travel_time

    return group

travl_df = travl_df.groupby('TripId', group_keys=False).apply(fill_travel_time).reset_index().drop(['level_0','index'],axis=1)

In [27]:
def fill_missing_recorded_at_time(group):
    group = group.reset_index()
    first_valid_idx = group[~group['RecordedAtTime'].isna()].index[0]
    first_valid_time = group.loc[first_valid_idx, 'RecordedAtTime']
    
    date = first_valid_time
    for i in range(first_valid_idx - 1, -1, -1):
        if pd.isna(group.loc[i, 'RecordedAtTime']):
            group.loc[i, 'RecordedAtTime'] = date - pd.to_timedelta(group.loc[i+1, 'TravelTime'], unit='s')
            date = group.loc[i, 'RecordedAtTime']
        else:
            date = group.loc[i, 'RecordedAtTime']
    
    date = first_valid_time
    for i in range(first_valid_idx + 1, len(group)):
        if pd.isna(group.loc[i, 'RecordedAtTime']):
            group.loc[i, 'RecordedAtTime'] = date + pd.to_timedelta(group.loc[i, 'TravelTime'], unit='s')
            date = group.loc[i, 'RecordedAtTime']
        else:
            date = group.loc[i, 'RecordedAtTime']
    
    return group


travl_df = travl_df.groupby('TripId', group_keys=False).apply(fill_missing_recorded_at_time).reset_index().drop(['level_0','index'],axis=1)
travl_df

Unnamed: 0,RecordedAtTime,DirectionRef,StopId,TripId,VehicleId,RouteId,DistanceFromLastStop,TravelTime,DistanceFromStart,ProportionFromStart
0,2017-06-05 07:50:37.448295315,0,0,0,3210,0,0.000000,0,0.000000,0.000000
1,2017-06-05 07:54:27.061523656,0,1,0,3210,0,269.828450,229.613228,269.828450,0.018185
2,2017-06-05 07:57:39.135103452,0,2,0,3210,0,225.713983,192.07358,495.542433,0.033397
3,2017-06-05 08:02:51.085143378,0,3,0,3210,0,366.586003,311.95004,862.128436,0.058103
4,2017-06-05 08:07:36.406253885,0,4,0,3210,0,335.293195,285.321111,1197.421631,0.080700
...,...,...,...,...,...,...,...,...,...,...
49055,2017-06-25 20:23:38.000000000,0,50,1078,343,0,201.970692,33.936743,14112.540456,0.951107
49056,2017-06-25 20:24:09.110956695,0,51,1078,343,0,185.153346,31.110957,14297.693803,0.963586
49057,2017-06-25 20:24:50.468402132,0,52,1078,343,0,246.134167,41.357445,14543.827970,0.980174
49058,2017-06-25 20:25:39.898865990,0,53,1078,343,0,294.179825,49.430464,14838.007795,1.000000


In [28]:
# remove wrongly assigned trips 
df_filtered = travl_df.copy()
trip_total_time = (
    df_filtered.groupby('TripId')['TravelTime']
    .sum()
    .reset_index()
    .rename(columns={'TravelTime': 'TotalTravelTime'})
)

invalid_trip_ids = trip_total_time[trip_total_time['TotalTravelTime'] > 7200]['TripId'].tolist()
df_filtered = df_filtered[~df_filtered['TripId'].isin(invalid_trip_ids)]

print(f"Removed TripIds: {invalid_trip_ids}")
display(df_filtered)


Removed TripIds: [0, 4, 6, 12, 13, 18, 24, 29, 36, 40, 53, 56, 57, 59, 60, 65, 68, 78, 84, 90, 99, 102, 107, 109, 110, 118, 125, 142, 143, 145, 150, 158, 163, 168, 174, 180, 181, 187, 188, 193, 203, 205, 209, 216, 217, 222, 229, 233, 234, 243, 247, 259, 271, 274, 277, 278, 279, 284, 299, 301, 303, 307, 321, 328, 329, 330, 336, 339, 355, 356, 362, 368, 379, 386, 387, 388, 400, 401, 402, 404, 406, 408, 418, 422, 424, 426, 429, 436, 442, 449, 461, 462, 465, 477, 478, 490, 499, 504, 505, 507, 514, 517, 521, 523, 525, 528, 535, 536, 540, 543, 546, 551, 554, 561, 564, 568, 576, 583, 585, 598, 599, 605, 609, 611, 620, 622, 623, 626, 627, 634, 637, 642, 643, 647, 655, 657, 658, 662, 663, 669, 671, 675, 678, 689, 711, 713, 723, 737, 740, 744, 758, 769, 773, 774, 785, 787, 789, 794, 805, 806, 807, 821, 829, 831, 835, 838, 844, 850, 851, 858, 859, 870, 872, 876, 877, 879, 887, 893, 909, 921, 927, 939, 945, 947, 949, 950, 953, 955, 960, 964, 965, 966, 988, 999, 1008, 1011, 1012, 1029, 1042, 1044, 

Unnamed: 0,RecordedAtTime,DirectionRef,StopId,TripId,VehicleId,RouteId,DistanceFromLastStop,TravelTime,DistanceFromStart,ProportionFromStart
55,2017-06-05 14:40:46.142730084,0,0,2,3210,0,0.000000,0,0.000000,0.000000
56,2017-06-05 14:41:47.343031733,0,1,2,3210,0,269.828450,61.200302,269.828450,0.018185
57,2017-06-05 14:42:38.537647597,0,2,2,3210,0,225.713983,51.194616,495.542433,0.033397
58,2017-06-05 14:44:01.683713182,0,3,2,3210,0,366.586003,83.146066,862.128436,0.058103
59,2017-06-05 14:45:17.732197653,0,4,2,3210,0,335.293195,76.048484,1197.421631,0.080700
...,...,...,...,...,...,...,...,...,...,...
49055,2017-06-25 20:23:38.000000000,0,50,1078,343,0,201.970692,33.936743,14112.540456,0.951107
49056,2017-06-25 20:24:09.110956695,0,51,1078,343,0,185.153346,31.110957,14297.693803,0.963586
49057,2017-06-25 20:24:50.468402132,0,52,1078,343,0,246.134167,41.357445,14543.827970,0.980174
49058,2017-06-25 20:25:39.898865990,0,53,1078,343,0,294.179825,49.430464,14838.007795,1.000000


In [29]:
df_filtered.isna().sum()

RecordedAtTime          0
DirectionRef            0
StopId                  0
TripId                  0
VehicleId               0
RouteId                 0
DistanceFromLastStop    0
TravelTime              0
DistanceFromStart       0
ProportionFromStart     0
dtype: int64

In [30]:
# create node features (DirectionRef, VehicleId, RouteId, DistanceFromStart)
normalized_df = df_filtered.copy()

normalized_df['DistanceFromStart'] = (normalized_df['DistanceFromStart'] - normalized_df['DistanceFromStart'].min()) / \
                          (normalized_df['DistanceFromStart'].max() - normalized_df['DistanceFromStart'].min())

features = normalized_df[['DirectionRef', 'VehicleId', 'RouteId', 'DistanceFromStart']].values    # Combine features

num_nodes = 55
num_snapshots = len(normalized_df) // num_nodes
node_features_snapshots = features.reshape(num_snapshots, num_nodes, -1)

In [31]:
# file_path = "Preprocess/node_features_s61_0.pt"
# torch.save(node_features_snapshots, file_path)

In [32]:
# create target
normalized_df['TravelTime'] = pd.to_numeric(normalized_df['TravelTime'], errors='coerce')
target = normalized_df['TravelTime'].values.reshape(num_snapshots, num_nodes)

In [33]:
# file_path = "Preprocess/target_s61_0.pt"
# torch.save(target, file_path)

In [34]:
# view inputs and attributes
display(edge_index.shape)
display(edge_attr.shape)
display(node_features_snapshots.shape)
display(target.shape)

torch.Size([2, 54])

torch.Size([54, 1])

(684, 55, 4)

(684, 55)

In [35]:
# creating data objects
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

def create_graphs(node_features, edge_index, edge_attr, target):
    graphs = []
    for t in range(node_features_snapshots.shape[0]):
        graph = Data(x=torch.tensor(node_features_snapshots[t]), 
             edge_index=torch.tensor(edge_index), 
             edge_attr=torch.tensor(edge_attr), 
             y=torch.tensor(target[t]))
        graphs.append(graph)
    return graphs

graphs = create_graphs(node_features=node_features_snapshots, edge_index=edge_index, edge_attr=edge_attr, target=target)
dataloader = DataLoader(graphs, batch_size=1, shuffle=False)

  edge_index=torch.tensor(edge_index),
  edge_attr=torch.tensor(edge_attr),


In [36]:
# create a json dataset


In [37]:
# train , test , val split
from torch.utils.data import Subset

dataset_size = len(graphs)

train_size = int(0.7 * dataset_size)  # 70% training
val_size = int(0.15 * dataset_size)   # 15% validation
test_size = dataset_size - train_size - val_size  # Remaining 15% for testing

# Create indices for each split
train_indices = range(0, train_size)
val_indices = range(train_size, train_size + val_size)
test_indices = range(train_size + val_size, dataset_size)

train_dataset = Subset(graphs, train_indices)
val_dataset = Subset(graphs, val_indices)
test_dataset = Subset(graphs, test_indices)


batch_size = 12

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [38]:
from torch_geometric_temporal.dataset import WikiMathsDatasetLoader
from torch_geometric_temporal.signal import temporal_signal_split

import torch
import torch.nn.functional as F
from torch_geometric_temporal.nn.recurrent import GConvGRU
from tqdm import tqdm

loader = WikiMathsDatasetLoader()
dataset = loader.get_dataset(lags=14)
train_dataset, test_dataset = temporal_signal_split(dataset, train_ratio=0.5)

class RecurrentGCN(torch.nn.Module):
    def __init__(self, node_features, filters):
        super(RecurrentGCN, self).__init__()
        self.recurrent = GConvGRU(node_features, filters, 2)
        self.linear = torch.nn.Linear(filters, 1)

    def forward(self, x, edge_index, edge_weight):
        h = self.recurrent(x, edge_index, edge_weight)
        h = F.relu(h)
        h = self.linear(h)
        return h

model = RecurrentGCN(node_features=14, filters=32)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
model.train()

for epoch in tqdm(range(50)):
    for time, snapshot in enumerate(train_dataset):
        y_hat = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
        cost = torch.mean((y_hat-snapshot.y)**2)
        cost.backward()
        optimizer.step()
        optimizer.zero_grad()

model.eval()
cost = 0
for time, snapshot in enumerate(test_dataset):
    y_hat = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
    cost = cost + torch.mean((y_hat-snapshot.y)**2)
cost = cost / (time+1)
cost = cost.item()
print("MSE: {:.4f}".format(cost))

100%|██████████| 50/50 [10:06<00:00, 12.12s/it]


MSE: 0.7885


In [44]:
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv

class GCNLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNLayer, self).__init__()
        self.conv = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index, edge_attr=None):
        # x is the node feature matrix, edge_index is the graph connectivity, and edge_attr is the edge features
        return self.conv(x, edge_index, edge_attr=edge_attr)  # Include edge_attr in the convolution

class MTGNN_GRU(nn.Module):
    def __init__(self, num_nodes, node_feature_dim, gcn_out_channels, gru_hidden_channels, gru_out_channels, seq_length, dropout=0.3):
        super(MTGNN_GRU, self).__init__()

        # Spatial Component (GCN Layers)
        self.gcn1 = GCNLayer(node_feature_dim, gcn_out_channels)
        self.gcn2 = GCNLayer(gcn_out_channels, gcn_out_channels)
        
        # Temporal Component (GRU Layer)
        self.gru = nn.GRU(input_size=gcn_out_channels, hidden_size=gru_hidden_channels, num_layers=1, batch_first=True)

        # Fully connected layer for output prediction
        self.fc = nn.Linear(gru_hidden_channels, gru_out_channels)

        self.seq_length = seq_length
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, edge_index, edge_attr=None):
        # x: (batch_size, seq_length, num_nodes, node_feature_dim)
        # edge_index: (2, num_edges)
        # edge_attr: (num_edges, feature_dim) -- if you have edge features

        # Process the spatial information through GCN
        batch_size, seq_length, num_nodes, node_feature_dim = x.size()

        # Flatten the input to match GCN input (batch_size * seq_length, num_nodes, node_feature_dim)
        x = x.view(batch_size * seq_length, num_nodes, node_feature_dim)
        
        # First GCN layer
        x = self.gcn1(x, edge_index, edge_attr)
        x = self.dropout(x)
        
        # Second GCN layer
        x = self.gcn2(x, edge_index, edge_attr)
        x = self.dropout(x)

        # Pass the output through GRU
        x = x.view(batch_size, seq_length, num_nodes, -1)  # Reshape for GRU input
        x = x.mean(dim=2)  # Aggregate node features (mean across nodes)
        
        # GRU Layer
        out, _ = self.gru(x)

        # Fully connected layer for output prediction (next step travel time)
        out = self.fc(out[:, -1, :])  # Use the last timestep output

        return out  # (batch_size, num_nodes)



In [45]:
# Example with edge features
num_nodes = 10
num_edges = 20  # Assume we have 20 edges
node_feature_dim = 5
edge_feature_dim = 3  # Edge features could be travel time, distance, etc.
seq_length = 12
batch_size = 2

# Node features (batch_size, seq_length, num_nodes, node_feature_dim)
node_features = torch.randn(batch_size, seq_length, num_nodes, node_feature_dim)

# Edge index (2, num_edges)
edge_index = torch.tensor([[0, 1, 2, 3, 4],
                           [1, 2, 3, 4, 0]], dtype=torch.long)  # Example edge indices

# Edge features (num_edges, edge_feature_dim)
edge_features = torch.randn(num_edges, edge_feature_dim)

# Target (batch_size, num_nodes) - Travel time predictions
target = torch.randn(batch_size, num_nodes)

# Initialize model
model = MTGNN_GRU(
    num_nodes=num_nodes,
    node_feature_dim=node_feature_dim,
    gcn_out_channels=64,
    gru_hidden_channels=128,
    gru_out_channels=1,
    seq_length=seq_length,
    dropout=0.3
)

# Forward pass (prediction)
predictions = model(node_features, edge_index, edge_attr=edge_features)

print(f"Predictions shape: {predictions.shape}")


TypeError: GCNConv.forward() got an unexpected keyword argument 'edge_attr'

In [None]:
from torch_geometric.nn import Edge

In [39]:
import torch.optim as optim

in_channels = node_features_snapshots.shape[2] 
hidden_channels = 64
out_channels = target.shape[2] if target.ndim == 3 else 1 
num_nodes = node_features_snapshots.shape[1] 
edge_attr_dim = edge_attr.shape[1]

model = GCNGRUModel(in_channels, hidden_channels, out_channels, num_nodes, edge_attr_dim)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

h = torch.zeros(1, 1, hidden_channels, device='cuda' if torch.cuda.is_available() else 'cpu')

num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for data in train_loader:
        # Move data to device
        data = data.to('cuda' if torch.cuda.is_available() else 'cpu')
        x = data.x.view(1, data.num_nodes, -1)  
        edge_index = data.edge_index  
        edge_attr = data.edge_attr.view(1, -1, edge_attr.shape[-1])
        y = data.y.view(1, data.num_nodes, -1)

        # Forward pass
        optimizer.zero_grad()
        out, h = model(x.float(), edge_index, edge_attr.float(), h.detach())  # Detach h to avoid backprop through time

        # Compute loss
        loss = criterion(out.float().squeeze(), y.float().squeeze())          # Remove batch dimension for loss calculation
        loss.backward()

        # Update parameters
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(dataloader):.4f}")

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        h = torch.zeros(1, 1, hidden_channels, device='cuda' if torch.cuda.is_available() else 'cpu')
        for data in val_loader:
            # Move data to device
            data = data.to('cuda' if torch.cuda.is_available() else 'cpu')
            x = data.x.view(1, data.num_nodes, -1)
            edge_index = data.edge_index
            edge_attr = data.edge_attr.view(1, -1, edge_attr.shape[-1])
            y = data.y.view(1, data.num_nodes, -1)

            # Forward pass
            out, h = model(x.float(), edge_index, edge_attr.float(), h)
            loss = criterion(out.float().squeeze(), y.float().squeeze())
            val_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss / len(val_loader):.4f}")

    patience = 5
    best_val_loss = float('inf')
    no_improve_epochs = 0

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        no_improve_epochs = 0
        torch.save(model.state_dict(), "best_model.pth")  # Save the best model
    else:
        no_improve_epochs += 1

    if no_improve_epochs >= patience:
        print(f"Early stopping triggered after {epoch+1} epochs.")
        break

print("Training complete!")


NameError: name 'GCNGRUModel' is not defined

In [None]:
# Test Phase
from torchmetrics.functional import mean_absolute_error, mean_squared_error

model.eval()
test_loss_mse = 0
test_loss_mae = 0
test_loss_mape = 0
n_samples = 0

with torch.no_grad():
    h = torch.zeros(1, 1, hidden_channels, device='cuda' if torch.cuda.is_available() else 'cpu')
    for data in test_loader:
        data = data.to('cuda' if torch.cuda.is_available() else 'cpu')
        x = data.x.view(1, data.num_nodes, -1)
        edge_index = data.edge_index
        edge_attr = data.edge_attr.view(1, -1, edge_attr.shape[-1])
        y = data.y.view(1, data.num_nodes, -1)

        # Forward pass
        out, h = model(x.float(), edge_index, edge_attr.float(), h)

        # Convert to appropriate shape
        y_pred = out.float().squeeze()
        y_true = y.float().squeeze()

        # MSE
        test_loss_mse += mean_squared_error(y_pred, y_true).item()

        # MAE
        test_loss_mae += mean_absolute_error(y_pred, y_true).item()

        # MAPE
        # Avoid division by zero by filtering out small true values
        valid_mask = y_true != 0
        mape = (torch.abs((y_pred[valid_mask] - y_true[valid_mask]) / y_true[valid_mask])).mean() * 100
        test_loss_mape += mape.item()

        n_samples += 1

# Average across all batches
test_loss_mse /= n_samples
test_loss_mae /= n_samples
test_loss_mape /= n_samples

print(f"Test MSE: {test_loss_mse:.4f}")
print(f"Test MAE: {test_loss_mae:.4f}")
print(f"Test MAPE: {test_loss_mape:.2f}%")


Test MSE: 845.3625
Test MAE: 20.8357
Test MAPE: 134.44%


In [43]:
# import torch
# torch.cuda.is_available()
!nvidia-smi

Sat Feb  8 11:17:42 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94                 Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4050 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   45C    P3              8W /   50W |     188MiB /   6141MiB |     21%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
valid_mask

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True])

In [None]:
y_pred

tensor([773.1844, 773.1844, 773.1844, 773.1844, 773.1844, 773.1844, 773.1844,
        773.1844, 773.1844, 773.1844, 773.1844, 773.1844, 773.1844, 773.1844,
        773.1844, 773.1844, 773.1844, 773.1844, 773.1844, 773.1844, 773.1844,
        773.1844, 773.1844, 773.1844, 773.1844, 773.1844, 773.1844, 773.1844,
        773.1844, 773.1844, 773.1844, 773.1844, 773.1844, 773.1844, 773.1844,
        773.1844, 773.1844, 773.1844, 773.1844, 773.1844, 773.1844, 773.1844,
        773.1844, 773.1844, 773.1844, 773.1844, 773.1844, 773.1844, 773.1844,
        773.1844, 773.1844, 773.1844, 773.1844, 773.1844, 773.1844])

In [None]:
y_true

tensor([ 714.6747,  714.6868,  714.6968,  714.7133,  714.7282,  714.7451,
         714.7609,  714.7717,  714.7885,  714.7984,  714.8071,  714.8174,
         714.8338,  714.8426,  714.8632,  714.8925,  714.9058,  714.9150,
         714.9263,  714.9396,  714.9450,  714.9562,  714.9678,  714.9759,
         715.0000, 1147.5242, 1147.5520, 1147.5876, 1147.6064, 1147.6178,
        1147.6331, 1147.6539, 1147.6675, 1147.6904, 1147.7014, 1147.7242,
        1147.7434, 1147.7501, 1147.7662, 1147.7888, 1147.7985, 1147.8120,
        1147.8390, 1147.8612, 1147.8917, 1147.9146, 1147.9314, 1147.9438,
        1147.9696, 1147.9855, 1148.0000, 1148.0133, 1148.0309, 1148.0520,
        1148.0520])

#### functions for model training

In [None]:
# create edge indexes, node features, edge attributes, targets
def preprocess(dataframe):
    return None

In [None]:
# create data loaders
def create_dataset():

    return train_dataloader, test_dataloader, validation_dataloader

In [None]:
# train the model
def train():
    return model

In [None]:
# evaluate the model
def evaluate():
    return