# Initialisation

In [1]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm

In [3]:
import sys
import os

py_file_location = '../'
home_directory = '../'

sys.path.append(os.path.abspath(py_file_location))
from model.model_class.environment import *

from model.model_class import GNN

SEED = 42

# Data Preparation

In [3]:
gnn_train_data = pd.read_parquet('../data/curated/ML_data/gnn_train_data.parquet')
gnn_val_data = pd.read_parquet('../data/curated/ML_data/gnn_val_data.parquet')
gnn_test_data = pd.read_parquet('../data/curated/ML_data/gnn_test_data.parquet')

SA2_gnn_data = pd.read_parquet('../data/curated/ML_data/SA2_gnn_data.parquet')
SA2_gnn_data['Station Name'] = SA2_gnn_data['Station Name'].apply(lambda x: '(SA2)'+x)
station_inference_gnn_data = pd.read_parquet('../data/curated/ML_data/station_inference_gnn_data.parquet')

station_inference_gnn_data = station_inference_gnn_data.rename({'Station_Name': 'Station Name'}, axis=1)
inference_data = pd.concat([SA2_gnn_data, station_inference_gnn_data], axis=0)
inference_data.drop(columns=['Station_Na'], axis=1, inplace=True)
inference_data = inference_data.rename({'Station Name': 'Station_Name'}, axis=1)

In [4]:
# open npy
station_weights_matrix = np.load('../data/curated/ML_data/station_weights_matrix.npy')
SA2_weights_matrix = np.load('../data/curated/ML_data/station_weights_withSA2_matrix.npy')

with open('../data/curated/ML_features/station_weights_withSA2.json', 'r') as f:
    station_weights_withSA2 = json.load(f)

with open('../data/curated/ML_features/station_weights.json', 'r') as f:
    station_weights = json.load(f)

In [5]:
# edit SA2_weights_matrix matrix: columns after 223 masked out to 0
SA2_weights_matrix[:, 223:] = 0

In [6]:
geospatial_features = ['log_Total_Demand']
non_geospatial_features = ['Weekday', 'mean_rainfall_value', 'has_school',
       'has_sport_facility', 'has_shopping_centre', 'has_hospital',
       'total_population', ' med_rent_weekly_c2021',
       ' med_mortg_rep_mon_c2021', ' med_person_inc_we_c2021',
       ' med_famly_inc_we_c2021']
label_columns = ['log_Total_Demand']

In [7]:
def DataFactory(raw_dataset, geospatial_features, non_geospatial_features, label_columns, stations_index, inference = False):

    """ Data Factory of GNN """
    
    geospatial_x_batches = []
    non_geospatial_x_batches = []
    y_batches = []
    masks = []

    if inference:
        groupby_column = 'Weekday'
    else:
        groupby_column = 'Business_Date'

    for day, daily_df in tqdm(raw_dataset.groupby(groupby_column)):

        geospatial_x = np.zeros([len(stations_index), len(geospatial_features)])
        y = np.zeros([len(stations_index), len(label_columns)])
        mask = np.zeros([len(stations_index), 1])
        non_geospatial_x = np.zeros([len(stations_index), len(non_geospatial_features)])

        daily_df.set_index('Station_Name', inplace=True)

        for station in daily_df.index:

            geospatial_x[stations_index[station]] = daily_df.loc[station][geospatial_features] # todo inference. 
            if not inference:
                y[stations_index[station]] = daily_df.loc[station][label_columns]
            mask[stations_index[station]] = 1
            non_geospatial_x[stations_index[station]] = daily_df.loc[station][non_geospatial_features]
                
        geospatial_x = np.nan_to_num(geospatial_x)
        geospatial_x_batches.append(geospatial_x)
        y_batches.append(y)
        masks.append(mask.flatten())

        non_geospatial_x_batches.append(non_geospatial_x)

        
    return geospatial_x_batches, non_geospatial_x_batches, y_batches, masks

In [7]:
train_geospatial_X_batches, train_non_geospatial_X_batches, train_y_batches, train_masks = DataFactory(gnn_train_data, geospatial_features, non_geospatial_features, label_columns, station_weights)
val_geospatial_X_batches, val_non_geospatial_X_batches, val_y_batches, val_masks = DataFactory(gnn_val_data, geospatial_features, non_geospatial_features, label_columns, station_weights)
test_geospatial_X_batches, test_non_geospatial_X_batches, test_y_batches, test_masks = DataFactory(gnn_test_data, geospatial_features, non_geospatial_features, label_columns, station_weights)

  0%|          | 0/382 [00:00<?, ?it/s]

100%|██████████| 382/382 [01:11<00:00,  5.34it/s]
100%|██████████| 82/82 [00:15<00:00,  5.42it/s]
100%|██████████| 82/82 [00:14<00:00,  5.59it/s]


In [8]:
all_inference_geospatial_X_batches, all_inference_non_geospatial_X_batches, all_inference_y_batches, all_inference_masks = DataFactory(inference_data, geospatial_features, non_geospatial_features, label_columns, station_weights_withSA2, inference = True)

100%|██████████| 2/2 [00:00<00:00,  2.94it/s]


# Training

In [10]:
class GNN_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 256
    n_heads = 8
    dropout = 0.1
    n_gnn_layers = 2
    activation = nn.ReLU()
    res_learning = False
    bottleneck = True
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    epochs = 32
    lr = 1e-3
    patience = 5
    loss = nn.MSELoss()
    validation_loss = nn.MSELoss()
    alpha = 0.1
    scheduler = True
    grad_clip = False
    # ----------------- operation hyperparameters ----------------- #
    spatial_input_dim = 1
    nonspatial_input_dim = 11
    # ----------------- saving hyperparameters ----------------- #
    rootpath = home_directory
    name = f'AGNN_2layer'

model2 = GNN(GNN_config) # initialise the model

# train the model (all cells except this one will print training log and evaluation at each batch)
best_epoch = model2.fit(train_geospatial_X_batches, train_non_geospatial_X_batches, train_y_batches, train_masks, val_geospatial_X_batches, val_non_geospatial_X_batches, val_y_batches, val_masks, station_weights_matrix)
print('\n\n')

# as model automatically saves best epoch, will now load the best epoch and evaluate on test set
model2.load()
model2.eval(val_geospatial_X_batches, val_non_geospatial_X_batches, val_y_batches, val_masks, station_weights_matrix, best_epoch, evaluation_mode = True)
model2.eval(test_geospatial_X_batches, test_non_geospatial_X_batches, test_y_batches, test_masks, station_weights_matrix, best_epoch, evaluation_mode = True)

  0%|          | 0/382 [00:00<?, ?it/s]

100%|██████████| 382/382 [00:23<00:00, 16.44it/s]


 Epoch 1 Train | Loss:  0.1195 | R2:  0.8701| MSE:  0.1191 | RMSE:  0.3452 | MAE:  0.2217 


100%|██████████| 82/82 [00:01<00:00, 63.24it/s]


Epoch 1 Val | Loss:  0.0248 | R2:  0.9751| MSE:  0.0248 | RMSE:  0.1576 | MAE:  0.1200 


100%|██████████| 382/382 [00:27<00:00, 14.03it/s]


 Epoch 2 Train | Loss:  0.0350 | R2:  0.9617| MSE:  0.0348 | RMSE:  0.1866 | MAE:  0.1311 


100%|██████████| 82/82 [00:01<00:00, 52.87it/s]


Epoch 2 Val | Loss:  0.0204 | R2:  0.9796| MSE:  0.0204 | RMSE:  0.1428 | MAE:  0.0976 


100%|██████████| 382/382 [00:27<00:00, 13.79it/s]


 Epoch 3 Train | Loss:  0.0253 | R2:  0.9722| MSE:  0.0254 | RMSE:  0.1593 | MAE:  0.1069 


100%|██████████| 82/82 [00:02<00:00, 40.03it/s]


Epoch 3 Val | Loss:  0.0078 | R2:  0.9922| MSE:  0.0078 | RMSE:  0.0884 | MAE:  0.0666 


100%|██████████| 382/382 [00:27<00:00, 14.13it/s]


 Epoch 4 Train | Loss:  0.0211 | R2:  0.9768| MSE:  0.0211 | RMSE:  0.1453 | MAE:  0.0951 


100%|██████████| 82/82 [00:01<00:00, 50.27it/s]


Epoch 4 Val | Loss:  0.0182 | R2:  0.9818| MSE:  0.0182 | RMSE:  0.1348 | MAE:  0.1128 


100%|██████████| 382/382 [00:27<00:00, 13.82it/s]


 Epoch 5 Train | Loss:  0.0191 | R2:  0.9788| MSE:  0.0192 | RMSE:  0.1387 | MAE:  0.0897 


100%|██████████| 82/82 [00:01<00:00, 55.20it/s]


Epoch 5 Val | Loss:  0.0077 | R2:  0.9923| MSE:  0.0077 | RMSE:  0.0875 | MAE:  0.0636 


100%|██████████| 382/382 [00:25<00:00, 15.11it/s]


 Epoch 6 Train | Loss:  0.0180 | R2:  0.9801| MSE:  0.0180 | RMSE:  0.1343 | MAE:  0.0859 


100%|██████████| 82/82 [00:01<00:00, 49.40it/s]


Epoch 6 Val | Loss:  0.0072 | R2:  0.9928| MSE:  0.0072 | RMSE:  0.0848 | MAE:  0.0589 


100%|██████████| 382/382 [00:25<00:00, 15.00it/s]


 Epoch 7 Train | Loss:  0.0241 | R2:  0.9733| MSE:  0.0242 | RMSE:  0.1554 | MAE:  0.1012 


100%|██████████| 82/82 [00:01<00:00, 55.66it/s]


Epoch 7 Val | Loss:  0.0091 | R2:  0.9909| MSE:  0.0091 | RMSE:  0.0954 | MAE:  0.0670 


100%|██████████| 382/382 [00:25<00:00, 14.96it/s]


 Epoch 8 Train | Loss:  0.0181 | R2:  0.9801| MSE:  0.0182 | RMSE:  0.1348 | MAE:  0.0870 


100%|██████████| 82/82 [00:01<00:00, 44.77it/s]


Epoch 8 Val | Loss:  0.0109 | R2:  0.9890| MSE:  0.0109 | RMSE:  0.1045 | MAE:  0.0707 


100%|██████████| 382/382 [00:27<00:00, 14.12it/s]


 Epoch 9 Train | Loss:  0.0173 | R2:  0.9809| MSE:  0.0174 | RMSE:  0.1318 | MAE:  0.0841 


100%|██████████| 82/82 [00:01<00:00, 52.93it/s]


Epoch 9 Val | Loss:  0.0094 | R2:  0.9906| MSE:  0.0094 | RMSE:  0.0967 | MAE:  0.0719 


100%|██████████| 382/382 [00:24<00:00, 15.71it/s]


 Epoch 10 Train | Loss:  0.0156 | R2:  0.9829| MSE:  0.0156 | RMSE:  0.1250 | MAE:  0.0767 


100%|██████████| 82/82 [00:01<00:00, 57.44it/s]


Epoch 10 Val | Loss:  0.0071 | R2:  0.9929| MSE:  0.0071 | RMSE:  0.0841 | MAE:  0.0635 


100%|██████████| 382/382 [00:26<00:00, 14.59it/s]


 Epoch 11 Train | Loss:  0.0142 | R2:  0.9843| MSE:  0.0143 | RMSE:  0.1194 | MAE:  0.0741 


100%|██████████| 82/82 [00:01<00:00, 51.40it/s]


Epoch 11 Val | Loss:  0.0051 | R2:  0.9948| MSE:  0.0051 | RMSE:  0.0717 | MAE:  0.0536 


100%|██████████| 382/382 [00:24<00:00, 15.57it/s]


 Epoch 12 Train | Loss:  0.0141 | R2:  0.9845| MSE:  0.0142 | RMSE:  0.1190 | MAE:  0.0726 


100%|██████████| 82/82 [00:01<00:00, 56.94it/s]


Epoch 12 Val | Loss:  0.0047 | R2:  0.9952| MSE:  0.0047 | RMSE:  0.0689 | MAE:  0.0515 


100%|██████████| 382/382 [00:25<00:00, 15.16it/s]


 Epoch 13 Train | Loss:  0.0139 | R2:  0.9846| MSE:  0.0140 | RMSE:  0.1183 | MAE:  0.0716 


100%|██████████| 82/82 [00:01<00:00, 54.81it/s]


Epoch 13 Val | Loss:  0.0044 | R2:  0.9956| MSE:  0.0044 | RMSE:  0.0663 | MAE:  0.0538 


100%|██████████| 382/382 [00:24<00:00, 15.53it/s]


 Epoch 14 Train | Loss:  0.0144 | R2:  0.9842| MSE:  0.0145 | RMSE:  0.1202 | MAE:  0.0739 


100%|██████████| 82/82 [00:01<00:00, 50.30it/s]


Epoch 14 Val | Loss:  0.0057 | R2:  0.9943| MSE:  0.0057 | RMSE:  0.0753 | MAE:  0.0541 





100%|██████████| 82/82 [00:01<00:00, 53.20it/s]


Epoch 13 Val | Loss:  0.0044 | R2:  0.9956| MSE:  0.0044 | RMSE:  0.0663 | MAE:  0.0538 


100%|██████████| 82/82 [00:01<00:00, 56.99it/s]


Epoch 13 Val | Loss:  0.0043 | R2:  0.9952| MSE:  0.0043 | RMSE:  0.0659 | MAE:  0.0533 


# Inference

In [171]:
os.makedirs('../output', exist_ok=True)

# read in y scale
with open('../data/curated/ML_data/y_scaler_gnn.pickle', 'rb') as f:
    y_scaler_gnn = pickle.load(f)

In [281]:
all_predictions = model2.predict(all_inference_geospatial_X_batches, all_inference_non_geospatial_X_batches, all_inference_masks, SA2_weights_matrix)
all_predictions = np.array(all_predictions).flatten()

all_predictions_df = pd.DataFrame({'Station Name': list(station_weights_withSA2.keys()) * 2,
              'Predicted_Log_Total_Demand': all_predictions,
              'Weekday': [0 for _ in range(len(station_weights_withSA2))] + [1 for _ in range(len(station_weights_withSA2))]})
all_predictions_df['Unscaled_Predicted_Log_Total_Demand'] = y_scaler_gnn.inverse_transform(all_predictions_df['Predicted_Log_Total_Demand'].values.reshape(-1, 1))
all_predictions_df['Unscaled_Predicted_Total_Demand'] = np.exp(all_predictions_df['Unscaled_Predicted_Log_Total_Demand'])
all_predictions_df.to_csv('../output/agnn2_predictions.csv', index=False)

all_predictions_df

100%|██████████| 2/2 [00:00<00:00, 29.04it/s]


Unnamed: 0,Station Name,Predicted_Log_Total_Demand,Weekday,Unscaled_Predicted_Log_Total_Demand,Unscaled_Predicted_Total_Demand
0,Aircraft,2.885838,0,12.258663,210799.561393
1,Alamein,2.880375,0,12.248887,208748.728791
2,Albion,2.902173,0,12.287896,217052.903606
3,Alphington,2.916082,0,12.312788,222523.576722
4,Altona,2.900282,0,12.284512,216319.642428
...,...,...,...,...,...
879,(SA2)Whittlesea,0.019191,1,7.128467,1246.964479
880,(SA2)Wollert,-0.413111,1,6.354812,575.254399
881,(SA2)Wyndham Vale - North,0.147060,1,7.357303,1567.602840
882,(SA2)Wyndham Vale - South,0.052260,1,7.187648,1322.987988


In [286]:
# want 5/7 * weekday = 1 + 2/7 * weekend = 0
weekday_weekend_scaled_predictions = all_predictions_df[['Station Name', 'Weekday', 'Unscaled_Predicted_Total_Demand']].groupby('Station Name').apply(lambda x: 5/7 * x[x['Weekday'] == 1]['Unscaled_Predicted_Total_Demand'].values[0] + 2/7 * x[x['Weekday'] == 0]['Unscaled_Predicted_Total_Demand'].values[0]).reset_index().rename({0: 'Predicted_Total_Demand'}, axis=1).loc[:218]
weekday_weekend_scaled_predictions.to_csv('../output/agnn2_predictions_weekday_weekend_scaled.csv', index=False)

  weekday_weekend_scaled_predictions = all_predictions_df[['Station Name', 'Weekday', 'Unscaled_Predicted_Total_Demand']].groupby('Station Name').apply(lambda x: 5/7 * x[x['Weekday'] == 1]['Unscaled_Predicted_Total_Demand'].values[0] + 2/7 * x[x['Weekday'] == 0]['Unscaled_Predicted_Total_Demand'].values[0]).reset_index().rename({0: 'Predicted_Total_Demand'}, axis=1).loc[:218]


# Feature Analysis

Last 2 tuples correspond to weights for: 'Nearby Train Demand Aggregate', 'Weekday', 'PublicHoliday', 'mean_rainfall_value', 'has_school',
       'has_sport_facility', 'has_shopping_centre', 'has_hospital',
       'total_population', ' med_rent_weekly_c2021',
       ' med_mortg_rep_mon_c2021', ' med_person_inc_we_c2021',
       ' med_famly_inc_we_c2021'

In [6]:
# A_GNN 2 layers
class GNN_config:
    # ----------------- architectual hyperparameters ----------------- #
    d_model = 256
    n_heads = 8
    dropout = 0.1
    n_gnn_layers = 2
    activation = nn.ReLU()
    res_learning = False
    bottleneck = True
    # ----------------- optimisation hyperparameters ----------------- #
    random_state = SEED
    epochs = 32
    lr = 1e-3
    patience = 5
    loss = nn.MSELoss()
    validation_loss = nn.MSELoss()
    alpha = 0.1
    scheduler = True
    grad_clip = False
    # ----------------- operation hyperparameters ----------------- #
    spatial_input_dim = 1
    nonspatial_input_dim = 11
    # ----------------- saving hyperparameters ----------------- #
    rootpath = home_directory
    name = f'AGNN_2layer'

model2 = GNN(GNN_config) # initialise the model

params = list(model2.model.parameters())

for param in params[-2:]:
    print(param)

Parameter containing:
tensor([[ 0.2207,  0.2396, -0.0676,  0.2652, -0.0632,  0.0583, -0.1405,  0.1695,
          0.2545, -0.2118,  0.2509,  0.0540]], requires_grad=True)
Parameter containing:
tensor([0.2133], requires_grad=True)
