# GNN Training

## Data Preparation

In [1]:
import os
import pandas as pd

import torch
from torch_geometric.data import HeteroData
from torch_geometric.loader import DataLoader
from torch_geometric.nn import to_hetero

from GNN_model.GCN_model import GCN
from GNN_model.trainer import Trainer

from sklearn.model_selection import train_test_split

In [2]:
def load_data(data_dir):
    data = {}
    for file in os.scandir(data_dir):
        if not file.name.endswith('.csv'):
            continue
        with open(file, 'r') as f:
            file_name = file.name[:file.name.find('.')]
            data[file_name] = pd.read_csv(f)
    return data

In [3]:
def split_data_to_separate_graphs(data):
    max_timestep = data['req_features']['timestep'].max()
    split_data = [HeteroData() for _ in range(max_timestep+1)]
    for name, add_feats_fn in zip(['req_features', 'veh_features', 'rr_graph', 'vr_graph'], [add_r_features, add_v_features, add_rr_edge_data, add_vr_edge_data]):
        grouped = data[name].groupby('timestep')
        for timestep, rows in grouped:
            add_feats_fn(rows, split_data[timestep])
    return split_data

def add_r_features(rows, hdata):
    new_rows = pd.get_dummies(columns=['status'], data=rows, dtype=int)
    hdata['request'].x = torch.tensor(new_rows.values)

def add_v_features(rows, hdata):
    new_rows = pd.get_dummies(columns=['type'], data=rows, dtype=int)
    hdata['vehicle'].x = torch.tensor(new_rows.values)

def add_rr_edge_data(rows, hdata):
    hdata['request', 'connects', 'request'].edge_index = [torch.tensor(rows['source'].values), torch.tensor(rows['target'].values)]
    hdata['request', 'connects', 'request'].y = torch.tensor(rows['label'].values)
    hdata['request', 'connects', 'request'].edge_attr = torch.tensor(rows.drop(columns=['source', 'target', 'label']).values)

def add_vr_edge_data(rows, hdata):
    hdata['vehicle', 'connects', 'request'].edge_index = [torch.tensor(rows['source'].values), torch.tensor(rows['target'].values)]
    hdata['vehicle', 'connects', 'request'].y = torch.tensor(rows['label'].values)
    hdata['vehicle', 'connects', 'request'].edge_attr = torch.tensor(rows.drop(columns=['source', 'target', 'label']).values)


In [4]:
base_data_dir = 'data/example_pool_sc_1/'
data = load_data(f'{base_data_dir}processed/')
data = split_data_to_separate_graphs(data)
# todo normalize features

In [5]:
# TODO split data into training and testing
def get_tensor(size, flag):
    if flag:
        return torch.ones(size, dtype=torch.bool)
    else:
        return torch.zeros(size, dtype=torch.bool)

indices = list(range(len(data)))
train_idx, test_idx = train_test_split(indices, test_size=.2)
train_idx, val_idx = train_test_split(train_idx, test_size=.1)
for i in range(len(data)):
    size = len(data[i].edge_index_dict)
    data[i].train_mask = get_tensor(size, i in train_idx)
    data[i].val_mask = get_tensor(size, i in val_idx)
    data[i].test_mask = get_tensor(size, i in test_idx)

KeyError: "Tried to collect 'edge_index' but did not find any occurrences of it in any node and/or edge type"

In [17]:
loader = DataLoader(data, batch_size=32, shuffle=True)

In [18]:
torch.save(data, f'{base_data_dir}processed/graph_data.pt')

## Model

In [19]:
num_classes = 2
hidden_channels = 4
epochs = 200

In [20]:
# TODO device
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [21]:
model = GCN(hidden_channels=hidden_channels, out_channels=num_classes)  #.to(device=device)

In [32]:
model = to_hetero(model, data[10].metadata(), aggr='sum')



ValueError: 'add_self_loops' attribute set to 'True' on module 'GCNConv(-1, 4)' for use with edge type(s) '[('request', 'connects', 'request'), ('vehicle', 'connects', 'request')]'. This will lead to incorrect message passing results.

In [None]:
criterion = torch.nn.BCELoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)  # Define optimizer.

In [None]:
with torch.no_grad():  # Initialize lazy modules.
    out = model(data.x_dict, data.edge_index_dict)

In [None]:
trainer = Trainer(f'{base_data_dir}/models/')
trainer.train(model, data, criterion, optimizer)

In [None]:
# TODO hyperparameter tuning
# TODO visualize results