# Outcome Prediction with RGAT Model

## 1. Load data from "processed_data" folder

In [1]:
import time
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric.nn import RGATConv
from torch_geometric.utils import to_undirected

In [2]:
num_patients = 1000
inverse_triples = True
embed_dim = 32
num_heads = 2

entity = pd.read_csv('processed_data/sphn_entities_noOutcome.tsv', sep='\t', index_col=0, header=None)
entity = entity.to_dict()[1]

indices = []
for i in range(num_patients):
    idx = f'<http://nvasc.org/synth_patient_{i}>'
    indices.append(entity[idx])

events = pd.read_csv('processed_data/sphn_events_noOutcome.tsv', sep='\t', header=None)
y = joblib.load('../Data Generation/outcomes_1000_0.joblib')

skf = StratifiedKFold(n_splits=5)
train_idx, test_idx = next(iter(skf.split(indices, y)))
indices = np.asarray(indices)
y = np.asarray(y)

edge_index = torch.vstack((torch.Tensor(events[0]).long(),torch.Tensor(events[2]).long()))
edge_type = torch.Tensor(events[1]).long()
train_x = torch.Tensor(indices[train_idx]).long()
train_y = torch.Tensor(y[train_idx]).long()
test_x = torch.Tensor(indices[test_idx]).long()
test_y = torch.Tensor(y[test_idx]).long()
num_nodes = len(entity)

if inverse_triples == True:
    edge_index = to_undirected(edge_index)
    edge_type = torch.cat((edge_type, edge_type))

data = Data(
    edge_index=edge_index,
    edge_type=edge_type,
    train_idx=train_x,
    train_y=train_y,
    test_idx=test_x,
    test_y=test_y,
    num_nodes=num_nodes,
)
embedding = torch.nn.Embedding(data.num_nodes, embed_dim)
data.x = embedding.weight
data.num_relations = data.num_edge_types
data.num_classes = 3
print(data)

Data(edge_index=[2, 223868], edge_type=[223868], train_idx=[800], train_y=[800], test_idx=[200], test_y=[200], num_nodes=32564, x=[32564, 32], num_relations=8, num_classes=3)


In [3]:
pd.DataFrame(y).value_counts()

0
0    456
1    429
2    115
Name: count, dtype: int64

## 2. Training the model

In [4]:
class RGAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels,
                 num_relations):
        super().__init__()
        self.conv1 = RGATConv(in_channels, hidden_channels, num_relations, heads=num_heads)
        self.conv2 = RGATConv(hidden_channels*num_heads, hidden_channels, num_relations, heads=num_heads)
        self.lin = torch.nn.Linear(hidden_channels*num_heads, out_channels)

    def forward(self, x, edge_index, edge_type):
        x = self.conv1(x, edge_index, edge_type)
        x = F.leaky_relu(x)
        x = F.dropout(x, p=0.3)
        x = self.conv2(x, edge_index, edge_type)
        x = F.leaky_relu(x)
        x = F.dropout(x, p=0.3)
        x = self.lin(x)
        return F.log_softmax(x, dim=-1)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)

num_classes = 3
num_relations = data.num_edge_types
model = RGAT(embed_dim, 32, num_classes, num_relations).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005)
class_weight = torch.Tensor([456/456, 456/429, 456/115]).to(device)

In [5]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index, data.edge_type)
    loss = F.nll_loss(out[data.train_idx], data.train_y, weight=class_weight)
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test():
    model.eval()
    pred = model(data.x, data.edge_index, data.edge_type).argmax(dim=-1)
    train_acc = float((pred[data.train_idx] == data.train_y).float().mean())
    test_acc = float((pred[data.test_idx] == data.test_y).float().mean())
    return train_acc, test_acc

In [7]:
times = []
for epoch in range(1, 101):
    start = time.time()
    loss = train()
    if epoch % 5 ==0:
        train_acc, test_acc = test()
        print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Train: {train_acc:.4f} '
            f'Test: {test_acc:.4f}')
    times.append(time.time() - start)
print(f"Median time per epoch: {torch.tensor(times).median():.4f}s")

Epoch: 05, Loss: 0.0529, Train: 0.9825 Test: 0.4200
Epoch: 10, Loss: 0.0364, Train: 0.9862 Test: 0.4650
Epoch: 15, Loss: 0.0778, Train: 0.9800 Test: 0.4400
Epoch: 20, Loss: 0.0383, Train: 0.9762 Test: 0.4000
Epoch: 25, Loss: 0.0353, Train: 0.9925 Test: 0.4250
Epoch: 30, Loss: 0.0513, Train: 0.9825 Test: 0.4400
Epoch: 35, Loss: 0.0344, Train: 0.9887 Test: 0.4600
Epoch: 40, Loss: 0.0328, Train: 0.9837 Test: 0.4300
Epoch: 45, Loss: 0.0269, Train: 0.9887 Test: 0.4400
Epoch: 50, Loss: 0.0231, Train: 0.9887 Test: 0.4900
Epoch: 55, Loss: 0.0231, Train: 0.9925 Test: 0.4300
Epoch: 60, Loss: 0.0208, Train: 0.9925 Test: 0.4150
Epoch: 65, Loss: 0.0219, Train: 0.9925 Test: 0.4050
Epoch: 70, Loss: 0.0297, Train: 0.9937 Test: 0.4150
Epoch: 75, Loss: 0.0413, Train: 0.9862 Test: 0.4000
Epoch: 80, Loss: 0.0201, Train: 0.9875 Test: 0.4000
Epoch: 85, Loss: 0.0308, Train: 0.9850 Test: 0.4250
Epoch: 90, Loss: 0.0157, Train: 0.9900 Test: 0.4650
Epoch: 95, Loss: 0.0373, Train: 0.9900 Test: 0.4400
Epoch: 100, 

In [8]:
from sklearn.metrics import roc_auc_score

model.eval()
with torch.no_grad():
    out = model(data.x, data.edge_index, data.edge_type).cpu()
    prob = F.softmax(out, dim=1)
auc = roc_auc_score(data.test_y.cpu(), prob[data.test_idx.cpu()], multi_class='ovr')
print(f'ROC AUC score: {auc:.4f}')
# print(prob.shape)

ROC AUC score: 0.5110
