# Outcome Prediction with RGCN Model

## 1. Load data from "processed_data" folder

In [1]:
import os
import time
import joblib
import pandas as pd

import torch
import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric.nn import RGCNConv
from torch_geometric.utils import to_undirected

In [2]:
dataset = '../Data Generation/sphn_transductive_1000_0.nt'
inverse_triples = True

def preprocess(data_name):
    s_list, r_list, d_list = [], [], []
    
    with open(data_name) as f:
        s = next(f)
        print(s)
        for idx, line in enumerate(f):
            e = line.strip().split(' ')
            s = e[0]
            r = e[1]
            d = e[2]            
            
            s_list.append(s)
            r_list.append(r)
            d_list.append(d)            
            
    return pd.DataFrame({
        's':s_list, 
        'r':r_list, 
        'd':d_list,                          
        })

df = preprocess(dataset)
print(df.head())

<http://nvasc.org/9580e5bd-472d-49e1-9bed-c2b6d0029253> <http://sphn.org/hasSubjectPseudoIdentifier> <http://nvasc.org/synth_patient_843> .

                                                   s  \
0  <http://nvasc.org/2c77cf90-594b-49bd-a5eb-a9b5...   
1  <http://nvasc.org/66c76a7b-87bb-4341-bbae-93dd...   
2  <http://nvasc.org/a44c2a60-b5a5-49c3-b230-09dd...   
3  <http://nvasc.org/016c2574-c8d5-4e5d-b523-c5b9...   
4  <http://nvasc.org/14848ddf-f39b-4126-8f5f-79b9...   

                                              r  \
0  <http://sphn.org/hasSubjectPseudoIdentifier>   
1           <http://sphn.org/hasRecordDateTime>   
2  <http://sphn.org/hasSubjectPseudoIdentifier>   
3  <http://sphn.org/hasSubjectPseudoIdentifier>   
4  <http://sphn.org/hasSubjectPseudoIdentifier>   

                                                   d  
0              <http://nvasc.org/synth_patient_2238>  
1  "2022-05-27T14:30:00"^^<http://www.w3.org/2001...  
2              <http://nvasc.org/synth_patient_512

In [12]:
num_patients = 1000

outcome = df['d'].str.contains('outcome_0.0|outcome_1.0|outcome_2.0')
node_df = df[~outcome]
node_df = node_df.reset_index(drop=True)
outcome = node_df['s'].str.contains('outcome_0.0|outcome_1.0|outcome_2.0')
node_df = node_df[~outcome]
node_df = node_df.reset_index(drop=True)

ent_to_id = {k: v for v, k in enumerate(set(node_df['s']).union(set(node_df['d'])), start=0)}
rel_to_id = {k: v for v, k in enumerate(set(node_df['r']), start=0)}

patients = [f"<http://nvasc.org/synth_patient_{i}>" for i in range(num_patients)]
patient_id = []
for patient in patients:
    patient_id.append(ent_to_id[patient])

num_nodes = max(ent_to_id.values()) + 1
num_rels = max(rel_to_id.values()) + 1

events = node_df.copy()
events["s"] = node_df.s.map(ent_to_id)
events["d"] = node_df.d.map(ent_to_id)
events["r"] = node_df.r.map(rel_to_id)

ent_to_id = pd.DataFrame.from_dict(ent_to_id, orient='index')
rel_to_id = pd.DataFrame.from_dict(rel_to_id, orient='index')

path = 'processed_data'
if not os.path.exists(path):
    os.makedirs(path)

events.to_csv(f'{path}/sphn_events_noOutcome.tsv', sep='\t', index=False, header=None)
ent_to_id.to_csv(f'{path}/sphn_entities_noOutcome.tsv', sep='\t', header=None)
rel_to_id.to_csv(f'{path}/sphn_relations_noOutcome.tsv', sep='\t', header=None)

In [3]:
num_patients = 1000
inverse_triples=True

entity = pd.read_csv('processed_data/sphn_entities_noOutcome.tsv', sep='\t', index_col=0, header=None)
entity = entity.to_dict()[1]

indices = []
for i in range(num_patients):
    idx = f'<http://nvasc.org/synth_patient_{i}>'
    indices.append(entity[idx])
# events = pd.read_csv('../data/SPHN_events_noOutcome.tsv', sep='\t', header=None)
events = pd.read_csv('processed_data/sphn_events_noOutcome.tsv', sep='\t', header=None)
y = joblib.load('../Data Generation/outcomes_1000_0.joblib')

edge_index = torch.vstack((torch.Tensor(events[0]).long(),torch.Tensor(events[2]).long()))
edge_type = torch.Tensor(events[1]).long()
train_idx = torch.Tensor(indices[:int(num_patients*0.8)]).long()
train_y = torch.Tensor(y[:int(num_patients*0.8)]).long()
test_idx = torch.Tensor(indices[int(num_patients*0.8):]).long()
test_y = torch.Tensor(y[int(num_patients*0.8):]).long()
num_nodes = len(entity)

if inverse_triples == True:
    edge_index = to_undirected(edge_index)
    edge_type = torch.cat((edge_type, edge_type))

data = Data(
    edge_index=edge_index,
    edge_type=edge_type,
    train_idx=train_idx,
    train_y=train_y,
    test_idx=test_idx,
    test_y=test_y,
    num_nodes=num_nodes,
)
embedding = torch.nn.Embedding(data.num_nodes, 128)
data.x = embedding.weight
data.num_relations = data.num_edge_types
data.num_classes = 3

In [4]:
print(data)

Data(edge_index=[2, 223868], edge_type=[223868], train_idx=[800], train_y=[800], test_idx=[200], test_y=[200], num_nodes=32564, x=[32564, 128], num_relations=8, num_classes=3)


## 2. Training the model

In [5]:
class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = RGCNConv(data.num_nodes, 32, data.num_relations,
                          num_bases=4)
        self.conv2 = RGCNConv(32, data.num_classes, data.num_relations,
                          num_bases=4)

    def forward(self, edge_index, edge_type):
        x = F.relu(self.conv1(None, edge_index, edge_type))
        x = self.conv2(x, edge_index, edge_type)
        return F.log_softmax(x, dim=1)


if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

model, data = Net().to(device), data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0005)
class_weight = torch.Tensor([1000/456, 1000/429, 1000/115]).to(device)

In [6]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.edge_index, data.edge_type)
    loss = F.nll_loss(out[data.train_idx], data.train_y, weight=class_weight)
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test():
    model.eval()
    pred = model(data.edge_index, data.edge_type).argmax(dim=-1)
    train_acc = float((pred[data.train_idx] == data.train_y).float().mean())
    test_acc = float((pred[data.test_idx] == data.test_y).float().mean())
    return train_acc, test_acc

In [7]:
times = []
for epoch in range(1, 1001):
    start = time.time()
    loss = train()
    train_acc, test_acc = test()
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Train: {train_acc:.4f} '
          f'Test: {test_acc:.4f}')
    times.append(time.time() - start)
print(f"Median time per epoch: {torch.tensor(times).median():.4f}s")

Epoch: 01, Loss: 1.0995, Train: 0.9800 Test: 0.3450
Epoch: 02, Loss: 1.0516, Train: 1.0000 Test: 0.3650
Epoch: 03, Loss: 1.0087, Train: 1.0000 Test: 0.3750
Epoch: 04, Loss: 0.9672, Train: 1.0000 Test: 0.4000
Epoch: 05, Loss: 0.9252, Train: 1.0000 Test: 0.4050
Epoch: 06, Loss: 0.8817, Train: 1.0000 Test: 0.4100
Epoch: 07, Loss: 0.8366, Train: 1.0000 Test: 0.4200
Epoch: 08, Loss: 0.7898, Train: 1.0000 Test: 0.4100
Epoch: 09, Loss: 0.7416, Train: 1.0000 Test: 0.4200
Epoch: 10, Loss: 0.6924, Train: 1.0000 Test: 0.4200
Epoch: 11, Loss: 0.6434, Train: 1.0000 Test: 0.4250
Epoch: 12, Loss: 0.5953, Train: 1.0000 Test: 0.4100
Epoch: 13, Loss: 0.5488, Train: 1.0000 Test: 0.4150
Epoch: 14, Loss: 0.5045, Train: 1.0000 Test: 0.4100
Epoch: 15, Loss: 0.4625, Train: 1.0000 Test: 0.4150
Epoch: 16, Loss: 0.4233, Train: 1.0000 Test: 0.4100
Epoch: 17, Loss: 0.3868, Train: 1.0000 Test: 0.4150
Epoch: 18, Loss: 0.3531, Train: 1.0000 Test: 0.4200
Epoch: 19, Loss: 0.3223, Train: 1.0000 Test: 0.4100
Epoch: 20, L

In [8]:
from sklearn.metrics import roc_auc_score

model.eval()
with torch.no_grad():
    out = model(data.edge_index, data.edge_type).cpu()
    prob = F.softmax(out, dim=1)
auc = roc_auc_score(data.test_y.cpu(), prob[data.test_idx.cpu()], multi_class='ovr')
print(f'ROC AUC score: {auc:.4f}')
# print(prob.shape)

ROC AUC score: 0.6354
