# Outcome Prediction with RGCN Model

## 1. Load data from "processed_data" folder

In [69]:
import os
import time
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

import torch
import torch.nn.functional as F

from torch_geometric.data import Data
from torch_geometric.nn import RGCNConv
from torch_geometric.utils import to_undirected

In [2]:
dataset = '../Data Generation/sphn_transductive_1000_0.nt'
inverse_triples = True

def preprocess(data_name):
    s_list, r_list, d_list = [], [], []
    
    with open(data_name) as f:
        s = next(f)
        print(s)
        for idx, line in enumerate(f):
            e = line.strip().split(' ')
            s = e[0]
            r = e[1]
            d = e[2]            
            
            s_list.append(s)
            r_list.append(r)
            d_list.append(d)            
            
    return pd.DataFrame({
        's':s_list, 
        'r':r_list, 
        'd':d_list,                          
        })

df = preprocess(dataset)
print(df.head())

<http://nvasc.org/9580e5bd-472d-49e1-9bed-c2b6d0029253> <http://sphn.org/hasSubjectPseudoIdentifier> <http://nvasc.org/synth_patient_843> .

                                                   s  \
0  <http://nvasc.org/2c77cf90-594b-49bd-a5eb-a9b5...   
1  <http://nvasc.org/66c76a7b-87bb-4341-bbae-93dd...   
2  <http://nvasc.org/a44c2a60-b5a5-49c3-b230-09dd...   
3  <http://nvasc.org/016c2574-c8d5-4e5d-b523-c5b9...   
4  <http://nvasc.org/14848ddf-f39b-4126-8f5f-79b9...   

                                              r  \
0  <http://sphn.org/hasSubjectPseudoIdentifier>   
1           <http://sphn.org/hasRecordDateTime>   
2  <http://sphn.org/hasSubjectPseudoIdentifier>   
3  <http://sphn.org/hasSubjectPseudoIdentifier>   
4  <http://sphn.org/hasSubjectPseudoIdentifier>   

                                                   d  
0              <http://nvasc.org/synth_patient_2238>  
1  "2022-05-27T14:30:00"^^<http://www.w3.org/2001...  
2              <http://nvasc.org/synth_patient_512

In [12]:
num_patients = 1000

outcome = df['d'].str.contains('outcome_0.0|outcome_1.0|outcome_2.0')
node_df = df[~outcome]
node_df = node_df.reset_index(drop=True)
outcome = node_df['s'].str.contains('outcome_0.0|outcome_1.0|outcome_2.0')
node_df = node_df[~outcome]
node_df = node_df.reset_index(drop=True)

ent_to_id = {k: v for v, k in enumerate(set(node_df['s']).union(set(node_df['d'])), start=0)}
rel_to_id = {k: v for v, k in enumerate(set(node_df['r']), start=0)}

patients = [f"<http://nvasc.org/synth_patient_{i}>" for i in range(num_patients)]
patient_id = []
for patient in patients:
    patient_id.append(ent_to_id[patient])

num_nodes = max(ent_to_id.values()) + 1
num_rels = max(rel_to_id.values()) + 1

events = node_df.copy()
events["s"] = node_df.s.map(ent_to_id)
events["d"] = node_df.d.map(ent_to_id)
events["r"] = node_df.r.map(rel_to_id)

ent_to_id = pd.DataFrame.from_dict(ent_to_id, orient='index')
rel_to_id = pd.DataFrame.from_dict(rel_to_id, orient='index')

path = 'processed_data'
if not os.path.exists(path):
    os.makedirs(path)

events.to_csv(f'{path}/sphn_events_noOutcome.tsv', sep='\t', index=False, header=None)
ent_to_id.to_csv(f'{path}/sphn_entities_noOutcome.tsv', sep='\t', header=None)
rel_to_id.to_csv(f'{path}/sphn_relations_noOutcome.tsv', sep='\t', header=None)

In [97]:
num_patients = 1000
inverse_triples=True
embed_dim = 32

entity = pd.read_csv('processed_data/sphn_entities_noOutcome.tsv', sep='\t', index_col=0, header=None)
entity = entity.to_dict()[1]

indices = []
for i in range(num_patients):
    idx = f'<http://nvasc.org/synth_patient_{i}>'
    indices.append(entity[idx])
# events = pd.read_csv('../data/SPHN_events_noOutcome.tsv', sep='\t', header=None)
events = pd.read_csv('processed_data/sphn_events_noOutcome.tsv', sep='\t', header=None)
y = joblib.load('../Data Generation/outcomes_1000_0.joblib')

non_valid_X, valid_X, non_valid_y, valid_y = train_test_split(indices, y, stratify=y, test_size=0.2)
train_X, testing_X, train_y, testing_y = train_test_split(non_valid_X, non_valid_y, stratify=non_valid_y, test_size=1./8)

# rus = RandomUnderSampler(random_state=0)
# under_idx, under_y = rus.fit_resample(np.asarray(train_X).reshape(-1, 1), np.asarray(train_y).reshape(-1,1))
# indices = np.asarray(indices)
# y = np.asarray(y)

edge_index = torch.vstack((torch.Tensor(events[0]).long(),torch.Tensor(events[2]).long()))
edge_type = torch.Tensor(events[1]).long()
train_idx = torch.Tensor(train_X).long().squeeze()
train_y = torch.Tensor(train_y).long()
val_idx = torch.Tensor(valid_X).long()
val_y = torch.Tensor(valid_y).long()
test_idx = torch.Tensor(testing_X).long()
test_y = torch.Tensor(testing_y).long()
num_nodes = len(entity)

if inverse_triples == True:
    edge_index = to_undirected(edge_index)
    edge_type = torch.cat((edge_type, edge_type))

data = Data(
    edge_index=edge_index,
    edge_type=edge_type,
    train_idx=train_idx,
    train_y=train_y,
    val_idx=val_idx,
    val_y=val_y,
    test_idx=test_idx,
    test_y=test_y,
    num_nodes=num_nodes,
)
embedding = torch.nn.Embedding(data.num_nodes, embed_dim)
data.x = embedding.weight
data.num_relations = data.num_edge_types
data.num_classes = 3

In [98]:
print(data)

Data(edge_index=[2, 223868], edge_type=[223868], train_idx=[700], train_y=[700], val_idx=[200], val_y=[200], test_idx=[100], test_y=[100], num_nodes=32563, x=[32563, 32], num_relations=8, num_classes=3)


## 2. Training the model

In [99]:
class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = RGCNConv(embed_dim, embed_dim, data.num_relations,
                          num_bases=10)
        self.conv2 = RGCNConv(embed_dim, data.num_classes, data.num_relations,
                          num_bases=10)

    def forward(self, edge_index, edge_type):
        x = F.relu(self.conv1(data.x, edge_index, edge_type))
        x = F.dropout(x, p=0.5)
        x = self.conv2(x, edge_index, edge_type)
        return F.log_softmax(x, dim=1)


if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

model, data = Net().to(device), data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
# class_weight = torch.bincount(train_y).to(device)
# class_weight = class_weight / class_weight.max()

In [100]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.edge_index, data.edge_type)
    loss = F.nll_loss(out[data.train_idx], data.train_y, 
                    #   weight=class_weight,
                      )
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test():
    model.eval()
    pred = model(data.edge_index, data.edge_type).argmax(dim=-1)
    train_acc = float((pred[data.train_idx] == data.train_y).float().mean())
    val_acc = float((pred[data.val_idx] == data.val_y).float().mean())
    test_acc = float((pred[data.test_idx] == data.test_y).float().mean())
    return train_acc, val_acc, test_acc

In [101]:
best_acc = 0.
times = []
for epoch in range(1, 10001):
    start = time.time()
    loss = train()
    if epoch % 50 == 0:
        train_acc, val_acc, test_acc = test()
        print(f'Epoch: {epoch:02d}, Loss: {loss:.4f} | ' 
            f'Train: {train_acc:.4f} '
            f'Valid: {val_acc:.4f} '
            f'Test: {test_acc:.4f}')    
        times.append(time.time() - start)
        # # Early stopping
        # if val_acc > best_acc:
        #     best_acc = val_acc
        #     patience = 10  # Reset patience counter
        # else:
        #     patience -= 1
        #     if patience == 0:
        #         print("Early stopping...")
        #         break
print(f"Median time per epoch: {torch.tensor(times).median():.4f}s")

Epoch: 50, Loss: 1.6164 | Train: 0.6100 Valid: 0.4200 Test: 0.4800
Epoch: 100, Loss: 0.9528 | Train: 0.6443 Valid: 0.3900 Test: 0.4700
Epoch: 150, Loss: 0.7092 | Train: 0.7171 Valid: 0.4200 Test: 0.4700
Epoch: 200, Loss: 0.4994 | Train: 0.7857 Valid: 0.4700 Test: 0.3600
Epoch: 250, Loss: 0.4057 | Train: 0.8271 Valid: 0.4450 Test: 0.4200
Epoch: 300, Loss: 0.3339 | Train: 0.8514 Valid: 0.3800 Test: 0.4600
Epoch: 350, Loss: 0.3049 | Train: 0.8814 Valid: 0.4200 Test: 0.4800
Epoch: 400, Loss: 0.2538 | Train: 0.9057 Valid: 0.4150 Test: 0.4600
Epoch: 450, Loss: 0.2359 | Train: 0.9086 Valid: 0.4200 Test: 0.4700
Epoch: 500, Loss: 0.2058 | Train: 0.9400 Valid: 0.4300 Test: 0.4500
Epoch: 550, Loss: 0.1604 | Train: 0.9429 Valid: 0.4200 Test: 0.4900
Epoch: 600, Loss: 0.1717 | Train: 0.9500 Valid: 0.4650 Test: 0.4800
Epoch: 650, Loss: 0.1321 | Train: 0.9443 Valid: 0.4250 Test: 0.4600
Epoch: 700, Loss: 0.1227 | Train: 0.9729 Valid: 0.4350 Test: 0.4600
Epoch: 750, Loss: 0.1106 | Train: 0.9686 Valid: 0

KeyboardInterrupt: 

In [54]:
from sklearn.metrics import roc_auc_score

model.eval()
with torch.no_grad():
    out = model(data.edge_index, data.edge_type).cpu()
    prob = F.softmax(out, dim=1)
auc = roc_auc_score(data.test_y.cpu(), prob[data.test_idx.cpu()], multi_class='ovr')
print(f'ROC AUC score: {auc:.4f}')
# print(prob.shape)

ROC AUC score: 0.4968
