In [1]:
from ogb.linkproppred import PygLinkPropPredDataset

dataset = PygLinkPropPredDataset(name = "ogbl-biokg") 

split_edge = dataset.get_edge_split()
train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
graph = dataset[0] # pyg graph object containing only training edges

  backends.update(_get_backends("networkx.backends"))


In [2]:
e = graph.edge_index_dict

In [3]:
rels = {}
for i in e.keys():
    rels[i[1]] = len(rels)
rels

{'disease-protein': 0,
 'drug-disease': 1,
 'drug-drug_acquired_metabolic_disease': 2,
 'drug-drug_bacterial_infectious_disease': 3,
 'drug-drug_benign_neoplasm': 4,
 'drug-drug_cancer': 5,
 'drug-drug_cardiovascular_system_disease': 6,
 'drug-drug_chromosomal_disease': 7,
 'drug-drug_cognitive_disorder': 8,
 'drug-drug_cryptorchidism': 9,
 'drug-drug_developmental_disorder_of_mental_health': 10,
 'drug-drug_endocrine_system_disease': 11,
 'drug-drug_fungal_infectious_disease': 12,
 'drug-drug_gastrointestinal_system_disease': 13,
 'drug-drug_hematopoietic_system_disease': 14,
 'drug-drug_hematopoietic_system_diseases': 15,
 'drug-drug_hypospadias': 16,
 'drug-drug_immune_system_disease': 17,
 'drug-drug_inherited_metabolic_disorder': 18,
 'drug-drug_integumentary_system_disease': 19,
 'drug-drug_irritable_bowel_syndrome': 20,
 'drug-drug_monogenic_disease': 21,
 'drug-drug_musculoskeletal_system_disease': 22,
 'drug-drug_nervous_system_disease': 23,
 'drug-drug_orofacial_cleft': 24,
 

In [3]:
import numpy as np, torch

In [5]:
node_mat = np.zeros((10687+10533+45085+17499+9969, 5))
node_dict = {'drug': 0, 'disease': 1, 'protein': 2, 'function': 3, 'sideeffect': 4}
node_lab_adder = {'drug': 0, 'disease': 10533, 'protein': 10687+10533, 'function': 10687+17499+10533, 'sideeffect': 10687+17499+45085+10533}

In [6]:
for i in e:
    edge_ind = e[i]
    adder_1 = [node_lab_adder[i[0]]]*edge_ind.shape[1]
    adder_2 = [node_lab_adder[i[2]]]*edge_ind.shape[1]
    tens_add = torch.tensor([adder_1, adder_2])
    edge_ind += tens_add

In [7]:
for i in e:
    edge_ind = e[i]
    for f in edge_ind[0]:
        node_mat[f][node_dict[i[0]]] = 1
    for f in edge_ind[1]:
        node_mat[f][node_dict[i[2]]] = 1

In [8]:
edge_attr = torch.empty((0))
edge_index = torch.tensor([[], []])
attr_index = 0
for i in e:
    edge_index = torch.cat((edge_index, e[i]), 1)
    attr_vec = np.zeros(51)
    attr_vec[rels[i[1]]] = 1
    a_m = torch.tile(torch.tensor(attr_vec), (e[i].shape[1], 1))
    if edge_attr.shape == (0):
        edge_attr = a_m
    else:
        edge_attr = torch.cat((edge_attr, a_m), 0)


In [9]:
full_graph = torch.sparse_coo_tensor(edge_index, edge_attr).coalesce()

In [10]:
ind_table = set()
table = []
for i in e:
    if rels[i[1]] < 44:
        continue
    batch = []
    for j in range(e[i].shape[1]):
        tup = (e[i][0][j], e[i][1][j])
        tup_r = (e[i][1][j], e[i][0][j])
        if tup in ind_table or tup_r in ind_table:
            continue
        ind_table.add(tup)
        batch.append((e[i][0][j], e[i][1][j], rels[i[1]]))
        if len(batch) >= 200:
            break
    table.extend(batch)
        

In [11]:
import pickle, random
from torch_geometric.utils import k_hop_subgraph
from scipy.sparse.csgraph import shortest_path
from torch_geometric.utils import to_scipy_sparse_matrix

In [12]:
node_mat = torch.tensor(node_mat)

In [13]:
#SUBGRAPH EXTRACTION TESTING
#Get Subgraph, DRNL

def sg_gen(index):
    n1 = table[index][0]
    n2 = table[index][1]
    subset, sg, new_targets, mask = k_hop_subgraph((n1, n2), 2, full_graph.indices(), relabel_nodes=True)
    n_edge_atts = full_graph.values()[mask]
    #Node attribute matrix gen
    d_n1 = shortest_path(to_scipy_sparse_matrix(sg, num_nodes=len(subset)), directed=False, indices=new_targets[0])
    d_n2 = shortest_path(to_scipy_sparse_matrix(sg, num_nodes=len(subset)), directed=False, indices=new_targets[1])
    labels = torch.zeros(len(subset), dtype=torch.int64)
    for i in range(len(subset)):
        if d_n1[i] == 1 and d_n2[i] == 1:
            labels[i] = 1
        elif (d_n1[i] == 1 and d_n2[i] == 2) or (d_n1[i] == 2 and d_n2[i] == 1):
            labels[i] = 2
        elif d_n1[i] == 2 and d_n2[i] == 2:
            labels[i] = 3
        else:
            labels[i] = 4
    labels[new_targets[0]] = 0
    labels[new_targets[1]] = 0
    labels = torch.nn.functional.one_hot(labels, num_classes=5)
    node_attr = torch.index_select(node_mat, 0, subset)
    node_attr = torch.cat((node_attr, labels), 1)
    sg = torch.sparse_coo_tensor(sg, n_edge_atts, size=(len(subset), len(subset), 51)).coalesce()

    #sg = torch.sparse_coo_tensor(sg, torch.ones(len(sg[0])), size=(len(subset), len(subset))).coalesce()
    #sg = sg.to_dense()
    #sg[new_targets[0]][new_targets[1]] = 0
    #sg[new_targets[1]][new_targets[0]] = 0
    #sg = sg.to_sparse_coo()

    inds = torch.tensor([[new_targets[0], new_targets[1]]]).transpose(1, 0)
    del_tens = torch.sparse_coo_tensor(inds, torch.nn.functional.one_hot(torch.tensor([table[index][2]]), num_classes=51), size=sg.shape).float().coalesce()
    sg -= del_tens
    return sg, node_attr



In [14]:
batch = []
ind = 1
inds = list(range(len(table)))
random.shuffle(inds)
for i in inds:
    sg, x = sg_gen(i)
    batch.append((table[i][2]-44, sg, x))
    if len(batch)%100 == 0:
        print("Added 100 subgraphs")
    if len(batch) == 1300:
        break
    '''if len(batch) >= 200:
        pickle.dump(batch, open(f"BioKG_Batch_{ind}.pickle", 'wb'))
        batch = []
        ind += 1
        print(f"Added Batch {ind-1}")
if len(batch) > 0:
    pickle.dump(batch, open(f"BioKG_Batch_{ind}.pickle", 'wb'))'''

Added 100 subgraphs
Added 100 subgraphs
Added 100 subgraphs
Added 100 subgraphs
Added 100 subgraphs
Added 100 subgraphs
Added 100 subgraphs
Added 100 subgraphs
Added 100 subgraphs
Added 100 subgraphs
Added 100 subgraphs
Added 100 subgraphs
Added 100 subgraphs


In [15]:
from torch_geometric.nn import GATConv, aggr, GCNConv
device = torch.device("cuda")

In [5]:
torch.cuda.is_available()

False

In [16]:
class AMDGCNN(torch.nn.Module):
    def __init__(self, dim_in):
        super().__init__()
        self.gcn1 = GATConv(dim_in, 32, edge_dim=51)
        self.gcn2 = GATConv(32, 32, edge_dim=51)
        self.gcn3 = GATConv(32, 32, edge_dim=51)
        self.gcn4 = GATConv(32, 1, edge_dim=51)
        self.global_pool = aggr.SortAggregation(k=30)
        self.conv1 = torch.nn.Conv1d(1, 16, 97, 97)
        self.conv2 = torch.nn.Conv1d(16, 32, 5, 1)
        self.maxpool = torch.nn.MaxPool1d(2, 2)
        self.linear1 = torch.nn.Linear(352, 128)
        self.linear2 = torch.nn.Linear(128, 7)
        self.dropout = torch.nn.Dropout(0.5)
    def forward(self, mat, edge_index):
        h1 = self.gcn1(mat, edge_index).tanh()
        h2 = self.gcn2(h1, edge_index).tanh()
        h3 = self.gcn3(h2, edge_index).tanh()
        h4 = self.gcn4(h3, edge_index).tanh()
        h = torch.cat([h1, h2, h3, h4], dim=-1)
        h = self.global_pool(h)
        h = h.view(h.size(0), 1, h.size(-1))
        h = self.conv1(h).relu()
        h = self.maxpool(h)
        h = self.conv2(h).relu()
        h = h.view(h.size(0), -1)
        h = self.linear1(h).relu()
        h = self.dropout(h)
        h = self.linear2(h)
        return h

In [17]:
model = AMDGCNN(10)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.00055)
criterion = torch.nn.CrossEntropyLoss()
model.train(True)
model.to(device)

RuntimeError: No CUDA GPUs are available

In [None]:
#ADGCNN
def train_one_epoch():
    torch.enable_grad()
    l = 0
    p = 0
    for i in batch[0:1000]:
        label = torch.nn.functional.one_hot(torch.tensor(i[0]), num_classes=7).view(1, -1).float()
        g = i[1].coalesce().float()
        x = i[2].float()
        optimizer.zero_grad()
        out = model(x.to(device), g.to(device))
        loss = criterion(out, label.to(device))
        loss.backward()
        optimizer.step()
        p += 1
        l += loss.item()
        if p % 100 == 0:
            print(f"Loss: {l}")
            l = 0

In [None]:
for i in range(1):
    train_one_epoch()

Loss: 157.89881592988968
Loss: 155.990369617939
Loss: 162.08416163921356
Loss: 157.4449891448021
Loss: 180.31131994724274
Loss: 160.04609733819962
Loss: 158.67898684740067
Loss: 166.98347693681717
Loss: 159.89664870500565
Loss: 168.99259996414185
Loss: 153.63431864976883
Loss: 152.45419251918793
Loss: 175.26148879528046
Loss: 153.39883935451508
Loss: 168.10096210241318
Loss: 154.82251408696175
Loss: 157.4013516306877
Loss: 164.74410432577133
Loss: 162.1094577908516
Loss: 165.691240131855
Loss: 156.7703014612198
Loss: 149.98195579648018
Loss: 174.4571948647499
Loss: 155.6932834982872
Loss: 170.439120054245
Loss: 160.54885318875313
Loss: 162.85599356889725
Loss: 164.68665331602097
Loss: 160.38610470294952
Loss: 173.49579071998596
Loss: 158.36921721696854
Loss: 156.43072420358658
Loss: 176.78861433267593
Loss: 166.65884673595428
Loss: 169.12505966424942
Loss: 156.14981651306152
Loss: 157.65793257951736
Loss: 165.05918353796005
Loss: 163.56419587135315
Loss: 169.02613192796707
Loss: 153.31

In [None]:
torch.no_grad()
labels = []
preds = []
for i in batch[1000:]:
        labels.append(i[0])
        g = i[1].coalesce().float()
        x = i[2].float()
        out = model(x.to(device), g.to(device)).softmax(1)
        preds.append(out.detach())

In [None]:
from torcheval.metrics import MulticlassAUROC
metric = MulticlassAUROC(num_classes=7)
metric.update(torch.stack(preds).view(len(batch[1000:]), 7).to(device), torch.tensor(labels).to(device))
metric.compute()

tensor(0.7273)