In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
drug_mapping = pd.read_csv("../input/dataset/edges/drug_mapping.txt", sep = "\t", names = ["drug_id", "type"])

In [3]:
drug_edges = pd.read_csv("../input/dataset/edges/drug_drug.txt", sep = " ", names = ["drug_i", "drug_j"])

In [4]:
drug_mapping["type"].nunique()

12

In [5]:
mapping_type = {k: e for e, k in enumerate(drug_mapping["type"].unique().tolist())}
print(mapping_type)
drug_mapping["type_code"] = drug_mapping["type"].map(mapping_type)

{'Whole body': 0, 'Nervous system': 1, 'Lymphatic system': 2, 'Musculoskeletal system': 3, 'Reproductive system': 4, 'Integumentary system': 5, 'Circulatory system': 6, 'Respiratory system': 7, 'Digestive system': 8, 'Urinary system': 9, 'Endocrine system': 10, 'Immune system': 11}


In [6]:
labels1 = []
labels2 = []
drug_map = drug_mapping["type_code"].to_dict()
for index, edge in drug_edges.iterrows():
    n1, n2 = edge["drug_i"], edge["drug_j"]
    n1, n2 = drug_map[n1], drug_map[n2]
    labels1.append(n1)
    labels2.append(n2)

drug_edges["label1"] = labels1
drug_edges["label2"] = labels2
# for index, edge in drug_edges.iterrows():
#     n1, n2 = edge["drug_i"], edge["drug_j"]
#     n1, n2 = drug_map[n1], drug_map[n2]
#     pairs[n1][n2] += 1
#     pairs[n2][n1] += 1

In [7]:
drug_edges

Unnamed: 0,drug_i,drug_j,label1,label2
0,0,31,0,1
1,0,38,0,6
2,0,284,0,1
3,0,216,0,1
4,0,384,0,10
...,...,...,...,...
165797,840,433,1,1
165798,840,637,1,0
165799,840,832,1,1
165800,840,235,1,1


In [8]:
l = 0
drug_edges.query(f"label1 == {l} | label2 == {l}")

Unnamed: 0,drug_i,drug_j,label1,label2
0,0,31,0,1
1,0,38,0,6
2,0,284,0,1
3,0,216,0,1
4,0,384,0,10
...,...,...,...,...
165769,840,413,1,0
165781,840,421,1,0
165782,840,829,1,0
165796,840,666,1,0


In [9]:
drug_edges.to_csv("../input/dataset/edges/edge_with_type.txt", index = False)

# Body Part

In [10]:
edge_type = pd.read_csv("../input/dataset/edges/edge_with_type.txt")

In [21]:
import torch
from torch_geometric.utils import to_undirected, coalesce
from ogb.io.read_graph_pyg import read_graph_pyg
from torch_geometric.data import Data
from torch_geometric.transforms import ToUndirected, RandomLinkSplit

def train_test_split_edges(data, sr = 1):
    assert data.num_nodes is not None
    assert data.edge_index is not None

    num_nodes = data.num_nodes
    row, col = data.edge_index
    edge_attr = data.edge_attr

    # Return upper triangular portion.
    mask = row < col
    row, col = row[mask], col[mask]

    if edge_attr is not None:
        edge_attr = edge_attr[mask]

    # Positive edges.
    zero_indices = (edge_attr == 0).nonzero()
    one_indices = (edge_attr == 1).nonzero()

    train_tensor = zero_indices.flatten()
    test_tensor = one_indices.flatten()
    
    r, c = row[test_tensor], col[test_tensor]
    data.test_pos_edge_index = torch.stack([r, c], dim=0)

    r, c = row[train_tensor], col[train_tensor]
    data.train_pos_edge_index = torch.stack([r, c], dim=0)
    
    if edge_attr is not None:
        out = to_undirected(data.train_pos_edge_index, edge_attr[train_tensor])
        data.train_pos_edge_index, _ = out

    # Negative edges.
    n_t = test_tensor.size(0) * sr
    n_tr = train_tensor.size(0) * sr
    #print(n_t, n_tr)
    
    neg_adj_mask = torch.ones(num_nodes, num_nodes, dtype=torch.uint8)
    neg_adj_mask = neg_adj_mask.triu(diagonal=1).to(torch.bool)
    neg_adj_mask[row, col] = 0

    neg_row, neg_col = neg_adj_mask.nonzero(as_tuple=False).t()
    perm = torch.randperm(neg_row.size(0) * sr)
    neg_row, neg_col = neg_row[perm], neg_col[perm]
    
    # assigning neg edges
    row, col = neg_row[:n_t], neg_col[:n_t]
    data.test_neg_edge_index = torch.stack([row, col], dim=0)
    row, col = neg_row[n_t: n_t + n_tr], neg_col[n_t: n_t + n_tr]
    data.train_neg_edge_index = torch.stack([row, col], dim=0)
    
    return data

for fold in range(12):
    if fold == 1:
        continue
    edge_type["label"] = 0
    test_idx = edge_type.query(f"(label1 == {fold}) | (label2 == {fold})").index
    edge_type.loc[test_idx, "label"] = 1
    # print(fold, "\n", edge_type.label.value_counts())
    edge_type["label"].to_csv("../input/dataset/ogbl_dkp/ddi/edge-feat.csv.gz", header = None, compression = "gzip", index = False)
    edge_type[["drug_i", "drug_j"]].to_csv("../input/dataset/ogbl_dkp/ddi/edge.csv.gz", header = None, compression = "gzip", index = False)
    num_edges = pd.DataFrame({"data": [edge_type.shape[0]]})
    num_edges.to_csv("../input/dataset/ogbl_dkp/ddi/num-edge-list.csv.gz", header = None, compression = "gzip", index = False)
    print(f"fold{fold} created")
    
    data = read_graph_pyg(
            "../input/dataset/ogbl_dkp/ddi/",
            add_inverse_edge=False,
            additional_node_files=[],
            additional_edge_files=[],
            binary=False,
    )[0]
    data = train_test_split_edges(data)

    train = {"edge": data["train_pos_edge_index"], "edge_neg": data["train_neg_edge_index"]}
    test = {"edge": data["test_pos_edge_index"], "edge_neg": data["test_neg_edge_index"]}

    base_path = f"../input/dataset/ogbl_dkp/split/fold{fold}"
    torch.save(train, f"{base_path}/train.pt")
    torch.save(test, f"{base_path}/test.pt")
    break

fold0 created
Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13617.87it/s]


Converting graphs into PyG objects...


100%|███████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 7626.01it/s]


In [None]:
# subset of 11K
# store the data

  from .autonotebook import tqdm as notebook_tqdm


Loading necessary files...
This might take a while.
Processing graphs...


100%|█████████████████████████████| 1/1 [00:00<00:00, 21290.88it/s]


Converting graphs into PyG objects...


100%|██████████████████████████████| 1/1 [00:00<00:00, 5275.85it/s]


In [11]:
data

Data(num_nodes=841, edge_index=[2, 165802], edge_attr=[165802, 1])

In [77]:
def train_test_split_edges(data, sr = 1):
    assert data.num_nodes is not None
    assert data.edge_index is not None

    num_nodes = data.num_nodes
    row, col = data.edge_index
    edge_attr = data.edge_attr

    # Return upper triangular portion.
    mask = row < col
    row, col = row[mask], col[mask]

    if edge_attr is not None:
        edge_attr = edge_attr[mask]

    # Positive edges.
    zero_indices = (edge_attr == 0).nonzero()
    one_indices = (edge_attr == 1).nonzero()

    train_tensor = zero_indices.flatten()
    test_tensor = one_indices.flatten()
    
    r, c = row[test_tensor], col[test_tensor]
    data.test_pos_edge_index = torch.stack([r, c], dim=0)

    r, c = row[train_tensor], col[train_tensor]
    data.train_pos_edge_index = torch.stack([r, c], dim=0)
    
    if edge_attr is not None:
        out = to_undirected(data.train_pos_edge_index, edge_attr[train_tensor])
        data.train_pos_edge_index, _ = out

    # Negative edges.
    n_t = test_tensor.size(0) * sr
    n_tr = train_tensor.size(0) * sr
    print(n_t, n_tr)
    
    neg_adj_mask = torch.ones(num_nodes, num_nodes, dtype=torch.uint8)
    neg_adj_mask = neg_adj_mask.triu(diagonal=1).to(torch.bool)
    neg_adj_mask[row, col] = 0

    neg_row, neg_col = neg_adj_mask.nonzero(as_tuple=False).t()
    perm = torch.randperm(neg_row.size(0) * sr)
    neg_row, neg_col = neg_row[perm], neg_col[perm]
    
    # assigning neg edges
    row, col = neg_row[:n_t], neg_col[:n_t]
    data.test_neg_edge_index = torch.stack([row, col], dim=0)
    row, col = neg_row[n_t: n_t + n_tr], neg_col[n_t: n_t + n_tr]
    data.train_neg_edge_index = torch.stack([row, col], dim=0)
    
    return data

In [81]:
data = train_test_split_edges(data)

train = {"edge": data["train_pos_edge_index"], "edge_neg": data["train_neg_edge_index"]}
test = {"edge": data["test_pos_edge_index"], "edge_neg": data["test_neg_edge_index"]}

train

base_path = "../input/dataset/ogbl_dkp/split/body"
torch.save(train, f"{base_path}/train.pt")
torch.save(test, f"{base_path}/test.pt")