In [1]:
import pandas as pd
import numpy as np
import torch
import math
import torch
import torch_geometric
from torch_geometric.utils import to_undirected
from ogb.io.read_graph_pyg import read_graph_pyg
from torch_geometric.data import Data
from torch_geometric.transforms import ToUndirected, RandomLinkSplit

  from .autonotebook import tqdm as notebook_tqdm


# Body Part

In [2]:
edge_type = pd.read_csv("../input/dataset/edges/edge_type.txt", names = ["type"], sep = " ")

In [3]:
edge_type.to_csv("../input/dataset/ogbl_ddi/raw/edge-feat.csv.gz", header = None, compression = "gzip", index = False)

In [2]:
data = read_graph_pyg(
        "../input/dataset/ogbl_ddi/raw/",
        add_inverse_edge=True,
        additional_node_files=[],
        additional_edge_files=[],
        binary=False,
)[0]

Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 132.44it/s]


Converting graphs into PyG objects...


100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3771.86it/s]


In [15]:
def train_test_split_edges(data, sr = 1):
    assert data.num_nodes is not None
    assert data.edge_index is not None

    num_nodes = data.num_nodes
    row, col = data.edge_index
    edge_attr = data.edge_attr

    # Return upper triangular portion.
    mask = row < col
    row, col = row[mask], col[mask]

    if edge_attr is not None:
        edge_attr = edge_attr[mask]

    # Positive edges.
    zero_indices = (edge_attr == 0).nonzero()
    one_indices = (edge_attr == 1).nonzero()
    two_indices = (edge_attr == 2).nonzero()

    train_tensor = zero_indices.flatten()
    val_tensor = one_indices.flatten()
    test_tensor = two_indices.flatten()
    
    r, c = row[val_tensor], col[val_tensor]
    data.val_pos_edge_index = torch.stack([r, c], dim=0)

    r, c = row[test_tensor], col[test_tensor]
    data.test_pos_edge_index = torch.stack([r, c], dim=0)

    r, c = row[train_tensor], col[train_tensor]
    data.train_pos_edge_index = torch.stack([r, c], dim=0)
    if edge_attr is not None:
        out = to_undirected(data.train_pos_edge_index, edge_attr[train_tensor])
        data.train_pos_edge_index, _ = out

    # Negative edges.
    n_v = val_tensor.size(0) * sr
    n_t = test_tensor.size(0) * sr
    neg_adj_mask = torch.ones(num_nodes, num_nodes, dtype=torch.uint8)
    neg_adj_mask = neg_adj_mask.triu(diagonal=1).to(torch.bool)
    neg_adj_mask[row, col] = 0

    neg_row, neg_col = neg_adj_mask.nonzero(as_tuple=False).t()
    perm = torch.randperm(neg_row.size(0) * sr)
    neg_row, neg_col = neg_row[perm], neg_col[perm]

    row, col = neg_row[:n_v], neg_col[:n_v]
    data.val_neg_edge_index = torch.stack([row, col], dim=0)

    row, col = neg_row[n_v:n_v + n_t], neg_col[n_v:n_v + n_t]
    data.test_neg_edge_index = torch.stack([row, col], dim=0)

    row, col = neg_row[n_v + n_t:], neg_col[n_v + n_t:]
    data.train_neg_edge_index = torch.stack([row, col], dim=0)
    
    return data

In [16]:
data = train_test_split_edges(data)

In [17]:
data

Data(num_nodes=841, edge_index=[2, 331604], edge_attr=[331604, 1], val_pos_edge_index=[2, 23544], test_pos_edge_index=[2, 33028], train_pos_edge_index=[2, 137516], train_pos_edge_attr=[137516, 1], val_neg_edge_index=[2, 23544], test_neg_edge_index=[2, 33028], train_neg_edge_index=[2, 213747])

In [18]:
train = {"edge": data["train_pos_edge_index"], "edge_neg": data["train_neg_edge_index"]}
val = {"edge": data["val_pos_edge_index"], "edge_neg": data["val_neg_edge_index"]}
test = {"edge": data["test_pos_edge_index"], "edge_neg": data["test_neg_edge_index"]}

In [14]:
train

{'edge': tensor([[  0,   0,   0,  ..., 840, 840, 840],
         [ 31,  35,  38,  ..., 831, 832, 836]]),
 'edge_neg': tensor([], size=(2, 0), dtype=torch.int64)}

In [19]:
base_path = "../input/dataset/ogbl_ddi/split/body"
torch.save(train, f"{base_path}/train.pt")
torch.save(val, f"{base_path}/val.pt")
torch.save(test, f"{base_path}/test.pt")

# Random

In [2]:
data = read_graph_pyg(
        "../input/ogbl_ddi/raw2/",
        add_inverse_edge=True,
        additional_node_files=[],
        additional_edge_files=[],
        binary=False,
)[0]

Loading necessary files...
This might take a while.
Processing graphs...


100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 115.43it/s]


Converting graphs into PyG objects...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 6553.60it/s]


In [3]:
data["edge_index"]

tensor([[   1, 3005,    1,  ...,  466, 3006,  345],
        [3005,    1, 2958,  ..., 3005,  345, 3006]])

In [5]:
drug = read_graph_pyg(
        "../input/dataset/ogbl_ddi/raw",
        add_inverse_edge=False,
        additional_node_files=[],
        additional_edge_files=[],
        binary=False,
)[0]

Loading necessary files...
This might take a while.
Processing graphs...


100%|█████████████████████████████| 1/1 [00:00<00:00, 24528.09it/s]


Converting graphs into PyG objects...


100%|██████████████████████████████| 1/1 [00:00<00:00, 5983.32it/s]


In [6]:
drug.validate()

True

In [8]:
for ratio in [1,2,3,4,5]:
    transform = RandomLinkSplit(
        num_val=0.1,
        num_test=0.1,
        is_undirected=True,
        neg_sampling_ratio=ratio,
        add_negative_train_samples=True,
        split_labels=True,
    )
    train_data, val_data, test_data = transform(drug)
    base_path = f"../input/dataset/ogbl_ddi/split/sr{ratio}"
    train = {"edge": train_data["pos_edge_label_index"], "edge_neg": train_data["neg_edge_label_index"]}
    val = {"edge": val_data["pos_edge_label_index"], "edge_neg": val_data["neg_edge_label_index"]}
    test = {"edge": test_data["pos_edge_label_index"], "edge_neg": test_data["neg_edge_label_index"]}
    torch.save(train, f"{base_path}/train.pt")
    torch.save(val, f"{base_path}/val.pt")
    torch.save(test, f"{base_path}/test.pt")

