<a href="https://colab.research.google.com/github/SheidaMajouni/CT-scan-classifier---R-language---torch/blob/main/2_transductiveEdgeSplitHeteroData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.5.0-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.5.0


In [3]:
import torch
from torch_geometric.data import HeteroData
from torch_geometric.transforms import RandomLinkSplit, ToUndirected
from torch_geometric.data import Data
import pandas as pd

In [4]:
# read the data from the file
data = torch.load('data.pt')

print(data)

HeteroData(
  donor={ x=[6, 3] },
  recipient={ x=[6, 3] },
  (recipient, match, donor)={
    edge_index=[2, 21],
    edge_weight=[21],
    edge_label=[21],
  }
)


In [None]:
# We can now convert `data` into an appropriate format for training a
# graph-based machine learning model:

# # 2. Perform a link-level split into training, validation, and test edges.
# transform = RandomLinkSplit(
#     num_val=0.05,
#     num_test=0.1,
#     neg_sampling_ratio=0.0,
#     edge_types=[('recipient', 'match', 'donor')],
#     rev_edge_types=[('donor', 'rev_match', 'recipient')],
# )
# train_data, val_data, test_data = transform(data)
# print(train_data)
# print(train_data['recipient', 'match', 'donor'].edge_index)
# print(train_data['recipient', 'match', 'donor'].edge_label)
# print(train_data['recipient', 'match', 'donor'].edge_attr)
# print(val_data)
# print(test_data)

In [None]:
"""
problem of RandomLinkSplit is that it considers all existing edges as positive examples
we want to use second edge_attr column to determine whether the edge is positive or negative
actually, first I have to filter edges (pair of nodes) that have weight = 3 because these
are the ones that we have the label for survival or not! and we only care of predicting edges
with weight 3. split the data into train, val, test. all the other edges should be in available
in message passing. (transductive link prediction split)
    * all edges with weight 1 and 2: blue message passing
    * all edges with weight 3_train: red train supervisor
    * all edges with weight 3_val: yellow validation
    * all edges with weight 3_test: green test
"""

In [5]:
# Get the edge indices and attributes
edge_index = data['recipient', 'match', 'donor'].edge_index
edge_weight = data['recipient', 'match', 'donor'].edge_weight


In [6]:
# Separate message passing and supervised edges
message_passing_edges = edge_index[:, edge_weight <= 2]
supervised_edges = edge_index[:, edge_weight == 3]


In [7]:
# Randomly permute the supervised edges
perm = torch.randperm(supervised_edges.size(1))

# Split the supervised edges into training, validation, and test sets
num_supervised = supervised_edges.size(1)
train_edges = supervised_edges[:, perm[:int(0.8 * num_supervised)]]
val_edges = supervised_edges[:, perm[int(0.8 * num_supervised):int(0.9 * num_supervised)]]
test_edges = supervised_edges[:, perm[int(0.9 * num_supervised):]]


In [8]:
# Find the indices of edge in edge_list
mp_indices = [torch.nonzero((edge_index.t() == edge).all(dim=1), as_tuple=False).view(-1).item() for edge in message_passing_edges.t()]
train_indices = [torch.nonzero((edge_index.t() == edge).all(dim=1), as_tuple=False).view(-1).item() for edge in train_edges.t()]
val_indices = [torch.nonzero((edge_index.t() == edge).all(dim=1), as_tuple=False).view(-1).item() for edge in val_edges.t()]
test_indices = [torch.nonzero((edge_index.t() == edge).all(dim=1), as_tuple=False).view(-1).item() for edge in test_edges.t()]

# convert to tensor
mp_indices = torch.tensor(mp_indices)
train_indices = torch.tensor(train_indices)
val_indices = torch.tensor(val_indices)
test_indices = torch.tensor(test_indices)


In [9]:
# Create the dictionary
edge_type = ('recipient', 'match', 'donor')
message_passing_dict = {edge_type: mp_indices}
train_edge_dict = {edge_type: train_indices}
val_edge_dict = {edge_type: val_indices}
test_edge_dict = {edge_type: test_indices}


In [10]:
# Create a Data object for the message passing set
mp_data = data.edge_subgraph(message_passing_dict) # 15 edges except (0,0) (1,1) ...
train_super_data = data.edge_subgraph(train_edge_dict) # 4 edges including (0,0) (1,1) ...
val_super_data = data.edge_subgraph(val_edge_dict) # 1 edge including (4,4)
test_super_data = data.edge_subgraph(test_edge_dict) # 1 edge including (5,5)


In [11]:
# create train data by combination of train_super_data and mp_data
train_data = mp_data.__copy__()
train_data['recipient', 'match', 'donor'].edge_label = train_super_data['recipient', 'match', 'donor'].edge_label
train_data['recipient', 'match', 'donor'].edge_label_index = train_super_data['recipient', 'match', 'donor'].edge_index


In [12]:
# creat validation: message passing edge = mp + train super, val_super = val_data
val_data = mp_data.__copy__()
val_data['recipient', 'match', 'donor'].edge_index = torch.cat([mp_data['recipient', 'match', 'donor'].edge_index, train_super_data['recipient', 'match', 'donor'].edge_index], dim=1)
val_data['recipient', 'match', 'donor'].edge_weight = torch.cat([mp_data['recipient', 'match', 'donor'].edge_weight, train_super_data['recipient', 'match', 'donor'].edge_weight], dim=0)
val_data['recipient', 'match', 'donor'].edge_label = val_super_data['recipient', 'match', 'donor'].edge_label
val_data['recipient', 'match', 'donor'].edge_label_index = val_super_data['recipient', 'match', 'donor'].edge_index


In [13]:
# creat test: message passing edge = mp + train super + val super, test_super = test_data
test_data = val_data.__copy__()
test_data['recipient', 'match', 'donor'].edge_index = torch.cat([val_data['recipient', 'match', 'donor'].edge_index, val_super_data['recipient', 'match', 'donor'].edge_index], dim=1)
test_data['recipient', 'match', 'donor'].edge_weight = torch.cat([val_data['recipient', 'match', 'donor'].edge_weight, val_super_data['recipient', 'match', 'donor'].edge_weight], dim=0)
test_data['recipient', 'match', 'donor'].edge_label = test_super_data['recipient', 'match', 'donor'].edge_label
test_data['recipient', 'match', 'donor'].edge_label_index = test_super_data['recipient', 'match', 'donor'].edge_index


In [14]:
# 1. Add a reverse ('donor', 'rev_match', 'recipient') relation for message passing.
train_data = ToUndirected()(train_data)
del train_data['donor', 'rev_match', 'recipient'].edge_label  # Remove "reverse" label.
print("train_data")
print(train_data)
print("train_ms_edges:", train_data['recipient', 'match', 'donor'].edge_index)
print("train_super_edges:", train_data['recipient', 'match', 'donor'].edge_label_index)

val_data = ToUndirected()(val_data)
del val_data['donor', 'rev_match', 'recipient'].edge_label  # Remove "reverse" label.
print("val_data")
print(val_data)
print("val_ms_edges:", val_data['recipient', 'match', 'donor'].edge_index)
print("val_super_edges:", val_data['recipient', 'match', 'donor'].edge_label_index)

test_data = ToUndirected()(test_data)
del test_data['donor', 'rev_match', 'recipient'].edge_label  # Remove "reverse" label.
print("test_data")
print(test_data)
print("test_ms_edges:", test_data['recipient', 'match', 'donor'].edge_index)
print("test_super_edges:", test_data['recipient', 'match', 'donor'].edge_label_index)


train_data
HeteroData(
  donor={ x=[6, 3] },
  recipient={ x=[6, 3] },
  (recipient, match, donor)={
    edge_index=[2, 15],
    edge_weight=[15],
    edge_label=[4],
    edge_label_index=[2, 4],
  },
  (donor, rev_match, recipient)={
    edge_index=[2, 15],
    edge_weight=[15],
  }
)
train_ms_edges: tensor([[1, 4, 4, 1, 4, 0, 1, 2, 4, 5, 0, 1, 1, 2, 4],
        [0, 0, 1, 2, 2, 3, 3, 3, 3, 3, 4, 4, 5, 5, 5]])
train_super_edges: tensor([[4, 2, 3, 5],
        [4, 2, 3, 5]])
val_data
HeteroData(
  donor={ x=[6, 3] },
  recipient={ x=[6, 3] },
  (recipient, match, donor)={
    edge_index=[2, 19],
    edge_weight=[19],
    edge_label=[1],
    edge_label_index=[2, 1],
  },
  (donor, rev_match, recipient)={
    edge_index=[2, 19],
    edge_weight=[19],
  }
)
val_ms_edges: tensor([[1, 4, 4, 1, 4, 0, 1, 2, 4, 5, 0, 1, 1, 2, 4, 4, 2, 3, 5],
        [0, 0, 1, 2, 2, 3, 3, 3, 3, 3, 4, 4, 5, 5, 5, 4, 2, 3, 5]])
val_super_edges: tensor([[1],
        [1]])
test_data
HeteroData(
  donor={ x=[6, 3] },
