In [2]:
!pip install torch_geometric > /dev/null

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv, HeteroConv

In [4]:
from torch_geometric.data import HeteroData

In [6]:
from torch_geometric.datasets import AMiner
dataset = AMiner(root="/content/")


Downloading https://www.dropbox.com/s/1bnz8r7mofx0osf/net_aminer.zip?dl=1
Extracting /content/net_aminer.zip
Downloading https://www.dropbox.com/s/nkocx16rpl4ydde/label.zip?dl=1
Extracting /content/raw/label.zip
Processing...
Done!


In [22]:
data = dataset[0]
print(data)
print("-----")
homogeneous_data = data.to_homogeneous()
hd = homogeneous_data # for short
print(homogeneous_data)
# this issue with the data rn is that there are 3441217 nodes in total, and y vector is storing a value for each of those, even though only 246812 of them have labels.
# this means that only 246812/3441217 (7%) of the entries in the y vector are real labels
# we need to filter out the unlabeled author nodes, the unlabeled venue nodes, and all of the paper nodes (since no paper nodes have labels) in order to form a suitable y vector
# for the loss comparison

HeteroData(
  author={
    y=[246678],
    y_index=[246678],
    num_nodes=1693531,
  },
  venue={
    y=[134],
    y_index=[134],
    num_nodes=3883,
  },
  paper={ num_nodes=3194405 },
  (paper, written_by, author)={ edge_index=[2, 9323605] },
  (author, writes, paper)={ edge_index=[2, 9323605] },
  (paper, published_in, venue)={ edge_index=[2, 3194405] },
  (venue, publishes, paper)={ edge_index=[2, 3194405] }
)
-----
Data(edge_index=[2, 25036020], y=[3441217], y_index=[3441217], node_type=[4891819], edge_type=[25036020])


In [27]:
# Getting suitable y vector for loss calculation: remove the placeholder labels (-1) for unlabeled nodes
print(f"homogeneous y started out with size: {hd.y.shape}")
processed_y = hd.y[hd.y != -1]
print(f"After removing the placeholder labels, homogeneous y now has size: {processed_y.shape}")

homogeneous y started out with size: torch.Size([3441217])
After removing the placeholder labels, homogeneous y now has size: torch.Size([246812])


In [43]:
# Assuming we already have the logits from the model
logits = torch.rand(hd.y.shape[0], hd.y.max() + 1) # using hd.y.max() + 1 to include all labels up to hd.y.max() and the 0 label
print(f"shape of logits before removing logits without labels: {logits.shape}")

# reorder the logits to match hd.y and remove the logits that match to placeholder labels
logits = logits[hd.y_index][hd.y != -1]

print(f"shape of logits after masking out the ones without labels: {logits.shape}")
print(f"now the logits with shape: {logits.shape} are aligned with the y labels with shape: {processed_y.shape}")

shape of logits before removing logits without labels: torch.Size([3441217, 8])
shape of logits after masking out the ones without labels: torch.Size([246812, 8])
now the logits with shape: torch.Size([246812, 8]) are aligned with the y labels with shape: torch.Size([246812])
