In [97]:
!pip install torch_geometric > /dev/null

In [98]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv, HeteroConv

In [99]:
from torch_geometric.data import HeteroData

In [100]:
from torch_geometric.datasets import AMiner
dataset = AMiner(root="/content/")


In [101]:
data = dataset[0]
print(data)
print("-----")
homogeneous_data = data.to_homogeneous()
hd = homogeneous_data # for short
print(homogeneous_data)
# this issue with the data rn is that there are 3441217 nodes in total, and y vector is storing a value for each of those, even though only 246812 of them have labels.
# this means that only 246812/3441217 (7%) of the entries in the y vector are real labels
# we need to filter out the unlabeled author nodes, the unlabeled venue nodes, and all of the paper nodes (since no paper nodes have labels) in order to form a suitable y vector
# for the loss comparison

HeteroData(
  author={
    y=[246678],
    y_index=[246678],
    num_nodes=1693531,
  },
  venue={
    y=[134],
    y_index=[134],
    num_nodes=3883,
  },
  paper={ num_nodes=3194405 },
  (paper, written_by, author)={ edge_index=[2, 9323605] },
  (author, writes, paper)={ edge_index=[2, 9323605] },
  (paper, published_in, venue)={ edge_index=[2, 3194405] },
  (venue, publishes, paper)={ edge_index=[2, 3194405] }
)
-----
Data(edge_index=[2, 25036020], y=[3441217], y_index=[3441217], node_type=[4891819], edge_type=[25036020])


In [102]:
# Getting suitable y vector for loss calculation: remove the placeholder labels (-1) for unlabeled nodes
print(f"homogeneous y started out with size: {hd.y.shape}")
processed_y = hd.y[hd.y != -1]
print(f"After removing the placeholder labels, homogeneous y now has size: {processed_y.shape}")
data.y = processed_y

homogeneous y started out with size: torch.Size([3441217])
After removing the placeholder labels, homogeneous y now has size: torch.Size([246812])


In [103]:
# Assuming we already have the logits from the model
logits = torch.rand(hd.y.shape[0], hd.y.max() + 1) # using hd.y.max() + 1 to include all labels up to hd.y.max() and the 0 label
print(f"shape of logits before removing logits without labels: {logits.shape}")

# reorder the logits to match hd.y and remove the logits that match to placeholder labels
logits = logits[hd.y_index][hd.y != -1]

print(f"shape of logits after masking out the ones without labels: {logits.shape}")
print(f"now the logits with shape: {logits.shape} are aligned with the y labels with shape: {processed_y.shape}")

shape of logits before removing logits without labels: torch.Size([3441217, 8])
shape of logits after masking out the ones without labels: torch.Size([246812, 8])
now the logits with shape: torch.Size([246812, 8]) are aligned with the y labels with shape: torch.Size([246812])


In [104]:
# Separate nodes by type
author_nodes = torch.nonzero(hd.node_type == 0).squeeze()
venue_nodes = torch.nonzero(hd.node_type == 1).squeeze()
paper_nodes = torch.nonzero(hd.node_type == 2).squeeze()

print(f"shape of author nodes: {author_nodes.shape}")
print(f"shape of venue nodes: {venue_nodes.shape}")
print(f"shape of paper nodes: {paper_nodes.shape}")

shape of author nodes: torch.Size([1693531])
shape of venue nodes: torch.Size([3883])
shape of paper nodes: torch.Size([3194405])


In [105]:
# Create feature vectors as one-hot encodings for each of the nodes
# For now they are stored in sparse format to conserve memory
import scipy
import numpy as np
author_nodes_features = scipy.sparse.eye(author_nodes.shape[0], format='coo')
venue_nodes_features = scipy.sparse.eye(venue_nodes.shape[0], format='coo')
paper_nodes_features = scipy.sparse.eye(paper_nodes.shape[0], format='coo')

# move these to torch
def scipy_coo_to_torch_sparse(scipy_coo):
    values = torch.FloatTensor(scipy_coo.data)
    indices = torch.LongTensor(np.vstack((scipy_coo.row, scipy_coo.col)))
    shape = torch.Size(scipy_coo.shape)
    return torch.sparse_coo_tensor(indices, values, shape)

# Convert SciPy sparse matrices to PyTorch sparse tensors
author_nodes_features_torch = scipy_coo_to_torch_sparse(author_nodes_features)
venue_nodes_features_torch = scipy_coo_to_torch_sparse(venue_nodes_features)
paper_nodes_features_torch = scipy_coo_to_torch_sparse(paper_nodes_features)

# checking
print(f"shape of author nodes features: {author_nodes_features_torch.shape}")
print(f"shape of venue nodes features: {venue_nodes_features_torch.shape}")
print(f"shape of paper nodes features: {paper_nodes_features_torch.shape}")



# pooling all nodes together
all_nodes_features = scipy.sparse.eye(hd.node_type.shape[0], format='coo')
all_nodes_features_torch = scipy_coo_to_torch_sparse(all_nodes_features)
print(all_nodes_features_torch.shape)
data.x = all_nodes_features_torch


shape of author nodes features: torch.Size([1693531, 1693531])
shape of venue nodes features: torch.Size([3883, 3883])
shape of paper nodes features: torch.Size([3194405, 3194405])
torch.Size([4891819, 4891819])


In [106]:

def make_training_test_validation_masks(y, train_ratio=0.1, validation_ratio=0.1, test_ratio=0.8):
  num_train = int(y.shape[0] * train_ratio)
  num_val = int(y.shape[0] * validation_ratio)
  num_test = int(y.shape[0] * test_ratio)
  num_test += 1 # the sum of the train/test/validation splits would otherwise be less than the number of y values

  train_counts = [0]*(torch.max(y).item() + 1) # each index of the list represents the number of that index's class label
  val_counts = [0]*(torch.max(y).item() +1 )
  test_counts = [0]*(torch.max(y).item() + 1)

  train_mask = []
  val_mask = []
  test_mask = []
  for y_value in y:
    y_value = y_value.item()
    ratios = {"train" : train_counts[y_value]/num_train, "val" : val_counts[y_value]/num_val, "test" : test_counts[y_value]/num_test}
    priority = min(ratios, key=ratios.get)

    train_mask.append(False)
    val_mask.append(False)
    test_mask.append(False)


    if priority == "train":
      train_mask[-1] = True
      train_counts[y_value] += 1

    elif priority == "val":
      val_mask[-1] = True
      val_counts[y_value] += 1

    elif priority == "test":
      test_mask[-1] = True
      test_counts[y_value] +=1

  # this demonstarates that the classes labels are split evenly over the masks
  index = 0
  for train, val, test in zip(train_counts, val_counts, test_counts):
    print(f"number of label [{index}] in train: {train}")
    print(f"number of label [{index}] in val: {val}")
    print(f"number of label [{index}] in test: {test}")
    print("------")
    index += 1

  return train_mask, val_mask, test_mask


train_mask, val_mask, test_mask = make_training_test_validation_masks(processed_y)

train_mask = torch.tensor(train_mask, dtype=torch.bool)
val_mask = torch.tensor(val_mask, dtype=torch.bool)
test_mask = torch.tensor(test_mask, dtype=torch.bool)

print(f"number of train mask labels: {train_mask.nonzero().shape[0]}")
print(f"number of validation mask labels: {val_mask.nonzero().shape[0]}")
print(f"number of test mask labels: {test_mask.nonzero().shape[0]}")
# this results in 246812 total true mask values

number of label [0] in train: 2893
number of label [0] in val: 2893
number of label [0] in test: 23143
------
number of label [1] in train: 6879
number of label [1] in val: 6879
number of label [1] in test: 55033
------
number of label [2] in train: 1800
number of label [2] in val: 1800
number of label [2] in test: 14393
------
number of label [3] in train: 1593
number of label [3] in val: 1593
number of label [3] in test: 12742
------
number of label [4] in train: 2615
number of label [4] in val: 2615
number of label [4] in test: 20914
------
number of label [5] in train: 1757
number of label [5] in val: 1757
number of label [5] in test: 14057
------
number of label [6] in train: 4573
number of label [6] in val: 4573
number of label [6] in test: 36583
------
number of label [7] in train: 2573
number of label [7] in val: 2573
number of label [7] in test: 20581
------
number of train mask labels: 24683
number of validation mask labels: 24683
number of test mask labels: 197446


In [107]:
### arguments.py File ###
### Modified for colab since colab does not seem to like argparse ###

# Defined custom class to hold arguments
class Args:
  def __init__(self):
    self.root_dir = "/content"
    self.data_dir = "/content/data"
    self.epochs = 300
    self.runs = 5
    self.droput = 0.4
    self.lr = 0.001
    self.wd = 0.001
    self.num_layers = 2
    self.num_hidden = 256
    self.num_features = 0 # placeholder
    self.num_classes = 0 # placeholder

def add_data_features(args, data):
  args.num_features = data.x.shape[1]
  args.num_classes = data.y.shape[0]
  return args

In [108]:
"""

# Display data that was originally used for GCN to get an idea of how to convert new data to its format
from torch_geometric.datasets import Planetoid

args = Args()
dataset = Planetoid(root=args.root_dir, name="Cora")
print(dataset[0])
"""

'\n\n# Display data that was originally used for GCN to get an idea of how to convert new data to its format\nfrom torch_geometric.datasets import Planetoid\n\nargs = Args()\ndataset = Planetoid(root=args.root_dir, name="Cora")\nprint(dataset[0])\n'

In [109]:
### model.py File ###

def make_layers(self):
    layers = []
    # initialize layers in a loop that uses conditionals to determine the input and output dimensions of the feature vectors
    for i in range(self.num_layers):
        if i == 0:  # first layer
            # dimensions in = input data size
            # dimensions out = hidden layer size
            layer = GCNConv(self.num_features, self.num_hidden)

        elif i < self.num_layers - 1: # hidden layer(s)
            # dimensions in = hidden layer size
            # dimensions out = hidden layer size
            layer = GCNConv(self.num_hidden, self.num_hidden)

        else:  # output layer
            # dimensions in = hidden layer size
            # dimensions out = output size
            layer = GCNConv(self.num_hidden, self.num_classes)

        layers.append(layer)

    return nn.ModuleList(layers)

class GCN_model(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.num_features = args.num_features
        self.num_layers = args.num_layers
        self.num_hidden = args.num_hidden
        self.num_classes = args.num_classes
        self.wd = args.wd
        self.lr = args.lr
        self.layers = make_layers(self)

    def forward(self, x, edge_idx):
        for i, layer in enumerate(self.layers):
            # apply the convolutional layer
            x = layer(x, edge_idx)

            # Since I did not apply the activation function in the Layers array, I apply it using conditionals (to decide relu or softmax) here
            if i != len(self.layers) - 1:
                x = F.relu(x)
            else:
                x = F.log_softmax(x, dim = 1)

        return x

In [111]:
### main.py File ###

def train(model, X, Y, data):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr = model.lr, weight_decay = model.wd)
    optimizer.zero_grad()
    activations = model(X, data.edge_index)

    # only calculate loss on train labels!!
    loss = F.nll_loss(activations[train_mask], Y[train_mask])
    loss.backward()
    optimizer.step()

def get_masked_acc(activations, y_true, mask):
    length = activations[mask].shape[0]
    correct = 0
    for yhat, y in zip(activations[mask], y_true[mask]):
        if torch.argmax(yhat) == y:
            correct += 1

    return correct / length

def get_accuracy(activations, y_true, data):
    train_acc = get_masked_acc(activations, y_true, data.train_mask)
    test_acc = get_masked_acc(activations, y_true, data.test_mask)
    val_acc = get_masked_acc(activations, y_true, data.val_mask)
    return train_acc, test_acc, val_acc

def main():
    # use gpu if possible (works most of the time here on colab)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Device: {device}")

    # get data
    data = hd.to(device)
    y = processed_y

    # get preferences
    args = Args()
    args = add_data_features(args, data)


    for run in range(args.runs):
        # initialize model
        model = GCN_model(args).to(device)
        print("\n------------ new model ------------\n")
        for epoch in range(args.epochs):
          # log loss every 50 steps
            if epoch % 50 == 0 or epoch == args.epochs - 1:
                model.eval()
                activations = model(x, hd.edge_index)
                loss = F.nll_loss(activations, y)
                train_acc, test_acc, val_acc = get_accuracy(activations, y, data)
                print(f" Epoch: {epoch} | Total Loss: {loss} | Train Accuracy: {train_acc} | Test Accuracy: {test_acc} | Val Accuracy: {val_acc}")

            # backprop & update
            train(model, x, y, data)

if __name__ == '__main__':
    main()

Device: cpu


AttributeError: 'NoneType' object has no attribute 'shape'