In [104]:
!pip install torch_geometric > /dev/null
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from torch_geometric.datasets import DBLP
import numpy as np
import scipy.sparse as sp
from torch_geometric.utils import to_torch_sparse_tensor

In [105]:
### arguments.py File ###
### Modified for colab since colab does not seem to like argparse ###

# Defined custom class to hold arguments
class Args:
  def __init__(self):
    self.root_dir = "/content"
    self.data_dir = "/content/data"
    self.epochs = 300
    self.runs = 5
    self.droput = 0.4
    self.lr = 0.001
    self.wd = 0.001
    self.num_layers = 2
    self.num_hidden = 256
    self.num_features = 0 # placeholder
    self.num_classes = 0 # placeholder

def add_data_features(args, data):
  args.num_features = data.x.shape[1]
  args.num_classes = data.y.shape[0]
  return args

In [106]:
### data.py File ###
args = Args()
dataset = DBLP(root=args.root_dir)
data = dataset[0]
print(data)

HeteroData(
  author={
    x=[4057, 334],
    y=[4057],
    train_mask=[4057],
    val_mask=[4057],
    test_mask=[4057],
  },
  paper={ x=[14328, 4231] },
  term={ x=[7723, 50] },
  conference={ num_nodes=20 },
  (author, to, paper)={ edge_index=[2, 19645] },
  (paper, to, author)={ edge_index=[2, 19645] },
  (paper, to, term)={ edge_index=[2, 85810] },
  (paper, to, conference)={ edge_index=[2, 14328] },
  (term, to, paper)={ edge_index=[2, 85810] },
  (conference, to, paper)={ edge_index=[2, 14328] }
)


In [107]:
# Printing the data's node and edge types
for item in data.metadata():
  print(item)

['author', 'paper', 'term', 'conference']
[('author', 'to', 'paper'), ('paper', 'to', 'author'), ('paper', 'to', 'term'), ('paper', 'to', 'conference'), ('term', 'to', 'paper'), ('conference', 'to', 'paper')]


# Creating Three Sets of Metapaths

In [108]:
# Creating (Author -> Paper -> Author) Metapaths
"""
1. Collect edge indices for author->paper and paper->author
2. Convert edge indices to torch sparse tensors
3. Matrix multiply ap_dense and pa_dense to get apa_dense
4. Bring apa_dense back down to sparse and set all nonzero values to 1 to represent connections
"""
# 1
num_authors = data["author"].x.shape[0]
num_papers = data["paper"].x.shape[0]
ap = data[('author', 'to', 'paper')].edge_index
pa = ap.T
# 2
ap_dense_adj = to_torch_sparse_tensor(ap, size=(num_authors, num_papers))
pa_dense_adj = ap_dense_adj.T
print(type(ap), ap)
print()
print(type(ap_dense_adj), ap_dense_adj)
print()
print(f"matrices to be multiplied: {list(ap_dense_adj.shape)}, {list(pa_dense_adj.shape)}") # will end up being a 4057 x 4057 matrix bc it will have all mps starting and ending with author, that go through a paper
# 3
apa_dense_adj = ap_dense_adj @ pa_dense_adj
apa_dense_adj
# 4
apa_sparse_adj = apa_dense_adj.indices().to_sparse().coalesce()
apa_sparse_adj.indices()

<class 'torch.Tensor'> tensor([[    0,     0,     1,  ...,  4054,  4055,  4056],
        [ 2364,  6457,  2365,  ..., 13891, 13891, 13892]])

<class 'torch.Tensor'> tensor(indices=tensor([[    0,     0,     1,  ...,  4054,  4055,  4056],
                       [ 2364,  6457,  2365,  ..., 13891, 13891, 13892]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       size=(4057, 14328), nnz=19645, layout=torch.sparse_coo)

matrices to be multiplied: [4057, 14328], [14328, 4057]


tensor([[    0,     0,     0,  ...,     1,     1,     1],
        [    1,     2,     3,  ..., 11110, 11111, 11112]])

In [160]:
# Creating (Author -> Paper -> Conference -> Paper -> Author) Metapaths
"""
1. Collect edge indices for (author->paper) (paper->conference) (conference->paper) (paper->author)
2. Convert edge indices to torch sparse tensors
3. Multiply edge indices from (Author -> Paper -> Conference) and (Conference -> Paper -> Author) -- this should be author x author in size
4. Bring apcpa back down to sparse and set all nonzero values to 1 to represent connections
"""
# 1.
num_conferences = data["conference"].num_nodes
pc = data[('paper', 'to', 'conference')].edge_index
# 2.
pc_dense_adj = to_torch_sparse_tensor(pc, size=(num_papers, num_conferences))
# 3.
print(f"Multiplying: apc = {list(ap_dense_adj.shape)}, {list(pc_dense_adj.shape)} which yields a num_authors ({num_authors}) by num_conferences ({num_conferences}) matrix")
apc_dense_adj = ap_dense_adj @ pc_dense_adj
print(f"multiplying apc with its transpose to obtain apcpa which is shape (author x author) ie ({num_authors} x {num_authors})")
apcpa_dense_adj = apc_dense_adj @ apc_dense_adj.T
# 4.
apcpa_sparse_adj = apcpa_dense_adj.indices().to_sparse().coalesce()
apcpa_sparse_adj.indices()

Multiplying: apc = [4057, 14328], [14328, 20] which yields a num_authors (4057) by num_conferences (20) matrix
multiplying apc with its transpose to obtain apcpa which is shape (author x author) ie (4057 x 4057)


tensor([[      0,       0,       0,  ...,       1,       1,       1],
        [   1037,    1038,    1039,  ..., 5000492, 5000493, 5000494]])

In [161]:
# Creating (Author -> Paper -> Term -> Paper -> Author) Metapaths
"""
1. Collect edge indices for (author->paper) (paper->term) (term->paper) (paper->author)
2. Convert edge indices to torch sparse tensors
3. Multiply edge indices from (Author -> Paper -> Term) and (Term -> Paper -> Author) -- this should be author x author in size
4. Bring aptpa back down to sparse and set all nonzero values to 1 to represent connections
"""
# 1.
num_terms = data["term"].x.shape[0]
pt = data[('paper', 'to', 'term')].edge_index
# 2.
pt_dense_adj = to_torch_sparse_tensor(pt, size=(num_papers, num_terms))
# 3.
print(f"Multiplying: apt = {list(ap_dense_adj.shape)}, {list(pt_dense_adj.shape)} which yields a num_authors ({num_authors}) by num_conferences ({num_terms}) matrix")
apt_dense_adj = ap_dense_adj @ pt_dense_adj
print(f"multiplying apc with its transpose to obtain apcpa which is shape (author x author) ie ({num_authors} x {num_authors})")
aptpa_dense_adj = apt_dense_adj @ apt_dense_adj.T
# 4.
aptpa_sparse_adj = aptpa_dense_adj.indices().to_sparse().coalesce()
aptpa_sparse_adj

Multiplying: apt = [4057, 14328], [14328, 7723] which yields a num_authors (4057) by num_conferences (7723) matrix
multiplying apc with its transpose to obtain apcpa which is shape (author x author) ie (4057 x 4057)


tensor(indices=tensor([[      0,       0,       0,  ...,       1,       1,
                              1],
                       [   1339,    1340,    1341,  ..., 7043568, 7043569,
                        7043570]]),
       values=tensor([   1,    1,    1,  ..., 4043, 4045, 4056]),
       size=(2, 7043571), nnz=14084464, layout=torch.sparse_coo)

In [155]:
print(aptpa_sparse_adj.indices()[0].max())

tensor(1)


In [136]:
# Exploring the metapaths
num_apa = (apa_dense_adj.to_dense() == 1).sum().item()
num_apcpa = (apcpa_dense_adj.to_dense() == 1).sum().item()
num_aptpa = (aptpa_dense_adj.to_dense() == 1).sum().item()
print(num_apa, num_apcpa, num_aptpa)

5615 1568211 2527692


# Converting the Metapaths into Usable Datasets

In [145]:
# Creating the apa dataset

apa_data = torch_geometric.data.HeteroData()
apa_data["author"].x = [author for author in data["author"].x if author in ]

{}

In [138]:
### Data Modification (heterogeneous --> homogeneous)
print("Original (heterogeneous) dataset summary: ")
print(data)
print("\n --------------- \n")
hd = data.to_homogeneous()
print("New (homogeneous) dataset summary: ")
print(hd)

Original (heterogeneous) dataset summary: 
HeteroData(
  author={
    x=[4057, 334],
    y=[4057],
    train_mask=[4057],
    val_mask=[4057],
    test_mask=[4057],
  },
  paper={ x=[14328, 4231] },
  term={ x=[7723, 50] },
  conference={ num_nodes=20 },
  (author, to, paper)={ edge_index=[2, 19645] },
  (paper, to, author)={ edge_index=[2, 19645] },
  (paper, to, term)={ edge_index=[2, 85810] },
  (paper, to, conference)={ edge_index=[2, 14328] },
  (term, to, paper)={ edge_index=[2, 85810] },
  (conference, to, paper)={ edge_index=[2, 14328] }
)

 --------------- 

New (homogeneous) dataset summary: 
Data(edge_index=[2, 239566], y=[26128], train_mask=[26128], val_mask=[26128], test_mask=[26128], node_type=[26128], edge_type=[239566])


In [None]:
### model.py File ###

def make_layers(self):
    layers = []
    # initialize layers in a loop that uses conditionals to determine the input and output dimensions of the feature vectors
    for i in range(self.num_layers):
        if i == 0:  # first layer
            # dimensions in = input data size
            # dimensions out = hidden layer size
            layer = GCNConv(self.num_features, self.num_hidden)

        elif i < self.num_layers - 1: # hidden layer(s)
            # dimensions in = hidden layer size
            # dimensions out = hidden layer size
            layer = GCNConv(self.num_hidden, self.num_hidden)

        else:  # output layer
            # dimensions in = hidden layer size
            # dimensions out = output size
            layer = GCNConv(self.num_hidden, self.num_classes)

        layers.append(layer)

    return nn.ModuleList(layers)

class GCN_model(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.num_features = args.num_features
        self.num_layers = args.num_layers
        self.num_hidden = args.num_hidden
        self.num_classes = args.num_classes
        self.wd = args.wd
        self.lr = args.lr
        self.layers = make_layers(self)

    def forward(self, x, edge_idx):
        for i, layer in enumerate(self.layers):
            # apply the convolutional layer
            x = layer(x, edge_idx)

            # Since I did not apply the activation function in the Layers array, I apply it using conditionals (to decide relu or softmax) here
            if i != len(self.layers) - 1:
                x = F.relu(x)
            else:
                x = F.log_softmax(x, dim = 1)

        return x


In [None]:
### main.py File ###

def train(model, X, Y, data):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr = model.lr, weight_decay = model.wd)
    optimizer.zero_grad()
    activations = model(X, data.edge_index)

    # only calculate loss on train labels!!
    loss = F.nll_loss(activations[data.train_mask], Y[data.train_mask])
    loss.backward()
    optimizer.step()

def get_masked_acc(activations, y_true, mask):
    length = activations[mask].shape[0]
    correct = 0
    for yhat, y in zip(activations[mask], y_true[mask]):
        if torch.argmax(yhat) == y:
            correct += 1

    return correct / length

def get_accuracy(activations, y_true, data):
    train_acc = get_masked_acc(activations, y_true, data.train_mask)
    test_acc = get_masked_acc(activations, y_true, data.test_mask)
    val_acc = get_masked_acc(activations, y_true, data.val_mask)
    return train_acc, test_acc, val_acc

def main():
    # use gpu if possible (works most of the time here on colab)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Device: {device}")

    # get data
    data = dataset[0].to(device)
    x = data.x
    y = data.y

    # get preferences
    args = Args()
    args = add_data_features(args, data)


    for run in range(args.runs):
        # initialize model
        model = GCN_model(args).to(device)
        print("\n------------ new model ------------\n")
        for epoch in range(args.epochs):
          # log loss every 50 steps
            if epoch % 50 == 0 or epoch == args.epochs - 1:
                model.eval()
                activations = model(x, data.edge_index)
                loss = F.nll_loss(activations, y)
                train_acc, test_acc, val_acc = get_accuracy(activations, y, data)
                print(f" Epoch: {epoch} | Total Loss: {loss} | Train Accuracy: {train_acc} | Test Accuracy: {test_acc} | Val Accuracy: {val_acc}")

            # backprop & update
            train(model, x, y, data)

if __name__ == '__main__':
    main()