In [2]:
import os
import time
import logging
import yaml
import ast
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import torch

from pprgo import utils, ppr
from pprgo.pprgo import PPRGo
from pprgo.train import train
from pprgo.predict import predict
from pprgo.dataset import PPRDataset
from pprgo.pytorch_utils import matrix_to_torch

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
# Set up logging
logger = logging.getLogger()
logger.handlers = []
ch = logging.StreamHandler()
formatter = logging.Formatter(
        fmt='%(asctime)s (%(levelname)s): %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.setLevel('INFO')

# Download dataset

In [4]:
#!wget --show-progress -O data/reddit.npz https://ndownloader.figshare.com/files/23742119

# Load config

In [5]:
with open('config_demo.yaml', 'r') as c:
    config = yaml.safe_load(c)

In [6]:
# For strings that yaml doesn't parse (e.g. None)
for key, val in config.items():
    if type(val) is str:
        try:
            config[key] = ast.literal_eval(val)
        except (ValueError, SyntaxError):
            pass

In [7]:
data_file           = config['data_file']           # Path to the .npz data file
split_seed          = config['split_seed']        # Seed for splitting the dataset into train/val/test
ntrain_div_classes  = config['ntrain_div_classes']  # Number of training nodes divided by number of classes
attr_normalization  = config['attr_normalization']  # Attribute normalization. Not used in the paper

alpha               = config['alpha']               # PPR teleport probability
eps                 = config['eps']                 # Stopping threshold for ACL's ApproximatePR
topk                = config['topk']                # Number of PPR neighbors for each node
ppr_normalization   = config['ppr_normalization']   # Adjacency matrix normalization for weighting neighbors

hidden_size         = config['hidden_size']         # Size of the MLP's hidden layer
nlayers             = config['nlayers']            # Number of MLP layers
weight_decay        = config['weight_decay']        # Weight decay used for training the MLP
dropout             = config['dropout']             # Dropout used for training

lr                  = config['lr']                  # Learning rate
max_epochs          = config['max_epochs']          # Maximum number of epochs (exact number if no early stopping)
batch_size          = config['batch_size']          # Batch size for training
batch_mult_val      = config['batch_mult_val']      # Multiplier for validation batch size

eval_step           = config['eval_step']           # Accuracy is evaluated after every this number of steps
run_val             = config['run_val']             # Evaluate accuracy on validation set during training

early_stop          = config['early_stop']          # Use early stopping
patience            = config['patience']            # Patience for early stopping

nprop_inference     = config['nprop_inference']     # Number of propagation steps during inference
inf_fraction        = config['inf_fraction']        # Fraction of nodes for which local predictions are computed during inference

# Load the data

In [8]:
start = time.time()
(adj_matrix, attr_matrix, labels,
 train_idx, val_idx, test_idx) = utils.get_data(
        f"{data_file}",
        seed=split_seed,
        ntrain_div_classes=ntrain_div_classes,
        normalize_attr=attr_normalization
)
try:
    d = attr_matrix.n_columns
except AttributeError:
    d = attr_matrix.shape[1]
nc = labels.max() + 1
time_loading = time.time() - start
print(f"Runtime: {time_loading:.2f}s")

Runtime: 0.02s


In [9]:
len(train_idx)

140

# Preprocessing: Calculate PPR scores

In [10]:
# compute the ppr vectors for train/val nodes using ACL's ApproximatePR
start = time.time()
topk_train = ppr.topk_ppr_matrix(adj_matrix, alpha, eps, train_idx, topk,
                                 normalization=ppr_normalization)
train_set = PPRDataset(attr_matrix_all=attr_matrix, ppr_matrix=topk_train, indices=train_idx, labels_all=labels)
if run_val:
    topk_val = ppr.topk_ppr_matrix(adj_matrix, alpha, eps, val_idx, topk,
                                   normalization=ppr_normalization)
    val_set = PPRDataset(attr_matrix_all=attr_matrix, ppr_matrix=topk_val, indices=val_idx, labels_all=labels)
else:
    val_set = None
time_preprocessing = time.time() - start
print(f"Runtime: {time_preprocessing:.2f}s")

Runtime: 0.12s


SparseTensor(row=tensor([   0,    0,    0,  ..., 2993, 2993, 2993]),
             col=tensor([1636, 1638, 2357,  ...,  200,  745, 1865]),
             val=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
             size=(2995, 2995), nnz=8416, density=0.09%)

<2995x2879 sparse matrix of type '<class 'numpy.float32'>'
	with 151171 stored elements in Compressed Sparse Row format>

In [14]:
train_loader = torch.utils.data.DataLoader(
        dataset=train_set,
        sampler=torch.utils.data.BatchSampler(
            torch.utils.data.SequentialSampler(train_set),
            batch_size=512, drop_last=False
        ),
        batch_size=None,
        num_workers=0,
    )

In [15]:
xbs, yb = next(iter(train_loader))

In [47]:
attr_matrix_batch, ppr_scores, ppr_idx = xbs

In [48]:
# original node attributes/embeddings
matrix_to_torch(attr_matrix[list(range(2995))])

SparseTensor(row=tensor([   0,    0,    0,  ..., 2994, 2994, 2994]),
             col=tensor([  49,   66,  107,  ..., 2327, 2561, 2573]),
             val=tensor([0.1060, 0.0626, 0.0916,  ..., 0.1141, 0.1413, 0.1413]),
             size=(2995, 2879), nnz=151171, density=1.75%)

In [49]:
# full adjacency matrix
matrix_to_torch(adj_matrix)

SparseTensor(row=tensor([   0,    0,    0,  ..., 2993, 2993, 2993]),
             col=tensor([1636, 1638, 2357,  ...,  200,  745, 1865]),
             val=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
             size=(2995, 2995), nnz=8416, density=0.09%)

In [50]:
# 
matrix_to_torch(topk_train)

SparseTensor(row=tensor([  0,   0,   0,  ..., 139, 139, 139]),
             col=tensor([  22, 1243, 1298,  ..., 2978, 2981, 2983]),
             val=tensor([5.0000e-01, 1.7678e-01, 8.8388e+04,  ..., 7.2921e-02, 5.0592e-01,
                           1.3832e-01]),
             size=(140, 2995), nnz=2027, density=0.48%)

In [51]:
ppr_scores, ppr_scores.shape

(tensor([5.0000e-01, 1.7678e-01, 8.8388e+04,  ..., 7.2921e-02, 5.0592e-01,
         1.3832e-01]),
 torch.Size([2027]))

In [52]:
 ppr_idx, ppr_idx.shape,

(tensor([  0,   0,   0,  ..., 139, 139, 139]), torch.Size([2027]))

In [66]:
torch.tensor(attr_matrix_batch)

TypeError: object of type 'SparseTensor' has no len()

In [56]:
source_idx, neighbor_idx = topk_train.nonzero()
len(source_idx), len(neighbor_idx)

(2027, 2027)

In [73]:
topk_train_batch = matrix_to_torch(topk_train[:, neighbor_idx]).to_torch_sparse_coo_tensor()
topk_train_batch

tensor(indices=tensor([[   0,    0,    0,  ...,  139,  139,  139],
                       [   0,    1,   93,  ..., 2024, 2025, 2026]]),
       values=tensor([5.0000e-01, 1.7678e-01, 8.8388e+04,  ...,
                      7.2921e-02, 5.0592e-01, 1.3832e-01]),
       size=(140, 2027), nnz=15921, layout=torch.sparse_coo)

In [77]:
topk_train_batch, attr_matrix_batch

(tensor(indices=tensor([[   0,    0,    0,  ...,  139,  139,  139],
                        [   0,    1,   93,  ..., 2024, 2025, 2026]]),
        values=tensor([5.0000e-01, 1.7678e-01, 8.8388e+04,  ...,
                       7.2921e-02, 5.0592e-01, 1.3832e-01]),
        size=(140, 2027), nnz=15921, layout=torch.sparse_coo),
 SparseTensor(row=tensor([   0,    0,    0,  ..., 2026, 2026, 2026]),
              col=tensor([  50,   54,   67,  ..., 2409, 2781, 2849]),
              val=tensor([0.1323, 0.0629, 0.0720,  ..., 0.1317, 0.0632, 0.1135]),
              size=(2027, 2879), nnz=106446, density=1.82%))

In [78]:
from rgnn_at_scale.aggregation import ROBUST_MEANS, chunked_message_and_aggregate
soft_medoid = ROBUST_MEANS["soft_k_medoid"]
soft_medoid

<function rgnn_at_scale.aggregation.soft_weighted_medoid_k_neighborhood(A: torch.sparse.FloatTensor, x: torch.Tensor, k: int = 32, temperature: float = 1.0, with_weight_correction: bool = True, threshold_for_dense_if_cpu: int = 5000, **kwargs) -> torch.Tensor>

In [79]:
soft_medoid(topk_train_batch, attr_matrix_batch.to_dense())

RuntimeError: The expanded size of the tensor (2027) must match the existing size (140) at non-singleton dimension 0.  Target sizes: [2027, 32, 32].  Tensor sizes: [140, 1, 32]

''

# Training: Set up model and train

In [11]:
start = time.time()
model = PPRGo(d, nc, hidden_size, nlayers, dropout, aggr="mean")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

nepochs, _, _ = train(
        model=model, train_set=train_set, val_set=val_set,
        lr=lr, weight_decay=weight_decay,
        max_epochs=max_epochs, batch_size=batch_size, batch_mult_val=batch_mult_val,
        eval_step=eval_step, early_stop=early_stop, patience=patience)
time_training = time.time() - start
logging.info('Training done.')
print(f"Runtime: {time_training:.2f}s")

2020-12-21 14:56:24 (INFO): Epoch 19, step 20: train 0.12009
2020-12-21 14:56:24 (INFO): Epoch 39, step 40: train 0.07238
2020-12-21 14:56:24 (INFO): Epoch 59, step 60: train 0.05558
2020-12-21 14:56:24 (INFO): Epoch 79, step 80: train 0.04384
2020-12-21 14:56:24 (INFO): Epoch 99, step 100: train 0.03783
2020-12-21 14:56:25 (INFO): Epoch 119, step 120: train 0.03370
2020-12-21 14:56:25 (INFO): Epoch 139, step 140: train 0.03027
2020-12-21 14:56:25 (INFO): Epoch 159, step 160: train 0.02800
2020-12-21 14:56:25 (INFO): Epoch 179, step 180: train 0.03056
2020-12-21 14:56:25 (INFO): Epoch 199, step 200: train 0.03036
2020-12-21 14:56:25 (INFO): Training done.


Runtime: 2.92s


# Inference (val and test)

In [12]:
start = time.time()
predictions, time_logits, time_propagation = predict(
        model=model, adj_matrix=adj_matrix, attr_matrix=attr_matrix, alpha=alpha,
        nprop=nprop_inference, inf_fraction=inf_fraction,
        ppr_normalization=ppr_normalization)
time_inference = time.time() - start
print(f"Runtime: {time_inference:.2f}s")

Runtime: 0.01s


# Collect and print results

In [13]:
acc_train = 100 * accuracy_score(labels[train_idx], predictions[train_idx])
acc_val = 100 * accuracy_score(labels[val_idx], predictions[val_idx])
acc_test = 100 * accuracy_score(labels[test_idx], predictions[test_idx])
f1_train = f1_score(labels[train_idx], predictions[train_idx], average='macro')
f1_val = f1_score(labels[val_idx], predictions[val_idx], average='macro')
f1_test = f1_score(labels[test_idx], predictions[test_idx], average='macro')

gpu_memory = torch.cuda.max_memory_allocated()
memory = utils.get_max_memory_bytes()

time_total = time_preprocessing + time_training + time_inference

In [14]:
print(f'''
Accuracy: Train: {acc_train:.1f}%, val: {acc_val:.1f}%, test: {acc_test:.1f}%
F1 score: Train: {f1_train:.3f}, val: {f1_val:.3f}, test: {f1_test:.3f}

Runtime: Preprocessing: {time_preprocessing:.2f}s, training: {time_training:.2f}s, inference: {time_inference:.2f}s -> total: {time_total:.2f}s
Memory: Main: {memory / 2**30:.2f}GB, GPU: {gpu_memory / 2**30:.3f}GB
''')


Accuracy: Train: 81.4%, val: 56.1%, test: 52.9%
F1 score: Train: 0.820, val: 0.528, test: 0.495

Runtime: Preprocessing: 0.12s, training: 2.92s, inference: 0.01s -> total: 3.05s
Memory: Main: 1.95GB, GPU: 0.009GB

