In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
import os
import custom_graphgym  # noqa, register custom modules
import torch
from torch_geometric import seed_everything
from torch_geometric.graphgym.cmd_args import parse_args
from torch_geometric.graphgym.config import (
    cfg,
    dump_cfg,
    load_cfg,
    set_out_dir,
    set_run_dir,
)
from torch_geometric.graphgym.logger import set_printing
from torch_geometric.graphgym.model_builder import create_model
from torch_geometric.graphgym.train import GraphGymDataModule, train
from torch_geometric.graphgym.utils.agg_runs import agg_runs
from torch_geometric.graphgym.utils.comp_budget import params_count
from torch_geometric.graphgym.utils.device import auto_select_device

In [3]:
from torch_geometric.graphgym import register

In [4]:
register.head_dict.keys()

dict_keys(['node', 'link_pred', 'edge', 'graph', 'head', 'custom'])

In [5]:
import argparse

class NotebookArgParser:
    def __init__(self, args_str):
        self.args = self.parse_args(args_str)

    def parse_args(self, args_str):
        parser = argparse.ArgumentParser(description='GraphGym')

        # Add command-line arguments
        parser.add_argument('--cfg',
                            dest='cfg_file',
                            type=str,
                            required=True,
                            help='The configuration file path.')
        parser.add_argument('--repeat',
                            type=int,
                            default=1,
                            help='The number of repeated jobs.')
        parser.add_argument('--mark_done',
                            action='store_true',
                            help='Mark yaml as done after a job has finished.')
        parser.add_argument('opts',
                            default=None,
                            nargs=argparse.REMAINDER,
                            help='See graphgym/config.py for remaining options.')

        # Parse the command-line arguments
        args = parser.parse_args(args_str)

        return args

In [6]:
# Emulate command-line arguments using input cells
command = "python main_pyg.py --cfg ./configs/pyg/example_link_debug.yaml --repeat 3"
args_str = command.split()[2:]
args_str


['--cfg', './configs/pyg/example_link_debug.yaml', '--repeat', '3']

In [7]:
# Create a NotebookArgParser instance
notebook_parser = NotebookArgParser(args_str)

# Access parsed arguments
args = notebook_parser.args
print("Parsed Arguments:")
print(f"Configuration File: {args.cfg_file}")
print(f"Repeat: {args.repeat}")
print(f"Mark Done: {args.mark_done}")
print(f"Remaining Options: {args.opts}")

Parsed Arguments:
Configuration File: ./configs/pyg/example_link_debug.yaml
Repeat: 3
Mark Done: False
Remaining Options: []


In [8]:
# Load config file
load_cfg(cfg, args)
set_out_dir(cfg.out_dir, args.cfg_file)
# Set Pytorch environment
torch.set_num_threads(cfg.num_threads)
dump_cfg(cfg)

for i in range(args.repeat):
    set_run_dir(cfg.out_dir, i)
    set_printing()
    # Set configurations for each run
    cfg.seed = cfg.seed + 1
    seed_everything(cfg.seed)
    auto_select_device() # if not set in the yaml config, set to cuda accelerator if available and single device
    # Set machine learning pipeline
    datamodule = register.train_dict["CustomGraphGymDataModule"]()
    cfg.share.dim_out = 1 # TODO fix this bug, that happend in set_dataset_info because dataset._data.y might have node labels (not edge labels)
    cfg.share.num_splits = 3 # TODO fix this bug
    model = create_model()
    # Print model info
    logging.info(model)
    logging.info(cfg)
    cfg.params = params_count(model)
    logging.info('Num parameters: %s', cfg.params)
    # Call the custom training function
    register.train_dict["train_pl"](model, datamodule, logger=True)
    #train(model, datamodule, logger=True)

DOWNLOADING CUSTOM DATASET LOADER
INITIALIZING NEW CUSTOM EDGE HEAD




GraphGymModule(
  (model): GNN(
    (encoder): FeatureEncoder()
    (mp): GNNStackStage(
      (layer0): GeneralLayer(
        (layer): GCNConv(
          (model): GCNConv(1433, 64)
        )
        (post_layer): Sequential(
          (0): ReLU()
        )
      )
      (layer1): GeneralLayer(
        (layer): GCNConv(
          (model): GCNConv(64, 64)
        )
        (post_layer): Sequential(
          (0): ReLU()
        )
      )
    )
    (post_mp): ExampleGNNEdgeHead(
      (layer_post_mp): MLP(
        (model): Sequential(
          (0): Linear(
            (model): Linear(64, 64, bias=True)
          )
        )
      )
    )
  )
)
accelerator: cuda
benchmark: False
bn:
  eps: 1e-05
  mom: 0.1
cfg_dest: config.yaml
custom_metrics: []
dataset:
  cache_load: False
  cache_save: False
  dir: ./datasets
  edge_dim: 128
  edge_encoder: False
  edge_encoder_bn: True
  edge_encoder_name: Bond
  edge_message_ratio: 0.8
  edge_negative_sampling_ratio: 1.0
  edge_train_mode: all
  enc

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: results/link_predict_debug/example_link_debug/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7,8,9]

  | Name  | Type | Params
-------------------------------
0 | model | GNN  | 100 K 
-------------------------------
100 K     Trainable params
0         Non-trainable params
100 K     Total params
0.400     Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]

/home/stu13/s18/lb9849/miniconda3/envs/pyg/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Sanity Checking DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  4.23it/s]val: {'epoch': 0, 'loss': 0.743, 'lr': 0.01, 'params': 100096, 'time_iter': 0.2553, 'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1': 0.6667, 'auc': 0.6852}
                                                                           

/home/stu13/s18/lb9849/miniconda3/envs/pyg/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/stu13/s18/lb9849/miniconda3/envs/pyg/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 0:   0%|          | 0/1 [00:00<?, ?it/s] torch.Size([2, 4224])
Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  8.57it/s, v_num=0]val: {'epoch': 0, 'loss': 0.6974, 'lr': 0.01, 'params': 100096, 'time_iter': 0.0178, 'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1': 0.6667, 'auc': 0.6695}
Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  5.70it/s, v_num=0]train: {'epoch': 0, 'eta': 46.0412, 'loss': 0.7352, 'lr': 0.01, 'params': 100096, 'time_iter': 0.1154, 'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1': 0.6667, 'auc': 0.8527}
Epoch 1:   0%|          | 0/1 [00:00<?, ?it/s, v_num=0]        torch.Size([2, 4224])
Epoch 1: 100%|██████████| 1/1 [00:00<00:00, 31.59it/s, v_num=0]val: {'epoch': 1, 'loss': 0.6908, 'lr': 0.01, 'params': 100096, 'time_iter': 0.0136, 'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1': 0.6667, 'auc': 0.7427}
Epoch 1: 100%|██████████| 1/1 [00:00<00:00, 12.06it/s, v_num=0]train: {'epoch': 1, 'eta': 29.025, 'loss': 0.6946, 'lr': 0.01, 'params': 100096, 'ti

`Trainer.fit` stopped: `max_epochs=400` reached.


Epoch 399: 100%|██████████| 1/1 [00:00<00:00,  5.46it/s, v_num=0]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7,8,9]
/home/stu13/s18/lb9849/miniconda3/envs/pyg/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 104.48it/s]test: {'epoch': 400, 'loss': 0.4788, 'lr': 0.0, 'params': 100096, 'time_iter': 0.0167, 'accuracy': 0.8207, 'precision': 0.7752, 'recall': 0.9032, 'f1': 0.8344, 'auc': 0.9056}
Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 17.65it/s] 




DOWNLOADING CUSTOM DATASET LOADER
INITIALIZING NEW CUSTOM EDGE HEAD
GraphGymModule(
  (model): GNN(
    (encoder): FeatureEncoder()
    (mp): GNNStackStage(
      (layer0): GeneralLayer(
        (layer): GCNConv(
          (model): GCNConv(1433, 64)
        )
        (post_layer): Sequential(
          (0): ReLU()
        )
      )
      (layer1): GeneralLayer(
        (layer): GCNConv(
          (model): GCNConv(64, 64)
        )
        (post_layer): Sequential(
          (0): ReLU()
        )
      )
    )
    (post_mp): ExampleGNNEdgeHead(
      (layer_post_mp): MLP(
        (model): Sequential(
          (0): Linear(
            (model): Linear(64, 64, bias=True)
          )
        )
      )
    )
  )
)
accelerator: cuda
benchmark: False
bn:
  eps: 1e-05
  mom: 0.1
cfg_dest: config.yaml
custom_metrics: []
dataset:
  cache_load: False
  cache_save: False
  dir: ./datasets
  edge_dim: 128
  edge_encoder: False
  edge_encoder_bn: True
  edge_encoder_name: Bond
  edge_message_ratio:

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7,8,9]

  | Name  | Type | Params
-------------------------------
0 | model | GNN  | 100 K 
-------------------------------
100 K     Trainable params
0         Non-trainable params
100 K     Total params
0.400     Total estimated model params size (MB)


Sanity Checking DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 139.05it/s]val: {'epoch': 0, 'loss': 0.7091, 'lr': 0.01, 'params': 100096, 'time_iter': 0.0157, 'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1': 0.6667, 'auc': 0.684}
                                                                            

/home/stu13/s18/lb9849/miniconda3/envs/pyg/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/stu13/s18/lb9849/miniconda3/envs/pyg/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/stu13/s18/lb9849/miniconda3/envs/pyg/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 0:   0%|          | 0/1 [00:00<?, ?it/s] torch.Size([2, 4224])
Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 33.95it/s, v_num=1]val: {'epoch': 0, 'loss': 0.6948, 'lr': 0.01, 'params': 100096, 'time_iter': 0.0137, 'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1': 0.6667, 'auc': 0.6855}
Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 13.20it/s, v_num=1]train: {'epoch': 0, 'eta': 11.0838, 'loss': 0.6968, 'lr': 0.01, 'params': 100096, 'time_iter': 0.0278, 'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1': 0.6667, 'auc': 0.8856}
Epoch 1:   0%|          | 0/1 [00:00<?, ?it/s, v_num=1]        torch.Size([2, 4224])
Epoch 1: 100%|██████████| 1/1 [00:00<00:00, 30.70it/s, v_num=1]val: {'epoch': 1, 'loss': 0.6877, 'lr': 0.01, 'params': 100096, 'time_iter': 0.0127, 'accuracy': 0.5019, 'precision': 0.501, 'recall': 1.0, 'f1': 0.6675, 'auc': 0.7337}
Epoch 1: 100%|██████████| 1/1 [00:00<00:00, 13.15it/s, v_num=1]train: {'epoch': 1, 'eta': 11.7798, 'loss': 0.6915, 'lr': 0.01, 'params': 10009

`Trainer.fit` stopped: `max_epochs=400` reached.


Epoch 399: 100%|██████████| 1/1 [00:00<00:00,  5.48it/s, v_num=1]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7,8,9]
/home/stu13/s18/lb9849/miniconda3/envs/pyg/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 158.28it/s]test: {'epoch': 400, 'loss': 0.4859, 'lr': 0.0, 'params': 100096, 'time_iter': 0.0122, 'accuracy': 0.8121, 'precision': 0.7719, 'recall': 0.8861, 'f1': 0.8251, 'auc': 0.8941}
Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 33.00it/s] 
DOWNLOADING CUSTOM DATASET LOADER
INITIALIZING NEW CUSTOM EDGE HEAD
GraphGymModule(
  (model): GNN(
    (encoder): FeatureEncoder()
    (mp): GNNStackStage(
      (layer0): GeneralLayer(
        (layer): GCNConv(
          (model): GCNConv(1433, 64)
        )
        (post_layer): Sequential(
          (0): ReLU()
        )
      )
      (layer1): GeneralLayer(
        (layer): GCNConv(
          (model): GCNConv(64, 64)
        )
        (post_layer): Sequential(
          (0): ReLU()
        )
      )
    )
    (post_mp): ExampleGNNEdgeHead(
      (layer_post_mp): MLP(
        (model): Sequential(
          (0): Linear(
            (model): Linear(64, 64, bias=True)
          )

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7,8,9]

  | Name  | Type | Params
-------------------------------
0 | model | GNN  | 100 K 
-------------------------------
100 K     Trainable params
0         Non-trainable params
100 K     Total params
0.400     Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]

/home/stu13/s18/lb9849/miniconda3/envs/pyg/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Sanity Checking DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 130.17it/s]val: {'epoch': 0, 'loss': 0.7201, 'lr': 0.01, 'params': 100096, 'time_iter': 0.0141, 'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1': 0.6667, 'auc': 0.6514}
                                                                            

/home/stu13/s18/lb9849/miniconda3/envs/pyg/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/stu13/s18/lb9849/miniconda3/envs/pyg/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 0:   0%|          | 0/1 [00:00<?, ?it/s] torch.Size([2, 4224])
Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 27.79it/s, v_num=2]val: {'epoch': 0, 'loss': 0.7052, 'lr': 0.01, 'params': 100096, 'time_iter': 0.013, 'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1': 0.6667, 'auc': 0.6085}
Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 10.55it/s, v_num=2]train: {'epoch': 0, 'eta': 13.8385, 'loss': 0.71, 'lr': 0.01, 'params': 100096, 'time_iter': 0.0347, 'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1': 0.6667, 'auc': 0.8394}
Epoch 1:   0%|          | 0/1 [00:00<?, ?it/s, v_num=2]        torch.Size([2, 4224])
Epoch 1: 100%|██████████| 1/1 [00:00<00:00, 33.02it/s, v_num=2]val: {'epoch': 1, 'loss': 0.692, 'lr': 0.01, 'params': 100096, 'time_iter': 0.0139, 'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1': 0.6667, 'auc': 0.7263}
Epoch 1: 100%|██████████| 1/1 [00:00<00:00, 12.75it/s, v_num=2]train: {'epoch': 1, 'eta': 12.7054, 'loss': 0.7016, 'lr': 0.01, 'params': 100096, 'time_

`Trainer.fit` stopped: `max_epochs=400` reached.


Epoch 399: 100%|██████████| 1/1 [00:00<00:00,  2.50it/s, v_num=2]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7,8,9]
/home/stu13/s18/lb9849/miniconda3/envs/pyg/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 161.67it/s]test: {'epoch': 400, 'loss': 0.4945, 'lr': 0.0, 'params': 100096, 'time_iter': 0.013, 'accuracy': 0.8207, 'precision': 0.7761, 'recall': 0.9013, 'f1': 0.8341, 'auc': 0.8948}
Testing DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 19.43it/s] 


### Previous working code

In [None]:
# Load config file
load_cfg(cfg, args)
set_out_dir(cfg.out_dir, args.cfg_file)

# Set Pytorch environment
torch.set_num_threads(cfg.num_threads)
dump_cfg(cfg)
# Repeat for different random seeds

In [None]:
set_run_dir(cfg.out_dir)
set_printing()
# Set configurations for each run
cfg.seed = cfg.seed + 1
seed_everything(cfg.seed)
auto_select_device()
# Set machine learning pipeline
# Load the custom GraphGymDataModule

In [None]:
cfg.device = 2
cfg.accelerator = 'cuda'

In [None]:
cfg.run_dir

In [None]:
datamodule = register.train_dict["CustomGraphGymDataModule"]()

In [None]:
datamodule.dataset

In [None]:
datamodule._train_dataloader

In [None]:
datamodule.dataset.num_edges

In [None]:
cfg.train.batch_size = datamodule.dataset.num_edges

In [None]:
for b in datamodule._train_dataloader:
    print(b)

In [None]:
cfg.share.dim_in

In [None]:
cfg.devices = 8

In [None]:
cfg.accelerator = 'cuda:2'

In [None]:
cfg.share.dim_out = 1 # TODO fix this bug, that happend in set_dataset_info because dataset._data.y might have node labels (not edge labels)

In [None]:
model = create_model()
# Print model infos
logging.info(model)
logging.info(cfg)
cfg.params = params_count(model)
logging.info("Num parameters: %s", cfg.params)

In [None]:
# for b in datamodule.train_dataloader():
#     print(b)


In [None]:
# b = next(iter(datamodule.test_dataloader()))

In [None]:
# from torch_geometric.loader import LinkNeighborLoader

# lb = LinkNeighborLoader(data=datamodule.splits['train'],
#             num_neighbors=[-1],
#             batch_size=8448,
#             edge_label_index= datamodule.splits['train'].edge_label_index,
#             edge_label= datamodule.splits['train'].edge_label,
#             shuffle=True,
#             num_workers=cfg.num_workers,
#             pin_memory=True
#         )

In [None]:
# for b in lb:
#     print(b)

In [None]:
# datamodule.splits['test'].edge_label

In [None]:
cfg.share.num_splits = 3 # TODO fixx thisss!

In [None]:
# Call the custom training function
register.train_dict["train_pl"](model, datamodule, logger=True)