In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
import os
import custom_graphgym  # noqa, register custom modules
import torch
from pytorch_lightning import seed_everything
from torch_geometric.graphgym.cmd_args import parse_args
from torch_geometric.graphgym.config import (
    cfg,
    dump_cfg,
    load_cfg,
    set_out_dir,
    set_run_dir,
)
from torch_geometric.graphgym.logger import set_printing
from torch_geometric.graphgym.model_builder import create_model
from torch_geometric.graphgym.train import GraphGymDataModule, train
from torch_geometric.graphgym.utils.agg_runs import agg_runs
from torch_geometric.graphgym.utils.comp_budget import params_count
from torch_geometric.graphgym.utils.device import auto_select_device
import pandas as pd
import numpy as np
from torch_geometric.graphgym import register

The following notebook covers dataset creation, config load and training procedures. 

In [3]:
# Parse the input arguments for the script

import argparse

class NotebookArgParser:
    def __init__(self, args_str):
        self.args = self.parse_args(args_str)

    def parse_args(self, args_str):
        parser = argparse.ArgumentParser(description='GraphGym')

        # Add command-line arguments
        parser.add_argument('--cfg',
                            dest='cfg_file',
                            type=str,
                            required=True,
                            help='The configuration file path.')
        parser.add_argument('--repeat',
                            type=int,
                            default=1,
                            help='The number of repeated jobs.')
        parser.add_argument('--mark_done',
                            action='store_true',
                            help='Mark yaml as done after a job has finished.')
        parser.add_argument('opts',
                            default=None,
                            nargs=argparse.REMAINDER,
                            help='See graphgym/config.py for remaining options.')

        # Parse the command-line arguments
        args = parser.parse_args(args_str)

        return args

In [4]:
config_name = 'yeast_static'

In [5]:
# Emulate command-line arguments using input cells
command = f"python main_pyg.py --cfg ./configs/pyg/{config_name}.yaml --repeat 1"
args_str = command.split()[2:]
# Create a NotebookArgParser instance
notebook_parser = NotebookArgParser(args_str)

# Access parsed arguments
args = notebook_parser.args
print("Parsed Arguments:")
print(f"Configuration File: {args.cfg_file}")
print(f"Repeat: {args.repeat}")
print(f"Mark Done: {args.mark_done}")
print(f"Remaining Options: {args.opts}")

Parsed Arguments:
Configuration File: ./configs/pyg/yeast_static.yaml
Repeat: 1
Mark Done: False
Remaining Options: []


In [6]:
notebook_parser = NotebookArgParser(args_str)

# Access parsed arguments
args = notebook_parser.args

# Explore datasets

In [7]:
path_to_configs = './configs/pyg'

configs = ['yeast_temporal', 'yeast_static', 'ecoli_temporal', 'ecoli_static']
data_type = ['yeast', 'yeast', 'ecoli', 'ecoli']
stats = []

In [8]:
for dataset,type in zip(configs, data_type):
    config_path = f'{path_to_configs}/{dataset}.yaml'
    cfg.merge_from_file(config_path)
    print(cfg.dataset.split_type)
    print(cfg.dataset.name)
    if type == 'yeast':
        datamodule = register.train_dict["BioGridGraphGymDataModule"](split_type = cfg.dataset.split_type)
    else:
        datamodule = register.train_dict["CustomGraphGymDataModule"](split_type = cfg.dataset.split_type)
    statistics = datamodule._get_split_statistics()
    stats.append(statistics)


temporal
yeast-ppi
DOWNLOADING CUSTOM DATASET LOADER
Trying to access: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.197/BIOGRID-ALL-4.4.197.tab3.zip
File already exists
Trying to access: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.198/BIOGRID-ALL-4.4.198.tab3.zip
File already exists
Trying to access: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.199/BIOGRID-ALL-4.4.199.tab3.zip
File already exists
Trying to access: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.200/BIOGRID-ALL-4.4.200.tab3.zip
File already exists
Trying to access: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.201/BIOGRID-ALL-4.4.201.tab3.zip
File already exists
Trying to access: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.202/BIOGRID-ALL-4.4.202.tab3.zip
File already exists
Trying to access: https://downloads.thebiogrid.org/Do

Processing...
  df = pd.read_csv(filepath, sep='\t', header=0, usecols=columns_needed)
  df = pd.read_csv(filepath, sep='\t', header=0, usecols=columns_needed)
  df = pd.read_csv(filepath, sep='\t', header=0, usecols=columns_needed)
  df = pd.read_csv(filepath, sep='\t', header=0, usecols=columns_needed)
  df = pd.read_csv(filepath, sep='\t', header=0, usecols=columns_needed)
  df = pd.read_csv(filepath, sep='\t', header=0, usecols=columns_needed)
  df = pd.read_csv(filepath, sep='\t', header=0, usecols=columns_needed)
  df = pd.read_csv(filepath, sep='\t', header=0, usecols=columns_needed)
  df = pd.read_csv(filepath, sep='\t', header=0, usecols=columns_needed)
  df = pd.read_csv(filepath, sep='\t', header=0, usecols=columns_needed)
  df = pd.read_csv(filepath, sep='\t', header=0, usecols=columns_needed)
  df = pd.read_csv(filepath, sep='\t', header=0, usecols=columns_needed)
  df = pd.read_csv(filepath, sep='\t', header=0, usecols=columns_needed)
  df = pd.read_csv(filepath, sep='\t'

The number of splits is defined by the user
[0, 12, 29, 33, 36]
Positive edges 17244
Positive edges 7129
Negative edges 7129
Positive edges 14316
Negative edges 14316
Positive edges 2428
Negative edges 2428
resetting share dim in for GNN model to: 5955
static
yeast-ppi
DOWNLOADING CUSTOM DATASET LOADER
Trying to access: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.197/BIOGRID-ALL-4.4.197.tab3.zip
File already exists
Trying to access: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.198/BIOGRID-ALL-4.4.198.tab3.zip
File already exists
Trying to access: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.199/BIOGRID-ALL-4.4.199.tab3.zip
File already exists
Trying to access: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.200/BIOGRID-ALL-4.4.200.tab3.zip
File already exists
Trying to access: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.201/



resetting share dim in for GNN model to: 5955
temporal
Ecoli
DOWNLOADING CUSTOM DATASET LOADER




The number of splits is defined by the user
Positive edges 373
Positive edges 307
Negative edges 307
Positive edges 180
Negative edges 180
Positive edges 172
Negative edges 172
resetting share dim in for GNN model to: 2608
static
Ecoli
DOWNLOADING CUSTOM DATASET LOADER
The number of splits is defined by the user
resetting share dim in for GNN model to: 2608


In [9]:
stats

[[            # MPP edges  # label pos edges  # label neg edges
  train            102863              17244                  0
  val              120107               7129               7129
  test             127236              14316              14316
  final_test       127236               2428               2428],
 [            # MPP edges  # label pos edges  # label neg edges
  train            105910              18690                  0
  val              124600              15574              15574
  test             140174              15574              15574
  final_test       140174               2428               2428],
 [            # MPP edges  # label pos edges  # label neg edges
  train              6647                373                  0
  val                6788                307                307
  test               7065                180                180
  final_test         7065                172                172],
 [            # MPP edges  # label

In [11]:
datamodule.dataset.compute_statistics()

Unnamed: 0,year,num_nodes,num_edges,num_self_loops,num_connected_components,avg_degree,validated
0,2003,2608,1498,49,20,0.574387,True
1,2005,2608,3300,65,2,1.265337,True
2,2006,2608,4589,87,2,1.759586,True
3,2011,2608,6034,111,4,2.31365,True
4,2013,2608,6502,117,3,2.493098,True
5,2014,2608,6647,119,4,2.548696,True
6,2015,2608,6559,120,5,2.514954,True
7,2017,2608,6788,125,5,2.602761,True
8,2018,2608,7065,125,5,2.708972,True
9,2020,2608,7187,128,5,2.755752,True


# Train a model with a chosen config

In [12]:
# Load config file
load_cfg(cfg, args)
set_out_dir(cfg.out_dir, args.cfg_file)
# Set Pytorch environment
torch.set_num_threads(cfg.num_threads)
dump_cfg(cfg)
# Repeat for different random seeds
for i in range(args.repeat):
    set_run_dir(cfg.out_dir, i)
    set_printing()
    # Set configurations for each run
    cfg.seed = cfg.seed + 1
    seed_everything(cfg.seed, workers=True)
    auto_select_device() # if not set in the yaml config, set to cuda accelerator if available and single device
    # Set machine learning pipeline
    if cfg.dataset.name in ['yeast-ppi']:
            print('Loading BIOGRID')
            datamodule = register.train_dict["BioGridGraphGymDataModule"](split_type = cfg.dataset.split_type)    
    else:
        print('Loading grn-ecoli')
        datamodule = register.train_dict["CustomGraphGymDataModule"](split_type = cfg.dataset.split_type)
    cfg.share.dim_out = 1 
    cfg.share.num_splits = 3
    model = create_model()
    # Print model info
    logging.info(model)
    logging.info(cfg)
    cfg.params = params_count(model)
    logging.info('Num parameters: %s', cfg.params)
    # Call the custom training function
    register.train_dict["train_pl"](model, datamodule, logger=True)

Seed set to 2025


Loading BIOGRID
DOWNLOADING CUSTOM DATASET LOADER
Trying to access: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.197/BIOGRID-ALL-4.4.197.tab3.zip
File already exists
Trying to access: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.198/BIOGRID-ALL-4.4.198.tab3.zip
File already exists
Trying to access: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.199/BIOGRID-ALL-4.4.199.tab3.zip
File already exists
Trying to access: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.200/BIOGRID-ALL-4.4.200.tab3.zip
File already exists
Trying to access: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.201/BIOGRID-ALL-4.4.201.tab3.zip
File already exists
Trying to access: https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.202/BIOGRID-ALL-4.4.202.tab3.zip
File already exists
Trying to access: https://downloads.thebiogrid.org/Downl



INITIALIZING NEW CUSTOM EDGE HEAD
GraphGymModule(
  (model): GNN(
    (encoder): FeatureEncoder()
    (pre_mp): GeneralMultiLayer(
      (Layer_0): GeneralLayer(
        (layer): Linear(
          (model): Linear(5955, 32, bias=True)
        )
        (post_layer): Sequential(
          (0): ReLU()
        )
      )
    )
    (mp): GNNStackStage(
      (layer0): GeneralLayer(
        (layer): GATConv(
          (model): GATConv(32, 32, heads=1)
        )
        (post_layer): Sequential(
          (0): ReLU()
        )
      )
      (layer1): GeneralLayer(
        (layer): GATConv(
          (model): GATConv(64, 32, heads=1)
        )
        (post_layer): Sequential(
          (0): ReLU()
        )
      )
    )
    (post_mp): ExampleGNNEdgeHead(
      (layer_post_mp): MLP(
        (model): Sequential(
          (0): Linear(
            (model): Linear(32, 32, bias=True)
          )
        )
      )
    )
  )
)
accelerator: cuda
benchmark: False
bn:
  eps: 1e-05
  mom: 0.1
cfg_dest: 

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: results/yeast/yeast_static/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7,8,9]

  | Name  | Type | Params
-------------------------------
0 | model | GNN  | 194 K 
-------------------------------
194 K     Trainable params
0         Non-trainable params
194 K     Total params
0.780     Total estimated model params size (MB)


Sanity Checking DataLoader 0: 100%|██████████| 1/1 [00:00<00:00,  2.90it/s]val: {'epoch': 0, 'loss': 0.8252, 'lr': 0.01, 'params': 194912, 'time_iter': 0.5508, 'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1': 0.6667, 'auc': 0.5625, 'aupr': 0.4982, 'mrr': 0.0002, 'hit_K': 12.0}
                                                                           

/home/stu13/s18/lb9849/miniconda3/envs/pyg/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 0:   0%|          | 0/1 [00:00<?, ?it/s] Samping new training neg edges: torch.Size([2, 18690])
Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  2.70it/s, v_num=0]val: {'epoch': 0, 'loss': 0.7038, 'lr': 0.01, 'params': 194912, 'time_iter': 0.2293, 'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1': 0.6667, 'auc': 0.6938, 'aupr': 0.649, 'mrr': 0.0003, 'hit_K': 20.0}
Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  1.13it/s, v_num=0, val_accuracy=0.500, val_precision=0.500, val_recall=1.000, val_f1=0.667, val_auc=0.694, val_aupr=0.649, val_mrr=0.0003, val_hit_K=20.00]train: {'epoch': 0, 'eta': 73.4572, 'loss': 0.8234, 'lr': 0.01, 'params': 194912, 'time_iter': 0.3691, 'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1': 0.6667, 'auc': 0.5838, 'aupr': 0.5111, 'mrr': 0.0002, 'hit_K': 16.0}
Epoch 1:   0%|          | 0/1 [00:00<?, ?it/s, v_num=0, val_accuracy=0.500, val_precision=0.500, val_recall=1.000, val_f1=0.667, val_auc=0.694, val_aupr=0.649, val_mrr=0.0003, val_hit_K=20.00]      

/home/stu13/s18/lb9849/miniconda3/envs/pyg/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7,8,9]


Testing DataLoader 1: 100%|██████████| 1/1 [00:00<00:00, 15.58it/s]test_split_0: {'epoch': 2, 'loss': 0.6906, 'lr': 0.01, 'params': 194912, 'time_iter': 0.2704, 'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1': 0.6667, 'auc': 0.8273, 'aupr': 0.8312, 'mrr': 0.0006, 'hit_K': 97.0}
test_split_1: {'epoch': 2, 'loss': 0.6916, 'lr': 0.01, 'params': 194912, 'time_iter': 0.7494, 'accuracy': 0.5, 'precision': 0.5, 'recall': 1.0, 'f1': 0.6667, 'auc': 0.8107, 'aupr': 0.7763, 'mrr': 0.0028, 'hit_K': 93.0}
Testing DataLoader 1: 100%|██████████| 1/1 [00:00<00:00,  4.96it/s]
