# Downloading GraphAny

In [1]:
!git clone https://github.com/DeepGraphLearning/GraphAny.git

Cloning into 'GraphAny'...
remote: Enumerating objects: 76, done.[K
remote: Counting objects: 100% (76/76), done.[K
remote: Compressing objects: 100% (62/62), done.[K
remote: Total 76 (delta 24), reused 40 (delta 13), pack-reused 0 (from 0)[K
Receiving objects: 100% (76/76), 576.89 KiB | 10.49 MiB/s, done.
Resolving deltas: 100% (24/24), done.


# Imports and Environment Setup

In [2]:
import sys
import os
import subprocess
import yaml

data = {
    'name': 'graphany',
    'channels': [
        'pytorch',
        'pyg',
        'nvidia',
        'conda-forge',
        'defaults',
        'dglteam/label/cu118'
    ],
    'dependencies': [
        'python=3.10',
        'cudatoolkit=11.8',
        'pyg',
        'pytorch=2.2.1',
        'torchvision',
        'torchaudio',
        'torchdata=0.7.1',
        'dgl',
        'lightning=2.*',
        'pydantic',
        'wandb',
        'rich',
        'hydra-core',
        'jupyter',
        'einops',
        'tensorboard',
        'pip',
        {
            'pip': [
                'ogb',
                'rootutils',
                'hydra_colorlog',
                # For time logging
                'codetiming',
                'humanfriendly',
                'torch_frame',
                'pytorch-frame[full]'
            ]
        }
    ]
}


sys.path.insert(0,'/content/GraphAny')


with open('GraphAny/environment.yaml', 'w') as file:
    yaml.dump(data, file)



!wget -O Miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
!bash Miniconda.sh -b -p /usr/local/miniconda

os.environ['PATH'] = '/usr/local/miniconda/bin:' + os.environ['PATH']

!conda update conda -y -q
!source /usr/local/etc/profile.d/conda.sh
!conda init
!conda install -n root _license -y -q

!conda env create -f GraphAny/environment.yaml

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m



libcusparse-12.0.2.5 | 163.0 MB  | :  31% 0.3055809531023978/1 [00:22<00:04,  6.48s/it] [A[A[A[A[A[A
cudatoolkit-11.8.0   | 682.5 MB  | :  85% 0.8501937449740127/1 [00:22<00:03, 21.23s/it][A






pytorch-2.2.1        | 1.34 GB   | :  42% 0.41733512064691186/1 [00:22<00:25, 43.85s/it]





libcusparse-12.0.2.5 | 163.0 MB  | :  32% 0.32197704440472286/1 [00:22<00:04,  6.50s/it][A[A[A[A[A[A
cudatoolkit-11.8.0   | 682.5 MB  | :  86% 0.8565815312304824/1 [00:22<00:02, 19.72s/it][A






libnpp-12.0.2.50     | 139.8 MB  | :  39% 0.39140898779426947/1 [00:22<00:02,  4.61s/it][A[A[A[A[A[A[A





libcusparse-12.0.2.5 | 163.0 MB  | :  34% 0.3441261501990919/1 [00:22<00:03,  5.84s/it] [A[A[A[A[A[A
pytorch-2.2.1        | 1.34 GB   | :  42% 0.4197096351220488/1 [00:22<00:26, 45.98s/it] 






libnpp-12.0.2.50     | 139.8 MB  | :  41% 0.4142160239296682/1 [00:22<00:02,  5.06s/it] [A[A[A[A[A[A[A






In [3]:
%%bash
source activate graphany

python
import sys
import os
import subprocess
# some simple python commands
sys.path.append('/usr/local/lib/python3.10/site-packages')
print(sys.path)

print("Python version")
print(sys.version)

['', '/env/python', '/usr/local/miniconda/envs/graphany/lib/python310.zip', '/usr/local/miniconda/envs/graphany/lib/python3.10', '/usr/local/miniconda/envs/graphany/lib/python3.10/lib-dynload', '/usr/local/miniconda/envs/graphany/lib/python3.10/site-packages', '/usr/local/lib/python3.10/site-packages']
Python version
3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:45:18) [GCC 12.3.0]


In [4]:
!source activate graphany && conda config --add channels pytorch
!source activate graphany && conda config --add channels pyg
!source activate graphany && conda config --add channels nvidia
!source activate graphany && conda config --add channels conda-forge
!source activate graphany && conda config --add channels dglteam/label/cu118

In [5]:
!source activate graphany && pip install pytorch_frame



In [6]:
!source activate graphany && conda list torch

# packages in environment at /usr/local/miniconda/envs/graphany:
#
# Name                    Version                   Build  Channel
pytorch                   2.2.1           py3.10_cuda12.1_cudnn8.9.2_0    pytorch
pytorch-cuda              12.1                 ha16c6d3_6    pytorch
pytorch-frame             0.2.5                    pypi_0    pypi
pytorch-lightning         2.5.0.post0        pyh101cb37_0    conda-forge
pytorch-mutex             1.0                        cuda    pytorch
torch-frame               1.7.5                    pypi_0    pypi
torchaudio                2.2.1               py310_cu121    pytorch
torchdata                 0.7.1                     py310    pytorch
torchmetrics              1.6.1              pyhd8ed1ab_0    conda-forge
torchtriton               2.2.0                     py310    pytorch
torchvision               0.17.1              py310_cu121    pytorch


# F1 and H&M Datasets Implementation

## Update `configs/data.yaml`

In [7]:
# leggiamo tutto il file yaml
file_path = 'GraphAny/configs/data.yaml'

with open(file_path, 'r') as file:
    data = yaml.safe_load(file)

# aggiungo i metadati del dataset F1 al file yaml
data['_ds_meta_data']['F1'] = 'relbench, f1_9_classes3'
# aggiungo i metadati del dataset H&M al file yaml
data['_ds_meta_data']['HM'] = 'relbench, hm_3_classes'


with open(file_path, 'w') as file:
    yaml.dump(data, file, default_flow_style=False, sort_keys=False)

In [8]:
with open(file_path, 'r') as file:
    data = yaml.safe_load(file)


# Aggiungo il nuovo elemento _dataset_lookup
data['_dataset_lookup']['F1Debug'] = {
    'train': ['Wisconsin'],
    'eval': ['F1']
}
data['_dataset_lookup']['HMDebug'] = {
    'train': ['Wisconsin'],
    'eval': ['HM']
}


with open(file_path, 'w') as file:
    yaml.dump(data, file, default_flow_style=False, sort_keys=False)

## Implement the dataset interface and update `GraphDataset` class in `data.py`

In [9]:
new_code = """

import logging
import os
import os.path
import os.path as osp
import re
import ssl
import sys
import urllib

import pandas as pd
import pickle
#from torch_frame import TensorFrame

import dgl
import dgl.function as fn
import numpy as np
import pytorch_lightning as pl
import torch
from hydra.utils import instantiate
from omegaconf import OmegaConf
from scipy.spatial.distance import pdist, squareform
from sklearn.manifold._utils import (
    _binary_search_perplexity as sklearn_binary_search_perplexity,
)
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

from graphany.utils import logger, timer


def get_entropy_normed_cond_gaussian_prob(X, entropy, metric="euclidean"):

    #Parameters
    #----------
    #X:              The matrix for pairwise similarity
    #entropy:     Perplexity of the conditional prob distribution
    #Returns the entropy-normalized conditional gaussian probability based on distances.
    #-------


    # Compute pairwise distances
    perplexity = np.exp2(entropy)
    distances = pdist(X, metric=metric)
    distances = squareform(distances)

    # Compute the squared distances
    distances **= 2
    distances = distances.astype(np.float32)
    return sklearn_binary_search_perplexity(distances, perplexity, verbose=0)


def sample_k_nodes_per_label(label, visible_nodes, k, num_class):
    ref_node_idx = [
        (label[visible_nodes] == lbl).nonzero().view(-1) for lbl in range(num_class)
    ]
    sampled_indices = [
        label_indices[torch.randperm(len(label_indices))[:k]]
        for label_indices in ref_node_idx
    ]
    return visible_nodes[torch.cat(sampled_indices)]


def get_data_split_masks(n_nodes, labels, num_train_nodes, label_idx=None, seed=42):
    label_idx = np.arange(n_nodes)
    test_rate_in_labeled_nodes = (len(labels) - num_train_nodes) / len(labels)
    train_idx, test_and_valid_idx = train_test_split(
        label_idx,
        test_size=test_rate_in_labeled_nodes,
        random_state=seed,
        shuffle=True,
        stratify=labels,
    )
    valid_idx, test_idx = train_test_split(
        test_and_valid_idx,
        test_size=0.5,
        random_state=seed,
        shuffle=True,
        stratify=labels[test_and_valid_idx],
    )
    train_mask = torch.zeros(n_nodes, dtype=torch.bool)
    val_mask = torch.zeros(n_nodes, dtype=torch.bool)
    test_mask = torch.zeros(n_nodes, dtype=torch.bool)

    train_mask[train_idx] = True
    val_mask[valid_idx] = True
    test_mask[test_idx] = True

    return train_mask, val_mask, test_mask


def download_url(url: str, folder: str, log: bool = True, filename=None):
    #Modified from torch_geometric.data.download_url

    #Downloads the content of an URL to a specific folder.

    #Args:
        #url (str): The URL.
        #folder (str): The folder.
        #log (bool, optional): If :obj:`False`, will not print anything to the
            #console. (default: :obj:`True`)


    if filename is None:
        filename = url.rpartition("/")[2]
        filename = filename if filename[0] == "?" else filename.split("?")[0]

    path = osp.join(folder, filename)

    if osp.exists(path):  # pragma: no cover
        if log and "pytest" not in sys.modules:
            print(f"Using existing file {filename}", file=sys.stderr)
        return path

    if log and "pytest" not in sys.modules:
        print(f"Downloading {url}", file=sys.stderr)

    os.makedirs(osp.expanduser(osp.normpath(folder)), exist_ok=True)

    context = ssl._create_unverified_context()
    data = urllib.request.urlopen(url, context=context)

    with open(path, "wb") as f:
        # workaround for https://bugs.python.org/issue42853
        while True:
            chunk = data.read(10 * 1024 * 1024)
            if not chunk:
                break
            f.write(chunk)

    return path


def load_relbench_dataset(url, raw_dir):
    # Converts relbench dataset to DGL Graph format
    download_path = download_url(url, raw_dir)
    # data = np.load(download_path, allow_pickle=True)
    # data = pd.read_pickle(download_path)
    with open(download_path, 'rb') as f:
        data = pickle.load(f)
    '''
    if isinstance(data['node_features'], list) and all(isinstance(tensor, torch.Tensor) for tensor in data['node_features']):
      for tensor in data['node_features']:
        node_features = torch.stack(data['node_features'])

    if isinstance(data['labels'], list) and all(isinstance(tensor, torch.Tensor) for label in data['labels']):
      for label in data['labels']:
        labels = torch.stack(data['labels'])
    '''
    node_features = torch.tensor(data['node_features'])
    labels = torch.tensor(data['labels'])
    edges = torch.tensor(data['edges'])

    graph = dgl.graph((edges[:, 0], edges[:, 1]),
                      num_nodes=len(node_features), idtype=torch.int32)
    num_classes = len(labels.unique())
    train_mask, val_mask, test_mask = torch.tensor(data['train_mask']), torch.tensor(data['val_mask']), torch.tensor(
        data['test_mask'])

    return graph, labels, num_classes, node_features, train_mask, val_mask, test_mask


def load_heterophilous_dataset(url, raw_dir):
    # Wrap Heterophilous to DGL Graph Dataset format https://arxiv.org/pdf/2302.11640.pdf
    download_path = download_url(url, raw_dir)
    data = np.load(download_path)
    node_features = torch.tensor(data["node_features"])
    labels = torch.tensor(data["node_labels"])
    edges = torch.tensor(data["edges"])

    #
    '''
    print(f"node_features è un: {type(data['node_features'])} con size: {data['node_features'].shape}")
    print(data['node_features'][0])
    print(f"node_labels è un: {type(data['node_labels'])} con size: {data['node_labels'].shape}")
    print(data['node_labels'][0])
    print(f"edges è un: {type(data['edges'])} con size: {data['edges'].shape}")
    print(data['edges'][0])
    '''
    #

    graph = dgl.graph(
        (edges[:, 0], edges[:, 1]), num_nodes=len(node_features), idtype=torch.int
    )
    num_classes = len(labels.unique())
    num_targets = 1 if num_classes == 2 else num_classes
    if num_targets == 1:
        labels = labels.float()
    train_masks = torch.tensor(data["train_masks"]).T
    val_masks = torch.tensor(data["val_masks"]).T
    test_masks = torch.tensor(data["test_masks"]).T

    '''
    print(f"la size della train mask è: {data['train_masks'].shape}")
    print(data['train_masks'][0])
    print(data['train_masks'][1])
    print(data['train_masks'][2])
    '''

    return graph, labels, num_classes, node_features, train_masks, val_masks, test_masks


class CombinedDataset(pl.LightningDataModule):
    def __init__(self, train_ds_dict, eval_ds_dict, cfg):
        super().__init__()
        self.train_ds_dict = train_ds_dict
        self.eval_ds_dict = eval_ds_dict
        self.all_ds = list(self.train_ds_dict.values()) + list(
            self.eval_ds_dict.values()
        )
        self.cfg = cfg

    def to(self, device):
        for ds in self.all_ds:
            ds.to(device)

    def train_dataloader(self):
        sub_dataloaders = {
            name: ds.train_dataloader() for name, ds in self.train_ds_dict.items()
        }
        return pl.utilities.combined_loader.CombinedLoader(sub_dataloaders, "min_size")

    def val_dataloader(self):
        sub_dataloaders = {
            name: ds.val_dataloader() for name, ds in self.eval_ds_dict.items()
        }
        # Use max_size instead of max_size_cycle to avoid repeated evaluation on small datasets
        return pl.utilities.combined_loader.CombinedLoader(sub_dataloaders, "max_size")

    def test_dataloader(self):
        sub_dataloaders = {
            name: ds.test_dataloader() for name, ds in self.eval_ds_dict.items()
        }
        # Use max_size instead of max_size_cycle to avoid repeated evaluation on small datasets
        return pl.utilities.combined_loader.CombinedLoader(sub_dataloaders, "max_size")


class GraphDataset(pl.LightningDataModule):
    def __init__(
            self,
            cfg,
            ds_name,
            cache_dir,
            train_batch_size=256,
            val_test_batch_size=256,
            n_hops=1,
            preprocess_device=torch.device("cpu"),
            permute_label=False,
    ):
        super().__init__()
        self.cfg = cfg
        self.name = ds_name
        self.train_batch_size = train_batch_size
        self.permute_label = permute_label  # For checking label equivariance
        self.val_test_batch_size = val_test_batch_size
        self.preprocess_device = preprocess_device

        self.n_hops = n_hops

        self.data_source, ds_alias = cfg["_ds_meta_data"][ds_name].split(", ")
        self.gidtype = None
        self.dist = None
        self.unmasked_pred = None
        if self.data_source == "pyg":
            components = ds_alias.split(".")
            ds_init_args = {
                "_target_": f"torch_geometric.datasets.{ds_alias}",
                "root": f"{cfg.dirs.data_storage}{self.data_source}/{ds_alias}/",
            }
            if len(components) == 2:  # If sub-dataset
                ds_init_args["_target_"] = f"torch_geometric.datasets.{components[0]}"
                ds_init_args["name"] = components[1]
        elif self.data_source == "dgl":
            ds_init_args = {
                "_target_": f"dgl.data.{ds_alias}",
                "raw_dir": f"{cfg.dirs.data_storage}{self.data_source}/",
            }
        elif self.data_source == "ogb":
            ds_init_args = {
                "_target_": f"ogb.nodeproppred.DglNodePropPredDataset",
                "root": f"{cfg.dirs.data_storage}{self.data_source}/",
                "name": ds_alias,
            }
        elif self.data_source == "heterophilous":
            target = "graphany.data.load_heterophilous_dataset"
            url = f"https://raw.githubusercontent.com/yandex-research/heterophilous-graphs/main/data/{ds_alias}.npz"
            ds_init_args = {
                "_target_": target,
                "raw_dir": f"{cfg.dirs.data_storage}{self.data_source}/",
                "url": url,
            }
        elif self.data_source == "relbench":
            target = "graphany.data.load_relbench_dataset"
            url = f"https://raw.githubusercontent.com/RiccardoRomeo01/BDATM_project_public_data/main/GraphAny_datasets/{ds_alias}.pkl"
            ds_init_args = {
                "_target_": target,
                "raw_dir": f"{cfg.dirs.data_storage}{self.data_source}/",
                "url": url,
            }
        else:
            raise NotImplementedError(f"Unsupported {self.data_source=}")
        self.data_init_args = OmegaConf.create(ds_init_args)
        # self.cache_f_name = osp.join(
        #     cache_dir, f'{self.name}_{n_hops}')
        if cfg.get("feat_chn"):
            all_channels = "+".join([cfg.feat_chn, cfg.pred_chn])
            all_hops = re.findall(r"\d+", all_channels)
            n_hops = max(max([int(_) for _ in all_hops]), n_hops)

        self.split_index = 0
        (
            self.g,
            self.label,
            self.feat,
            self.train_mask,
            self.val_mask,
            self.test_mask,
            self.num_class,
        ) = self.load_dataset(self.data_init_args)
        self.n_nodes, self.n_edges = self.g.num_nodes(), self.g.num_edges()
        self.cache_f_name = osp.join(
            cache_dir,
            f"{self.name}_{n_hops}hop_selfloop={cfg.add_self_loop}_bidirected={cfg.to_bidirected}_split="
            f"{self.split_index}.pt",
        )

        self.dist_f_name = osp.join(
            cache_dir,
            f"{self.name}_{n_hops}hop_selfloop={cfg.add_self_loop}_bidirected={cfg.to_bidirected}_split="
            f"{self.split_index}_{cfg.feat_chn}_entropy={cfg.entropy}_dist.pt",
        )

        self.gidtype = self.g.idtype
        self.train_indices = self.train_mask.nonzero().view(-1)

        (
            self.features,
            self.unmasked_pred,
            self.dist,
        ) = self.prepare_prop_features_logits_and_dist_features(
            self.g, self.feat, n_hops=cfg.n_hops
        )
        # Remove the graph, as GraphAny doesn't use it in training
        del self.g
        del self.feat
        torch.cuda.empty_cache()

    def to(self, device):  # Supports nested dictionary
        def to_device(input):
            if input is None:
                return None
            elif isinstance(input, dict):
                return {key: to_device(value) for key, value in input.items()}
            elif isinstance(input, list):
                return [to_device(item) for item in input]
            elif hasattr(input, "to"):
                return input.to(device)
            else:
                return (
                    input  # Return as is if it's not a tensor or any nested structure
                )

        # Apply to_device to all attributes that may contain tensors
        attrs = [
            "label",
            "feat",
            "train_mask",
            "val_mask",
            "test_mask",
            "train_indices",
            "unmasked_pred",
        ]
        for attr in attrs:
            if hasattr(self, attr):
                setattr(self, attr, to_device(getattr(self, attr)))

    def load_dataset(self, data_init_args):
        dataset = instantiate(data_init_args)

        if self.data_source == "ogb":
            split_idx = dataset.get_idx_split()
            train_indices, valid_indices, test_indices = (
                split_idx["train"],
                split_idx["valid"],
                split_idx["test"],
            )
            # graph: dgl graph object, label: torch tensor of shape (num_nodes, num_tasks)
            g, label = dataset[0]
            label = label.view(-1)

            def to_mask(indices):
                mask = torch.BoolTensor(g.number_of_nodes()).fill_(False)
                mask[indices] = 1
                return mask

            train_mask, val_mask, test_mask = map(
                to_mask, (train_indices, valid_indices, test_indices)
            )

            num_class = label.max().item() + 1

            feat = g.ndata["feat"]
        elif self.data_source == "heterophilous":
            g, label, num_class, feat, train_mask, val_mask, test_mask = dataset
        elif self.data_source == "relbench":
            g, label, num_class, feat, train_mask, val_mask, test_mask = dataset
        elif self.data_source == "dgl":
            g = dataset[0]
            num_class = dataset.num_classes

            # get node feature
            feat = g.ndata["feat"]

            # get data split
            train_mask = g.ndata["train_mask"]
            val_mask = g.ndata["val_mask"]
            test_mask = g.ndata["test_mask"]

            label = g.ndata["label"]
        elif self.data_source == "pyg":
            g = dgl.graph((dataset.edge_index[0], dataset.edge_index[1]))
            n_nodes = dataset.x.shape[0]
            num_class = dataset.num_classes
            # get node feature
            feat = dataset.x
            label = dataset.y

            if (
                    hasattr(dataset, "train_mask")
                    and hasattr(dataset, "val_mask")
                    and hasattr(dataset, "test_mask")
            ):
                train_mask, val_mask, test_mask = (
                    dataset.train_mask,
                    dataset.val_mask,
                    dataset.test_mask,
                )
            else:
                if label.ndim > 1:
                    raise NotImplementedError(
                        "Multi-Label classification currently unsupported."
                    )
                logging.warning(
                    f"No dataset split found for {self.name}, splitting with semi-supervised settings!!"
                )
                train_mask, val_mask, test_mask = get_data_split_masks(
                    n_nodes, label, 20 * num_class, seed=self.cfg.seed
                )

                self.split_index = self.cfg.seed
        else:
            raise NotImplementedError(f"Unsupported {self.data_source=}")
        if train_mask.ndim == 1:
            pass  # only one train/val/test split
        elif train_mask.ndim == 2:
            # ! Multiple splits
            # Modified: Use the ${seed} split if not specified!
            split_index = self.data_init_args.get("split", self.cfg.seed)
            # Avoid invalid split index
            self.split_index = split_index = (split_index % train_mask.ndim)
            train_mask = train_mask[:, split_index].squeeze()
            val_mask = val_mask[:, split_index].squeeze()
            if test_mask.ndim == 2:
                test_mask = test_mask[:, split_index].squeeze()
        else:
            raise ValueError("train/val/test masks have more than 2 dimensions")
        print(
            f"{self.name} {g.num_nodes()} {g.num_edges()} {feat.shape[1]} {num_class} {len(train_mask.nonzero())}"
        )

        if self.cfg.add_self_loop:
            g = dgl.add_self_loop(g)
        else:
            g = dgl.remove_self_loop(g)
        if self.cfg.to_bidirected:
            g = dgl.to_bidirected(g)
        g = dgl.to_simple(g)  # Remove duplicate edges.
        return g, label, feat, train_mask, val_mask, test_mask, num_class

    def compute_linear_gnn_logits(
            self, features, n_per_label_examples, visible_nodes, bootstrap=False
    ):
        # Compute and save LinearGNN logits into a dict. Note the computation is on CPU as torch does not support
        # the gelss driver on GPU currently.
        preds = {}
        label, num_class, device = self.label, self.num_class, torch.device("cpu")
        label = label.to(device)
        visible_nodes = visible_nodes.to(device)
        for channel, F in features.items():
            F = F.to(device)
            if bootstrap:
                ref_nodes = sample_k_nodes_per_label(
                    label, visible_nodes, n_per_label_examples, num_class
                )
            else:
                ref_nodes = visible_nodes
            Y_L = torch.nn.functional.one_hot(label[ref_nodes], num_class).float()
            with timer(
                    f"Solving with CPU driver (N={len(ref_nodes)}, d={F.shape[1]}, k={num_class})",
                    logger.debug,
            ):
                W = torch.linalg.lstsq(
                    F[ref_nodes.cpu()].cpu(), Y_L.cpu(), driver="gelss"
                )[0]
            preds[channel] = F @ W

        return preds

    def compute_channel_logits(self, features, visible_nodes, sample, device):
        pred_logits = self.compute_linear_gnn_logits(
            {
                c: features[c]
                for c in set(self.cfg.feat_channels + self.cfg.pred_channels)
            },
            self.cfg.n_per_label_examples,
            visible_nodes,
            bootstrap=sample,
        )
        return {c: logits.to(device) for c, logits in pred_logits.items()}

    def prepare_prop_features_logits_and_dist_features(self, g, input_feats, n_hops):
        # Calculate Low-pass features containing AX, A^2X and High-pass features
        # (I-A)X, and (I-A)^2X
        if not os.path.exists(self.cache_f_name):
            g = g.to(self.preprocess_device)
            with timer(
                    f"Computing {self.name} message passing and normalized predictions to file {self.cache_f_name}",
                    logger.info,
            ):
                dim = input_feats.size(1)
                LP = torch.zeros(n_hops, g.number_of_nodes(), dim).to(
                    self.preprocess_device
                )
                HP = torch.zeros(n_hops, g.number_of_nodes(), dim).to(
                    self.preprocess_device
                )

                g.ndata["LP"] = input_feats.to(self.preprocess_device)
                g.ndata["HP"] = input_feats.to(self.preprocess_device)
                for hop_idx in range(n_hops):
                    # D^-1 A filter
                    g.update_all(fn.copy_u("LP", "temp"), fn.mean("temp", "LP"))

                    # (I - D^-1A) filter
                    g.update_all(fn.copy_u("HP", "temp"), fn.mean("temp", "HP_out"))
                    g.ndata["HP"] = g.ndata["HP"] - g.ndata["HP_out"]

                    LP[hop_idx] = g.ndata["LP"].clone()
                    HP[hop_idx] = g.ndata["HP"].clone()
                lp_feat_dict = {f"L{l + 1}": x for l, x in enumerate(LP)}
                hp_feat_dict = {f"H{l + 1}": x for l, x in enumerate(HP)}

                features = {"X": input_feats, **lp_feat_dict, **hp_feat_dict}
                unmasked_pred = self.compute_channel_logits(
                    features,
                    self.train_indices,
                    sample=False,
                    device=self.preprocess_device,
                )
                torch.save((features, unmasked_pred), self.cache_f_name)
        else:
            features, unmasked_pred = torch.load(self.cache_f_name, map_location="cpu")
        if not os.path.exists(self.dist_f_name):
            with timer(
                    f"Computing {self.name} conditional gaussian distances "
                    f"and save to {self.dist_f_name}",
                    logger.info,
            ):
                # y_feat: n_nodes, n_channels, n_labels
                y_feat = np.stack(
                    [unmasked_pred[c].cpu().numpy() for c in self.cfg.feat_channels],
                    axis=1,
                )
                # Conditional gaussian probability
                bsz, n_channel, n_class = y_feat.shape
                dist_feat_dim = n_channel * (n_channel - 1)
                # Conditional gaussian probability
                cond_gaussian_prob = np.zeros((bsz, n_channel, n_channel))
                for i in range(bsz):
                    cond_gaussian_prob[i, :, :] = get_entropy_normed_cond_gaussian_prob(
                        y_feat[i, :, :], self.cfg.entropy
                    )
                dist = np.zeros((bsz, dist_feat_dim), dtype=np.float32)

                # Compute pairwise distances between channels n_channels(n_channels-1)/2 total features
                pair_index = 0
                for c in range(n_channel):
                    for c_prime in range(n_channel):
                        if c != c_prime:  # Diagonal distances are useless
                            dist[:, pair_index] = cond_gaussian_prob[:, c, c_prime]
                            pair_index += 1

                dist = torch.from_numpy(dist)
                torch.save(dist, self.dist_f_name)
        else:
            dist = torch.load(self.dist_f_name, map_location="cpu")
        return features, unmasked_pred, dist

    def train_dataloader(self):
        return DataLoader(
            self.train_mask.nonzero().view(-1),
            batch_size=self.train_batch_size,
            shuffle=True,
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_mask.nonzero().view(-1), batch_size=self.val_test_batch_size
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_mask.nonzero().view(-1), batch_size=self.val_test_batch_size
        )

"""

In [10]:
path_name = 'GraphAny/graphany/data.py'
with open(path_name, 'w') as file:
    file.write(new_code)

# Testing GraphAny on F1 Dataset

In [19]:
script_path = "GraphAny/graphany/run.py"
dataset = "F1Debug" # we want to use the F1 dataset
# dataset = "Debug"
steps = 0 # we want to perform zero-shot, thus we impose zero training epochs
checkpoint_path = "GraphAny/checkpoints/graph_any_wisconsin.pt"

In [20]:
os.environ['HYDRA_FULL_ERROR'] = '1'
!source activate graphany && python {script_path} prev_ckpt={checkpoint_path} dataset={dataset} total_steps={steps}

DGL backend not selected or invalid.  Assuming PyTorch for now.
Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)
[2;36m[17:01:23][0m[2;36m [0m[34mINFO    [0m Logger initialized.                                                                                                                 [2mlogging.py:53[0m
[2;36m          [0m[2;36m [0m[34mINFO    [0m [33mLocal_rank[0m=[1;36m0[0m, [33mworking_dir[0m=[35m/content/temp/working_dir/[0m[95mFeb24-17[0m:[1;36m01[0m-c00a66bd/                                                        [2mexperiment.py:56[0m
Downloading /content/data/dgl/wisconsin.zip from https://data.dgl.ai/dataset/wisconsin.zip...
/content/data/dgl/wisconsin.zip: 100% 41.2k/41.2k [00:00<00:00, 1.82MB/s]
Extracting file to /content/data/dgl/wisconsin_5bfc48b0
Done saving data into cached files.
Wiscons

# Testing GraphAny on H&M Dataset

In [None]:
script_path = "GraphAny/graphany/run.py"
dataset = "HMDebug" # we want to use the H&M dataset
# dataset = "Debug"
steps = 0 # we want to perform zero-shot, thus we impose zero training epochs
checkpoint_path = "GraphAny/checkpoints/graph_any_wisconsin.pt"

In [None]:
os.environ['HYDRA_FULL_ERROR'] = '1'
!source activate graphany && python {script_path} prev_ckpt={checkpoint_path} dataset={dataset} total_steps={steps}

DGL backend not selected or invalid.  Assuming PyTorch for now.
Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)
[2;36m[11:29:00][0m[2;36m [0m[34mINFO    [0m Logger initialized.                                                                                                                 [2mlogging.py:53[0m
[2;36m          [0m[2;36m [0m[34mINFO    [0m [33mLocal_rank[0m=[1;36m0[0m, [33mworking_dir[0m=[35m/content/temp/working_dir/[0m[95mFeb20-11[0m:[1;36m29[0m-67b5ea93/                                                        [2mexperiment.py:56[0m
Downloading /content/data/dgl/wisconsin.zip from https://data.dgl.ai/dataset/wisconsin.zip...
/content/data/dgl/wisconsin.zip: 100% 41.2k/41.2k [00:00<00:00, 8.90MB/s]
Extracting file to /content/data/dgl/wisconsin_5bfc48b0
Done saving data into cached files.
Wiscons

# Testing GraphAny on F1 Dataset prediction files

In [21]:
new_code = """

import pytorch_lightning as pl
import rootutils

root = rootutils.setup_root(__file__, dotenv=True, pythonpath=True, cwd=False)
from graphany.utils import logger, timer
from graphany.utils.experiment import init_experiment
from graphany.data import GraphDataset, CombinedDataset
from graphany.model import GraphAny

import torch
import hydra
from omegaconf import DictConfig
import wandb
import numpy as np
import torchmetrics
from rich.pretty import pretty_repr

import os

mean = lambda input: np.round(np.mean(input).item(), 2)


class InductiveNodeClassification(pl.LightningModule):
    def __init__(self, cfg, combined_dataset, checkpoint=None):
        super().__init__()
        self.cfg = cfg
        if checkpoint:
            # Initialize from previous checkpoint using previous graphany config
            ckpt = torch.load(checkpoint, map_location="cpu")
            logger.critical(f"Loaded checkpoint at {checkpoint}")
            self.gnn_model = GraphAny(**ckpt["graph_any_config"])
            self.load_state_dict(ckpt["state_dict"])
        else:
            self.gnn_model = GraphAny(**cfg.graph_any)
        self.combined_dataset = combined_dataset
        self.attn_dict, self.loss_dict, self.res_dict = {}, {}, {}
        # Initialize accuracy metrics for validation and testing
        self.metrics = {}
        held_out_datasets = list(
            set(self.cfg._all_datasets) - set(self.cfg._trans_datasets)
        )  # 27 datasets in total
        self.heldout_metrics = [
            f"{setting}/{d.lower()[:4]}_{split}_acc"
            for split in ["val", "test"]
            for d in held_out_datasets
            for setting in ["trans", "ind"]
        ]
        for split in ("val", "test"):
            self.metrics[split] = {
                k: torchmetrics.Accuracy(task="multiclass", num_classes=v.num_class)
                for k, v in combined_dataset.eval_ds_dict.items()
            }

        self.criterion = torch.nn.CrossEntropyLoss()

    def on_train_end(self):
        checkpoint_path = f"{self.cfg.dirs.output}{self.cfg.dataset}_val_acc={self.res_dict['val_acc']}.pt"
        self.save_checkpoint(checkpoint_path)

    def save_checkpoint(self, file_path):
        checkpoint = {
            "state_dict": self.state_dict(),
            "optimizer_state_dict": [
                opt.state_dict() for opt in self.trainer.optimizers
            ],
            "graph_any_config": self.cfg.graph_any,
        }
        torch.save(checkpoint, file_path)
        logger.critical(f"Checkpoint saved to {file_path}")

    def get_metric_name(self, ds_name, split):
        if ds_name in self.cfg.train_datasets:
            return f"trans/{ds_name.lower()[:4]}_{split}_acc"
        else:
            return f"ind/{ds_name.lower()[:4]}_{split}_acc"

    def configure_optimizers(self):
        # start with all the candidate parameters
        param_dict = {pn: p for pn, p in self.named_parameters()}
        # filter out those that do not require grad
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {"params": decay_params, "weight_decay": self.cfg.weight_decay},
            {"params": nodecay_params, "weight_decay": 0.0},
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        logger.info(
            f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters"
        )
        logger.info(
            f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters"
        )

        if self.cfg.optimizer == "adam":
            optimizer = torch.optim.Adam(self.parameters(), lr=self.cfg.lr)
        else:  # AdamW
            optimizer = torch.optim.AdamW(
                optim_groups,
                lr=self.cfg.lr,
                weight_decay=self.cfg.weight_decay,
            )
        return optimizer

    def on_fit_start(self):
        super().on_fit_start()
        # move all datasets to the correct GPU device
        print(f"moving train and eval datasets to {self.device}")
        self.combined_dataset.to(self.device)
        self.move_metrics_to_device()

    def move_metrics_to_device(self):
        for metrics_dict in self.metrics.values():
            for metric in metrics_dict.values():
                metric.to(self.device)

    def predict(self, ds, nodes, input, is_training=False):
        # Use preprocessed distance during evaluation
        dist = ds.dist if not is_training else None
        dist = dist.to(nodes.device)[nodes] if dist is not None else dist

        preds, attn = self.gnn_model(
            {c: chn_pred[nodes] for c, chn_pred in input.items()}, dist=dist
        )

        self.attn_dict.update(
            {
                f"Attention/{ds.name}-{c}": v
                for c, v in zip(self.cfg.feat_channels, attn)
            }
        )

        def softmax(logits):
          exp_logits = np.exp(logits - np.max(logits))  # Stabilizza l'esponenziale
          return exp_logits / exp_logits.sum(axis=0)

        # Scrittura delle predizioni in un file
        with open("predizioni.txt", "a") as file:  # Modalità append
            for node, pred in zip(nodes.cpu().numpy(), preds.cpu().numpy()):
                line = f"Nodo:{node}\tPredizione:{pred}\tClasse:{np.argmax(softmax(pred))}"
                file.write(line + os.linesep)  # Scrive il nodo e la predizione

        return preds

    def training_step(self, batch, batch_idx):

        loss = {}
        for ds_name, batch_nodes in batch.items():
            ds = self.combined_dataset.train_ds_dict[ds_name]
            train_target_idx = batch_nodes
            # Batch nodes are not visible to avoid trivial solution and overfitting
            visible_nodes = list(
                set(ds.train_indices.tolist()) - set(batch_nodes.tolist())
            )
            ref_nodes = torch.tensor(visible_nodes, dtype=torch.long).to(self.device)
            ds_too_small = len(visible_nodes) < len(batch_nodes)
            if ds_too_small:
                # Visible nodes are too few, add first half of the batch to visible nodes
                ref_nodes = torch.cat((ref_nodes, batch_nodes[: len(batch_nodes) // 2]))

            input = ds.compute_channel_logits(
                ds.features, ref_nodes, sample=True, device=self.device
            )

            preds = self.predict(ds, train_target_idx, input, is_training=True)
            loss[f"loss/{ds_name}_loss"] = self.criterion(
                preds, ds.label[train_target_idx]
            )

        detached_loss = {k: v.detach().cpu() for k, v in loss.items()}
        avg_loss = mean(list(detached_loss.values()))
        self.loss_dict.update({"loss/avg_loss": avg_loss, **detached_loss})
        return sum(loss.values())

    def evaluation_step(self, split, batch, batch_idx):
        self.move_metrics_to_device()
        for ds_name, eval_idx in batch.items():
            if eval_idx is None:  # Skip if dataset is already evaluated (empty batch)
                continue
            ds = self.combined_dataset.eval_ds_dict[ds_name]
            ds.to(self.device)
            eval_idx.to(self.device)
            # Use unmasked feature for evaluation
            processed_feat = ds.unmasked_pred
            preds = self.predict(
                ds, eval_idx, processed_feat, is_training=False
            ).argmax(-1)
            self.metrics[split][ds_name].update(preds, ds.label[eval_idx])

    def validation_step(self, batch, batch_idx):
        self.evaluation_step("val", batch, batch_idx)

    def test_step(self, batch, batch_idx):
        self.evaluation_step("test", batch, batch_idx)

    def compute_and_log_metrics(self, split):
        # Compute metrics from collected outputs
        res = {}
        for ds_name, metric in self.metrics[split].items():
            metric_name = self.get_metric_name(ds_name, split)
            accuracy = metric.compute().cpu().numpy()
            res[metric_name] = np.round(accuracy * 100, 2)
            metric.reset()  # Reset metrics for the next epoch

        combined_res = {f"{split}_acc": np.round(sum(res.values()) / len(res), 2)}
        combined_res[f"trans_{split}_acc"] = mean(
            [v for k, v in res.items() if k.startswith("trans")]
        )
        combined_res[f"ind_{split}_acc"] = mean(
            [v for k, v in res.items() if k.startswith("ind")]
        )

        combined_res[f"heldout_{split}_acc"] = mean(
            [v for k, v in res.items() if k in self.heldout_metrics]
        )
        self.log_dict(res, prog_bar=False, logger=True, add_dataloader_idx=False)
        self.log_dict(
            combined_res, prog_bar=True, logger=True, add_dataloader_idx=False
        )
        self.res_dict.update({**res, **combined_res})

    def on_train_epoch_end(self):
        self.log_dict(self.loss_dict, on_epoch=True, prog_bar=True, logger=True)
        if len(self.attn_dict):
            self.log_dict(self.attn_dict, on_epoch=True, prog_bar=False, logger=True)

    def on_validation_epoch_end(self):
        self.compute_and_log_metrics("val")

    def on_test_epoch_end(self):
        self.compute_and_log_metrics("test")


@timer()
@hydra.main(config_path=f"{root}/configs", config_name="main", version_base=None)
def main(cfg: DictConfig):
    cfg, logger = init_experiment(cfg)
    # Define the default step metric for all metrics
    wandb.define_metric("*", step_metric="epoch")
    if torch.cuda.is_available() and cfg.preprocess_device == "gpu":
        preprocess_device = torch.device("cuda")
    else:
        preprocess_device = torch.device("cpu")

    def construct_ds_dict(datasets):
        datasets = [datasets] if isinstance(datasets, str) else datasets
        ds_dict = {
            dataset: GraphDataset(
                cfg,
                dataset,
                cfg.dirs.data_cache,
                cfg.train_batch_size,
                cfg.val_test_batch_size,
                cfg.n_hops,
                preprocess_device,
            )
            for dataset in datasets
        }
        return ds_dict

    train_ds_dict = construct_ds_dict(cfg.train_datasets)
    eval_ds_dict = construct_ds_dict(cfg.eval_datasets)

    combined_dataset = CombinedDataset(train_ds_dict, eval_ds_dict, cfg)

    model = InductiveNodeClassification(cfg, combined_dataset, cfg.get("prev_ckpt"))
    # Set up the checkpoint callback to save only at the end of training
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        dirpath=cfg.dirs.output,  # specify where to save
        filename="final_checkpoint.pt",  # set a filename
        save_top_k=0,  # do not save based on metric, just save last
        save_last=True,  # ensures only the last checkpoint is kept
        save_on_train_epoch_end=True,  # save at the end of training epoch
    )
    trainer = pl.Trainer(
        max_epochs=cfg.total_steps,
        callbacks=[checkpoint_callback],
        limit_train_batches=cfg.limit_train_batches,
        check_val_every_n_epoch=cfg.eval_freq,
        logger=logger,
        accelerator="gpu" if torch.cuda.is_available() and cfg.gpus > 0 else "cpu",
        default_root_dir=cfg.dirs.lightning_root,
    )
    dataloaders = {
        "train": combined_dataset.train_dataloader(),
        "val": combined_dataset.val_dataloader(),
        "test": combined_dataset.test_dataloader(),
    }
    if cfg.total_steps > 0:
        trainer.fit(
            model,
            train_dataloaders=dataloaders["train"],
            val_dataloaders=dataloaders["val"],
        )
    trainer.validate(model, dataloaders=dataloaders["val"])
    trainer.test(model, dataloaders=dataloaders["test"])
    final_results = model.res_dict
    logger.critical(pretty_repr(final_results))
    logger.wandb_summary_update(final_results, finish_wandb=True)


if __name__ == "__main__":
    main()

"""

In [23]:
path_name = 'GraphAny/graphany/run.py'
with open(path_name, 'w') as file:
    file.write(new_code)

In [24]:
script_path = "GraphAny/graphany/run.py"
dataset = "F1Debug" # we want to use the F1 dataset
# dataset = "Debug"
steps = 0 # we want to perform zero-shot, thus we impose zero training epochs
checkpoint_path = "GraphAny/checkpoints/graph_any_wisconsin.pt"

In [25]:
os.environ['HYDRA_FULL_ERROR'] = '1'
!source activate graphany && python {script_path} prev_ckpt={checkpoint_path} dataset={dataset} total_steps={steps}

[2;36m[09:20:23][0m[2;36m [0m[34mINFO    [0m Logger initialized.                                                                                                                 [2mlogging.py:53[0m
[2;36m          [0m[2;36m [0m[34mINFO    [0m [33mLocal_rank[0m=[1;36m0[0m, [33mworking_dir[0m=[35m/content/temp/working_dir/[0m[95mFeb25-9[0m:[1;36m20[0m-13783a79/                                                         [2mexperiment.py:56[0m
Done loading data from cached files.
Wisconsin 251 515 1703 5 120
Using existing file f1_9_classes3.pkl
F1 12553 11362 300 9 9720
[2;36m          [0m[2;36m [0m[1;7;31mCRITICAL[0m Loaded checkpoint at GraphAny/checkpoints/graph_any_wisconsin.pt                                                                        [2mrun.py:32[0m
INFO: GPU available: True (cuda), used: True
[2;36m          [0m[2;36m [0m[34mINFO    [0m GPU available: [3;92mTrue[0m [1m([0mcuda[1m)[0m, used: [3;92mTrue[0m                     

In [29]:
import pickle
import torch


def build_comparison_file(download_path, output_file):
    with open(download_path, 'rb') as f:
        data = pickle.load(f)

    node_features = torch.tensor(data['node_features'])
    labels = torch.tensor(data['labels'])
    edges = torch.tensor(data['edges'])

    train_mask, val_mask, test_mask = torch.tensor(data['train_mask']), torch.tensor(data['val_mask']), torch.tensor(
        data['test_mask'])

    # Scrittura dei nodi e delle rispettive classi nel test set
    with open(output_file, 'w') as f:
        f.write("Nodo,Classe\n")  # Header del file
        for node in range(len(test_mask)):
            if test_mask[node]:  # Controllo se il nodo è nel test set
                f.write(f"{node},{labels[node].item()}\n")  # Scrivo nodo e classe

    return labels, node_features, train_mask, val_mask, test_mask

In [30]:
import requests

output_file = "true_labels.txt"

# URL del file da scaricare
download_url = 'https://raw.githubusercontent.com/RiccardoRomeo01/BDATM_project_public_data/main/GraphAny_datasets/f1_9_classes3.pkl'
local_file_path = 'f1_9_classes3.pkl'


response = requests.get(download_url)

if response.status_code == 200:
    with open(local_file_path, 'wb') as f:
        f.write(response.content)
    print("File scaricato e salvato come:", local_file_path)
else:
    print("Errore nel download del file:", response.status_code)

labels, node_features, train_mask, val_mask, test_mask = build_comparison_file(local_file_path, output_file)

File scaricato e salvato come: f1_9_classes3.pkl
