In [1]:
# we install all the required for RelBench
!pip install relbench[full]

Collecting relbench[full]
  Downloading relbench-1.1.0-py3-none-any.whl.metadata (12 kB)
Collecting pytorch_frame>=0.2.3 (from relbench[full])
  Downloading pytorch_frame-0.2.5-py3-none-any.whl.metadata (20 kB)
Collecting torch_geometric (from relbench[full])
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->pytorch_frame>=0.2.3->relbench[full])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->pytorch_frame>=0.2.3->relbench[full])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->pytorch_frame>=0.2.3->relbench[full])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2

In [2]:
import relbench
from relbench.datasets import get_dataset_names, get_dataset
from relbench.modeling.utils import get_stype_proposal
from relbench.modeling.graph import make_pkey_fkey_graph


import torch
from torch_geometric.seed import seed_everything
from torch import Tensor
from torch_frame import stype
from torch_frame.config.text_embedder import TextEmbedderConfig
from torch_frame.data.multi_embedding_tensor import MultiEmbeddingTensor


from typing import List, Optional


from sentence_transformers import SentenceTransformer


import os


import pandas as pd

import numpy as np

import random

import pickle

import requests

# Environment Checking

In [3]:
class GloveTextEmbedding:
    def __init__(self, device: Optional[torch.device] = None):
        self.model = SentenceTransformer(
            "sentence-transformers/average_word_embeddings_glove.6B.300d",
            device=device,
        )

    def __call__(self, sentences: List[str]) -> Tensor:
        return self.model.encode(sentences, convert_to_tensor=True)

In [4]:
# Check that it's cuda if you want it to run in reasonable time!
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.set_num_threads(1)
print(device)

# Set the seed for generating random numbers to ensure reproducibility
seed_everything(42)

# Path to the directory for caching graph data
root_dir = "./data"

# Configure the text encoder
text_embedder_cfg = TextEmbedderConfig(
    text_embedder=GloveTextEmbedding(device=device),
    batch_size=256
)

cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/248 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

(…)ddings%2Fwhitespacetokenizer_config.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/480M [00:00<?, ?B/s]

(…)rdEmbeddings%2Fwordembedding_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
print(f"The RelBench version is {relbench.__version__}")
print(f"The RelBench datasets are {get_dataset_names()}")

The RelBench version is 1.1.0
The RelBench datasets are ['rel-amazon', 'rel-avito', 'rel-event', 'rel-f1', 'rel-hm', 'rel-stack', 'rel-trial']


#Usefull functions

In [6]:
def modify_node_name(key):
    # Trova l'ultima posizione di '_' e rimuove la parte numerica
    last_underscore_index = key.rfind('_')

    if last_underscore_index != -1:
        # Parte letterale (es. 'standings')
        modified_name = key[:last_underscore_index]

        # Parte numerica (es. 3)
        node_index = int(''.join(filter(str.isdigit, key[last_underscore_index:])))

        return modified_name, node_index

    else:
        node_name = ''.join(filter(str.isalpha, key))  # Parte letterale (es. 'standings')
        node_index = int(''.join(filter(str.isdigit, key)))  # Parte numerica (es. 5)
        return node_name, node_index

In [7]:
def read_triplets_from_file(file_path):
    triplets = []

    # Controlla se il file_path è un URL
    if file_path.startswith("http://") or file_path.startswith("https://"):
        response = requests.get(file_path)
        if response.status_code == 200:
            lines = response.text.splitlines()
        else:
            print(f"Errore nel download del file: {response.status_code}")
            return triplets
    else:
        # Legge il file locale
        with open(file_path, 'r') as file:
            lines = file.readlines()

    for line in lines:
        # Rimuovi eventuali spazi bianchi e separa la riga in base ai tab
        parts = line.strip().split('\t')
        if len(parts) == 3:  # Assicurati che ci siano esattamente 3 elementi
            triplet = (parts[0], parts[1], parts[2])
            triplets.append(triplet)
        else:
            print(f"Riga non valida: {line.strip()}")

    return triplets


In [8]:
def build_nodes_dictionary_from_triplets(node_dict, triplets, nodes_without_timestamp, split):

    for triplet in triplets:
      source_node = triplet[0]
      edge_label = triplet[1]
      target_node = triplet[2]

      # se il source node e il target node non sono già nel vocabolario li aggiungo
      if source_node not in node_dict:
          source_node_label = modify_node_name(source_node)

          if source_node_label in nodes_without_timestamp:
            node_dict[source_node] = [len(node_dict), 'all']
          else:
            node_dict[source_node] = [len(node_dict), split]

      if target_node not in node_dict:
          target_node_label = modify_node_name(target_node)

          if target_node_label in nodes_without_timestamp:
            node_dict[target_node] = [len(node_dict), 'all']
          else:
            node_dict[target_node] = [len(node_dict), split]

    return node_dict

In [9]:
def build_GraphAny_dataset(KG_data, node_dict, triplets):
    node_features = []
    labels = []
    edges = []

    train_mask = []
    val_mask = []
    test_mask = []

    for key, value in node_dict.items():
      # dalla chiave ottengo il nome del nodo e l'indice
      entity_label, entity_index = modify_node_name(key)

      # da KG_data prendo le features di quell'entità e la aggiungo a node_features
      node_features.append(KG_data[entity_label].tf[entity_index])

      # aggiugo la label di quell'entità a labels
      labels.append(entity_label)

      # in base al valore di split aggiorno le maschere
      split_string = value[1]
      if split_string == "train":
        train_mask.append(True)
        val_mask.append(False)
        test_mask.append(False)
      elif split_string == "val":
        train_mask.append(False)
        val_mask.append(True)
        test_mask.append(False)
      elif split_string == "test":
        train_mask.append(False)
        val_mask.append(False)
        test_mask.append(True)
      elif split_string == 'all':
        train_mask.append(True)
        val_mask.append(True)
        test_mask.append(True)


    # per ogni tripletta passata ricavo source e target
    for triplet in triplets:
      source_node = triplet[0]
      edge_label = triplet[1]
      target_node = triplet[2]

      source_index = node_dict[source_node][0]
      target_index = node_dict[target_node][0]

      pair = [source_index, target_index]
      if pair not in edges:
        edges.append(pair)

    return node_features, labels, edges, train_mask, val_mask, test_mask

In [10]:
def flatten_multi_embedding(met: MultiEmbeddingTensor, device=None, flatten_extra_dims=True):
    """
    Convert MultiEmbeddingTensor to dense tensor with better error handling.
    """
    if device is None:
        device = torch.device("cpu")

    # 1. First check for direct tensor conversion methods
    if hasattr(met, 'to_tensor') and callable(met.to_tensor):
        tensor = met.to_tensor()
        if isinstance(tensor, torch.Tensor):
            if tensor.layout != torch.strided:
                tensor = tensor.to_dense()
            tensor = tensor.to(device)
            if flatten_extra_dims and tensor.dim() > 2:
                tensor = tensor.flatten(start_dim=1)
            return tensor

    # 2. Look for embedding storage in attributes
    dict_candidates = ["_data", "embeddings", "_embeddings", "_tensor_dict", "values"]
    embedding_dict = None

    for candidate in dict_candidates:
        if hasattr(met, candidate):
            candidate_val = getattr(met, candidate)
            # Handle both direct tensors and dictionaries
            if isinstance(candidate_val, torch.Tensor):
                return _process_tensor(candidate_val, device, flatten_extra_dims)
            elif isinstance(candidate_val, dict):
                embedding_dict = candidate_val
                break

    # 3. Handle case where MultiEmbeddingTensor wraps a single tensor
    if embedding_dict is None:
        if hasattr(met, 'values') and callable(met.values):
            tensor = met.values()
            return _process_tensor(tensor, device, flatten_extra_dims)
        else:
            raise ValueError(
                f"Failed to unpack MultiEmbeddingTensor. Available attributes: {dir(met)}\n"
                "Consider inspecting the object structure with: "
                "print(dir(your_multi_embedding_tensor))"
            )

    # 4. Process dictionary of embeddings
    sub_tensors = []
    for emb in embedding_dict.values():
        if isinstance(emb, torch.Tensor):
            if emb.layout != torch.strided:
                emb = emb.to_dense()
            emb = emb.to(device)
            if flatten_extra_dims and emb.dim() > 2:
                emb = emb.flatten(start_dim=1)
            sub_tensors.append(emb)
        else:
            raise TypeError(f"Unexpected embedding type: {type(emb)}")

    return torch.cat(sub_tensors, dim=1)

def _process_tensor(tensor: torch.Tensor, device, flatten_extra_dims) -> torch.Tensor:
    """Helper for consistent tensor processing"""
    if tensor.layout != torch.strided:
        tensor = tensor.to_dense()
    tensor = tensor.to(device)
    if flatten_extra_dims and tensor.dim() > 2:
        tensor = tensor.flatten(start_dim=1)
    return tensor


def torchframe_to_tensor(tf, device=None, flatten_extra_dims=True):
    """
    Robustly convert a TorchFrame to a dense torch.Tensor by handling
    MultiEmbeddingTensors and other column types.
    """
    if device is None:
        device = torch.device("cpu")

    feats = []
    for stype_key, typed_feat in tf.feat_dict.items():
        # Resolve potential lazy-loaded tensors
        if callable(typed_feat):
            typed_feat = typed_feat()

        # Handle MultiEmbeddingTensor
        if isinstance(typed_feat, MultiEmbeddingTensor):
            met_tensor = flatten_multi_embedding(
                typed_feat, device=device, flatten_extra_dims=flatten_extra_dims
            )
            feats.append(met_tensor)
            continue

        # Convert sparse tensors to dense
        if hasattr(typed_feat, "to_dense") and callable(typed_feat.to_dense):
            typed_feat = typed_feat.to_dense()

        # Fallback to .values() if not a tensor
        if not isinstance(typed_feat, torch.Tensor):
            if hasattr(typed_feat, "values") and callable(typed_feat.values):
                typed_feat = typed_feat.values()
            else:
                raise TypeError(
                    f"Feature {stype_key} is not a tensor. Got {type(typed_feat)}"
                )

        # Flatten
        typed_feat = typed_feat.to(device)
        if flatten_extra_dims and typed_feat.dim() > 2:
            typed_feat = typed_feat.flatten(start_dim=1)

        feats.append(typed_feat)

    return torch.cat(feats, dim=1)

In [11]:
def remove_last_sep(s: str) -> str:
    sep = "[SEP]"
    last_index = s.rfind(sep)
    if last_index != -1:
        return s[:last_index]
    return s


def linearize_features(node_features: list, device=None) -> list:
    """
    Linearizes a list of node features in the format: <name_feature_1> <val_1> [SEP] <name_feature_2> <val2> [SEP] ...
    """

    if device is None:
        device = torch.device("cpu")

    linearized_tensors = []

    for i, tensor_frame in enumerate(node_features):

      tf_on_device = tensor_frame.to(device)
      feats = []
      for stype_key, typed_feat in tf_on_device.feat_dict.items():

          for i in range(len(tf_on_device.col_names_dict[stype_key])):
            if isinstance(typed_feat, MultiEmbeddingTensor):
                met_tensor = flatten_multi_embedding(
                    typed_feat, flatten_extra_dims = True
                )
                num_cols = typed_feat.num_cols
                emb_dim = typed_feat.values.shape[1] // num_cols


                sub_tensors = torch.split(met_tensor, emb_dim, dim=1)

                feature_name = tf_on_device.col_names_dict[stype_key][i]

                feature_value = sub_tensors[i].tolist()

                feats.append(f"{feature_name} {feature_value} [SEP] ")
                continue

            feature_name = tf_on_device.col_names_dict[stype_key][i]
            feature_value = typed_feat[0][i].tolist()

            feats.append(f"{feature_name} {feature_value} [SEP] ")


      linearized_tensors.append(remove_last_sep(''.join(feats)))
    return linearized_tensors

In [12]:
def text_embedding(linearize_features: list, embedder_model, device=None) -> list:
    """
    Convert a list of linearized features in a list of text embedded features (list of tensors).
    """
    if device is None:
        device = torch.device("cpu")

    # we compute the embedding of each linearized input feature
    embedded_features = []
    for feature in linearize_features:
        emb_feat = embedder_model(feature)
        embedded_features.append(emb_feat)

    return embedded_features

In [None]:
def from_string_to_digit(unique_labels: list, node_labels: list) -> list:
    """
    Convert a list of string labels in a list digit labels.
    """
    digitalized_labels = []
    for node_label in node_labels:
        for i, unique_label in enumerate(unique_labels):
            if node_label == unique_label:
                digitalized_labels.append(i)
    return digitalized_labels

# F1 Dataset Creation

In [13]:
# We download the f1-dataset
f1_dataset = get_dataset(name="rel-f1", download=True)

# we download the entire database (also the test part)
f1_db = f1_dataset.get_db(upto_test_timestamp = False)
f1_col_to_stype_dict = get_stype_proposal(f1_db)

# Generate graph data
f1_data, f1_col_stats_dict = make_pkey_fkey_graph(
    f1_db,
    col_to_stype_dict = f1_col_to_stype_dict,  # Column types
    text_embedder_cfg = text_embedder_cfg,  # Our chosen text encoder
    cache_dir=os.path.join(
        root_dir, f"rel-f1_materialized_cache"
    ),  # Store materialized graph for convenience
)

Downloading file 'rel-f1/db.zip' from 'https://relbench.stanford.edu/download/rel-f1/db.zip' to '/root/.cache/relbench'.
100%|████████████████████████████████████████| 704k/704k [00:00<00:00, 212MB/s]
Unzipping contents of '/root/.cache/relbench/rel-f1/db.zip' to '/root/.cache/relbench/rel-f1/.'


Loading Database object from /root/.cache/relbench/rel-f1/db...
Done in 0.19 seconds.


Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00,  1.96it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 245.61it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 239.84it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 214.42it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 104.56it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 136.09it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 124.39it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 71.69it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 103.21it/s]
  ser = pd.to_datetime(ser, format=time_format)
Embedding raw data in mini-batch: 100%|██████████| 5/5 [00:00<00:00, 56.89it/s]
  ser = pd.to_datetime(ser, format=self.format, errors='coerce')
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 34.36it/s]


In [14]:
f1_val_timestep = f1_dataset.val_timestamp
f1_test_timestep = f1_dataset.test_timestamp

print(f"The validation timestep is: {f1_val_timestep}")
print(f"The test timestep is: {f1_test_timestep}")

The validation timestep is: 2005-01-01 00:00:00
The test timestep is: 2010-01-01 00:00:00


In [15]:
f1_data

HeteroData(
  results={
    tf=TensorFrame([26080, 11]),
    time=[26080],
  },
  constructor_results={
    tf=TensorFrame([12290, 2]),
    time=[12290],
  },
  standings={
    tf=TensorFrame([34124, 4]),
    time=[34124],
  },
  circuits={ tf=TensorFrame([77, 7]) },
  drivers={ tf=TensorFrame([857, 6]) },
  races={
    tf=TensorFrame([1101, 5]),
    time=[1101],
  },
  constructors={ tf=TensorFrame([211, 3]) },
  constructor_standings={
    tf=TensorFrame([13051, 4]),
    time=[13051],
  },
  qualifying={
    tf=TensorFrame([9815, 3]),
    time=[9815],
  },
  (results, f2p_raceId, races)={ edge_index=[2, 26080] },
  (races, rev_f2p_raceId, results)={ edge_index=[2, 26080] },
  (results, f2p_driverId, drivers)={ edge_index=[2, 26080] },
  (drivers, rev_f2p_driverId, results)={ edge_index=[2, 26080] },
  (results, f2p_constructorId, constructors)={ edge_index=[2, 26080] },
  (constructors, rev_f2p_constructorId, results)={ edge_index=[2, 26080] },
  (constructor_results, f2p_raceId, rac

In [16]:
node_names = ['standings', 'drivers', 'results', 'constructor_results', 'circuits', 'qualifying', 'races', 'constructors', 'constructor_standings']
node_without_timestamp = ['drivers', 'circuits', 'constructors']
edges_names = [('constructor_standings', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'constructor_standings'),
                ('constructor_standings', 'f2p_constructorId', 'constructors'),
                ('constructors', 'rev_f2p_constructorId', 'constructor_standings'),
                ('standings', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'standings'),
                ('standings', 'f2p_driverId', 'drivers'),
                ('drivers', 'rev_f2p_driverId', 'standings'),
                ('constructor_results', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'constructor_results'),
                ('constructor_results', 'f2p_constructorId', 'constructors'),
                ('constructors', 'rev_f2p_constructorId', 'constructor_results'),
                ('results', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'results'),
                ('results', 'f2p_driverId', 'drivers'),
                ('drivers', 'rev_f2p_driverId', 'results'),
                ('results', 'f2p_constructorId', 'constructors'),
                ('constructors', 'rev_f2p_constructorId', 'results'),
                ('qualifying', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'qualifying'),
                ('qualifying', 'f2p_driverId', 'drivers'),
                ('drivers', 'rev_f2p_driverId', 'qualifying'),
                ('qualifying', 'f2p_constructorId', 'constructors'),
                ('constructors', 'rev_f2p_constructorId', 'qualifying'),
                ('races', 'f2p_circuitId', 'circuits'),
                ('circuits', 'rev_f2p_circuitId', 'races')]

In [17]:
train_path = "https://raw.githubusercontent.com/RiccardoRomeo01/BDATM_project_public_data/main/datasets/F1-v2/inductive/train.txt"
val_path = "https://raw.githubusercontent.com/RiccardoRomeo01/BDATM_project_public_data/main/datasets/F1-v2/inductive/inference_valid.txt"
test_path = "https://raw.githubusercontent.com/RiccardoRomeo01/BDATM_project_public_data/main/datasets/F1-v2/inductive/inference_test.txt"

In [18]:
train_triplets = read_triplets_from_file(train_path)
val_triplets = read_triplets_from_file(val_path)
test_triplets = read_triplets_from_file(test_path)

In [19]:
f1_dict = {}
f1_dict = build_nodes_dictionary_from_triplets(node_dict = f1_dict,
                                               triplets = train_triplets,
                                               nodes_without_timestamp = node_without_timestamp,
                                               split = 'train')
f1_dict = build_nodes_dictionary_from_triplets(node_dict = f1_dict,
                                               triplets = val_triplets,
                                               nodes_without_timestamp = node_without_timestamp,
                                               split = 'val')
f1_dict = build_nodes_dictionary_from_triplets(node_dict = f1_dict,
                                               triplets = test_triplets,
                                               nodes_without_timestamp = node_without_timestamp,
                                               split = 'test')

In [20]:
f1_triplets = train_triplets + val_triplets + test_triplets

In [34]:
f1_node_features, f1_labels, f1_edges, f1_train_mask, f1_val_mask, f1_test_mask = build_GraphAny_dataset(KG_data = f1_data,
                                                                                                          node_dict = f1_dict,
                                                                                                          triplets = f1_triplets)

In [35]:
# we linearize the features in the format <name_feature_1> <val_1> [SEP] <name_feature_2> <val2> [SEP] ...
f1_linearized_features = linearize_features(f1_node_features, device=device)

# now we compute the embedding for each linearized feature
embedding_model = GloveTextEmbedding(device)
f1_emb_features = text_embedding(f1_linearized_features, embedding_model, device)

In [36]:
# Now we have to change the format of our input data
# All the data must appear in the format of numpy arrays
f1_emb_features = torch.stack(f1_emb_features).cpu().numpy()

# we digitalize labels
f1_labels = np.array(from_string_to_digit(node_names, f1_labels))

f1_edges = np.array(f1_edges)

f1_train_mask = np.array(f1_train_mask)
f1_val_mask = np.array(f1_val_mask)
f1_test_mask = np.array(f1_test_mask)

In [37]:
data_to_save = {
    'node_features': f1_emb_features,
    'labels': f1_labels,
    'edges': f1_edges,
    'train_mask': f1_train_mask,
    'val_mask': f1_val_mask,
    'test_mask': f1_test_mask
}

with open('f1_data.pkl', 'wb') as f:
    pickle.dump(data_to_save, f)

In [None]:
# PER LEGGERE DAL FILE
import requests
from torch_frame import stype

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


file_url = "https://raw.githubusercontent.com/RiccardoRomeo01/BDATM_project_public_data/main/GraphAny_datasets/f1_9_classes.pkl"

# Scarica il file
response = requests.get(file_url)
response.raise_for_status()  # Verifica che la richiesta sia andata a buon fine

# Carica i dati dal contenuto del file
data_loaded = pickle.loads(response.content)

f1_node_features = data_loaded['node_features']
f1_labels = data_loaded['labels']
f1_edges = data_loaded['edges']
f1_train_mask = data_loaded['train_mask']
f1_val_mask = data_loaded['val_mask']
f1_test_mask = data_loaded['test_mask']