In [1]:
# we install all the required for RelBench
!pip install relbench[full]

Collecting relbench[full]
  Downloading relbench-1.1.0-py3-none-any.whl.metadata (12 kB)
Collecting pytorch_frame>=0.2.3 (from relbench[full])
  Downloading pytorch_frame-0.2.5-py3-none-any.whl.metadata (20 kB)
Collecting torch_geometric (from relbench[full])
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->pytorch_frame>=0.2.3->relbench[full])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->pytorch_frame>=0.2.3->relbench[full])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->pytorch_frame>=0.2.3->relbench[full])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2

In [2]:
import relbench
from relbench.datasets import get_dataset_names, get_dataset
from relbench.modeling.utils import get_stype_proposal
from relbench.modeling.graph import make_pkey_fkey_graph


import torch
from torch_geometric.seed import seed_everything
from torch import Tensor
from torch_frame.config.text_embedder import TextEmbedderConfig


from typing import List, Optional


from sentence_transformers import SentenceTransformer


import os


import pandas as pd


import random

import pickle

import requests

In [3]:
class GloveTextEmbedding:
    def __init__(self, device: Optional[torch.device] = None):
        self.model = SentenceTransformer(
            "sentence-transformers/average_word_embeddings_glove.6B.300d",
            device=device,
        )

    def __call__(self, sentences: List[str]) -> Tensor:
        return self.model.encode(sentences, convert_to_tensor=True)

# Environment Checking

In [4]:
# Check that it's cuda if you want it to run in reasonable time!
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.set_num_threads(1)
print(device)

# Set the seed for generating random numbers to ensure reproducibility
seed_everything(42)

# Path to the directory for caching graph data
root_dir = "./data"

# Configure the text encoder
text_embedder_cfg = TextEmbedderConfig(
    text_embedder=GloveTextEmbedding(device=device),
    batch_size=256
)

cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/248 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

(…)ddings%2Fwhitespacetokenizer_config.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/480M [00:00<?, ?B/s]

(…)rdEmbeddings%2Fwordembedding_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
print(f"The RelBench version is {relbench.__version__}")
print(f"The RelBench datasets are {get_dataset_names()}")

The RelBench version is 1.1.0
The RelBench datasets are ['rel-amazon', 'rel-avito', 'rel-event', 'rel-f1', 'rel-hm', 'rel-stack', 'rel-trial']


#Usefull functions

In [6]:
def modify_node_name(key):
    # Trova l'ultima posizione di '_' e rimuove la parte numerica
    last_underscore_index = key.rfind('_')

    if last_underscore_index != -1:
        # Parte letterale (es. 'standings')
        modified_name = key[:last_underscore_index]

        # Parte numerica (es. 3)
        node_index = int(''.join(filter(str.isdigit, key[last_underscore_index:])))

        return modified_name, node_index

    else:
        node_name = ''.join(filter(str.isalpha, key))  # Parte letterale (es. 'standings')
        node_index = int(''.join(filter(str.isdigit, key)))  # Parte numerica (es. 5)
        return node_name, node_index

In [7]:
def read_triplets_from_file(file_path):
    triplets = []

    # Controlla se il file_path è un URL
    if file_path.startswith("http://") or file_path.startswith("https://"):
        response = requests.get(file_path)
        if response.status_code == 200:
            lines = response.text.splitlines()
        else:
            print(f"Errore nel download del file: {response.status_code}")
            return triplets
    else:
        # Legge il file locale
        with open(file_path, 'r') as file:
            lines = file.readlines()

    for line in lines:
        # Rimuovi eventuali spazi bianchi e separa la riga in base ai tab
        parts = line.strip().split('\t')
        if len(parts) == 3:  # Assicurati che ci siano esattamente 3 elementi
            triplet = (parts[0], parts[1], parts[2])
            triplets.append(triplet)
        else:
            print(f"Riga non valida: {line.strip()}")

    return triplets


In [8]:
def build_nodes_dictionary_from_triplets(node_dict, triplets, nodes_without_timestamp, split):

    for triplet in triplets:
      source_node = triplet[0]
      edge_label = triplet[1]
      target_node = triplet[2]

      # se il source node e il target node non sono già nel vocabolario li aggiungo
      if source_node not in node_dict:
          source_node_label = modify_node_name(source_node)

          if source_node_label in nodes_without_timestamp:
            node_dict[source_node] = [len(node_dict), 'all']
          else:
            node_dict[source_node] = [len(node_dict), split]

      if target_node not in node_dict:
          target_node_label = modify_node_name(target_node)

          if target_node_label in nodes_without_timestamp:
            node_dict[target_node] = [len(node_dict), 'all']
          else:
            node_dict[target_node] = [len(node_dict), split]

    return node_dict

In [9]:
def build_GraphAny_dataset(KG_data, node_dict, triplets):
    node_features = []
    labels = []
    edges = []

    train_mask = []
    val_mask = []
    test_mask = []

    for key, value in node_dict.items():
      # dalla chiave ottengo il nome del nodo e l'indice
      entity_label, entity_index = modify_node_name(key)

      # da KG_data prendo le features di quell'entità e la aggiungo a node_features
      node_features.append(KG_data[entity_label].tf[entity_index])

      # aggiugo la label di quell'entità a labels
      labels.append(entity_label)

      # in base al valore di split aggiorno le maschere
      split_string = value[1]
      if split_string == "train":
        train_mask.append(True)
        val_mask.append(False)
        test_mask.append(False)
      elif split_string == "val":
        train_mask.append(False)
        val_mask.append(True)
        test_mask.append(False)
      elif split_string == "test":
        train_mask.append(False)
        val_mask.append(False)
        test_mask.append(True)
      elif split_string == 'all':
        train_mask.append(True)
        val_mask.append(True)
        test_mask.append(True)


    # per ogni tripletta passata ricavo source e target
    for triplet in triplets:
      source_node = triplet[0]
      edge_label = triplet[1]
      target_node = triplet[2]

      source_index = node_dict[source_node][0]
      target_index = node_dict[target_node][0]

      pair = [source_index, target_index]
      if pair not in edges:
        edges.append(pair)

    return node_features, labels, edges, train_mask, val_mask, test_mask

# F1 Dataset Creation

In [10]:
# We download the f1-dataset
f1_dataset = get_dataset(name="rel-f1", download=True)

# we download the entire database (also the test part)
f1_db = f1_dataset.get_db(upto_test_timestamp = False)
f1_col_to_stype_dict = get_stype_proposal(f1_db)

# Generate graph data
f1_data, f1_col_stats_dict = make_pkey_fkey_graph(
    f1_db,
    col_to_stype_dict = f1_col_to_stype_dict,  # Column types
    text_embedder_cfg = text_embedder_cfg,  # Our chosen text encoder
    cache_dir=os.path.join(
        root_dir, f"rel-f1_materialized_cache"
    ),  # Store materialized graph for convenience
)

Downloading file 'rel-f1/db.zip' from 'https://relbench.stanford.edu/download/rel-f1/db.zip' to '/root/.cache/relbench'.
100%|████████████████████████████████████████| 704k/704k [00:00<00:00, 181MB/s]
Unzipping contents of '/root/.cache/relbench/rel-f1/db.zip' to '/root/.cache/relbench/rel-f1/.'


Loading Database object from /root/.cache/relbench/rel-f1/db...
Done in 0.17 seconds.


Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00,  2.94it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 152.85it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 167.64it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 222.50it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 184.37it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 210.21it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 212.49it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 192.54it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 223.15it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 269.02it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 250.45it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 221.53it/s]
  ser = pd.to_datetime(ser, f

In [11]:
f1_val_timestep = f1_dataset.val_timestamp
f1_test_timestep = f1_dataset.test_timestamp

print(f"The validation timestep is: {f1_val_timestep}")
print(f"The test timestep is: {f1_test_timestep}")

The validation timestep is: 2005-01-01 00:00:00
The test timestep is: 2010-01-01 00:00:00


In [12]:
f1_data

HeteroData(
  constructors={ tf=TensorFrame([211, 3]) },
  drivers={ tf=TensorFrame([857, 6]) },
  circuits={ tf=TensorFrame([77, 7]) },
  qualifying={
    tf=TensorFrame([9815, 3]),
    time=[9815],
  },
  standings={
    tf=TensorFrame([34124, 4]),
    time=[34124],
  },
  results={
    tf=TensorFrame([26080, 11]),
    time=[26080],
  },
  races={
    tf=TensorFrame([1101, 5]),
    time=[1101],
  },
  constructor_standings={
    tf=TensorFrame([13051, 4]),
    time=[13051],
  },
  constructor_results={
    tf=TensorFrame([12290, 2]),
    time=[12290],
  },
  (qualifying, f2p_raceId, races)={ edge_index=[2, 9815] },
  (races, rev_f2p_raceId, qualifying)={ edge_index=[2, 9815] },
  (qualifying, f2p_driverId, drivers)={ edge_index=[2, 9815] },
  (drivers, rev_f2p_driverId, qualifying)={ edge_index=[2, 9815] },
  (qualifying, f2p_constructorId, constructors)={ edge_index=[2, 9815] },
  (constructors, rev_f2p_constructorId, qualifying)={ edge_index=[2, 9815] },
  (standings, f2p_raceId, r

In [13]:
node_names = ['standings', 'drivers', 'results', 'constructor_results', 'circuits', 'qualifying', 'races', 'constructors', 'constructor_standings']
node_without_timestamp = ['drivers', 'circuits', 'constructors']
edges_names = [('constructor_standings', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'constructor_standings'),
                ('constructor_standings', 'f2p_constructorId', 'constructors'),
                ('constructors', 'rev_f2p_constructorId', 'constructor_standings'),
                ('standings', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'standings'),
                ('standings', 'f2p_driverId', 'drivers'),
                ('drivers', 'rev_f2p_driverId', 'standings'),
                ('constructor_results', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'constructor_results'),
                ('constructor_results', 'f2p_constructorId', 'constructors'),
                ('constructors', 'rev_f2p_constructorId', 'constructor_results'),
                ('results', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'results'),
                ('results', 'f2p_driverId', 'drivers'),
                ('drivers', 'rev_f2p_driverId', 'results'),
                ('results', 'f2p_constructorId', 'constructors'),
                ('constructors', 'rev_f2p_constructorId', 'results'),
                ('qualifying', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'qualifying'),
                ('qualifying', 'f2p_driverId', 'drivers'),
                ('drivers', 'rev_f2p_driverId', 'qualifying'),
                ('qualifying', 'f2p_constructorId', 'constructors'),
                ('constructors', 'rev_f2p_constructorId', 'qualifying'),
                ('races', 'f2p_circuitId', 'circuits'),
                ('circuits', 'rev_f2p_circuitId', 'races')]

In [14]:
train_path = "https://raw.githubusercontent.com/RiccardoRomeo01/BDATM_project_public_data/main/datasets/F1-v2/inductive/train.txt"
val_path = "https://raw.githubusercontent.com/RiccardoRomeo01/BDATM_project_public_data/main/datasets/F1-v2/inductive/inference_valid.txt"
test_path = "https://raw.githubusercontent.com/RiccardoRomeo01/BDATM_project_public_data/main/datasets/F1-v2/inductive/inference_test.txt"

In [15]:
train_triplets = read_triplets_from_file(train_path)
val_triplets = read_triplets_from_file(val_path)
test_triplets = read_triplets_from_file(test_path)

In [16]:
f1_dict = {}
f1_dict = build_nodes_dictionary_from_triplets(node_dict = f1_dict,
                                               triplets = train_triplets,
                                               nodes_without_timestamp = node_without_timestamp,
                                               split = 'train')
f1_dict = build_nodes_dictionary_from_triplets(node_dict = f1_dict,
                                               triplets = val_triplets,
                                               nodes_without_timestamp = node_without_timestamp,
                                               split = 'val')
f1_dict = build_nodes_dictionary_from_triplets(node_dict = f1_dict,
                                               triplets = test_triplets,
                                               nodes_without_timestamp = node_without_timestamp,
                                               split = 'test')

In [17]:
f1_triplets = train_triplets + val_triplets + test_triplets

In [18]:
f1_node_features, f1_labels, f1_edges, f1_train_mask, f1_val_mask, f1_test_mask = build_GraphAny_dataset(KG_data = f1_data,
                                                                                                          node_dict = f1_dict,
                                                                                                          triplets = f1_triplets)

In [19]:
data_to_save = {
    'node_features': f1_node_features,
    'labels': f1_labels,
    'edges': f1_edges,
    'train_mask': f1_train_mask,
    'val_mask': f1_val_mask,
    'test_mask': f1_test_mask
}

with open('f1_data.pkl', 'wb') as f:
    pickle.dump(data_to_save, f)

In [20]:
# PER LEGGERE DAL FILE
import requests
from torch_frame import stype
from torch_frame.nn import (
    EmbeddingEncoder,
    LinearEmbeddingEncoder,
    LinearEncoder,
    TimestampEncoder,
    StypeWiseFeatureEncoder,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


file_url = "https://raw.githubusercontent.com/RiccardoRomeo01/BDATM_project_public_data/main/GraphAny_datasets/f1_9_classes.pkl"

# Scarica il file
response = requests.get(file_url)
response.raise_for_status()  # Verifica che la richiesta sia andata a buon fine

# Carica i dati dal contenuto del file
data_loaded = pickle.loads(response.content)

f1_node_features = data_loaded['node_features']
f1_labels = data_loaded['labels']
f1_edges = data_loaded['edges']
f1_train_mask = data_loaded['train_mask']
f1_val_mask = data_loaded['val_mask']
f1_test_mask = data_loaded['test_mask']

In [21]:
print(f"La variabile f1_node_features è una lista di TensorFrame contenenti ognuno le feature di un nodo del nostro grafo. Quindi abbiamo in totale {len(f1_node_features)} nodi nel nostro grafo.\n")

print(f"La variabile f1_labels è una lista di stringhe che rappresentano il tipo di nodo (la sua label). Per esempio, il primo nodo del nostro grafo è un nodo di tipo: {f1_labels[0]}\n")

print(f"La variabile f1_edges è una lista di tuple che rappresentano le coppie di nodi connessi da un arco. Quindi abbiamo in totale {len(f1_edges)} archi nel nostro grafo.\n")

print(f"Le variabili 'mask' rappresentano le maschere. Si tratta di liste di booleani che indicano quali nodi fanno parte di quali set.")

La variabile f1_node_features è una lista di TensorFrame contenenti ognuno le feature di un nodo del nostro grafo. Quindi abbiamo in totale 12553 nodi nel nostro grafo.

La variabile f1_labels è una lista di stringhe che rappresentano il tipo di nodo (la sua label). Per esempio, il primo nodo del nostro grafo è un nodo di tipo: results

La variabile f1_edges è una lista di tuple che rappresentano le coppie di nodi connessi da un arco. Quindi abbiamo in totale 11362 archi nel nostro grafo.

Le variabili 'mask' rappresentano le maschere. Si tratta di liste di booleani che indicano quali nodi fanno parte di quali set.


# Comprendiamo cosa diamo in pasto a StypeWiseFeatureEncoder

Qui faccio vedere cosa in teoria diamo in pasto a StypeWiseFeatureEncoder, lo farò mostrando un solo elemento del ciclo e spiegando ogni passaggio.

In [22]:
# Noi vogliamo trasformare tutte le feature da formati TensorFrame a normali tensori. Per farlo dobbiamo usare un StypeWiseFeatureEncoder.
# Però, ogni nodo del grafo è  diverso, quindi creiamo un StypeWiseFeatureEncoder differente per ogni nodo, in modo da dare in pasto a StypeWiseFeatureEncoder i parametri giusti.
for i, tensor_frame in enumerate(f1_node_features):
  # Qui inserisco un if che normalmente non metterei, lo faccio solo per evidenziare il problema che abbiamo.
  # In particolare, i dati categorici sono un problema. Quindi, al primo dato di tipo categorico fermo il ciclo
  # Per ottenere i tipi dei dati (stype) uso col_names_dict del TensorFrame e ne seleziono le chiavi
  if stype.categorical in tensor_frame.col_names_dict.keys():

    print(f"Il TensorFrame incontrato ha un elemento di tipo categorico al suo interno: \n{tensor_frame}\n\n")

    #### out_channels ####
    '''
      Il primo valore da dare in pasto è out_channels, che rappresenta la dimensione di uscita del tensore.
      Ogni TensorFrame ha un parametro chiamato num_cols  che ho deciso di utilizzare come canale di output: out_channels = tensor_frame.num_cols
    '''
    print(f"Il numero di colonne di questo TensorFrame è: {tensor_frame.num_cols}, quindi out_channels sarà uguale a {tensor_frame.num_cols}\n\n")

    #### col_stats ####
    '''
      col_stats è un dizionario che contiene le statistiche per ogni colonna, ma sono le statistiche delle colonne di quello specifico TensorFrame.
      Per ottenere questi valori uso la variabile f1_col_stats_dict ottenuta tramite RelBench. Si tratta di un dizionario con tutte le statistiche delle colonne di ogni TensorFrame nel nostro grafo.
      Quindi avremo che col_stats = f1_col_stats_dict[f1_labels[i]]. Nota che ho usato f1_labels[i] per selezionare il giusto nome del TensorFrame.
      Infatti se l'i-esimo TensorFrame è di tipo 'standings' allora col_stats = f1_col_stats_dict['standings'].
    '''
    print(f"Questo TensorFrame è di tipo: {f1_labels[i]}")
    print(f"Le statistiche di questo TensorFrame sono: \n{f1_col_stats_dict[f1_labels[i]]}\n\n")

    #### col_names_dict ####
    '''
      Il terzo parametro da passare dentro StypeWiseFeatureEncoder è col_names_dict. Si tratta di un dizionario che indica il tipo di dato stype per ogni colonna del TensorFrame
    '''
    print(f"Il col_names_dict di questo tensore è: \n{tensor_frame.col_names_dict}\n\n")

    #### stype_encoder_dict ####
    '''
      L'ultimo parametro da passare è un dizionario che serve all'encoder per "decidere" quale encoder utilizzare per ogni tipo di dato (stype).
      Nel nostro caso abbiamo:
      stype_encoder_dict = {
                stype.categorical: EmbeddingEncoder(),
                stype.numerical: LinearEncoder(),
                stype.embedding: LinearEmbeddingEncoder(),
                stype.timestamp: TimestampEncoder()
            }
      Quindi, l'encoder utilizzerà:
      - EmbeddingEncoder per i dati categorici,
      - LinearEncoder per i dati numerici,
      - LinearEmbeddingEncoder per i dati embedding,
      - TimestampEncoder per i dati temporali.
    '''
    print(f"Il nostro stype_encoder_dict è:\n{stype_encoder_dict}")
    break

Il TensorFrame incontrato ha un elemento di tipo categorico al suo interno: 
TensorFrame(
  num_cols=5,
  num_rows=1,
  categorical (1): ['year'],
  numerical (1): ['round'],
  timestamp (2): ['date', 'time'],
  embedding (1): ['name'],
  has_target=False,
  device='cpu',
)


Il numero di colonne di questo TensorFrame è: 5, quindi out_channels sarà uguale a 5


Questo TensorFrame è di tipo: races
Le statistiche di questo TensorFrame sono: 
{'year': {<StatType.COUNT: 'COUNT'>: ([2023, 2021, 2022, 2016, 2018, 2019, 2017, 2012, 2014, 2013, 2010, 2011, 2005, 2015, 2006, 2004, 2008, 2007, 2000, 2009, 1997, 2001, 2020, 2002, 1995, 1977, 1999, 2003, 1978, 1976, 1998, 1996, 1994, 1991, 1992, 1993, 1990, 1989, 1988, 1987, 1986, 1985, 1984, 1982, 1979, 1983, 1981, 1974, 1973, 1975, 1980, 1970, 1968, 1972, 1967, 1971, 1969, 1958, 1965, 1963, 1964, 1960, 1966, 1953, 1954, 1959, 1962, 1952, 1956, 1951, 1961, 1957, 1955, 1950], [22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17,

NameError: name 'stype_encoder_dict' is not defined

In [38]:
from torch_frame.data.multi_embedding_tensor import MultiEmbeddingTensor
import torch

def flatten_multi_embedding(met: MultiEmbeddingTensor, device=None, flatten_extra_dims=True):
    """
    Convert MultiEmbeddingTensor to dense tensor with better error handling.
    """
    if device is None:
        device = torch.device("cpu")

    # 1. First check for direct tensor conversion methods
    if hasattr(met, 'to_tensor') and callable(met.to_tensor):
        tensor = met.to_tensor()
        if isinstance(tensor, torch.Tensor):
            if tensor.layout != torch.strided:
                tensor = tensor.to_dense()
            tensor = tensor.to(device)
            if flatten_extra_dims and tensor.dim() > 2:
                tensor = tensor.flatten(start_dim=1)
            return tensor

    # 2. Look for embedding storage in attributes
    dict_candidates = ["_data", "embeddings", "_embeddings", "_tensor_dict", "values"]
    embedding_dict = None

    for candidate in dict_candidates:
        if hasattr(met, candidate):
            candidate_val = getattr(met, candidate)
            # Handle both direct tensors and dictionaries
            if isinstance(candidate_val, torch.Tensor):
                return _process_tensor(candidate_val, device, flatten_extra_dims)
            elif isinstance(candidate_val, dict):
                embedding_dict = candidate_val
                break

    # 3. Handle case where MultiEmbeddingTensor wraps a single tensor
    if embedding_dict is None:
        if hasattr(met, 'values') and callable(met.values):
            tensor = met.values()
            return _process_tensor(tensor, device, flatten_extra_dims)
        else:
            raise ValueError(
                f"Failed to unpack MultiEmbeddingTensor. Available attributes: {dir(met)}\n"
                "Consider inspecting the object structure with: "
                "print(dir(your_multi_embedding_tensor))"
            )

    # 4. Process dictionary of embeddings
    sub_tensors = []
    for emb in embedding_dict.values():
        if isinstance(emb, torch.Tensor):
            if emb.layout != torch.strided:
                emb = emb.to_dense()
            emb = emb.to(device)
            if flatten_extra_dims and emb.dim() > 2:
                emb = emb.flatten(start_dim=1)
            sub_tensors.append(emb)
        else:
            raise TypeError(f"Unexpected embedding type: {type(emb)}")

    return torch.cat(sub_tensors, dim=1)

def _process_tensor(tensor: torch.Tensor, device, flatten_extra_dims) -> torch.Tensor:
    """Helper for consistent tensor processing"""
    if tensor.layout != torch.strided:
        tensor = tensor.to_dense()
    tensor = tensor.to(device)
    if flatten_extra_dims and tensor.dim() > 2:
        tensor = tensor.flatten(start_dim=1)
    return tensor


def torchframe_to_tensor(tf, device=None, flatten_extra_dims=True):
    """
    Robustly convert a TorchFrame to a dense torch.Tensor by handling
    MultiEmbeddingTensors and other column types.
    """
    if device is None:
        device = torch.device("cpu")

    feats = []
    for stype_key, typed_feat in tf.feat_dict.items():
        # Resolve potential lazy-loaded tensors
        if callable(typed_feat):
            typed_feat = typed_feat()

        # Handle MultiEmbeddingTensor
        if isinstance(typed_feat, MultiEmbeddingTensor):
            met_tensor = flatten_multi_embedding(
                typed_feat, device=device, flatten_extra_dims=flatten_extra_dims
            )
            feats.append(met_tensor)
            continue

        # Convert sparse tensors to dense
        if hasattr(typed_feat, "to_dense") and callable(typed_feat.to_dense):
            typed_feat = typed_feat.to_dense()

        # Fallback to .values() if not a tensor
        if not isinstance(typed_feat, torch.Tensor):
            if hasattr(typed_feat, "values") and callable(typed_feat.values):
                typed_feat = typed_feat.values()
            else:
                raise TypeError(
                    f"Feature {stype_key} is not a tensor. Got {type(typed_feat)}"
                )

        # Flatten
        typed_feat = typed_feat.to(device)
        if flatten_extra_dims and typed_feat.dim() > 2:
            typed_feat = typed_feat.flatten(start_dim=1)

        feats.append(typed_feat)

    return torch.cat(feats, dim=1)


f1_node_features_tensors = []
for i, tensor_frame in enumerate(f1_node_features):

    tf_on_device = tensor_frame.to(device)
    dense_tensor = torchframe_to_tensor(tf_on_device, device=device)
    print(i, dense_tensor.shape)
    f1_node_features_tensors.append(dense_tensor)

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
7553 torch.Size([1, 17])
7554 torch.Size([1, 10])
7555 torch.Size([1, 10])
7556 torch.Size([1, 8])
7557 torch.Size([1, 17])
7558 torch.Size([1, 17])
7559 torch.Size([1, 17])
7560 torch.Size([1, 10])
7561 torch.Size([1, 10])
7562 torch.Size([1, 10])
7563 torch.Size([1, 10])
7564 torch.Size([1, 17])
7565 torch.Size([1, 10])
7566 torch.Size([1, 10])
7567 torch.Size([1, 9])
7568 torch.Size([1, 10])
7569 torch.Size([1, 10])
7570 torch.Size([1, 17])
7571 torch.Size([1, 10])
7572 torch.Size([1, 8])
7573 torch.Size([1, 10])
7574 torch.Size([1, 9])
7575 torch.Size([1, 17])
7576 torch.Size([1, 10])
7577 torch.Size([1, 17])
7578 torch.Size([1, 17])
7579 torch.Size([1, 1507])
7580 torch.Size([1, 10])
7581 torch.Size([1, 10])
7582 torch.Size([1, 17])
7583 torch.Size([1, 9])
7584 torch.Size([1, 17])
7585 torch.Size([1, 17])
7586 torch.Size([1, 9])
7587 torch.Size([1, 17])
7588 torch.Size([1, 10])
7589 torch.Size([1, 10])
7590 torch.Size

In [52]:
def flatten_multi_embedding(met: MultiEmbeddingTensor, device=None, flatten_extra_dims=True):
    """
    Convert MultiEmbeddingTensor to dense tensor with better error handling.
    """
    if device is None:
        device = torch.device("cpu")

    # 1. First check for direct tensor conversion methods
    if hasattr(met, 'to_tensor') and callable(met.to_tensor):
        tensor = met.to_tensor()
        if isinstance(tensor, torch.Tensor):
            if tensor.layout != torch.strided:
                tensor = tensor.to_dense()
            tensor = tensor.to(device)
            if flatten_extra_dims and tensor.dim() > 2:
                tensor = tensor.flatten(start_dim=1)
            return tensor

    # 2. Look for embedding storage in attributes
    dict_candidates = ["_data", "embeddings", "_embeddings", "_tensor_dict", "values"]
    embedding_dict = None

    for candidate in dict_candidates:
        if hasattr(met, candidate):
            candidate_val = getattr(met, candidate)
            # Handle both direct tensors and dictionaries
            if isinstance(candidate_val, torch.Tensor):
                return _process_tensor(candidate_val, device, flatten_extra_dims)
            elif isinstance(candidate_val, dict):
                embedding_dict = candidate_val
                break

    # 3. Handle case where MultiEmbeddingTensor wraps a single tensor
    if embedding_dict is None:
        if hasattr(met, 'values') and callable(met.values):
            tensor = met.values()
            return _process_tensor(tensor, device, flatten_extra_dims)
        else:
            raise ValueError(
                f"Failed to unpack MultiEmbeddingTensor. Available attributes: {dir(met)}\n"
                "Consider inspecting the object structure with: "
                "print(dir(your_multi_embedding_tensor))"
            )

    # 4. Process dictionary of embeddings
    sub_tensors = []
    for emb in embedding_dict.values():
        if isinstance(emb, torch.Tensor):
            if emb.layout != torch.strided:
                emb = emb.to_dense()
            emb = emb.to(device)
            if flatten_extra_dims and emb.dim() > 2:
                emb = emb.flatten(start_dim=1)
            sub_tensors.append(emb)
        else:
            raise TypeError(f"Unexpected embedding type: {type(emb)}")

    return torch.cat(sub_tensors, dim=1)

def _process_tensor(tensor: torch.Tensor, device, flatten_extra_dims) -> torch.Tensor:
    """Helper for consistent tensor processing"""
    if tensor.layout != torch.strided:
        tensor = tensor.to_dense()
    tensor = tensor.to(device)
    if flatten_extra_dims and tensor.dim() > 2:
        tensor = tensor.flatten(start_dim=1)
    return tensor



def torchframe_to_linearized_string(tf, flatten_extra_dims=True):
    """
    Convert a TorchFrame to a linearized string representation of features.
    """
    feats = []
    for stype_key, typed_feat in tf.feat_dict.items():
        feature_names = tf.col_names_dict[stype_key]

        # Risolvi eventuali tensori caricati pigramente
        if callable(typed_feat):
            typed_feat = typed_feat()

        # Gestisci MultiEmbeddingTensor
        if isinstance(typed_feat, MultiEmbeddingTensor):
            met_tensor = flatten_multi_embedding(
                typed_feat, flatten_extra_dims=flatten_extra_dims
            )
            # Converti il tensore in una stringa
            feats.append(f"{stype_key} {met_tensor.tolist()} [SEP]")
            continue

        # Controlla se è un tensore denso
        if hasattr(typed_feat, "to_dense") and callable(typed_feat.to_dense):
            typed_feat = typed_feat.to_dense()

        # Fallback a .values() se non è un tensore
        if not isinstance(typed_feat, torch.Tensor):
            if hasattr(typed_feat, "values") and callable(typed_feat.values):
                typed_feat = typed_feat.values()
            else:
                raise TypeError(
                    f"Feature {stype_key} is not a tensor. Got {type(typed_feat)}"
                )

        # Linearizza e aggiungi alla lista
        typed_feat = typed_feat.flatten(start_dim=0) if flatten_extra_dims and typed_feat.dim() > 1 else typed_feat

        for i in range(len(feature_names)):
          feature_name = feature_names[i]
          feature_value = typed_feat[i]
          feats.append(f"{feature_name} {feature_value} [SEP] ")

    # Unisci tutte le feature in una singola stringa
    return ''.join(feats)# Rimuovi l'ultimo [SEP]

# Utilizzo della funzione
linearized_features = []
for i, tensor_frame in enumerate(f1_node_features):
    if i == 10:
      break
    tf_on_device = tensor_frame.to(device)
    linearized_string = torchframe_to_linearized_string(tf_on_device)
    linearized_features.append(linearized_string)
    print(i, linearized_string)

0 fastestLap nan [SEP] grid 8.0 [SEP] laps 8.0 [SEP] milliseconds nan [SEP] number 8.0 [SEP] points 0.0 [SEP] position nan [SEP] positionOrder 13.0 [SEP] rank nan [SEP] statusId 44.0 [SEP] date 1971 [SEP] 
1 embedding [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [55]:
print(f1_node_features[2])

TensorFrame(
  num_cols=5,
  num_rows=1,
  categorical (1): ['year'],
  numerical (1): ['round'],
  timestamp (2): ['date', 'time'],
  embedding (1): ['name'],
  has_target=False,
  device='cpu',
)


In [89]:
def flatten_multi_embedding(met: MultiEmbeddingTensor, device=None, flatten_extra_dims=True):
    """
    Convert MultiEmbeddingTensor to a list of values with better error handling.
    """
    if device is None:
        device = torch.device("cpu")

    # 1. First check for direct tensor conversion methods
    if hasattr(met, 'to_tensor') and callable(met.to_tensor):
        tensor = met.to_tensor()
        if isinstance(tensor, torch.Tensor):
            if tensor.layout != torch.strided:
                tensor = tensor.to_dense()
            tensor = tensor.to(device)
            if flatten_extra_dims and tensor.dim() > 2:
                tensor = tensor.flatten(start_dim=1)
            return tensor.tolist()  # Restituisci come lista

    # 2. Look for embedding storage in attributes
    dict_candidates = ["_data", "embeddings", "_embeddings", "_tensor_dict", "values"]
    embedding_dict = None

    for candidate in dict_candidates:
        if hasattr(met, candidate):
            candidate_val = getattr(met, candidate)
            # Handle both direct tensors and dictionaries
            if isinstance(candidate_val, torch.Tensor):
                return _process_tensor(candidate_val, device, flatten_extra_dims)  # Restituisci come lista
            elif isinstance(candidate_val, dict):
                embedding_dict = candidate_val
                break

    # 3. Handle case where MultiEmbeddingTensor wraps a single tensor
    if embedding_dict is None:
        if hasattr(met, 'values') and callable(met.values):
            tensor = met.values()
            return _process_tensor(tensor, device, flatten_extra_dims)  # Restituisci come lista
        else:
            raise ValueError(
                f"Failed to unpack MultiEmbeddingTensor. Available attributes: {dir(met)}\n"
                "Consider inspecting the object structure with: "
                "print(dir(your_multi_embedding_tensor))"
            )

    # 4. Process dictionary of embeddings
    sub_tensors = []
    for emb in embedding_dict.values():
        if isinstance(emb, torch.Tensor):
            if emb.layout != torch.strided:
                emb = emb.to_dense()
            emb = emb.to(device)
            if flatten_extra_dims and emb.dim() > 2:
                emb = emb.flatten(start_dim=1)
            sub_tensors.append(emb.tolist())  # Aggiungi come lista
        else:
            raise TypeError(f"Unexpected embedding type: {type(emb)}")

    # Concatenate all sub_tensors and return as a list
    return sub_tensors  # Flatten e restituisci come lista

def _process_tensor(tensor: torch.Tensor, device, flatten_extra_dims) -> list:
    """Helper for consistent tensor processing, returning a list."""

    #if tensor.layout != torch.strided:
        #tensor = tensor.to_dense()
    tensor = tensor.to(device)
    # if flatten_extra_dims and tensor.dim() > 2:
        # tensor = tensor.flatten(start_dim=1)
    return tensor.tolist()  # Restituisci come lista




def torchframe_to_linearized_string(tf, flatten_extra_dims=True):
    """
    Convert a TorchFrame to a linearized string representation of features.
    """
    feats = []
    for stype_key, typed_feat in tf.feat_dict.items():
        print(stype_key)
        print(typed_feat)


        # Risolvi eventuali tensori caricati pigramente
        if callable(typed_feat):
            typed_feat = typed_feat()

        # Gestisci MultiEmbeddingTensor
        if isinstance(typed_feat, MultiEmbeddingTensor):
            met_tensor = flatten_multi_embedding(
                typed_feat, flatten_extra_dims=flatten_extra_dims
            )
            # Converti il tensore in una stringa
            feats.append(f"{stype_key} {met_tensor} [SEP]")
            continue

        # Controlla se è un tensore denso
        if hasattr(typed_feat, "to_dense") and callable(typed_feat.to_dense):
            typed_feat = typed_feat.to_dense()

        # Fallback a .values() se non è un tensore
        if not isinstance(typed_feat, torch.Tensor):
            if hasattr(typed_feat, "values") and callable(typed_feat.values):
                typed_feat = typed_feat.values()
            else:
                raise TypeError(
                    f"Feature {stype_key} is not a tensor. Got {type(typed_feat)}"
                )

        # Linearizza e aggiungi alla lista
        typed_feat = typed_feat.flatten(start_dim=0) if flatten_extra_dims and typed_feat.dim() > 1 else typed_feat
        feats.append(f"{stype_key} {typed_feat.tolist()} [SEP] ")

    # Unisci tutte le feature in una singola stringa
    return ''.join(feats)

# Utilizzo della funzione
linearized_features = []
for i, tensor_frame in enumerate(f1_node_features):
    if i == 5:
      break

    tf_on_device = tensor_frame.to(device)
    linearized_string = torchframe_to_linearized_string(tf_on_device)
    linearized_features.append(linearized_string)
    print(i, linearized_string)

numerical
tensor([[nan,  8.,  8., nan,  8.,  0., nan, 13., nan, 44.]], device='cuda:0')
timestamp
tensor([[[1971,    7,    0,    6,    0,    0,    0]]], device='cuda:0')
0 numerical [nan, 8.0, 8.0, nan, 8.0, 0.0, nan, 13.0, nan, 44.0] [SEP] timestamp [1971, 7, 0, 6, 0, 0, 0] [SEP] 
embedding
MultiEmbeddingTensor(num_rows=1, num_cols=3, device='cuda:0')
1 embedding [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0

In [102]:
for i, tensor_frame in enumerate(f1_node_features):
    print("\n")
    if i == 5:
      break

    tf_on_device = tensor_frame.to(device)
    # print(tf_on_device.col_names_dict)



    # print(tf_on_device.get_col_feat)
    # print(tf_on_device.feat_dict)
    feats = []
    for stype_key, typed_feat in tf_on_device.feat_dict.items():
        # print(stype_key, typed_feat)
        # print(tf_on_device.col_names_dict[stype_key])


        for i in range(len(tf_on_device.col_names_dict[stype_key])):

          if isinstance(typed_feat, MultiEmbeddingTensor):
            met_tensor = flatten_multi_embedding(
                typed_feat, flatten_extra_dims = True
            )
            num_cols = typed_feat.num_cols
            emb_dim = typed_feat.values.shape[1] // num_cols
            for col in range(num_cols):
                feature_name = tf_on_device.col_names_dict[stype_key][i][col]
                feature_value
            feats.append(f"{stype_key} {met_tensor} [SEP]")
            print(typed_feat.num_cols)
            print(typed_feat.values.shape[1])
            continue


          feature_name = tf_on_device.col_names_dict[stype_key][i]
          feature_value = typed_feat[0][i].tolist()

          feats.append(f"{feature_name} {feature_value} [SEP] ")

    print(''.join(feats))




fastestLap nan [SEP] grid 8.0 [SEP] laps 8.0 [SEP] milliseconds nan [SEP] number 8.0 [SEP] points 0.0 [SEP] position nan [SEP] positionOrder 13.0 [SEP] rank nan [SEP] statusId 44.0 [SEP] date [1971, 7, 0, 6, 0, 0, 0] [SEP] 


3
900
3
900
3
900
embedding [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.