In [1]:
# we install all the required for RelBench
!pip install relbench[full]

Collecting relbench[full]
  Downloading relbench-1.1.0-py3-none-any.whl.metadata (12 kB)
Collecting pytorch_frame>=0.2.3 (from relbench[full])
  Downloading pytorch_frame-0.2.4-py3-none-any.whl.metadata (20 kB)
Collecting torch_geometric (from relbench[full])
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Downloading pytorch_frame-0.2.4-py3-none-any.whl (144 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.5/144.5 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading relbench-1.1.0-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.1/58.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstall

In [2]:
import relbench
from relbench.datasets import get_dataset_names, get_dataset
from relbench.modeling.utils import get_stype_proposal
from relbench.modeling.graph import make_pkey_fkey_graph


import torch
from torch_geometric.seed import seed_everything
from torch import Tensor
from torch_frame.config.text_embedder import TextEmbedderConfig


from typing import List, Optional


from sentence_transformers import SentenceTransformer


import os


import pandas as pd


import random

In [3]:
class GloveTextEmbedding:
    def __init__(self, device: Optional[torch.device] = None):
        self.model = SentenceTransformer(
            "sentence-transformers/average_word_embeddings_glove.6B.300d",
            device=device,
        )

    def __call__(self, sentences: List[str]) -> Tensor:
        return self.model.encode(sentences, convert_to_tensor=True)

# Environment Checking

In [4]:
# Check that it's cuda if you want it to run in reasonable time!
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.set_num_threads(1)
print(device)

# Set the seed for generating random numbers to ensure reproducibility
seed_everything(42)

# Path to the directory for caching graph data
root_dir = "./data"

# Configure the text encoder
text_embedder_cfg = TextEmbedderConfig(
    text_embedder=GloveTextEmbedding(device=device),
    batch_size=256
)

cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/248 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

(…)WordEmbeddings/wordembedding_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/480M [00:00<?, ?B/s]

(…)beddings/whitespacetokenizer_config.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
print(f"The RelBench version is {relbench.__version__}")
print(f"The RelBench datasets are {get_dataset_names()}")

The RelBench version is 1.1.0
The RelBench datasets are ['rel-amazon', 'rel-avito', 'rel-event', 'rel-f1', 'rel-hm', 'rel-stack', 'rel-trial']


#Usefull functions

In [6]:
def build_nodes_dictionary(KG_data, node_names, val_timestamp, test_timestamp):
    nodes_dict = {}
    seq_number = 0  # Inizializzazione del contatore sequenziale

    for node_name in node_names:
        # Estrazione dei dati del nodo
        node_data = KG_data[node_name]

        # Controllo se il campo 'time' esiste
        if 'time' in node_data:
            timestamps = node_data['time']  # Supponendo che i timestamp siano in un campo 'time'

            for index, time in enumerate(timestamps):
                time = pd.to_datetime(time.item(), unit='s')

                # Determina se il nodo è in train, validation o test
                if time < val_timestamp:
                    category = 'train'
                elif val_timestamp <= time < test_timestamp:
                    category = 'validation'
                else:
                    category = 'test'

                # Creazione della chiave e valore da inserire nel dizionario
                key = f"{node_name}_{index}"
                value = [seq_number, category]

                # Aggiunta al dizionario
                nodes_dict[key] = value
                seq_number += 1  # Incrementa il contatore per ogni nodo analizzato
        else:
            # Se il campo 'time' non esiste
            for index in range(len(node_data)):  # Itera sugli indici dei nodi
                key = f"{node_name}_{index}"
                value = [seq_number, 'all']  # Categoria 'all' se non c'è 'time'

                # Aggiunta al dizionario
                nodes_dict[key] = value
                seq_number += 1  # Incrementa il contatore

    return nodes_dict


In [7]:
def modify_node_name(key):
    # Trova l'ultima posizione di '_' e rimuove la parte numerica
    last_underscore_index = key.rfind('_')

    if last_underscore_index != -1:
        # Parte letterale (es. 'standings')
        modified_name = key[:last_underscore_index]

        # Parte numerica (es. 3)
        node_index = int(''.join(filter(str.isdigit, key[last_underscore_index:])))

        return modified_name, node_index

    else:
        node_name = ''.join(filter(str.isalpha, key))  # Parte letterale (es. 'standings')
        node_index = int(''.join(filter(str.isdigit, key)))  # Parte numerica (es. 5)
        return node_name, node_index

In [8]:
import requests

def read_triplets_from_file(file_path):
    triplets = []

    # Controlla se il file_path è un URL
    if file_path.startswith("http://") or file_path.startswith("https://"):
        response = requests.get(file_path)
        if response.status_code == 200:
            lines = response.text.splitlines()
        else:
            print(f"Errore nel download del file: {response.status_code}")
            return triplets
    else:
        # Legge il file locale
        with open(file_path, 'r') as file:
            lines = file.readlines()

    for line in lines:
        # Rimuovi eventuali spazi bianchi e separa la riga in base ai tab
        parts = line.strip().split('\t')
        if len(parts) == 3:  # Assicurati che ci siano esattamente 3 elementi
            triplet = (parts[0], parts[1], parts[2])
            triplets.append(triplet)
        else:
            print(f"Riga non valida: {line.strip()}")

    return triplets


In [9]:
def build_nodes_dictionary_from_triplets(node_dict, triplets, nodes_without_timestamp, split):

    for triplet in triplets:
      source_node = triplet[0]
      edge_label = triplet[1]
      target_node = triplet[2]

      # se il source node e il target node non sono già nel vocabolario li aggiungo
      if source_node not in node_dict:
          source_node_label = modify_node_name(source_node)

          if source_node_label in nodes_without_timestamp:
            node_dict[source_node] = [len(node_dict), 'all']
          else:
            node_dict[source_node] = [len(node_dict), split]

      if target_node not in node_dict:
          target_node_label = modify_node_name(target_node)

          if target_node_label in nodes_without_timestamp:
            node_dict[target_node] = [len(node_dict), 'all']
          else:
            node_dict[target_node] = [len(node_dict), split]

    return node_dict

In [10]:
def build_data_file(KG_data, node_names, nodes_dictionary, edges_names):
    node_features = []
    labels = []
    edges = []

    train_mask = []
    val_mask = []
    test_mask = []

    for key in nodes_dictionary:
        # Dividi la chiave in parte letterale e parte numerica
        # node_name = ''.join(filter(str.isalpha, key))  # Parte letterale (es. 'standings')
        # node_index = int(''.join(filter(str.isdigit, key)))  # Parte numerica (es. 5)
        node_name, node_index = modify_node_name(key)

        # Estrai il valore delle features usando i dati di KD_data
        print(node_name)
        feature_value = KG_data[node_name].tf[node_index]
        node_features.append(feature_value)

        # Inserisco anche la label del nodo in labels
        labels.append(node_name)

        # Aggiorno le maschere
        split_value = nodes_dictionary[key][1]
        if split_value == "train":
            train_mask.append(True)
            val_mask.append(False)
            test_mask.append(False)
        elif split_value == "validation":
            train_mask.append(False)
            val_mask.append(True)
            test_mask.append(False)
        elif split_value == "test":
            train_mask.append(False)
            val_mask.append(False)
            test_mask.append(True)
        else:
            train_mask.append(True)
            val_mask.append(True)
            test_mask.append(True)

    # return  node_features, labels, edges, train_mask, val_mask, test_mask

#'''
        # Ora devo inserire gli edges in cui è presente quel nodo
        for edge in edges_names:
            # controllo che quel tipo di nodo è sorgente di un possibile edge
            if node_name == edge[0]:
                # se l'edge ha dentro quel tipo di nodo allora
                edge_index = KG_data[edge].edge_index # retrieve indexes
                # prendo tutti gli indici dei nodi sorgente e dei nodi destinazione
                source_nodes = edge_index[0]  # source indexes
                target_nodes = edge_index[1]  # targer indexes

                source_dict_indices = []
                target_dict_indices = []

                # Creazione delle nuove chiavi
                for source_index in source_nodes:
                    source_key = f"{node_name}_{source_index.item()}"  # Concatenazione con il nome del nodo
                    source_dict_indices.append(nodes_dictionary[source_key][0])

                for target_index in target_nodes:
                    target_key = f"{node_name}_{target_index.item()}"  # Concatenazione
                    target_dict_indices.append(nodes_dictionary[target_key][0])


                # Creazione delle coppie e aggiunta alla lista edges
                for src_index in source_dict_indices:
                    for tgt_index in target_dict_indices:
                        edge_pair = [src_index, tgt_index]
                        if edge_pair not in edges:  # Controllo se la coppia non è già presente
                            edges.append(edge_pair)
        break

    return   node_features, labels, edges, train_mask, val_mask, test_mask
#'''

In [11]:
def build_GraphAny_dataset(KG_data, node_dict, triplets):
    node_features = []
    labels = []
    edges = []

    train_mask = []
    val_mask = []
    test_mask = []

    for key, value in node_dict.items():
      # dalla chiave ottengo il nome del nodo e l'indice
      entity_label, entity_index = modify_node_name(key)

      # da KG_data prendo le features di quell'entità e la aggiungo a node_features
      node_features.append(KG_data[entity_label].tf[entity_index])

      # aggiugo la label di quell'entità a labels
      labels.append(entity_label)

      # in base al valore di split aggiorno le maschere
      split_string = value[1]
      if split_string == "train":
        train_mask.append(True)
        val_mask.append(False)
        test_mask.append(False)
      elif split_string == "val":
        train_mask.append(False)
        val_mask.append(True)
        test_mask.append(False)
      elif split_string == "test":
        train_mask.append(False)
        val_mask.append(False)
        test_mask.append(True)
      elif split_string == 'all':
        train_mask.append(True)
        val_mask.append(True)
        test_mask.append(True)


    # per ogni tripletta passata ricavo source e target
    for triplet in triplets:
      source_node = triplet[0]
      edge_label = triplet[1]
      target_node = triplet[2]

      source_index = node_dict[source_node][0]
      target_index = node_dict[target_node][0]

      pair = [source_index, target_index]
      if pair not in edges:
        edges.append(pair)

    return node_features, labels, edges, train_mask, val_mask, test_mask

# F1 Dataset Creation

In [12]:
# We download the f1-dataset
f1_dataset = get_dataset(name="rel-f1", download=True)

# we download the entire database (also the test part)
f1_db = f1_dataset.get_db(upto_test_timestamp = False)
f1_col_to_stype_dict = get_stype_proposal(f1_db)

# Generate graph data
f1_data, f1_col_stats_dict = make_pkey_fkey_graph(
    f1_db,
    col_to_stype_dict = f1_col_to_stype_dict,  # Column types
    text_embedder_cfg = text_embedder_cfg,  # Our chosen text encoder
    cache_dir=os.path.join(
        root_dir, f"rel-f1_materialized_cache"
    ),  # Store materialized graph for convenience
)

Downloading file 'rel-f1/db.zip' from 'https://relbench.stanford.edu/download/rel-f1/db.zip' to '/root/.cache/relbench'.
100%|████████████████████████████████████████| 704k/704k [00:00<00:00, 704MB/s]
Unzipping contents of '/root/.cache/relbench/rel-f1/db.zip' to '/root/.cache/relbench/rel-f1/.'


Loading Database object from /root/.cache/relbench/rel-f1/db...
Done in 0.15 seconds.


Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00,  5.19it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 218.03it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 204.66it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 213.23it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 191.79it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 47.46it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 158.18it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 102.56it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 276.38it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 246.58it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 243.25it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 172.35it/s]
  ser = pd.to_datetime(ser, fo

In [13]:
f1_val_timestep = f1_dataset.val_timestamp
f1_test_timestep = f1_dataset.test_timestamp

print(f"The validation timestep is: {f1_val_timestep}")
print(f"The test timestep is: {f1_test_timestep}")

The validation timestep is: 2005-01-01 00:00:00
The test timestep is: 2010-01-01 00:00:00


In [14]:
f1_data

HeteroData(
  drivers={ tf=TensorFrame([857, 6]) },
  results={
    tf=TensorFrame([26080, 11]),
    time=[26080],
  },
  standings={
    tf=TensorFrame([34124, 4]),
    time=[34124],
  },
  constructor_results={
    tf=TensorFrame([12290, 2]),
    time=[12290],
  },
  constructors={ tf=TensorFrame([211, 3]) },
  circuits={ tf=TensorFrame([77, 7]) },
  qualifying={
    tf=TensorFrame([9815, 3]),
    time=[9815],
  },
  constructor_standings={
    tf=TensorFrame([13051, 4]),
    time=[13051],
  },
  races={
    tf=TensorFrame([1101, 5]),
    time=[1101],
  },
  (results, f2p_raceId, races)={ edge_index=[2, 26080] },
  (races, rev_f2p_raceId, results)={ edge_index=[2, 26080] },
  (results, f2p_driverId, drivers)={ edge_index=[2, 26080] },
  (drivers, rev_f2p_driverId, results)={ edge_index=[2, 26080] },
  (results, f2p_constructorId, constructors)={ edge_index=[2, 26080] },
  (constructors, rev_f2p_constructorId, results)={ edge_index=[2, 26080] },
  (standings, f2p_raceId, races)={ edge

In [15]:
node_names = ['standings', 'drivers', 'results', 'constructor_results', 'circuits', 'qualifying', 'races', 'constructors', 'constructor_standings']
node_without_timestamp = ['drivers', 'circuits', 'constructors']
edges_names = [('constructor_standings', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'constructor_standings'),
                ('constructor_standings', 'f2p_constructorId', 'constructors'),
                ('constructors', 'rev_f2p_constructorId', 'constructor_standings'),
                ('standings', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'standings'),
                ('standings', 'f2p_driverId', 'drivers'),
                ('drivers', 'rev_f2p_driverId', 'standings'),
                ('constructor_results', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'constructor_results'),
                ('constructor_results', 'f2p_constructorId', 'constructors'),
                ('constructors', 'rev_f2p_constructorId', 'constructor_results'),
                ('results', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'results'),
                ('results', 'f2p_driverId', 'drivers'),
                ('drivers', 'rev_f2p_driverId', 'results'),
                ('results', 'f2p_constructorId', 'constructors'),
                ('constructors', 'rev_f2p_constructorId', 'results'),
                ('qualifying', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'qualifying'),
                ('qualifying', 'f2p_driverId', 'drivers'),
                ('drivers', 'rev_f2p_driverId', 'qualifying'),
                ('qualifying', 'f2p_constructorId', 'constructors'),
                ('constructors', 'rev_f2p_constructorId', 'qualifying'),
                ('races', 'f2p_circuitId', 'circuits'),
                ('circuits', 'rev_f2p_circuitId', 'races')]

In [16]:
train_path = "https://raw.githubusercontent.com/RiccardoRomeo01/BDATM_project_public_data/main/datasets/F1-v2/inductive/train.txt"
val_path = "https://raw.githubusercontent.com/RiccardoRomeo01/BDATM_project_public_data/main/datasets/F1-v2/inductive/inference_valid.txt"
test_path = "https://raw.githubusercontent.com/RiccardoRomeo01/BDATM_project_public_data/main/datasets/F1-v2/inductive/inference_test.txt"

In [17]:
train_triplets = read_triplets_from_file(train_path)
val_triplets = read_triplets_from_file(val_path)
test_triplets = read_triplets_from_file(test_path)

In [18]:
f1_dict = {}
f1_dict = build_nodes_dictionary_from_triplets(node_dict = f1_dict,
                                               triplets = train_triplets,
                                               nodes_without_timestamp = node_without_timestamp,
                                               split = 'train')
f1_dict = build_nodes_dictionary_from_triplets(node_dict = f1_dict,
                                               triplets = val_triplets,
                                               nodes_without_timestamp = node_without_timestamp,
                                               split = 'val')
f1_dict = build_nodes_dictionary_from_triplets(node_dict = f1_dict,
                                               triplets = test_triplets,
                                               nodes_without_timestamp = node_without_timestamp,
                                               split = 'test')

In [19]:
f1_dict

{'results_4553': [0, 'train'],
 'constructors_177': [1, 'train'],
 'races_439': [2, 'train'],
 'constructor_results_4615': [3, 'train'],
 'drivers_7': [4, 'train'],
 'qualifying_6747': [5, 'train'],
 'drivers_77': [6, 'train'],
 'results_13175': [7, 'train'],
 'drivers_838': [8, 'train'],
 'results_25067': [9, 'train'],
 'constructors_17': [10, 'train'],
 'constructor_results_8620': [11, 'train'],
 'drivers_46': [12, 'train'],
 'qualifying_1809': [13, 'train'],
 'constructors_9': [14, 'train'],
 'results_20307': [15, 'train'],
 'constructors_24': [16, 'train'],
 'constructor_standings_8057': [17, 'train'],
 'constructors_2': [18, 'train'],
 'constructor_standings_3456': [19, 'train'],
 'constructors_1': [20, 'train'],
 'constructor_results_8984': [21, 'train'],
 'races_690': [22, 'train'],
 'standings_25265': [23, 'train'],
 'constructors_41': [24, 'train'],
 'constructor_standings_6232': [25, 'train'],
 'drivers_119': [26, 'train'],
 'results_14180': [27, 'train'],
 'drivers_152': [28

In [20]:
f1_triplets = train_triplets + val_triplets + test_triplets

In [51]:
f1_node_features, f1_labels, f1_edges, f1_train_mask, f1_val_mask, f1_test_mask = build_GraphAny_dataset(KG_data = f1_data,
                                                                                                          node_dict = f1_dict,
                                                                                                          triplets = f1_triplets)

In [53]:
!pip install h5py
import h5py



In [54]:
with h5py.File('f1_data.h5', 'w') as hf:
      hf.create_dataset('node_features', data=f1_node_features, compression='gzip')
      hf.create_dataset('labels', data=f1_labels, compression='gzip')
      hf.create_dataset('edges', data=f1_edges, compression='gzip')
      hf.create_dataset('train_mask', data=f1_train_mask, compression='gzip')
      hf.create_dataset('val_mask', data=f1_val_mask, compression='gzip')
      hf.create_dataset('test_mask', data=f1_test_mask, compression='gzip')

ValueError: setting an array element with a sequence. The requested array would exceed the maximum number of dimension of 32.

In [None]:
# PER LEGGERE DAL FILE

with h5py.File('data.h5', 'r') as hf:
    f1_node_features = hf['node_features'][:]
    f1_labels = hf['labels'][:]
    f1_edges = hf['edges'][:]
    f1_train_mask = hf['train_mask'][:]
    f1_val_mask = hf['val_mask'][:]
    f1_test_mask = hf['test_mask'][:]