In [1]:
# we install all the required for RelBench
!pip install relbench[full]

Collecting relbench[full]
  Downloading relbench-1.1.0-py3-none-any.whl.metadata (12 kB)
Collecting pytorch_frame>=0.2.3 (from relbench[full])
  Downloading pytorch_frame-0.2.4-py3-none-any.whl.metadata (20 kB)
Collecting torch_geometric (from relbench[full])
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading pytorch_frame-0.2.4-py3-none-any.whl (144 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.5/144.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading relbench-1.1.0-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.1/58.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstall

In [2]:
import relbench
from relbench.datasets import get_dataset_names, get_dataset
from relbench.modeling.utils import get_stype_proposal
from relbench.modeling.graph import make_pkey_fkey_graph


import torch
from torch_geometric.seed import seed_everything
from torch import Tensor
from torch_frame.config.text_embedder import TextEmbedderConfig


from typing import List, Optional


from sentence_transformers import SentenceTransformer


import os


import pandas as pd


import random

In [3]:
class GloveTextEmbedding:
    def __init__(self, device: Optional[torch.device] = None):
        self.model = SentenceTransformer(
            "sentence-transformers/average_word_embeddings_glove.6B.300d",
            device=device,
        )

    def __call__(self, sentences: List[str]) -> Tensor:
        return self.model.encode(sentences, convert_to_tensor=True)

# Environment Checking

In [4]:
# Check that it's cuda if you want it to run in reasonable time!
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.set_num_threads(1)
print(device)

# Set the seed for generating random numbers to ensure reproducibility
seed_everything(42)

# Path to the directory for caching graph data
root_dir = "./data"

# Configure the text encoder
text_embedder_cfg = TextEmbedderConfig(
    text_embedder=GloveTextEmbedding(device=device),
    batch_size=256
)

cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/248 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/480M [00:00<?, ?B/s]

(…)WordEmbeddings/wordembedding_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

(…)beddings/whitespacetokenizer_config.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
print(f"The RelBench version is {relbench.__version__}")
print(f"The RelBench datasets are {get_dataset_names()}")

The RelBench version is 1.1.0
The RelBench datasets are ['rel-amazon', 'rel-avito', 'rel-event', 'rel-f1', 'rel-hm', 'rel-stack', 'rel-trial']


#Usefull functions

In [7]:
def build_nodes_dictionary(KG_data, node_names, val_timestamp, test_timestamp):
    nodes_dict = {}
    seq_number = 0  # Inizializzazione del contatore sequenziale

    for node_name in node_names:
        # Estrazione dei dati del nodo
        node_data = KG_data[node_name]

        # Controllo se il campo 'time' esiste
        if 'time' in node_data:
            timestamps = node_data['time']  # Supponendo che i timestamp siano in un campo 'time'

            for index, time in enumerate(timestamps):
                time = pd.to_datetime(time.item(), unit='s')

                # Determina se il nodo è in train, validation o test
                if time < val_timestamp:
                    category = 'train'
                elif val_timestamp <= time < test_timestamp:
                    category = 'validation'
                else:
                    category = 'test'

                # Creazione della chiave e valore da inserire nel dizionario
                key = f"{node_name}_{index}"
                value = [seq_number, category]

                # Aggiunta al dizionario
                nodes_dict[key] = value
                seq_number += 1  # Incrementa il contatore per ogni nodo analizzato
        else:
            # Se il campo 'time' non esiste
            for index in range(len(node_data)):  # Itera sugli indici dei nodi
                key = f"{node_name}_{index}"
                value = [seq_number, 'all']  # Categoria 'all' se non c'è 'time'

                # Aggiunta al dizionario
                nodes_dict[key] = value
                seq_number += 1  # Incrementa il contatore

    return nodes_dict


In [8]:
def modify_node_name(key):
    # Trova l'ultima posizione di '_' e rimuove la parte numerica
    last_underscore_index = key.rfind('_')

    if last_underscore_index != -1:
        # Parte letterale (es. 'standings')
        modified_name = key[:last_underscore_index]

        # Parte numerica (es. 3)
        node_index = int(''.join(filter(str.isdigit, key[last_underscore_index:])))

        return modified_name, node_index

    else:
        node_name = ''.join(filter(str.isalpha, key))  # Parte letterale (es. 'standings')
        node_index = int(''.join(filter(str.isdigit, key)))  # Parte numerica (es. 5)
        return node_name, node_index

In [9]:
def build_data_file(KG_data, node_names, nodes_dictionary, edges_names):
    node_features = []
    labels = []
    edges = []

    train_mask = []
    val_mask = []
    test_mask = []

    for key in nodes_dictionary:
        # Dividi la chiave in parte letterale e parte numerica
        # node_name = ''.join(filter(str.isalpha, key))  # Parte letterale (es. 'standings')
        # node_index = int(''.join(filter(str.isdigit, key)))  # Parte numerica (es. 5)
        node_name, node_index = modify_node_name(key)

        # Estrai il valore delle features usando i dati di KD_data
        print(node_name)
        feature_value = KG_data[node_name].tf[node_index]
        node_features.append(feature_value)

        # Inserisco anche la label del nodo in labels
        labels.append(node_name)

        # Aggiorno le maschere
        split_value = nodes_dictionary[key][1]
        if split_value == "train":
            train_mask.append(True)
            val_mask.append(False)
            test_mask.append(False)
        elif split_value == "validation":
            train_mask.append(False)
            val_mask.append(True)
            test_mask.append(False)
        elif split_value == "test":
            train_mask.append(False)
            val_mask.append(False)
            test_mask.append(True)
        else:
            train_mask.append(True)
            val_mask.append(True)
            test_mask.append(True)

    # return  node_features, labels, edges, train_mask, val_mask, test_mask

#'''
        # Ora devo inserire gli edges in cui è presente quel nodo
        for edge in edges_names:
            # controllo che quel tipo di nodo è sorgente di un possibile edge
            if node_name == edge[0]:
                # se l'edge ha dentro quel tipo di nodo allora
                edge_index = KG_data[edge].edge_index # retrieve indexes
                # prendo tutti gli indici dei nodi sorgente e dei nodi destinazione
                source_nodes = edge_index[0]  # source indexes
                target_nodes = edge_index[1]  # targer indexes

                source_dict_indices = []
                target_dict_indices = []

                # Creazione delle nuove chiavi
                for source_index in source_nodes:
                    source_key = f"{node_name}_{source_index.item()}"  # Concatenazione con il nome del nodo
                    source_dict_indices.append(nodes_dictionary[source_key][0])

                for target_index in target_nodes:
                    target_key = f"{node_name}_{target_index.item()}"  # Concatenazione
                    target_dict_indices.append(nodes_dictionary[target_key][0])


                # Creazione delle coppie e aggiunta alla lista edges
                for src_index in source_dict_indices:
                    for tgt_index in target_dict_indices:
                        edge_pair = [src_index, tgt_index]
                        if edge_pair not in edges:  # Controllo se la coppia non è già presente
                            edges.append(edge_pair)
        break

    return   node_features, labels, edges, train_mask, val_mask, test_mask
#'''

# F1 Dataset Creation

In [10]:
# We download the f1-dataset
f1_dataset = get_dataset(name="rel-f1", download=True)

# we download the entire database (also the test part)
f1_db = f1_dataset.get_db(upto_test_timestamp = False)
f1_col_to_stype_dict = get_stype_proposal(f1_db)

# Generate graph data
f1_data, f1_col_stats_dict = make_pkey_fkey_graph(
    f1_db,
    col_to_stype_dict = f1_col_to_stype_dict,  # Column types
    text_embedder_cfg = text_embedder_cfg,  # Our chosen text encoder
    cache_dir=os.path.join(
        root_dir, f"rel-f1_materialized_cache"
    ),  # Store materialized graph for convenience
)

Downloading file 'rel-f1/db.zip' from 'https://relbench.stanford.edu/download/rel-f1/db.zip' to '/root/.cache/relbench'.
100%|████████████████████████████████████████| 704k/704k [00:00<00:00, 561MB/s]
Unzipping contents of '/root/.cache/relbench/rel-f1/db.zip' to '/root/.cache/relbench/rel-f1/.'


Loading Database object from /root/.cache/relbench/rel-f1/db...
Done in 0.14 seconds.


Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:01<00:00,  2.47it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 34.31it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 43.83it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 55.42it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 34.02it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 169.11it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 223.24it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 256.77it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 255.75it/s]
  ser = pd.to_datetime(ser, format=time_format)
Embedding raw data in mini-batch: 100%|██████████| 5/5 [00:00<00:00, 123.18it/s]
  ser = pd.to_datetime(ser, format=self.format, errors='coerce')
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 43.00it/s]
Em

In [11]:
f1_val_timestep = f1_dataset.val_timestamp
f1_test_timestep = f1_dataset.test_timestamp

print(f"The validation timestep is: {f1_val_timestep}")
print(f"The test timestep is: {f1_test_timestep}")

The validation timestep is: 2005-01-01 00:00:00
The test timestep is: 2010-01-01 00:00:00


In [12]:
f1_data

HeteroData(
  standings={
    tf=TensorFrame([34124, 4]),
    time=[34124],
  },
  drivers={ tf=TensorFrame([857, 6]) },
  results={
    tf=TensorFrame([26080, 11]),
    time=[26080],
  },
  constructor_results={
    tf=TensorFrame([12290, 2]),
    time=[12290],
  },
  circuits={ tf=TensorFrame([77, 7]) },
  qualifying={
    tf=TensorFrame([9815, 3]),
    time=[9815],
  },
  races={
    tf=TensorFrame([1101, 5]),
    time=[1101],
  },
  constructors={ tf=TensorFrame([211, 3]) },
  constructor_standings={
    tf=TensorFrame([13051, 4]),
    time=[13051],
  },
  (standings, f2p_raceId, races)={ edge_index=[2, 34124] },
  (races, rev_f2p_raceId, standings)={ edge_index=[2, 34124] },
  (standings, f2p_driverId, drivers)={ edge_index=[2, 34124] },
  (drivers, rev_f2p_driverId, standings)={ edge_index=[2, 34124] },
  (results, f2p_raceId, races)={ edge_index=[2, 26080] },
  (races, rev_f2p_raceId, results)={ edge_index=[2, 26080] },
  (results, f2p_driverId, drivers)={ edge_index=[2, 26080] 

In [None]:
node_names = ['standings', 'drivers', 'results', 'constructor_results', 'circuits', 'qualifying', 'races', 'constructors', 'constructor_standings']
edges_names = [('constructor_standings', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'constructor_standings'),
                ('constructor_standings', 'f2p_constructorId', 'constructors'),
                ('constructors', 'rev_f2p_constructorId', 'constructor_standings'),
                ('standings', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'standings'),
                ('standings', 'f2p_driverId', 'drivers'),
                ('drivers', 'rev_f2p_driverId', 'standings'),
                ('constructor_results', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'constructor_results'),
                ('constructor_results', 'f2p_constructorId', 'constructors'),
                ('constructors', 'rev_f2p_constructorId', 'constructor_results'),
                ('results', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'results'),
                ('results', 'f2p_driverId', 'drivers'),
                ('drivers', 'rev_f2p_driverId', 'results'),
                ('results', 'f2p_constructorId', 'constructors'),
                ('constructors', 'rev_f2p_constructorId', 'results'),
                ('qualifying', 'f2p_raceId', 'races'),
                ('races', 'rev_f2p_raceId', 'qualifying'),
                ('qualifying', 'f2p_driverId', 'drivers'),
                ('drivers', 'rev_f2p_driverId', 'qualifying'),
                ('qualifying', 'f2p_constructorId', 'constructors'),
                ('constructors', 'rev_f2p_constructorId', 'qualifying'),
                ('races', 'f2p_circuitId', 'circuits'),
                ('circuits', 'rev_f2p_circuitId', 'races')]

In [None]:
F1_dict = build_nodes_dictionary(KG_data = f1_data,
                                 node_names = node_names,
                                 val_timestamp = f1_val_timestep,
                                 test_timestamp = f1_test_timestep)

In [None]:
F1_dict

{'standings_0': [0, 'train'],
 'standings_1': [1, 'train'],
 'standings_2': [2, 'train'],
 'standings_3': [3, 'train'],
 'standings_4': [4, 'train'],
 'standings_5': [5, 'train'],
 'standings_6': [6, 'train'],
 'standings_7': [7, 'train'],
 'standings_8': [8, 'train'],
 'standings_9': [9, 'train'],
 'standings_10': [10, 'train'],
 'standings_11': [11, 'train'],
 'standings_12': [12, 'train'],
 'standings_13': [13, 'train'],
 'standings_14': [14, 'train'],
 'standings_15': [15, 'train'],
 'standings_16': [16, 'train'],
 'standings_17': [17, 'train'],
 'standings_18': [18, 'train'],
 'standings_19': [19, 'train'],
 'standings_20': [20, 'train'],
 'standings_21': [21, 'train'],
 'standings_22': [22, 'train'],
 'standings_23': [23, 'train'],
 'standings_24': [24, 'train'],
 'standings_25': [25, 'train'],
 'standings_26': [26, 'train'],
 'standings_27': [27, 'train'],
 'standings_28': [28, 'train'],
 'standings_29': [29, 'train'],
 'standings_30': [30, 'train'],
 'standings_31': [31, 'train

In [None]:
node_features, labels, edges, train_mask, val_mask, test_mask = build_data_file(KG_data = f1_data,
                                                                                node_names = node_names,
                                                                                nodes_dictionary = F1_dict,
                                                                                edges_names = edges_names)

standings


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-dc3adaa1e589>", line 1, in <cell line: 0>
    node_features, labels, edges, train_mask, val_mask, test_mask = build_data_file(KG_data = f1_data,
                                                                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-8-70616300896c>", line None, in build_data_file
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback_()
          ^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Tra