# Setting Environment

In [None]:
!pip install relbench[full]

Collecting relbench[full]
  Downloading relbench-1.1.0-py3-none-any.whl.metadata (12 kB)
Collecting pytorch_frame>=0.2.3 (from relbench[full])
  Downloading pytorch_frame-0.2.5-py3-none-any.whl.metadata (20 kB)
Collecting torch_geometric (from relbench[full])
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->pytorch_frame>=0.2.3->relbench[full])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->pytorch_frame>=0.2.3->relbench[full])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->pytorch_frame>=0.2.3->relbench[full])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2

In [None]:
import relbench
from relbench.datasets import get_dataset_names, get_dataset
from relbench.modeling.utils import get_stype_proposal
from relbench.modeling.graph import make_pkey_fkey_graph
from relbench.tasks import get_task_names, get_task
from relbench.base import TaskType


import torch
from torch_geometric.seed import seed_everything
from torch import Tensor
from torch_frame import stype
from torch_frame.config.text_embedder import TextEmbedderConfig
from torch_frame.data.multi_embedding_tensor import MultiEmbeddingTensor


from typing import List, Optional


from sentence_transformers import SentenceTransformer


import os


import pandas as pd

import numpy as np

import random

import pickle

import requests

In [None]:
class GloveTextEmbedding:
    def __init__(self, device: Optional[torch.device] = None):
        self.model = SentenceTransformer(
            "sentence-transformers/average_word_embeddings_glove.6B.300d",
            device=device,
        )

    def __call__(self, sentences: List[str]) -> Tensor:
        return self.model.encode(sentences, convert_to_tensor=True)

In [None]:
# Check that it's cuda if you want it to run in reasonable time!
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.set_num_threads(1)
print(device)

# Set the seed for generating random numbers to ensure reproducibility
seed_everything(42)

# Path to the directory for caching graph data
root_dir = "./data"

# Configure the text encoder
text_embedder_cfg = TextEmbedderConfig(
    text_embedder=GloveTextEmbedding(device=device),
    batch_size=256
)

cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/248 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/480M [00:00<?, ?B/s]

wordembedding_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

whitespacetokenizer_config.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
print(f"The RelBench version is {relbench.__version__}")
print(f"The RelBench datasets are {get_dataset_names()}")

The RelBench version is 1.1.0
The RelBench datasets are ['rel-amazon', 'rel-avito', 'rel-event', 'rel-f1', 'rel-hm', 'rel-stack', 'rel-trial']


# Some Usefull functions

In [None]:
def pick_pairs(KG_data, edge):
    """
    Retrieve pairs of source and target node indexes from the knowledge graph data.

    Parameters:
    - KG_data (dict): A dictionary containing knowledge graph data, including edge information.
    - edge (tuple): A tuple containing edge names, where:
        - edge[0] is the source edge name.
        - edge[2] is the target edge name.

    Returns:
    - pairs_of_indexes (torch.Tensor): A tensor containing pairs of source and target node indexes,
      where each row represents a pair (source_node, target_node).
    """
    source_edge_name = edge[0]
    target_edge_name = edge[2]

    edge_index = KG_data[edge].edge_index # retrieve indexes
    source_nodes = edge_index[0]  # source indexes
    target_nodes = edge_index[1]  # targer indexes

    pairs_of_indexes = torch.stack((source_nodes, target_nodes), dim=1)


    return pairs_of_indexes

In [None]:
def train_inference_split_pairs(data, pairs, time_node, val_timestamp, test_timestamp):
    """
    Split pairs of nodes into training, validation, and test sets based on timestamps.

    Parameters:
    - data (dict): A dictionary containing node data, including timestamps.
    - pairs (torch.Tensor): A tensor containing pairs of source and target node indexes.
    - time_node (str): The key in the data dictionary that maps to the timestamps for the nodes.
    - val_timestamp (int): The timestamp that separates training and validation data.
    - test_timestamp (int): The timestamp that separates validation and test data.

    Returns:
    - final_pairs_train (list): A list of tuples representing the training pairs.
    - final_pairs_val (list): A list of tuples representing the validation pairs.
    - final_pairs_test (list): A list of tuples representing the test pairs.
    """

    final_pairs_train = []
    final_pairs_val = []
    final_pairs_test = []

    # We iterate trough pairs and timestamps
    for i in range(pairs.shape[0]):
        source_node = pairs[i, 0].item()
        target_node = pairs[i, 1].item()

        # we retrieve the source timestamp
        transaction_time_int = data[time_node].time[source_node].item()

        # Conversion from integer timestamp to pandas.Timestamp
        transaction_time = pd.to_datetime(transaction_time_int, unit='s')

        # We devide edges based on the Timestamps
        if transaction_time < val_timestamp:
            final_pairs_train.append((source_node, target_node))
        elif val_timestamp <= transaction_time and transaction_time < test_timestamp:
            final_pairs_val.append((source_node, target_node))
        else:
            final_pairs_test.append((source_node, target_node))

    return final_pairs_train, final_pairs_val, final_pairs_test

In [None]:
def retrieve_edges_features(KG_data, edges_dict):
    """
    Retrieve features of nodes and edges from the knowledge graph data.

    Parameters:
    - KG_data (dict): A dictionary containing knowledge graph data, including features for nodes.
    - edges_dict (dict): A dictionary mapping edges to their corresponding timestamps.

    Returns:
    - node_features (dict): A dictionary mapping node identifiers to their features.
    - output_edges (list): A list of tuples representing the edges in terms of their node indices.
    """

    node_features = {}  # Dictionary to store node features
    output_edges = []   # List to store output edges

    for edge, time in edges_dict.items():
        # Retrieve the names of source and destination nodes
        src_name = edge[0]
        dst_name = edge[2]

        # Get pairs of indexes for the current edge
        pairs_of_indexes = pick_pairs(KG_data, edge)

        for i in range(pairs_of_indexes.shape[0]):
            # Retrieve source and destination indices from the pair
            src_index = pairs_of_indexes[i, 0].item()
            dst_index = pairs_of_indexes[i, 1].item()

            # Retrieve features for source and destination nodes from KG data
            src_feat = KG_data[src_name].tf[src_index]
            dst_feat = KG_data[dst_name].tf[dst_index]

            # Add source node feature to node_features if not already present
            if f"{src_name}_{src_index}" not in node_features:
                node_features[f"{src_name}_{src_index}"] = src_feat
            # Add destination node feature to node_features if not already present
            if f"{dst_name}_{dst_index}" not in node_features:
                node_features[f"{dst_name}_{dst_index}"] = dst_feat

            # Iterate through node_features to find the indices of the nodes
            for idx, (node_name, node_feature) in enumerate(node_features.items()):
                if node_name == f"{src_name}_{src_index}":
                    src_index = idx
                if node_name == f"{dst_name}_{dst_index}":
                    dst_index = idx

            # Append the edge as a tuple of indices to output_edges
            output_edges.append((src_index, dst_index))

    return node_features, output_edges


In [None]:
def build_masks_and_labels(KG_data, node_without_timestamp, node_features, train_table, val_table, test_table, class_value, val_timestamp, test_timestamp, task_column_name='did_not_finish'):
    """
    Build labels and masks for training, validation, and testing datasets based on node features and timestamps.

    Parameters:
    - KG_data (dict): A dictionary containing knowledge graph data, including timestamps for nodes.
    - node_without_timestamp (list): A list of node names that do not have associated timestamps.
    - node_features (dict): A dictionary mapping node identifiers to their features.
    - train_table (DataFrame): A DataFrame containing training data.
    - val_table (DataFrame): A DataFrame containing validation data.
    - test_table (DataFrame): A DataFrame containing testing data.
    - class_value (any): The value to use for labels of nodes not present in the classification task.
    - val_timestamp (int): The timestamp that separates training and validation data.
    - test_timestamp (int): The timestamp that separates validation and test data.
    - task_column_name (str): The name of the column containing task labels.

    Returns:
    - labels (list): A list of labels for the nodes.
    - train_mask (list): A list indicating which nodes are part of the training set.
    - val_mask (list): A list indicating which nodes are part of the validation set.
    - test_mask (list): A list indicating which nodes are part of the test set.
    """

    labels = []      # List to hold labels for nodes
    train_mask = []  # List to hold training mask
    val_mask = []    # List to hold validation mask
    test_mask = []   # List to hold test mask

    for nodeID, (node_name, node_feature) in enumerate(node_features.items()):
        # Split the node name into its components: name and index
        node_name, idx = node_name.rsplit('_', 1)
        idx = int(idx)

        # Check if the node is the classification target
        if node_name == next(iter(train_table.fkey_col_to_pkey_table.values())):
            IDname = next(iter(train_table.fkey_col_to_pkey_table.keys()))
            nodeID = idx

            # Check if the node ID is present in the training table
            if nodeID in train_table.df[IDname].values:
                idx = train_table.df[train_table.df[IDname] == nodeID].index[0]
                label = train_table.df.loc[idx][task_column_name]
                labels.append(label)
                train_mask.append(True)
                val_mask.append(nodeID in val_table.df[IDname].values)
                test_mask.append(nodeID in test_table.df[IDname].values)
                continue

            # Check if the node ID is present in the validation table
            if nodeID in val_table.df[IDname].values:
                idx = val_table.df[val_table.df[IDname] == nodeID].index[0]
                label = val_table.df.loc[idx][task_column_name]
                labels.append(label)
                val_mask.append(True)
                train_mask.append(nodeID in train_table.df[IDname].values)
                test_mask.append(nodeID in test_table.df[IDname].values)
                continue

            # Check if the node ID is present in the test table
            if nodeID in test_table.df[IDname].values:
                idx = test_table.df[test_table.df[IDname] == nodeID].index[0]
                label = test_table.df.loc[idx][task_column_name]
                labels.append(label)
                test_mask.append(True)
                train_mask.append(nodeID in train_table.df[IDname].values)
                val_mask.append(nodeID in val_table.df[IDname].values)
                continue

            # If the node is not found in any table, use the class_value
            labels.append(class_value)
            train_mask.append(False)
            val_mask.append(False)
            test_mask.append(False)
            continue

        # If the node does not have a timestamp, it is visible in all splits
        elif node_name in node_without_timestamp:
            train_mask.append(True)
            val_mask.append(True)
            test_mask.append(True)
            labels.append(class_value)
            continue

        # If the node has a timestamp, update masks based on the timestamp
        else:
            labels.append(class_value)
            time_int = KG_data[node_name].time[idx].item()
            time_value = pd.to_datetime(time_int, unit='s')

            if time_value < val_timestamp:
                train_mask.append(True)
                val_mask.append(False)
                test_mask.append(False)
            elif val_timestamp <= time_value < test_timestamp:
                train_mask.append(False)
                val_mask.append(True)
                test_mask.append(False)
            else:
                train_mask.append(False)
                val_mask.append(False)
                test_mask.append(True)
            continue

    return labels, train_mask, val_mask, test_mask


In [None]:
def flatten_multi_embedding(met: MultiEmbeddingTensor, device=None, flatten_extra_dims=True):
    """
    Convert MultiEmbeddingTensor to dense tensor with better error handling.
    """
    if device is None:
        device = torch.device("cpu")

    # 1. First check for direct tensor conversion methods
    if hasattr(met, 'to_tensor') and callable(met.to_tensor):
        tensor = met.to_tensor()
        if isinstance(tensor, torch.Tensor):
            if tensor.layout != torch.strided:
                tensor = tensor.to_dense()
            tensor = tensor.to(device)
            if flatten_extra_dims and tensor.dim() > 2:
                tensor = tensor.flatten(start_dim=1)
            return tensor

    # 2. Look for embedding storage in attributes
    dict_candidates = ["_data", "embeddings", "_embeddings", "_tensor_dict", "values"]
    embedding_dict = None

    for candidate in dict_candidates:
        if hasattr(met, candidate):
            candidate_val = getattr(met, candidate)
            # Handle both direct tensors and dictionaries
            if isinstance(candidate_val, torch.Tensor):
                return _process_tensor(candidate_val, device, flatten_extra_dims)
            elif isinstance(candidate_val, dict):
                embedding_dict = candidate_val
                break

    # 3. Handle case where MultiEmbeddingTensor wraps a single tensor
    if embedding_dict is None:
        if hasattr(met, 'values') and callable(met.values):
            tensor = met.values()
            return _process_tensor(tensor, device, flatten_extra_dims)
        else:
            raise ValueError(
                f"Failed to unpack MultiEmbeddingTensor. Available attributes: {dir(met)}\n"
                "Consider inspecting the object structure with: "
                "print(dir(your_multi_embedding_tensor))"
            )

    # 4. Process dictionary of embeddings
    sub_tensors = []
    for emb in embedding_dict.values():
        if isinstance(emb, torch.Tensor):
            if emb.layout != torch.strided:
                emb = emb.to_dense()
            emb = emb.to(device)
            if flatten_extra_dims and emb.dim() > 2:
                emb = emb.flatten(start_dim=1)
            sub_tensors.append(emb)
        else:
            raise TypeError(f"Unexpected embedding type: {type(emb)}")

    return torch.cat(sub_tensors, dim=1)

def _process_tensor(tensor: torch.Tensor, device, flatten_extra_dims) -> torch.Tensor:
    """Helper for consistent tensor processing"""
    if tensor.layout != torch.strided:
        tensor = tensor.to_dense()
    tensor = tensor.to(device)
    if flatten_extra_dims and tensor.dim() > 2:
        tensor = tensor.flatten(start_dim=1)
    return tensor


def torchframe_to_tensor(tf, device=None, flatten_extra_dims=True):
    """
    Robustly convert a TorchFrame to a dense torch.Tensor by handling
    MultiEmbeddingTensors and other column types.
    """
    if device is None:
        device = torch.device("cpu")

    feats = []
    for stype_key, typed_feat in tf.feat_dict.items():
        # Resolve potential lazy-loaded tensors
        if callable(typed_feat):
            typed_feat = typed_feat()

        # Handle MultiEmbeddingTensor
        if isinstance(typed_feat, MultiEmbeddingTensor):
            met_tensor = flatten_multi_embedding(
                typed_feat, device=device, flatten_extra_dims=flatten_extra_dims
            )
            feats.append(met_tensor)
            continue

        # Convert sparse tensors to dense
        if hasattr(typed_feat, "to_dense") and callable(typed_feat.to_dense):
            typed_feat = typed_feat.to_dense()

        # Fallback to .values() if not a tensor
        if not isinstance(typed_feat, torch.Tensor):
            if hasattr(typed_feat, "values") and callable(typed_feat.values):
                typed_feat = typed_feat.values()
            else:
                raise TypeError(
                    f"Feature {stype_key} is not a tensor. Got {type(typed_feat)}"
                )

        # Flatten
        typed_feat = typed_feat.to(device)
        if flatten_extra_dims and typed_feat.dim() > 2:
            typed_feat = typed_feat.flatten(start_dim=1)

        feats.append(typed_feat)

    return torch.cat(feats, dim=1)

In [None]:
def remove_last_sep(s: str) -> str:
    sep = "[SEP]"
    last_index = s.rfind(sep)
    if last_index != -1:
        return s[:last_index]
    return s


def linearize_features(node_features: list, device=None) -> list:
    """
    Linearizes a list of node features in the format: <name_feature_1> <val_1> [SEP] <name_feature_2> <val2> [SEP] ...
    """

    if device is None:
        device = torch.device("cpu")

    linearized_tensors = []

    for i, tensor_frame in enumerate(node_features):

      tf_on_device = tensor_frame.to(device)
      feats = []
      for stype_key, typed_feat in tf_on_device.feat_dict.items():

          for i in range(len(tf_on_device.col_names_dict[stype_key])):
            if isinstance(typed_feat, MultiEmbeddingTensor):
                met_tensor = flatten_multi_embedding(
                    typed_feat, flatten_extra_dims = True
                )
                num_cols = typed_feat.num_cols
                emb_dim = typed_feat.values.shape[1] // num_cols


                sub_tensors = torch.split(met_tensor, emb_dim, dim=1)

                feature_name = tf_on_device.col_names_dict[stype_key][i]

                feature_value = sub_tensors[i].tolist()

                feats.append(f"{feature_name} {feature_value} [SEP] ")
                continue

            feature_name = tf_on_device.col_names_dict[stype_key][i]
            feature_value = typed_feat[0][i].tolist()

            feats.append(f"{feature_name} {feature_value} [SEP] ")


      linearized_tensors.append(remove_last_sep(''.join(feats)))
    return linearized_tensors

In [None]:
def text_embedding(linearize_features: list, embedder_model, device=None) -> list:
    """
    Convert a list of linearized features in a list of text embedded features (list of tensors).
    """
    if device is None:
        device = torch.device("cpu")

    # we compute the embedding of each linearized input feature
    embedded_features = []
    for feature in linearize_features:
        emb_feat = embedder_model(feature)
        embedded_features.append(emb_feat)

    return embedded_features

In [None]:
def update_masks(labels, val_mask, test_mask):
    '''
        Update val_mask and test_mask based on the values ​​of labels.

        Parameters:
        - labels: numpy array containing labels
        - val_mask: numpy boolean array for validation set
        - test_mask: numpy boolean array for test set
    '''
    for i in range(len(labels)):
        if labels[i] == 2:
            if val_mask[i]:
                val_mask[i] = False
            if test_mask[i]:
                test_mask[i] = False

    return val_mask, test_mask

# F1 Dataset Creation

We want to import one Knowledge Graph from Relbench in a format which is compatible with GraphAny. In particular, we are going to select a node classification task.

We can see all the possible tasks in the Relbench Formula 1 dataset from [this link](https://relbench.stanford.edu/datasets/rel-f1/).

## Downloading a node classification task

We are going to select as task `driver-dnf`, which description is: "For each driver predict the if they will DNF (did not finish) a race in the next 1 month."

In [None]:
get_task_names("rel-f1")
task = get_task("rel-f1", "driver-dnf", download=True)
assert task.task_type == TaskType.BINARY_CLASSIFICATION

Downloading file 'rel-f1/tasks/driver-dnf.zip' from 'https://relbench.stanford.edu/download/rel-f1/tasks/driver-dnf.zip' to '/root/.cache/relbench'.
100%|█████████████████████████████████████| 37.3k/37.3k [00:00<00:00, 14.5MB/s]
Unzipping contents of '/root/.cache/relbench/rel-f1/tasks/driver-dnf.zip' to '/root/.cache/relbench/rel-f1/tasks/.'


Now, we are going to download the train, validation and test tables from our task.

In [None]:
train_table = task.get_table("train")
val_table = task.get_table("val")
# relbench masks in default the values for test_table in order to prevent est leakage, but we need this information: mask_input_cols=False
test_table = task.get_table("test", mask_input_cols=False)

In [None]:
test_table

Table(df=
          date  driverId  did_not_finish
0   2013-03-16       814               0
1   2012-11-16         9               1
2   2012-11-16        17               0
3   2012-10-17         0               1
4   2012-09-17       816               0
..         ...       ...             ...
697 2011-08-24        14               1
698 2011-05-26        14               1
699 2011-05-26       154               0
700 2010-09-28        14               1
701 2010-09-28       154               0

[702 rows x 3 columns],
  fkey_col_to_pkey_table={'driverId': 'drivers'},
  pkey_col=None,
  time_col=date)

## Downloading the relbench KG

At this point we need also to download the entire Knowledge Graph of Formula 1 datasbase.

In [None]:
# We download the f1-dataset
f1_dataset = get_dataset(name="rel-f1", download=True)

# we download the entire database (also the test part)
f1_db = f1_dataset.get_db(upto_test_timestamp = False)
f1_col_to_stype_dict = get_stype_proposal(f1_db)

# Generate graph data
f1_data, f1_col_stats_dict = make_pkey_fkey_graph(
    f1_db,
    col_to_stype_dict = f1_col_to_stype_dict,  # Column types
    text_embedder_cfg = text_embedder_cfg,  # Our chosen text encoder
    cache_dir=os.path.join(
        root_dir, f"rel-f1_materialized_cache"
    ),  # Store materialized graph for convenience
)

Downloading file 'rel-f1/db.zip' from 'https://relbench.stanford.edu/download/rel-f1/db.zip' to '/root/.cache/relbench'.
100%|████████████████████████████████████████| 704k/704k [00:00<00:00, 663MB/s]
Unzipping contents of '/root/.cache/relbench/rel-f1/db.zip' to '/root/.cache/relbench/rel-f1/.'


Loading Database object from /root/.cache/relbench/rel-f1/db...
Done in 0.04 seconds.


Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00,  3.11it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 295.19it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 305.04it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 303.63it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 238.25it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 228.79it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 230.26it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 228.71it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 214.91it/s]
  ser = pd.to_datetime(ser, format=time_format)
Embedding raw data in mini-batch: 100%|██████████| 5/5 [00:00<00:00, 166.59it/s]
  ser = pd.to_datetime(ser, format=self.format, errors='coerce')
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 43.43it/s

In [None]:
f1_val_timestep = f1_dataset.val_timestamp
f1_test_timestep = f1_dataset.test_timestamp

print(f"The validation timestep is: {f1_val_timestep}")
print(f"The test timestep is: {f1_test_timestep}")

The validation timestep is: 2005-01-01 00:00:00
The test timestep is: 2010-01-01 00:00:00


Let's inspect our HeteroData graph composition.

In [None]:
f1_data

HeteroData(
  constructor_standings={
    tf=TensorFrame([13051, 4]),
    time=[13051],
  },
  results={
    tf=TensorFrame([26080, 11]),
    time=[26080],
  },
  circuits={ tf=TensorFrame([77, 7]) },
  drivers={ tf=TensorFrame([857, 6]) },
  races={
    tf=TensorFrame([1101, 5]),
    time=[1101],
  },
  standings={
    tf=TensorFrame([34124, 4]),
    time=[34124],
  },
  qualifying={
    tf=TensorFrame([9815, 3]),
    time=[9815],
  },
  constructors={ tf=TensorFrame([211, 3]) },
  constructor_results={
    tf=TensorFrame([12290, 2]),
    time=[12290],
  },
  (constructor_standings, f2p_raceId, races)={ edge_index=[2, 13051] },
  (races, rev_f2p_raceId, constructor_standings)={ edge_index=[2, 13051] },
  (constructor_standings, f2p_constructorId, constructors)={ edge_index=[2, 13051] },
  (constructors, rev_f2p_constructorId, constructor_standings)={ edge_index=[2, 13051] },
  (results, f2p_raceId, races)={ edge_index=[2, 26080] },
  (races, rev_f2p_raceId, results)={ edge_index=[2, 2

In [None]:
f1_edges_dict = {
        ('constructor_standings', 'f2p_raceId', 'races'): 'constructor_standings',
        ('races', 'rev_f2p_raceId', 'constructor_standings'): 'races',
        ('constructor_standings', 'f2p_constructorId', 'constructors'): 'constructor_standings',
        ('constructors', 'rev_f2p_constructorId', 'constructor_standings'): 'constructor_standings',
        ('standings', 'f2p_raceId', 'races'): 'standings',
        ('races', 'rev_f2p_raceId', 'standings'): 'races',
        ('standings', 'f2p_driverId', 'drivers'): 'standings',
        ('drivers', 'rev_f2p_driverId', 'standings'): 'standings',
        ('constructor_results', 'f2p_raceId', 'races'): 'constructor_results',
        ('races', 'rev_f2p_raceId', 'constructor_results'): 'races',
        ('constructor_results', 'f2p_constructorId', 'constructors'): 'constructor_results',
        ('constructors', 'rev_f2p_constructorId', 'constructor_results'): 'constructor_results',
        ('results', 'f2p_raceId', 'races'): 'results',
        ('races', 'rev_f2p_raceId', 'results'): 'races',
        ('results', 'f2p_driverId', 'drivers'): 'results',
        ('drivers', 'rev_f2p_driverId', 'results'): 'results',
        ('results', 'f2p_constructorId', 'constructors'): 'results',
        ('constructors', 'rev_f2p_constructorId', 'results'): 'results',
        ('qualifying', 'f2p_raceId', 'races'): 'qualifying',
        ('races', 'rev_f2p_raceId', 'qualifying'): 'races',
        ('qualifying', 'f2p_driverId', 'drivers'): 'qualifying',
        ('drivers', 'rev_f2p_driverId', 'qualifying'): 'qualifying',
        ('qualifying', 'f2p_constructorId', 'constructors'): 'qualifying',
        ('constructors', 'rev_f2p_constructorId', 'qualifying'): 'qualifying',
        ('races', 'f2p_circuitId', 'circuits'): 'races',
        ('circuits', 'rev_f2p_circuitId', 'races'): 'races'
    }
node_without_timestamp = ['drivers', 'circuits', 'constructors']

Now, we retrieve all the `node_features` for each node in the Relbench Knowledge Graph.

We also retrieve all the edges in the KG.

In [None]:
node_features, output_edges = retrieve_edges_features(f1_data, f1_edges_dict)

Then, we need to create the masks for train, validation and test sets. But, importantly, we need to copy also the labels for each node in our graph.

In this way we obtain the following structures:


*   The i-th element in `node_features` represents the features of the node number i of our KG;
*   The i-th element in `labels` represents the labels of the node number i of our KG;
*   The i-th element in the `split masks` is a boolean values that specifies if the node number i must be consider as part of that specific split.



In particular, regarding the `labels`, we are going to have only three labels in our graph. This because we are going to use the labels `[0,1]` for all the nodes of type `drivers`, and the label `2` for all the others.

This approach reduces drastically the number of labels and we are going to do so because we need to map in particular the selected node classification task.

In [None]:
labels, train_mask, val_mask, test_mask = build_masks_and_labels(f1_data,
                                                                  node_without_timestamp,
                                                                  node_features,
                                                                  train_table,
                                                                  val_table,
                                                                  test_table,
                                                                  2,
                                                                  f1_val_timestep,
                                                                  f1_test_timestep,
                                                                  task_column_name = 'did_not_finish')

print(f"{len(node_features)}")
print(f"{len(output_edges)}")
print(f"{len(labels)}")
print(f"{len(train_mask)}")
print(f"{len(val_mask)}")
print(f"{len(test_mask)}")

97605
455432
97605
97605
97605
97605


At this point we are going to linearize all the features of our Knowledge Graph and we compute their embedding. In this case we chose Glove as embedding, but we are free to choose any other embedding technique we want.

We need to perform this step because we are in front of an heterograph with nodes those have different dimensions. But, GraphAny is able to accept only graphs with nodes with the same number of dimensions.

In [None]:
node_features_list = []
for node_name, node_feature in node_features.items():
  node_features_list.append(node_feature)

In [None]:
# we linearize the features in the format <name_feature_1> <val_1> [SEP] <name_feature_2> <val2> [SEP] ...
f1_linearized_features = linearize_features(node_features_list, device=device)

# now we compute the embedding for each linearized feature
embedding_model = GloveTextEmbedding(device)
f1_emb_features = text_embedding(f1_linearized_features, embedding_model, device)

Now we can save our data in a local `pickle` file which is the data file that we will use as input to GraphAny.

In [None]:
# Now we have to change the format of our input data
# All the data must appear in the format of numpy arrays
f1_emb_features = torch.stack(f1_emb_features).cpu().numpy()

# we digitalize labels
f1_labels = np.array(labels)

f1_edges = np.array(output_edges)

f1_train_mask = np.array(train_mask)
f1_val_mask = np.array(val_mask)
f1_test_mask = np.array(test_mask)


data_to_save = {
    'node_features': f1_emb_features,
    'labels': f1_labels,
    'edges': f1_edges,
    'train_mask': f1_train_mask,
    'val_mask': f1_val_mask,
    'test_mask': f1_test_mask
}

with open('f1_3_classes.pkl', 'wb') as f:
    pickle.dump(data_to_save, f)

Now, note that the task we selected is a binary task, but we created our data cosidering three labels. In order to solve this little issue we are going to mask out all the nodes those are not objective of the classification task in our data. In this way only the nodes of type `drivers` will be cosidered by GraphAny during the evaluation steps.

In [None]:
with open('f1_3_classes.pkl', 'rb') as f:
        data = pickle.load(f)

node_features = data['node_features']
labels = data['labels']
edges = data['edges']
train_mask, val_mask, test_mask = data['train_mask'], data['val_mask'], data['test_mask']

updated_val_mask, updated_test_mask = update_masks(labels, val_mask, test_mask)



data_to_save = {
    'node_features': node_features,
    'labels': labels,
    'edges': edges,
    'train_mask': train_mask,
    'val_mask': updated_val_mask,
    'test_mask': updated_test_mask
}

with open('f1_3_classes_remastered.pkl', 'wb') as f:
    pickle.dump(data_to_save, f)