# Setting Environment

In [1]:
!pip install relbench[full]

Collecting relbench[full]
  Downloading relbench-1.1.0-py3-none-any.whl.metadata (12 kB)
Collecting pytorch_frame>=0.2.3 (from relbench[full])
  Downloading pytorch_frame-0.2.5-py3-none-any.whl.metadata (20 kB)
Collecting torch_geometric (from relbench[full])
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->pytorch_frame>=0.2.3->relbench[full])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->pytorch_frame>=0.2.3->relbench[full])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->pytorch_frame>=0.2.3->relbench[full])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2

In [2]:
import relbench
from relbench.datasets import get_dataset_names, get_dataset
from relbench.modeling.utils import get_stype_proposal
from relbench.modeling.graph import make_pkey_fkey_graph
from relbench.tasks import get_task_names, get_task
from relbench.base import TaskType


import torch
from torch_geometric.seed import seed_everything
from torch import Tensor
from torch_frame import stype
from torch_frame.config.text_embedder import TextEmbedderConfig
from torch_frame.data.multi_embedding_tensor import MultiEmbeddingTensor


from typing import List, Optional


from sentence_transformers import SentenceTransformer


import os


import pandas as pd

import numpy as np

import random

import pickle

import requests

In [3]:
class GloveTextEmbedding:
    def __init__(self, device: Optional[torch.device] = None):
        self.model = SentenceTransformer(
            "sentence-transformers/average_word_embeddings_glove.6B.300d",
            device=device,
        )

    def __call__(self, sentences: List[str]) -> Tensor:
        return self.model.encode(sentences, convert_to_tensor=True)

In [4]:
# Check that it's cuda if you want it to run in reasonable time!
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.set_num_threads(1)
print(device)

# Set the seed for generating random numbers to ensure reproducibility
seed_everything(42)

# Path to the directory for caching graph data
root_dir = "./data"

# Configure the text encoder
text_embedder_cfg = TextEmbedderConfig(
    text_embedder=GloveTextEmbedding(device=device),
    batch_size=256
)

cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/248 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/480M [00:00<?, ?B/s]

(…)rdEmbeddings%2Fwordembedding_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

(…)ddings%2Fwhitespacetokenizer_config.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
print(f"The RelBench version is {relbench.__version__}")
print(f"The RelBench datasets are {get_dataset_names()}")

The RelBench version is 1.1.0
The RelBench datasets are ['rel-amazon', 'rel-avito', 'rel-event', 'rel-f1', 'rel-hm', 'rel-stack', 'rel-trial']


# F1 Dataset Creation

## Downloading a link prediction task

In [7]:
get_task_names("rel-f1")
task = get_task("rel-f1", "driver-dnf", download=True)
assert task.task_type == TaskType.BINARY_CLASSIFICATION

Downloading file 'rel-f1/tasks/driver-dnf.zip' from 'https://relbench.stanford.edu/download/rel-f1/tasks/driver-dnf.zip' to '/root/.cache/relbench'.
100%|█████████████████████████████████████| 37.3k/37.3k [00:00<00:00, 35.9MB/s]
Unzipping contents of '/root/.cache/relbench/rel-f1/tasks/driver-dnf.zip' to '/root/.cache/relbench/rel-f1/tasks/.'


In [10]:
train_table = task.get_table("train")
val_table = task.get_table("val")
# relbench masks in default the values for test_table in order to prevent est leakage, but we need this information: mask_input_cols=False
test_table = task.get_table("test", mask_input_cols=False)

In [11]:
test_table

Table(df=
          date  driverId  did_not_finish
0   2013-03-16       814               0
1   2012-11-16         9               1
2   2012-11-16        17               0
3   2012-10-17         0               1
4   2012-09-17       816               0
..         ...       ...             ...
697 2011-08-24        14               1
698 2011-05-26        14               1
699 2011-05-26       154               0
700 2010-09-28        14               1
701 2010-09-28       154               0

[702 rows x 3 columns],
  fkey_col_to_pkey_table={'driverId': 'drivers'},
  pkey_col=None,
  time_col=date)

## Downloading the relbench KG

In [13]:
# We download the f1-dataset
f1_dataset = get_dataset(name="rel-f1", download=True)

# we download the entire database (also the test part)
f1_db = f1_dataset.get_db(upto_test_timestamp = False)
f1_col_to_stype_dict = get_stype_proposal(f1_db)

# Generate graph data
f1_data, f1_col_stats_dict = make_pkey_fkey_graph(
    f1_db,
    col_to_stype_dict = f1_col_to_stype_dict,  # Column types
    text_embedder_cfg = text_embedder_cfg,  # Our chosen text encoder
    cache_dir=os.path.join(
        root_dir, f"rel-f1_materialized_cache"
    ),  # Store materialized graph for convenience
)

Downloading file 'rel-f1/db.zip' from 'https://relbench.stanford.edu/download/rel-f1/db.zip' to '/root/.cache/relbench'.
100%|████████████████████████████████████████| 704k/704k [00:00<00:00, 689MB/s]
Unzipping contents of '/root/.cache/relbench/rel-f1/db.zip' to '/root/.cache/relbench/rel-f1/.'


Loading Database object from /root/.cache/relbench/rel-f1/db...
Done in 0.05 seconds.


Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00,  3.07it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 237.10it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 251.58it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 267.31it/s]
  ser = pd.to_datetime(ser, format=time_format)
Embedding raw data in mini-batch: 100%|██████████| 5/5 [00:00<00:00, 197.33it/s]
  ser = pd.to_datetime(ser, format=self.format, errors='coerce')
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 45.69it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 169.28it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 190.96it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 205.11it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 229.85it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 217.09it/s

In [14]:
f1_val_timestep = f1_dataset.val_timestamp
f1_test_timestep = f1_dataset.test_timestamp

print(f"The validation timestep is: {f1_val_timestep}")
print(f"The test timestep is: {f1_test_timestep}")

The validation timestep is: 2005-01-01 00:00:00
The test timestep is: 2010-01-01 00:00:00


In [15]:
f1_data

HeteroData(
  constructor_results={
    tf=TensorFrame([12290, 2]),
    time=[12290],
  },
  results={
    tf=TensorFrame([26080, 11]),
    time=[26080],
  },
  circuits={ tf=TensorFrame([77, 7]) },
  races={
    tf=TensorFrame([1101, 5]),
    time=[1101],
  },
  constructor_standings={
    tf=TensorFrame([13051, 4]),
    time=[13051],
  },
  constructors={ tf=TensorFrame([211, 3]) },
  standings={
    tf=TensorFrame([34124, 4]),
    time=[34124],
  },
  qualifying={
    tf=TensorFrame([9815, 3]),
    time=[9815],
  },
  drivers={ tf=TensorFrame([857, 6]) },
  (constructor_results, f2p_raceId, races)={ edge_index=[2, 12290] },
  (races, rev_f2p_raceId, constructor_results)={ edge_index=[2, 12290] },
  (constructor_results, f2p_constructorId, constructors)={ edge_index=[2, 12290] },
  (constructors, rev_f2p_constructorId, constructor_results)={ edge_index=[2, 12290] },
  (results, f2p_raceId, races)={ edge_index=[2, 26080] },
  (races, rev_f2p_raceId, results)={ edge_index=[2, 26080] },

In [16]:
f1_edges_dict = {
        ('constructor_standings', 'f2p_raceId', 'races'): 'constructor_standings',
        ('races', 'rev_f2p_raceId', 'constructor_standings'): 'races',
        ('constructor_standings', 'f2p_constructorId', 'constructors'): 'constructor_standings',
        ('constructors', 'rev_f2p_constructorId', 'constructor_standings'): 'constructor_standings',
        ('standings', 'f2p_raceId', 'races'): 'standings',
        ('races', 'rev_f2p_raceId', 'standings'): 'races',
        ('standings', 'f2p_driverId', 'drivers'): 'standings',
        ('drivers', 'rev_f2p_driverId', 'standings'): 'standings',
        ('constructor_results', 'f2p_raceId', 'races'): 'constructor_results',
        ('races', 'rev_f2p_raceId', 'constructor_results'): 'races',
        ('constructor_results', 'f2p_constructorId', 'constructors'): 'constructor_results',
        ('constructors', 'rev_f2p_constructorId', 'constructor_results'): 'constructor_results',
        ('results', 'f2p_raceId', 'races'): 'results',
        ('races', 'rev_f2p_raceId', 'results'): 'races',
        ('results', 'f2p_driverId', 'drivers'): 'results',
        ('drivers', 'rev_f2p_driverId', 'results'): 'results',
        ('results', 'f2p_constructorId', 'constructors'): 'results',
        ('constructors', 'rev_f2p_constructorId', 'results'): 'results',
        ('qualifying', 'f2p_raceId', 'races'): 'qualifying',
        ('races', 'rev_f2p_raceId', 'qualifying'): 'races',
        ('qualifying', 'f2p_driverId', 'drivers'): 'qualifying',
        ('drivers', 'rev_f2p_driverId', 'qualifying'): 'qualifying',
        ('qualifying', 'f2p_constructorId', 'constructors'): 'qualifying',
        ('constructors', 'rev_f2p_constructorId', 'qualifying'): 'qualifying',
        ('races', 'f2p_circuitId', 'circuits'): 'races',
        ('circuits', 'rev_f2p_circuitId', 'races'): 'races'
    }

In [17]:
def pick_pairs(KG_data, edge):
    source_edge_name = edge[0]
    target_edge_name = edge[2]

    edge_index = KG_data[edge].edge_index # retrieve indexes
    source_nodes = edge_index[0]  # source indexes
    target_nodes = edge_index[1]  # targer indexes

    pairs_of_indexes = torch.stack((source_nodes, target_nodes), dim=1)


    return pairs_of_indexes

In [None]:
def train_inference_split_pairs(data, pairs, time_node, val_timestamp, test_timestamp):

    final_pairs_train = []
    final_pairs_val = []
    final_pairs_test = []

    # We iterate trough pairs and timestamps
    for i in range(pairs.shape[0]):
        source_node = pairs[i, 0].item()
        target_node = pairs[i, 1].item()

        # we retrieve the source timestamp
        transaction_time_int = data[time_node].time[source_node].item()

        # Conversion from integer timestamp to pandas.Timestamp
        transaction_time = pd.to_datetime(transaction_time_int, unit='s')

        # We devide edges based on the Timestamps
        if transaction_time < val_timestamp:
            final_pairs_train.append((source_node, target_node))
        elif val_timestamp <= transaction_time and transaction_time < test_timestamp:
            final_pairs_val.append((source_node, target_node))
        else:
            final_pairs_test.append((source_node, target_node))

    return final_pairs_train, final_pairs_val, final_pairs_test

In [None]:
# We iterate on the entire dictionary
for edge, time in edges_dict.items():
    # For each edge we build the pairs of source nodes and destination nodes
    pairs_of_nodes = pick_pairs(KG_data, edge)


    # We devide nodes in train, test and validation based on timestamp
    train_pairs, val_pairs, test_pairs = train_inference_split_database(data = KG_data,
                                                                        pairs = pairs_of_nodes,
                                                                        time_node = time,
                                                                        val_timestamp = val_timestamp,
                                                                        test_timestamp = test_timestamp)

In [20]:
edge = ('constructor_standings', 'f2p_raceId', 'races')

# ricavo i nomi di <src> e <dst>
src_name = edge[0]
dst_name = edge[2]

# per ogni edge presente nel grafo prendo le coppie <src>, <dst>
pairs_of_indexes = pick_pairs(f1_data, edge)

for i in range(pairs_of_indexes.shape[0]):
    # dalla coppia prendo gli indici destinazione e sorgente
    src_index = pairs_of_indexes[i, 0].item()
    dst_index = pairs_of_indexes[i, 1].item()
    # usando gli indici ricavo le features di destinazione e sorgente dal KG
    src_feat = f1_data[src_name].tf[src_index]
    dst_feat = f1_data[dst_name].tf[dst_index]


    # linearizzo le features e ne faccio l'embedding con Glove


    # se quel nodo non è già nel vocabolario node_features allora lo aggiungo
    if src_name not in node_features:
        node_features[f"{src_name}_{src_index}"] = src_feat
    if dst_name not in node_features:
        node_features[f"{dst_name}_{dst_index}"] = dst_feat


    # itero su tutto il vocabolario node_features cercando f"{src_name}_{src_index}" e f"{dst_name}_{dst_index}" e ne ricavo gli indici
    for idx, (node_name, node_feature) in enumerate(node_features.items()):
        if node_name == f"{src_name}_{src_index}":
            src_index = idx
        if node_name == f"{dst_name}_{dst_index}":
            dst_index = idx
    # aggiungo l'edge ad output_edges
    output_edges.append((src_index, dst_index))


    # aggiorno le maschere di train, test ed eval in base al timestamp e alla task

    # aggiorno output_labels in base alla task

In [21]:
pairs_of_indexes

tensor([[    0,    64],
        [    1,    64],
        [    2,    64],
        ...,
        [13048,  1090],
        [13049,  1090],
        [13050,  1090]])