This Notebook is used in order to download the RelBench data.

# Imports and Dependencies installation

In [1]:
# we install all the required for RelBench
!pip install relbench[full]

Collecting relbench[full]
  Downloading relbench-1.1.0-py3-none-any.whl.metadata (12 kB)
Collecting pytorch_frame>=0.2.3 (from relbench[full])
  Downloading pytorch_frame-0.2.4-py3-none-any.whl.metadata (20 kB)
Collecting torch_geometric (from relbench[full])
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading pytorch_frame-0.2.4-py3-none-any.whl (144 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.5/144.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading relbench-1.1.0-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.1/58.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstall

In [2]:
import relbench
from relbench.datasets import get_dataset_names, get_dataset
from relbench.modeling.utils import get_stype_proposal
from relbench.modeling.graph import make_pkey_fkey_graph


import torch
from torch_geometric.seed import seed_everything
from torch import Tensor
from torch_frame.config.text_embedder import TextEmbedderConfig


from typing import List, Optional


from sentence_transformers import SentenceTransformer


import os


import pandas as pd


import random

# Usefull functions

In [3]:
class GloveTextEmbedding:
    def __init__(self, device: Optional[torch.device] = None):
        self.model = SentenceTransformer(
            "sentence-transformers/average_word_embeddings_glove.6B.300d",
            device=device,
        )

    def __call__(self, sentences: List[str]) -> Tensor:
        return self.model.encode(sentences, convert_to_tensor=True)

In [4]:
def train_inference_split_database(data, pairs, time_node, val_timestamp, test_timestamp):

    final_pairs_train = []
    final_pairs_val = []
    final_pairs_test = []

    # We iterate trough pairs and timestamps
    for i in range(pairs.shape[0]):
        source_node = pairs[i, 0].item()
        target_node = pairs[i, 1].item()

        # we retrieve the source timestamp
        transaction_time_int = data[time_node].time[source_node].item()

        # Conversion from integer timestamp to pandas.Timestamp
        transaction_time = pd.to_datetime(transaction_time_int, unit='s')

        # We devide edges based on the Timestamps
        if transaction_time < val_timestamp:
            final_pairs_train.append((source_node, target_node))
        elif val_timestamp <= transaction_time and transaction_time < test_timestamp:
            final_pairs_val.append((source_node, target_node))
        else:
            final_pairs_test.append((source_node, target_node))

    return final_pairs_train, final_pairs_val, final_pairs_test

In [5]:
def pick_pairs(KG_data, edge):
    source_edge_name = edge[0]
    target_edge_name = edge[2]

    edge_index = KG_data[edge].edge_index # retrieve indexes
    source_nodes = edge_index[0]  # source indexes
    target_nodes = edge_index[1]  # targer indexes

    pairs_of_indexes = torch.stack((source_nodes, target_nodes), dim=1)


    return pairs_of_indexes

In [6]:
def write_KG_files(KG_data, db_name, edges_dict, val_timestamp, test_timestamp):
    # We create the dataset directory
    dataset_path = os.path.join('datasets', db_name, 'transductive')
    os.makedirs(dataset_path, exist_ok=True)

    # File paths
    train_file_path = os.path.join(dataset_path, 'train.txt')
    val_file_path = os.path.join(dataset_path, 'valid.txt')
    test_file_path = os.path.join(dataset_path, 'test.txt')

    # We open files in append modality
    with open(train_file_path, 'a') as train_file, \
         open(val_file_path, 'a') as val_file, \
         open(test_file_path, 'a') as test_file:


        # We iterate on the entire dictionary
        for edge, time in edges_dict.items():
            # For each edge we build the pairs of source nodes and destination nodes
            pairs_of_nodes = pick_pairs(KG_data, edge)


            # We devide nodes in train, test and validation based on timestamp
            train_pairs, val_pairs, test_pairs = train_inference_split_database(data = KG_data,
                                                                                pairs = pairs_of_nodes,
                                                                                time_node = time,
                                                                                val_timestamp = val_timestamp,
                                                                                test_timestamp = test_timestamp)

            # we take the names of the nodes
            source_edge_name = edge[0]
            target_edge_name = edge[2]

            # We write each dataset in the proper file in the triplets form
            for pair in train_pairs:
                train_file.write(f"{source_edge_name}_{pair[0]}\t{edge[1]}\t{target_edge_name}_{pair[1]}\n")

            for pair in val_pairs:
                val_file.write(f"{source_edge_name}_{pair[0]}\t{edge[1]}\t{target_edge_name}_{pair[1]}\n")

            for pair in test_pairs:
                test_file.write(f"{source_edge_name}_{pair[0]}\t{edge[1]}\t{target_edge_name}_{pair[1]}\n")

In [7]:
def write_reduced_KG_files(KG_data, db_name, edges_dict, val_timestamp, test_timestamp, train_proportion=1.0, val_proportion=1.0, test_proportion=1.0):
    # We create the dataset directory
    dataset_path = os.path.join('datasets', db_name, 'transductive')
    os.makedirs(dataset_path, exist_ok=True)

    # File paths
    train_file_path = os.path.join(dataset_path, 'train.txt')
    val_file_path = os.path.join(dataset_path, 'valid.txt')
    test_file_path = os.path.join(dataset_path, 'test.txt')

    # We open files in append modality
    with open(train_file_path, 'a') as train_file, \
         open(val_file_path, 'a') as val_file, \
         open(test_file_path, 'a') as test_file:


        # We iterate on the entire dictionary
        for edge, time in edges_dict.items():
            # For each edge we build the pairs of source nodes and destination nodes
            pairs_of_nodes = pick_pairs(KG_data, edge)


            # We devide nodes in train, test and validation based on timestamp
            train_pairs, val_pairs, test_pairs = train_inference_split_database(data = KG_data,
                                                                                pairs = pairs_of_nodes,
                                                                                time_node = time,
                                                                                val_timestamp = val_timestamp,
                                                                                test_timestamp = test_timestamp)

            # we take the names of the nodes
            source_edge_name = edge[0]
            target_edge_name = edge[2]

            # Random shuffle of pairs
            random.shuffle(train_pairs)
            random.shuffle(val_pairs)
            random.shuffle(test_pairs)

            # We use only the given proportion for data
            num_train_pairs = int(len(train_pairs) * train_proportion)
            num_val_pairs = int(len(val_pairs) * val_proportion)
            num_test_pairs = int(len(test_pairs) * test_proportion)

            # We write each dataset in the proper file in the triplets form
            for pair in train_pairs[:num_train_pairs]:
                train_file.write(f"{source_edge_name}_{pair[0]}\t{edge[1]}\t{target_edge_name}_{pair[1]}\n")

            for pair in val_pairs[:num_val_pairs]:
                val_file.write(f"{source_edge_name}_{pair[0]}\t{edge[1]}\t{target_edge_name}_{pair[1]}\n")

            for pair in test_pairs[:num_test_pairs]:
                test_file.write(f"{source_edge_name}_{pair[0]}\t{edge[1]}\t{target_edge_name}_{pair[1]}\n")

In [8]:
def build_Inductive_KG_files(db_name):
    inductive_dataset_path = os.path.join('datasets', db_name, 'inductive')
    os.makedirs(inductive_dataset_path, exist_ok=True)

    # File paths from transductive case
    train_file_path = os.path.join('datasets', db_name, 'transductive', 'train.txt')
    val_file_path = os.path.join('datasets', db_name, 'transductive', 'valid.txt')
    test_file_path = os.path.join('datasets', db_name,  'transductive', 'test.txt')

    # 1. Shuffle train file
    with open(train_file_path, 'r') as train_file:
        train_lines = train_file.readlines()

    random.shuffle(train_lines)

    train_file_path = os.path.join(inductive_dataset_path, 'train.txt')
    with open(train_file_path, 'w') as train_file:
        train_file.writelines(train_lines)

    # 2. Shuffle validation and test
    with open(val_file_path, 'r') as val_file:
        val_lines = val_file.readlines()

    with open(test_file_path, 'r') as test_file:
        test_lines = test_file.readlines()

    # The inference_graph file is the summation of valid and test files
    all_lines = val_lines + test_lines
    random.shuffle(all_lines)
    msg_file_path = os.path.join(inductive_dataset_path, 'inference_graph.txt')
    with open(msg_file_path, 'w') as msg_file:
        msg_file.writelines(all_lines)


    # 3. Validation file
    val_count = len(val_lines)
    # new_val_count = val_count - val_count // 3
    # new_val_lines = random.sample(val_lines, new_val_count)
    new_val_lines = val_lines
    random.shuffle(new_val_lines)
    val_file_path = os.path.join(inductive_dataset_path, 'inference_valid.txt')
    with open(val_file_path, 'w') as val_file:
        val_file.writelines(new_val_lines)

    # 4. Test file
    test_count = len(test_lines)
    # new_test_count = test_count - test_count // 3
    # new_test_lines = random.sample(test_lines, new_test_count)
    # new_test_lines = test_lines
    new_test_lines = random.sample(test_lines, test_count)
    random.shuffle(new_test_lines)
    test_file_path = os.path.join(inductive_dataset_path, 'inference_test.txt')
    with open(test_file_path, 'w') as test_file:
        test_file.writelines(new_test_lines)

# Checking the Environment

In [9]:
# Check that it's cuda if you want it to run in reasonable time!
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.set_num_threads(1)
print(device)

# Set the seed for generating random numbers to ensure reproducibility
seed_everything(42)

# Path to the directory for caching graph data
root_dir = "./data"

# Configure the text encoder
text_embedder_cfg = TextEmbedderConfig(
    text_embedder=GloveTextEmbedding(device=device),
    batch_size=256
)

cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/248 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

(…)WordEmbeddings/wordembedding_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

(…)beddings/whitespacetokenizer_config.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/480M [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
print(f"The RelBench version is {relbench.__version__}")
print(f"The RelBench datasets are {get_dataset_names()}")

The RelBench version is 1.1.0
The RelBench datasets are ['rel-amazon', 'rel-avito', 'rel-event', 'rel-f1', 'rel-hm', 'rel-stack', 'rel-trial']


# Downloading Datasets from RelBench

## Downloading F1 Dataset from RelBench

In [None]:
# We download the f1-dataset
f1_dataset = get_dataset(name="rel-f1", download=True)

# we download the entire database (also the test part)
f1_db = f1_dataset.get_db(upto_test_timestamp = False)
f1_col_to_stype_dict = get_stype_proposal(f1_db)

# Generate graph data
f1_data, f1_col_stats_dict = make_pkey_fkey_graph(
    f1_db,
    col_to_stype_dict = f1_col_to_stype_dict,  # Column types
    text_embedder_cfg = text_embedder_cfg,  # Our chosen text encoder
    cache_dir=os.path.join(
        root_dir, f"rel-f1_materialized_cache"
    ),  # Store materialized graph for convenience
)

Downloading file 'rel-f1/db.zip' from 'https://relbench.stanford.edu/download/rel-f1/db.zip' to '/root/.cache/relbench'.
100%|████████████████████████████████████████| 704k/704k [00:00<00:00, 398MB/s]
Unzipping contents of '/root/.cache/relbench/rel-f1/db.zip' to '/root/.cache/relbench/rel-f1/.'


Loading Database object from /root/.cache/relbench/rel-f1/db...
Done in 0.13 seconds.


Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00,  4.45it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 209.40it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 198.51it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 215.93it/s]
Embedding raw data in mini-batch: 100%|██████████| 4/4 [00:00<00:00, 210.62it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 235.69it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 264.36it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 262.39it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 257.21it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 50.20it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 152.57it/s]
Embedding raw data in mini-batch: 100%|██████████| 1/1 [00:00<00:00, 156.01it/s]
  ser = pd.to_datetime(ser, fo

In [None]:
f1_val_timestep = f1_dataset.val_timestamp
f1_test_timestep = f1_dataset.test_timestamp

print(f"The validation timestep is: {f1_val_timestep}")
print(f"The test timestep is: {f1_test_timestep}")

The validation timestep is: 2005-01-01 00:00:00
The test timestep is: 2010-01-01 00:00:00


In [None]:
f1_data

HeteroData(
  standings={
    tf=TensorFrame([34124, 4]),
    time=[34124],
  },
  drivers={ tf=TensorFrame([857, 6]) },
  constructor_standings={
    tf=TensorFrame([13051, 4]),
    time=[13051],
  },
  constructor_results={
    tf=TensorFrame([12290, 2]),
    time=[12290],
  },
  results={
    tf=TensorFrame([26080, 11]),
    time=[26080],
  },
  circuits={ tf=TensorFrame([77, 7]) },
  constructors={ tf=TensorFrame([211, 3]) },
  qualifying={
    tf=TensorFrame([9815, 3]),
    time=[9815],
  },
  races={
    tf=TensorFrame([1101, 5]),
    time=[1101],
  },
  (standings, f2p_raceId, races)={ edge_index=[2, 34124] },
  (races, rev_f2p_raceId, standings)={ edge_index=[2, 34124] },
  (standings, f2p_driverId, drivers)={ edge_index=[2, 34124] },
  (drivers, rev_f2p_driverId, standings)={ edge_index=[2, 34124] },
  (constructor_standings, f2p_raceId, races)={ edge_index=[2, 13051] },
  (races, rev_f2p_raceId, constructor_standings)={ edge_index=[2, 13051] },
  (constructor_standings, f2p_c

In [None]:
f1_edges_dict = {
        ('constructor_standings', 'f2p_raceId', 'races'): 'constructor_standings',
        ('races', 'rev_f2p_raceId', 'constructor_standings'): 'races',
        ('constructor_standings', 'f2p_constructorId', 'constructors'): 'constructor_standings',
        ('constructors', 'rev_f2p_constructorId', 'constructor_standings'): 'constructor_standings',
        ('standings', 'f2p_raceId', 'races'): 'standings',
        ('races', 'rev_f2p_raceId', 'standings'): 'races',
        ('standings', 'f2p_driverId', 'drivers'): 'standings',
        ('drivers', 'rev_f2p_driverId', 'standings'): 'standings',
        ('constructor_results', 'f2p_raceId', 'races'): 'constructor_results',
        ('races', 'rev_f2p_raceId', 'constructor_results'): 'races',
        ('constructor_results', 'f2p_constructorId', 'constructors'): 'constructor_results',
        ('constructors', 'rev_f2p_constructorId', 'constructor_results'): 'constructor_results',
        ('results', 'f2p_raceId', 'races'): 'results',
        ('races', 'rev_f2p_raceId', 'results'): 'races',
        ('results', 'f2p_driverId', 'drivers'): 'results',
        ('drivers', 'rev_f2p_driverId', 'results'): 'results',
        ('results', 'f2p_constructorId', 'constructors'): 'results',
        ('constructors', 'rev_f2p_constructorId', 'results'): 'results',
        ('qualifying', 'f2p_raceId', 'races'): 'qualifying',
        ('races', 'rev_f2p_raceId', 'qualifying'): 'races',
        ('qualifying', 'f2p_driverId', 'drivers'): 'qualifying',
        ('drivers', 'rev_f2p_driverId', 'qualifying'): 'qualifying',
        ('qualifying', 'f2p_constructorId', 'constructors'): 'qualifying',
        ('constructors', 'rev_f2p_constructorId', 'qualifying'): 'qualifying',
        ('races', 'f2p_circuitId', 'circuits'): 'races',
        ('circuits', 'rev_f2p_circuitId', 'races'): 'races'
    }

In [None]:
train_proportion = 1/40
valid_proportion = 1/40
test_proportion = 1/40

write_reduced_KG_files(KG_data = f1_data,
                      db_name = "F1",
                      edges_dict = f1_edges_dict,
                      val_timestamp = f1_val_timestep,
                      test_timestamp = f1_test_timestep,
                      train_proportion = train_proportion,
                      val_proportion = valid_proportion,
                      test_proportion = test_proportion)

build_Inductive_KG_files("F1")

## Downloading H&M Dataset from RelBench

In [11]:
# We download the clinical-trial-dataset
hm_dataset = get_dataset(name="rel-hm", download=True)

# we download the entire database (also the test part)
hm_db = hm_dataset.get_db(upto_test_timestamp = False)
hm_col_to_stype_dict = get_stype_proposal(hm_db)

# Generate graph data
hm_data, hm_col_stats_dict = make_pkey_fkey_graph(
    hm_db,
    col_to_stype_dict = hm_col_to_stype_dict,  # Column types
    text_embedder_cfg = text_embedder_cfg,  # Our chosen text encoder
    cache_dir=os.path.join(
        root_dir, f"rel-hm_materialized_cache"
    ),  # Store materialized graph for convenience
)

Downloading file 'rel-hm/db.zip' from 'https://relbench.stanford.edu/download/rel-hm/db.zip' to '/root/.cache/relbench'.
100%|████████████████████████████████████████| 143M/143M [00:00<00:00, 132GB/s]
Unzipping contents of '/root/.cache/relbench/rel-hm/db.zip' to '/root/.cache/relbench/rel-hm/.'


Loading Database object from /root/.cache/relbench/rel-hm/db...
Done in 2.08 seconds.


Embedding raw data in mini-batch: 100%|██████████| 5360/5360 [00:30<00:00, 177.75it/s]
Embedding raw data in mini-batch: 100%|██████████| 5360/5360 [00:29<00:00, 184.18it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:02<00:00, 183.20it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:02<00:00, 137.69it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:03<00:00, 111.45it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:02<00:00, 196.96it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:02<00:00, 196.74it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:02<00:00, 151.32it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:02<00:00, 145.60it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:02<00:00, 193.97it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:02<00:00, 183.18it/s]


In [12]:
hm_val_timestep = hm_dataset.val_timestamp
hm_test_timestep = hm_dataset.test_timestamp

print(f"The validation timestep is: {hm_val_timestep}")
print(f"The test timestep is: {hm_test_timestep}")

The validation timestep is: 2020-09-07 00:00:00
The test timestep is: 2020-09-14 00:00:00


In [13]:
hm_data

HeteroData(
  transactions={
    tf=TensorFrame([15453651, 3]),
    time=[15453651],
  },
  customer={ tf=TensorFrame([1371980, 6]) },
  article={ tf=TensorFrame([105542, 24]) },
  (transactions, f2p_customer_id, customer)={ edge_index=[2, 15453651] },
  (customer, rev_f2p_customer_id, transactions)={ edge_index=[2, 15453651] },
  (transactions, f2p_article_id, article)={ edge_index=[2, 15453651] },
  (article, rev_f2p_article_id, transactions)={ edge_index=[2, 15453651] }
)

In [14]:
hm_edges_dict = {
        ('transactions', 'f2p_customer_id', 'customer'): 'transactions',
        ('customer', 'rev_f2p_customer_id', 'transactions'): 'transactions',
        ('transactions', 'f2p_article_id', 'article'): 'transactions',
        ('article', 'rev_f2p_article_id', 'transactions'): 'transactions'
    }

In [None]:
# The data inside H&M dataset is huge, we reduce the number of pairs in the dataset
train_proportion = 1/20000
valid_proportion = 1/800
test_proportion = 71/20000

write_reduced_KG_files(KG_data = hm_data,
                        db_name = "HM",
                        edges_dict = hm_edges_dict,
                        val_timestamp = hm_val_timestep,
                        test_timestamp = hm_test_timestep,
                        train_proportion = train_proportion,
                        val_proportion = valid_proportion,
                        test_proportion = test_proportion)

build_Inductive_KG_files("HM")