# IMPORTS


Looking in indexes: https://download.pytorch.org/whl/cu118
Looking in links: https://data.pyg.org/whl/torch-2.1.0+cu118.html


# Downloading Datasets from RelBench

In [1]:
!pip install relbench[full]



In [2]:
import relbench
relbench.__version__

'1.1.0'

In [3]:
from relbench.datasets import get_dataset_names, get_dataset

get_dataset_names()

['rel-amazon',
 'rel-avito',
 'rel-event',
 'rel-f1',
 'rel-hm',
 'rel-stack',
 'rel-trial']

## Testing on F1 dataset

In [4]:
hm_dataset = get_dataset(name="rel-f1", download=True)

from relbench.modeling.utils import get_stype_proposal

hm_db = hm_dataset.get_db(upto_test_timestamp = False)  # Noi dobbiamo scaricare tutto il database, anche quello di test.
col_to_stype_dict = get_stype_proposal(hm_db)

import torch

from torch_geometric.seed import seed_everything


# Check that it's cuda if you want it to run in reasonable time!
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.set_num_threads(1)
print(device)

# Set the seed for generating random numbers to ensure reproducibility
seed_everything(42)

# Path to the directory for caching graph data
root_dir = "./data"

from typing import List, Optional
from sentence_transformers import SentenceTransformer
from torch import Tensor

class GloveTextEmbedding:
    def __init__(self, device: Optional[torch.device] = None):
        self.model = SentenceTransformer(
            "sentence-transformers/average_word_embeddings_glove.6B.300d",
            device=device,
        )

    def __call__(self, sentences: List[str]) -> Tensor:
        return self.model.encode(sentences, convert_to_tensor=True)


import os

# Root directory where files will be stored
root_dir = "./data"

# Run the from-scratch graph computation
from torch_frame.config.text_embedder import TextEmbedderConfig
from relbench.modeling.graph import make_pkey_fkey_graph

# Configure the text encoder
text_embedder_cfg = TextEmbedderConfig(
    text_embedder=GloveTextEmbedding(device=device),
    batch_size=256
)

# Generate graph data
hm_data, col_stats_dict = make_pkey_fkey_graph(
    hm_db,
    col_to_stype_dict=col_to_stype_dict,  # Column types
    text_embedder_cfg=text_embedder_cfg,  # Our chosen text encoder
    cache_dir=os.path.join(
        root_dir, f"rel-hm_materialized_cache"
    ),  # Store materialized graph for convenience
)


Loading Database object from /root/.cache/relbench/rel-f1/db...
Done in 0.12 seconds.




cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
hm_val_timestep = hm_dataset.val_timestamp
hm_test_timestep = hm_dataset.test_timestamp

print(f"The validation timestep is: {hm_val_timestep}")
print(f"The test timestep is: {hm_test_timestep}")

The validation timestep is: 2005-01-01 00:00:00
The test timestep is: 2010-01-01 00:00:00


In [6]:
hm_data

HeteroData(
  constructor_standings={
    tf=TensorFrame(
  num_cols=4,
  num_rows=13051,
  numerical (3): ['points', 'position', 'wins'],
  timestamp (1): ['date'],
  has_target=False,
  device='cpu',
),
    time=[13051],
  },
  standings={
    tf=TensorFrame(
  num_cols=4,
  num_rows=34124,
  numerical (3): ['points', 'position', 'wins'],
  timestamp (1): ['date'],
  has_target=False,
  device='cpu',
),
    time=[34124],
  },
  constructors={ tf=TensorFrame(
  num_cols=3,
  num_rows=211,
  embedding (3): ['constructorRef', 'name', 'nationality'],
  has_target=False,
  device='cuda:0',
) },
  constructor_results={
    tf=TensorFrame(
  num_cols=2,
  num_rows=12290,
  numerical (1): ['points'],
  timestamp (1): ['date'],
  has_target=False,
  device='cpu',
),
    time=[12290],
  },
  results={
    tf=TensorFrame(
  num_cols=11,
  num_rows=26080,
  numerical (10): ['fastestLap', 'grid', 'laps', 'milliseconds', 'number', 'points', 'position', 'positionOrder', 'rank', 'statusId'],
  times

In [7]:
edge_index = hm_data[("constructor_standings", "f2p_raceId", "races")].edge_index

In [8]:
edge_index

tensor([[    0,     1,     2,  ..., 13048, 13049, 13050],
        [   64,    64,    64,  ...,  1090,  1090,  1090]])

In [9]:
edge_index = hm_data[("standings", "f2p_raceId", "races")].edge_index

In [10]:
edge_index

tensor([[    0,     1,     2,  ..., 34121, 34122, 34123],
        [    0,     0,     0,  ...,  1090,  1090,  1090]])

In [11]:
import pandas as pd
def train_inference_split_F1_database(pairs, time_node):

    final_pairs_train = []
    final_pairs_val = []
    final_pairs_test = []

    # Itera attraverso le coppie e i timestamp
    for i in range(pairs.shape[0]):
        source_node = pairs[i, 0].item()
        target_node = pairs[i, 1].item()

        transaction_time_int = hm_data[time_node].time[source_node].item()  # Ottieni il timestamp dal nodo sorgente

        # Converti il timestamp intero in pandas.Timestamp
        transaction_time = pd.to_datetime(transaction_time_int, unit='s')

        # Divido gli edges in train e inference in base al Timestamp
        if transaction_time < hm_val_timestep:
            final_pairs_train.append((source_node, target_node))
        elif hm_val_timestep <= transaction_time and transaction_time < hm_test_timestep:
            final_pairs_val.append((source_node, target_node))
        else:
            final_pairs_test.append((source_node, target_node))

    return final_pairs_train, final_pairs_val, final_pairs_test

In [12]:
def pick_pairs(KG_data, edge):
    source_edge_name = edge[0]
    target_edge_name = edge[2]

    edge_index = KG_data[edge].edge_index # prelevo gli indici
    source_nodes = edge_index[0]  # Indici dei nodi sorgente
    target_nodes = edge_index[1]  # Indici dei nodi target

    pairs_of_indexes = torch.stack((source_nodes, target_nodes), dim=1)


    return pairs_of_indexes

In [13]:
def write_KG_files(KG_data, db_name, edges_dict):
    # Creo la cartella 'datasets' se non esiste
    dataset_path = os.path.join('datasets', db_name, 'transductive')
    os.makedirs(dataset_path, exist_ok=True)

    # Percorsi dei file
    train_file_path = os.path.join(dataset_path, 'train.txt')
    val_file_path = os.path.join(dataset_path, 'valid.txt')
    test_file_path = os.path.join(dataset_path, 'test.txt')

    # Apro i file in modalità append
    with open(train_file_path, 'a') as train_file, \
         open(val_file_path, 'a') as val_file, \
         open(test_file_path, 'a') as test_file:

        # Itero su tutto il dizionario passato
        for edge, time in edges_dict.items():
            # Per ogni edge costruisco le pairs di nodi sorgenti e destinazione
            pairs_of_nodes = pick_pairs(KG_data, edge)

            # Divido questi nodi in train, test e validation in base al timestamp (time)
            train_pairs, val_pairs, test_pairs = train_inference_split_F1_database(pairs = pairs_of_nodes,
                                                                                   time_node = time)
            # prendo i nomi dei nodi
            source_edge_name = edge[0] # nome nodo sorgente
            target_edge_name = edge[2] # nodo nome destinazione

            # Scrivo ogni dataset nel rispettivo file sotto forma di triplette
            for pair in train_pairs:
                train_file.write(f"{source_edge_name}_{pair[0]}\t{edge[1]}\t{target_edge_name}_{pair[1]}\n")  # Scrivi nel file di train

            for pair in val_pairs:
                val_file.write(f"{source_edge_name}_{pair[0]}\t{edge[1]}\t{target_edge_name}_{pair[1]}\n")  # Scrivi nel file di validation

            for pair in test_pairs:
                test_file.write(f"{source_edge_name}_{pair[0]}\t{edge[1]}\t{target_edge_name}_{pair[1]}\n")  # Scrivi nel file di test


In [14]:
F1_edges_dict = {
        ('constructor_standings', 'f2p_raceId', 'races'): 'constructor_standings',
        ('races', 'rev_f2p_raceId', 'constructor_standings'): 'races',
        ('constructor_standings', 'f2p_constructorId', 'constructors'): 'constructor_standings',
        ('constructors', 'rev_f2p_constructorId', 'constructor_standings'): 'constructor_standings',
        ('standings', 'f2p_raceId', 'races'): 'standings',
        ('races', 'rev_f2p_raceId', 'standings'): 'races',
        ('standings', 'f2p_driverId', 'drivers'): 'standings',
        ('drivers', 'rev_f2p_driverId', 'standings'): 'standings',
        ('constructor_results', 'f2p_raceId', 'races'): 'constructor_results',
        ('races', 'rev_f2p_raceId', 'constructor_results'): 'races',
        ('constructor_results', 'f2p_constructorId', 'constructors'): 'constructor_results',
        ('constructors', 'rev_f2p_constructorId', 'constructor_results'): 'constructor_results',
        ('results', 'f2p_raceId', 'races'): 'results',
        ('races', 'rev_f2p_raceId', 'results'): 'races',
        ('results', 'f2p_driverId', 'drivers'): 'results',
        ('drivers', 'rev_f2p_driverId', 'results'): 'results',
        ('results', 'f2p_constructorId', 'constructors'): 'results',
        ('constructors', 'rev_f2p_constructorId', 'results'): 'results',
        ('qualifying', 'f2p_raceId', 'races'): 'qualifying',
        ('races', 'rev_f2p_raceId', 'qualifying'): 'races',
        ('qualifying', 'f2p_driverId', 'drivers'): 'qualifying',
        ('drivers', 'rev_f2p_driverId', 'qualifying'): 'qualifying',
        ('qualifying', 'f2p_constructorId', 'constructors'): 'qualifying',
        ('constructors', 'rev_f2p_constructorId', 'qualifying'): 'qualifying',
        ('races', 'f2p_circuitId', 'circuits'): 'races',
        ('circuits', 'rev_f2p_circuitId', 'races'): 'races'
    }

In [15]:
write_KG_files(KG_data = hm_data,
               db_name = "F1",
               edges_dict = F1_edges_dict)

In [16]:
import random

def build_Inductive_KG_files(db_name):
    inductive_dataset_path = os.path.join('datasets', db_name, 'inductive')
    os.makedirs(inductive_dataset_path, exist_ok=True)

    # Percorsi dei file
    train_file_path = os.path.join('datasets', db_name, 'transductive', 'train.txt')
    val_file_path = os.path.join('datasets', db_name, 'transductive', 'valid.txt')
    test_file_path = os.path.join('datasets', db_name,  'transductive', 'test.txt')
    # msg_file_path = os.path.join('datasets','transductive',  db_name, 'msg.txt')

    # 1. Shuffle del file di train
    with open(train_file_path, 'r') as train_file:
        train_lines = train_file.readlines()

    random.shuffle(train_lines)

    train_file_path = train_file_path = os.path.join(inductive_dataset_path, 'train.txt')
    with open(train_file_path, 'w') as train_file:
        train_file.writelines(train_lines)

    # 2. Lettura e shuffle di validation e test
    with open(val_file_path, 'r') as val_file:
        val_lines = val_file.readlines()

    with open(test_file_path, 'r') as test_file:
        test_lines = test_file.readlines()

    all_lines = val_lines + test_lines
    random.shuffle(all_lines)
    msg_file_path = os.path.join(inductive_dataset_path, 'inference_graph.txt')
    with open(msg_file_path, 'w') as msg_file:
        msg_file.writelines(all_lines)


    # 3. Riscrittura del file di validation rimuovendo un terzo degli elementi
    val_count = len(val_lines)
    new_val_count = val_count - val_count // 3
    new_val_lines = random.sample(val_lines, new_val_count)
    random.shuffle(new_val_lines)
    val_file_path = os.path.join(inductive_dataset_path, 'inference_valid.txt')
    with open(val_file_path, 'w') as val_file:
        val_file.writelines(new_val_lines)

    # 4. Riscrittura del file di test rimuovendo un terzo degli elementi
    test_count = len(test_lines)
    new_test_count = test_count - test_count // 3
    new_test_lines = random.sample(test_lines, new_test_count)
    random.shuffle(new_test_lines)
    test_file_path = os.path.join(inductive_dataset_path, 'inference_test.txt')
    with open(test_file_path, 'w') as test_file:
        test_file.writelines(new_test_lines)

In [17]:
build_Inductive_KG_files("F1")

## Testing ULTRA on F1 dataset

### Downloading ULTRA

In [1]:
!pip install torch==2.1.0 --index-url https://download.pytorch.org/whl/cu118
!pip install torch-scatter==2.1.2 torch-sparse==0.6.18 torch-geometric==2.4.0 -f https://data.pyg.org/whl/torch-2.1.0+cu118.html
!pip install ninja easydict pyyaml

Looking in indexes: https://download.pytorch.org/whl/cu118
Looking in links: https://data.pyg.org/whl/torch-2.1.0+cu118.html


In [21]:
!git clone https://github.com/DeepGraphLearning/ULTRA.git

fatal: destination path 'ULTRA' already exists and is not an empty directory.


### Creating new F1 Dataset

In [22]:
import sys
sys.path.insert(0,'/content/ULTRA')

In [23]:
from ultra.datasets import InductiveDataset

In [29]:
class F1DatasetInductive(InductiveDataset):

    urls = [
        "datasets/F1/inductive/train.txt",
        "datasets/F1/inductive/inference_graph.txt",
        "datasets/F1/inductive/inference_valid.txt",
        "datasets/F1/inductive/inference_test.txt",
        ]
    name = "f1_dataInductive"

In [32]:
with open('/content/ULTRA/ultra/datasets.py', 'a') as f:
    f.write("""
class F1DatasetInductive(InductiveDataset):
    urls = [
        "/content/datasets/F1/inductive/train.txt",
        "/content/datasets/F1/inductive/inference_graph.txt",
        "/content/datasets/F1/inductive/inference_valid.txt",
        "/content/datasets/F1/inductive/inference_test.txt",
    ]
    name = "f1_dataInductive"
""")

In [33]:
!python ULTRA/script/run.py -c ULTRA/config/inductive/inference.yaml --dataset F1DatasetInductive --version v1 --epochs 0 --bpe null --gpus [0] --ckpt /content/ULTRA/ckpts/ultra_4g.pth

10:19:57   Random seed: 1024
10:19:57   Config file: ULTRA/config/inductive/inference.yaml
10:19:57   {'checkpoint': '/content/ULTRA/ckpts/ultra_4g.pth',
 'dataset': {'class': 'F1DatasetInductive',
             'root': '~/git/ULTRA/kg-datasets/',
             'version': 'v1'},
 'model': {'class': 'Ultra',
           'entity_model': {'aggregate_func': 'sum',
                            'class': 'EntityNBFNet',
                            'hidden_dims': [64, 64, 64, 64, 64, 64],
                            'input_dim': 64,
                            'layer_norm': True,
                            'message_func': 'distmult',
                            'short_cut': True},
           'relation_model': {'aggregate_func': 'sum',
                              'class': 'RelNBFNet',
                              'hidden_dims': [64, 64, 64, 64, 64, 64],
                              'input_dim': 64,
                              'layer_norm': True,
                              'message_func': 

## Downloading H&M dataset

In [4]:
hm_dataset = get_dataset(name="rel-hm", download=True)

Downloading file 'rel-hm/db.zip' from 'https://relbench.stanford.edu/download/rel-hm/db.zip' to '/root/.cache/relbench'.
100%|███████████████████████████████████████| 143M/143M [00:00<00:00, 27.1GB/s]
Unzipping contents of '/root/.cache/relbench/rel-hm/db.zip' to '/root/.cache/relbench/rel-hm/.'


In [5]:
# Salvo i timestep di validation e test in modo da usarli dopo per la creazione dei file di input per ULTRA
hm_val_timestep = hm_dataset.val_timestamp
hm_test_timestep = hm_dataset.test_timestamp

print(f"The validation timestep is: {hm_val_timestep}")
print(f"The test timestep is: {hm_test_timestep}")

The validation timestep is: 2020-09-07 00:00:00
The test timestep is: 2020-09-14 00:00:00


In [6]:
hm_db = hm_dataset.get_db()

Loading Database object from /root/.cache/relbench/rel-hm/db...
Done in 7.86 seconds.


In [7]:
hm_db.table_dict.keys()

dict_keys(['customer', 'article', 'transactions'])

### Checking Tables

In [None]:
table = hm_db.table_dict["transactions"]
table

Table(df=
              t_dat  customer_id  article_id     price  sales_channel_id
0        2019-09-07          155       51985  0.010153                 1
1        2019-09-07          155       51985  0.010153                 1
2        2019-09-07          155       83127  0.042356                 1
3        2019-09-07          155        6066  0.005068                 1
4        2019-09-07          155       78525  0.033881                 1
...             ...          ...         ...       ...               ...
15187282 2020-09-14      1371926       93801  0.025407                 1
15187283 2020-09-14      1371926       17155  0.033881                 1
15187284 2020-09-14      1371926       65802  0.030492                 1
15187285 2020-09-14      1371926       85883  0.016932                 1
15187286 2020-09-14      1371926      104763  0.042356                 1

[15187287 rows x 5 columns],
  fkey_col_to_pkey_table={'customer_id': 'customer', 'article_id': 'article'},
  pke

In [None]:
table = hm_db.table_dict["customer"]
table

Table(df=
         customer_id   FN  Active club_member_status fashion_news_frequency  \
0                  0  NaN     NaN             ACTIVE                   NONE   
1                  1  NaN     NaN             ACTIVE                   NONE   
2                  2  NaN     NaN             ACTIVE                   NONE   
3                  3  NaN     NaN             ACTIVE                   NONE   
4                  4  1.0     1.0             ACTIVE              Regularly   
...              ...  ...     ...                ...                    ...   
1371975      1371975  NaN     NaN             ACTIVE                   NONE   
1371976      1371976  NaN     NaN             ACTIVE                   NONE   
1371977      1371977  1.0     1.0             ACTIVE              Regularly   
1371978      1371978  1.0     1.0             ACTIVE              Regularly   
1371979      1371979  NaN     NaN         PRE-CREATE                   NONE   

          age                            

In [None]:
table = hm_db.table_dict["article"]
table

Table(df=
        article_id  product_code               prod_name  product_type_no  \
0                0        108775               Strap top              253   
1                1        108775               Strap top              253   
2                2        108775           Strap top (1)              253   
3                3        110065       OP T-shirt (Idro)              306   
4                4        110065       OP T-shirt (Idro)              306   
...            ...           ...                     ...              ...   
105537      105537        953450  5pk regular Placement1              302   
105538      105538        953763       SPORT Malaga tank              253   
105539      105539        956217         Cartwheel dress              265   
105540      105540        957375        CLAIRE HAIR CLAW               72   
105541      105541        959461            Lounge dress              265   

       product_type_name  product_group_name  graphical_appearanc

### Downloading a task

In [None]:
from relbench.tasks import get_task_names, get_task

get_task_names("rel-hm")

['user-item-purchase', 'user-churn', 'item-sales']

In [None]:
hm_task = get_task("rel-hm", "user-item-purchase", download=True)

Downloading file 'rel-hm/tasks/user-item-purchase.zip' from 'https://relbench.stanford.edu/download/rel-hm/tasks/user-item-purchase.zip' to '/root/.cache/relbench'.
100%|█████████████████████████████████████| 46.9M/46.9M [00:00<00:00, 18.2GB/s]
Unzipping contents of '/root/.cache/relbench/rel-hm/tasks/user-item-purchase.zip' to '/root/.cache/relbench/rel-hm/tasks/.'


In [None]:
from relbench.base import TaskType
assert hm_task.task_type == TaskType.LINK_PREDICTION

In [None]:
train_table = hm_task.get_table("train")
val_table = hm_task.get_table("val")
test_table = hm_task.get_table("test")

In [None]:
train_table

Table(df=
         timestamp  customer_id  \
0       2019-12-09       149853   
1       2019-12-09       435491   
2       2019-12-09       600889   
3       2019-12-09      1271535   
4       2019-12-09       124560   
...            ...          ...   
3878446 2020-04-20       408061   
3878447 2020-04-20      1138840   
3878448 2020-03-30       140490   
3878449 2020-03-30      1094930   
3878450 2020-03-30      1217756   

                                                article_id  
0                                           [11667, 83069]  
1        [8061, 56842, 70123, 83386, 14038, 70122, 3315...  
2                                           [25756, 72271]  
3         [78428, 38992, 91389, 86016, 2556, 72566, 10378]  
4                                                  [80745]  
...                                                    ...  
3878446                                            [82437]  
3878447                                           [101299]  
3878448             

## Creating the Graph

In [8]:
import torch

from torch_geometric.seed import seed_everything

# Check that it's cuda if you want it to run in reasonable time!
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.set_num_threads(1)
print(device)

# Set the seed for generating random numbers to ensure reproducibility
seed_everything(42)

# Path to the directory for caching graph data
root_dir = "./data"

cuda


In [9]:
from relbench.modeling.utils import get_stype_proposal

hm_db = hm_dataset.get_db(upto_test_timestamp = False)  # Noi dobbiamo scaricare tutto il database, anche quello di test.
col_to_stype_dict = get_stype_proposal(hm_db)

Loading Database object from /root/.cache/relbench/rel-hm/db...
Done in 2.04 seconds.


In [10]:
from typing import List, Optional
from sentence_transformers import SentenceTransformer
from torch import Tensor

class GloveTextEmbedding:
    def __init__(self, device: Optional[torch.device] = None):
        self.model = SentenceTransformer(
            "sentence-transformers/average_word_embeddings_glove.6B.300d",
            device=device,
        )

    def __call__(self, sentences: List[str]) -> Tensor:
        return self.model.encode(sentences, convert_to_tensor=True)

In [11]:
import os

# Root directory where files will be stored
root_dir = "./data"

# Run the from-scratch graph computation
from torch_frame.config.text_embedder import TextEmbedderConfig
from relbench.modeling.graph import make_pkey_fkey_graph

# Configure the text encoder
text_embedder_cfg = TextEmbedderConfig(
    text_embedder=GloveTextEmbedding(device=device),
    batch_size=256
)

# Generate graph data
hm_data, col_stats_dict = make_pkey_fkey_graph(
    hm_db,
    col_to_stype_dict=col_to_stype_dict,  # Column types
    text_embedder_cfg=text_embedder_cfg,  # Our chosen text encoder
    cache_dir=os.path.join(
        root_dir, f"rel-hm_materialized_cache"
    ),  # Store materialized graph for convenience
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/248 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

(…)WordEmbeddings/wordembedding_config.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/480M [00:00<?, ?B/s]

(…)beddings/whitespacetokenizer_config.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding raw data in mini-batch: 100%|██████████| 5360/5360 [00:28<00:00, 187.12it/s]
Embedding raw data in mini-batch: 100%|██████████| 5360/5360 [00:33<00:00, 159.58it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:02<00:00, 178.20it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:02<00:00, 170.69it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:04<00:00, 83.03it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:02<00:00, 165.92it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:02<00:00, 191.31it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:02<00:00, 190.50it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:02<00:00, 165.47it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:02<00:00, 157.40it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00:02<00:00, 143.28it/s]
Embedding raw data in mini-batch: 100%|██████████| 413/413 [00

In [12]:
hm_data

HeteroData(
  customer={ tf=TensorFrame([1371980, 6]) },
  article={ tf=TensorFrame([105542, 24]) },
  transactions={
    tf=TensorFrame([15453651, 3]),
    time=[15453651],
  },
  (transactions, f2p_customer_id, customer)={ edge_index=[2, 15453651] },
  (customer, rev_f2p_customer_id, transactions)={ edge_index=[2, 15453651] },
  (transactions, f2p_article_id, article)={ edge_index=[2, 15453651] },
  (article, rev_f2p_article_id, transactions)={ edge_index=[2, 15453651] }
)

In [13]:
hm_data[("transactions", "f2p_customer_id", "customer")]

{'edge_index': tensor([[       0,        1,        2,  ..., 15453648, 15453649, 15453650],
        [     155,      155,      155,  ...,  1371721,  1371747,  1371960]])}

Arrivati a questo punto abbiamo il nostro dataset sottoforma di grafo.

Ora, quello che dobbiamo fare è rappresentare le relazioni presenti nel grafo in quattro file .txt:

* "transductive_train_set_link",
* "inference_graph_link",
* "inference_valid_set_link",
* "inference_test_set_link".


## KG .txt files creation

In [14]:
print(f"The validation timestep is: {hm_val_timestep}")
print(f"The test timestep is: {hm_test_timestep}")

The validation timestep is: 2020-09-07 00:00:00
The test timestep is: 2020-09-14 00:00:00


In [15]:
print(type(hm_val_timestep))

<class 'pandas._libs.tslibs.timestamps.Timestamp'>


*   Information up to September 7, 2020 can be used for training.
*   Information up to September 14, 2020 can be used for validation.
*   Information after September 14, 2020 can be used for testing.

In [16]:
# The temporal information is inside transaction, there is inside the time for each node
hm_data["transactions"].time

tensor([1567814400, 1567814400, 1567814400,  ..., 1600732800, 1600732800,
        1600732800])

In [17]:
# prendo tutti gli edges di questo tipo presenti
edge_index = hm_data[("transactions", "f2p_customer_id", "customer")].edge_index

In [18]:
# Estraggo le coppie di nodi
source_nodes = edge_index[0]  # Indici dei nodi sorgente (transactions)
target_nodes = edge_index[1]  # Indici dei nodi target (customer)

In [19]:
# combino le coppie di indici [0, 155] vuol dire che la transazione 0 è stata fatta dal customer 155
pairs_of_indexes = torch.stack((source_nodes, target_nodes), dim=1)

In [20]:
pairs_of_indexes

tensor([[       0,      155],
        [       1,      155],
        [       2,      155],
        ...,
        [15453648,  1371721],
        [15453649,  1371747],
        [15453650,  1371960]])

In [21]:
len(pairs_of_indexes)

15453651

In [29]:
import numpy as np
import pandas as pd

# Riduci pairs_of_indexes
num_pairs = pairs_of_indexes.shape[0]
num_to_keep = num_pairs // 4500  # circa 3440 elementi -> 13760 elementi in totale tra train e inference (4 relazioni)

# Seleziona casualmente
random_indices = np.random.choice(num_pairs, num_to_keep, replace=False)

In [30]:
reduced_pairs = pairs_of_indexes[random_indices]
print(reduced_pairs.shape[0])

3434


In [31]:
def train_inference_splitting(pairs):

    final_pairs_train = []
    final_pairs_val = []
    final_pairs_test = []

    # Itera attraverso le coppie e i timestamp
    for i in range(pairs.shape[0]):
        source_node = pairs[i, 0].item()
        target_node = pairs[i, 1].item()
        transaction_time_int = hm_data["transactions"].time[source_node].item()  # Ottieni il timestamp

        # Converti il timestamp intero in pandas.Timestamp
        transaction_time = pd.to_datetime(transaction_time_int, unit='s')

        # Divido gli edges in train e inference in base al Timestamp
        if transaction_time < hm_val_timestep:
            final_pairs_train.append((source_node, target_node))
        elif hm_val_timestep <= transaction_time and transaction_time < hm_test_timestep:
            final_pairs_val.append((source_node, target_node))
        else:
            final_pairs_test.append((source_node, target_node))

    return final_pairs_train, final_pairs_val, final_pairs_test

In [32]:
train_pairs, val_pairs, test_pairs = train_inference_splitting(reduced_pairs)

In [33]:
print(len(train_pairs))
print(len(val_pairs))
print(len(test_pairs))

3292
62
80


In [43]:
# In dataset di msg dev'essere costituito da tutti gli elementi presenti in test e val, ma non è vero il viceversa
# Quindi, procedo eliminando un certo numero di elementi da val_pairs e da test_pairs

validation_count = (4 * len(val_pairs)) // 5 # numero di elementi per edge che voglio nel validation set
test_count = (4 * len(test_pairs)) // 5 # numero di elementi per edge che voglio nel test set


validation_indices = np.random.choice(len(val_pairs), validation_count, replace=False)
test_indices = np.random.choice(len(test_pairs), test_count, replace=False)

In [44]:
red_val_pairs = [val_pairs[i] for i in validation_indices]
red_test_pairs = [test_pairs[i] for i in test_indices]

In [45]:
print(len(red_val_pairs))
print(len(red_test_pairs))

49
64


In [None]:
	import os

  save_dir = f"./{args.data_tgt}/"
	os.makedirs(save_dir, exist_ok=True)
	write(save_dir + 'train.txt', train)
	write(save_dir + 'kg_inference.txt', test)

# ULTRA

In [None]:
!pip install torch==2.1.0 --index-url https://download.pytorch.org/whl/cu118
!pip install torch-scatter==2.1.2 torch-sparse==0.6.18 torch-geometric==2.4.0 -f https://data.pyg.org/whl/torch-2.1.0+cu118.html
!pip install ninja easydict pyyaml

In [None]:
!git clone https://github.com/DeepGraphLearning/ULTRA.git

In [None]:
!python ULTRA/script/run.py -c ULTRA/config/inductive/inference.yaml --dataset FB15k237Inductive --version v1 --epochs 0 --bpe null --gpus [0] --ckpt /content/ULTRA/ckpts/ultra_4g.pth