In [1]:
!pip install pykeen
# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

Collecting pykeen
  Downloading pykeen-1.10.1-py3-none-any.whl (739 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m739.3/739.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dataclasses-json (from pykeen)
  Downloading dataclasses_json-0.6.3-py3-none-any.whl (28 kB)
Collecting click-default-group (from pykeen)
  Downloading click_default_group-1.2.4-py2.py3-none-any.whl (4.1 kB)
Collecting optuna>=2.0.0 (from pykeen)
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Collecting more-click (from pykeen)
  Downloading more_click-0.1.2-py3-none-any.whl (6.7 kB)
Collecting pystow>=0.4.3 (from pykeen)
  Downloading pystow-0.5.2-py3-none-any.whl (31 kB)
Collecting docdata (from pykeen)
  Downloading docdata-0.0.3-py3-none-any.whl (5.8 kB)
Collecting class-resolver>=0.3.10 (from pykeen)
  Downloading class_resolver-0.4.2-py3-none-an

In [2]:
"""Embed patients with the biomedical entities (genes and metabolites) using Knowledge graph embedding."""
import os
from typing import Tuple, Optional, Dict

import numpy as np
import pandas as pd
from pykeen.hpo.hpo import hpo_pipeline
from pykeen.models.base import Model
from pykeen.pipeline import pipeline_from_path
from pykeen.triples import TriplesFactory

INFO:pykeen.utils:Using opt_einsum


In [3]:
def _weighted_splitter(
        edgelist: pd.DataFrame,
        train_size: Optional[float] = 0.8,
        validation_size: Optional[float] = 0.1
) -> Tuple[pd.DataFrame, ...]:
    """Split the given edgelist into training, validation and testing sets on the basis of the ratio of relations.

    :param edgelist: Edgelist in the form of (Source, Relation, Target)
    :param train_size: Size of the training data
    :param validation_size: Size of the training data
    :return: Tuple containing the train, validation & test splits
    """
    # Validation size is the size of the percentage of the remaining data (i.e. If required validation size is 10% of
    # the original data & training size is 80% then the new validation size is 50% of the data without the training
    # data. The similar calculation is done for training size, hence it is always 1
    validation_size = validation_size / (1 - train_size)
    test_size = 1

    # Get the unique relations in the network
    unique_relations = sorted(edgelist['relation'].unique())

    data = edgelist.drop_duplicates().copy()

    split = []
    # Split the data to get training, validation and test samples
    for frac_size in [train_size, validation_size, test_size]:
        frames = []
        # Random sampling of the data for every type of relation
        for relation in unique_relations:
            temp = data[data['relation'] == relation].sample(frac=frac_size)

            data = data[~data.index.isin(temp.index)]

            frames.append(temp)
        # Join all the different relations in one dataframe
        split.append(pd.concat(frames, ignore_index=True, sort=False))

    return tuple(split)

In [4]:
def _model_to_numpy(
        model: Model
) -> np.array:
    """Retrieve embedding from the models as a numpy array."""
    return model.entity_embeddings.weight.detach().cpu().numpy()

In [5]:
def run_optimization(dataset: Tuple[TriplesFactory, TriplesFactory, TriplesFactory], model_config: Dict, out_dir: str):
    """Run HPO."""
    training_factory, testing_factory, validation_factory = dataset

    # Define HPO pipeline
    hpo_results = hpo_pipeline(
        dataset=None,
        training=training_factory,
        testing=testing_factory,
        validation=validation_factory,
        model=model_config["model"],
        model_kwargs=model_config["model_kwargs"],
        model_kwargs_ranges=model_config["model_kwargs_ranges"],
        loss=model_config["loss_function"],
        loss_kwargs=model_config["loss_kwargs"],
        loss_kwargs_ranges=model_config["loss_kwargs_ranges"],
        regularizer=model_config["regularizer"],
        optimizer=model_config["optimizer"],
        optimizer_kwargs=model_config["optimizer_kwargs"],
        # optimizer_kwargs_ranges=model_config["optimizer_kwargs_ranges"],
        training_loop=model_config["training_loop"],
        training_kwargs=model_config["training_kwargs"],
        training_kwargs_ranges=model_config["training_kwargs_ranges"],
        negative_sampler=model_config["negative_sampler"],
        negative_sampler_kwargs=model_config["negative_sampler_kwargs"],
        stopper=model_config["stopper"],
        stopper_kwargs=model_config["stopper_kwargs"],
        evaluator=model_config["evaluator"],
        evaluator_kwargs=model_config["evaluator_kwargs"],
        evaluation_kwargs=model_config["evaluation_kwargs"],
        n_trials=model_config["n_trials"],
        timeout=model_config["timeout"],
        metric=model_config["metric"],
        direction=model_config["direction"],
        sampler=model_config["sampler"],
        # pruner=model_config["pruner"],
    )

    optimization_dir = os.path.join(out_dir, 'pykeen_results_optim')
    if not os.path.isdir(optimization_dir):
        os.makedirs(optimization_dir)

    hpo_results.save_to_directory(optimization_dir)

    return None



In [None]:
def run_pipeline(
        dataset: Tuple[TriplesFactory, TriplesFactory, TriplesFactory],
        out_dir: str
):
    """Run Pipeline."""
    training_factory, testing_factory, validation_factory = dataset

    config_path = os.path.join(out_dir, 'pykeen_results_optim', 'best_pipeline', 'pipeline_config.json')
    pipeline_results = pipeline_from_path(
        path=config_path,
        # training=training_factory,
        # testing=testing_factory,
        # validation=validation_factory,
    )

    best_pipeline_dir = os.path.join(out_dir, 'pykeen_results_final')
    if not os.path.isdir(best_pipeline_dir):
        os.makedirs(best_pipeline_dir)

    pipeline_results.save_to_directory(best_pipeline_dir, save_replicates=True)

    return pipeline_results

In [19]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [7]:
%cd drive/MyDrive/capstone
%pwd

/content/drive/MyDrive/capstone


'/content/drive/MyDrive/capstone'

In [8]:
kg = pd.read_csv('./output/subgraph-generation.csv',sep='\t')
edgelist = kg.drop(columns='label')
edgelist = edgelist[['source','relation','target']]
edgelist.columns = ['head', 'relation', 'tail']

In [10]:
edgelist

Unnamed: 0,head,relation,tail
0,PHYHIP,protein_protein,KIF15
1,PHYHIP,protein_protein,ZZEF1
2,PHYHIP,protein_protein,PNPLA2
3,PHYHIP,protein_protein,PAQR5
4,PHYHIP,protein_protein,CA10
...,...,...,...
1786014,GSM6249817,down_reg,ZSCAN16
1786015,GSM6249817,up_reg,ZSWIM2
1786016,GSM6249817,down_reg,ZYG11B
1786017,GSM6249817,up_reg,ZYX


In [11]:
edgelist.relation.value_counts()

protein_protein       642150
bioprocess_protein    289298
up_reg                288569
down_reg              259954
disease_protein       160822
pathway_protein        85183
drug_protein           50933
phenotype_protein       6660
exposure_protein        2424
disease_disease           26
Name: relation, dtype: int64

In [None]:
model_config = {
  "model": "Rotate", # R-GCN, GIN
  "model_kwargs": {"embedding_dim":64,
                    "entity_initializer":'xavier_uniform',
                    "relation_initializer":'xavier_uniform'},
  "model_kwargs_ranges": {},
  "training_loop": "slcwa",
  "optimizer": "adam",
  "optimizer_kwargs": {
    "weight_decay": 1e-3, # check with the weight decay, optional, check with that
    "lr": 0.001
  },
  "loss_function": "NSSALoss", # NSSALoss | cross entropy | few other ones.
  "loss_kwargs": {},
  "loss_kwargs_ranges": {
    "margin": {
      "type": "float",
      "low": 1,
      "high": 30,
      "q": 2.0
    }
    },
    "adversarial_temperature": {
      "type": "float",
      "low": 0.1,
      "high": 1.0,
      "q": 0.1
    },
  "regularizer": "NoRegularizer", # consider that later
  "regularizer_kwargs": {},
  "regularizer_kwargs_ranges": {},
  "negative_sampler": "basic",
  "negative_sampler_kwargs": {},
  "negative_sampler_kwargs_ranges": {
    "num_negs_per_pos": {
      "type": "int",
      "low": 1,
      "high": 50,
      "q": 1
    }
  },
  "create_inverse_triples": False,
  "evaluator": "RankBasedEvaluator",
  "evaluator_kwargs": {
    "filtered": True
  },
  "evaluation_kwargs": {
    "batch_size": 256
  },
  "training_kwargs": {
    "num_epochs": 35, #1000
    "label_smoothing": 0.0,
    "batch_size":512,
    "num_workers":8,
  },
    "training_kwargs_ranges": {},
  "stopper": "early",
  "stopper_kwargs": {
    "frequency": 25,
    "patience": 4,
    "relative_delta":0.002
  },
  "n_trials": 1,
  "timeout": 129600,
  "metric": "hits@10",
  "direction": "maximize",
  "sampler": "random",
  # "pruner":"Hyperband Pruner", # define the seeds
}

### run the below code, but it will cause some errors.

In [13]:
out = './output/RotaE' # convert this path
train, validation, test = _weighted_splitter(
        edgelist=edgelist,
        train_size=0.8,
        validation_size=0.1
    )
train.to_csv(f'{out}/train.edgelist', sep='\t', index=False, header=False)
validation.to_csv(f'{out}/validation.edgelist', sep='\t', index=False, header=False)
test.to_csv(f'{out}/test.edgelist', sep='\t', index=False, header=False)

In [15]:
out = './output/RotaE' # convert this path
create_inverse_triples = False
training_factory = TriplesFactory.from_path(
        path=f'{out}/train.edgelist',
        create_inverse_triples=create_inverse_triples,
    )
validation_factory = TriplesFactory.from_path(
    path=f'{out}/validation.edgelist',
    create_inverse_triples=create_inverse_triples,
)
testing_factory = TriplesFactory.from_path(
    path=f'{out}/test.edgelist',
    create_inverse_triples=create_inverse_triples,
)

In [16]:
training_factory # to verify the triples that we are running is correct!!!

TriplesFactory(num_entities=47964, num_relations=10, create_inverse_triples=False, num_triples=1428814, path="/content/drive/My Drive/capstone/output/RotaE/train.edgelist")

In [None]:
# run_optimization(
#         dataset=(training_factory, validation_factory, testing_factory),
#         model_config=model_config,
#         out_dir=out
#     )

[I 2023-11-28 02:56:22,021] A new study created in memory with name: no-name-d49a52d6-8560-4c6c-bdbc-8d0522436e7d
INFO:pykeen.hpo.hpo:Using model: <class 'pykeen.models.unimodal.rotate.RotatE'>
INFO:pykeen.hpo.hpo:Using loss: <class 'pykeen.losses.NSSALoss'>
INFO:pykeen.hpo.hpo:Using regularizer: <class 'pykeen.regularizers.NoRegularizer'>
INFO:pykeen.hpo.hpo:Using optimizer: <class 'torch.optim.adam.Adam'>
INFO:pykeen.hpo.hpo:Using training loop: <class 'pykeen.training.slcwa.SLCWATrainingLoop'>
INFO:pykeen.hpo.hpo:Using negative sampler: <class 'pykeen.sampling.basic_negative_sampler.BasicNegativeSampler'>
INFO:pykeen.hpo.hpo:Using evaluator: <class 'pykeen.evaluation.rank_based_evaluator.RankBasedEvaluator'>
INFO:pykeen.hpo.hpo:Attempting to maximize both.realistic.hits_at_10
INFO:pykeen.hpo.hpo:Filter validation triples when testing: True
INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: /root/.data/pyke

Training epochs on cuda:0:   0%|          | 0/200 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]



Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2209 [00:00<?, ?batch/s]

[W 2023-11-28 03:05:39,684] Trial 0 failed with parameters: {'loss.margin': 3.0, 'loss.adversarial_temperature': 0.9961586015880591, 'negative_sampler.num_negs_per_pos': 9} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/usr/local/lib/python3.10/dist-packages/pykeen/hpo/hpo.py", line 259, in __call__
    result = pipeline(
  File "/usr/local/lib/python3.10/dist-packages/pykeen/pipeline/api.py", line 1546, in pipeline
    stopper_instance, configuration, losses, train_seconds = _handle_training(
  File "/usr/local/lib/python3.10/dist-packages/pykeen/pipeline/api.py", line 1190, in _handle_training
    losses = training_loop_instance.train(
  File "/usr/local/lib/python3.10/dist-packages/pykeen/training/training_loop.py", line 378, in train
    result = self._train(
  File "/usr/local/lib/python3.10/dist-pac

KeyboardInterrupt: ignored

In [None]:
from pykeen.pipeline import pipeline
from pykeen.models import RotatE
from pykeen.losses import MarginRankingLoss
result = pipeline(
    model=RotatE,
    training=training_factory,
    testing=testing_factory,
    validation=validation_factory,
    device="gpu",
    model_kwargs=dict(
        embedding_dim=64,
        entity_initializer='xavier_uniform',
        relation_initializer='xavier_uniform',
    ),
    loss="NSSALoss",
    loss_kwargs=dict(
        # margin=6.0,
        # reduction='mean',
    ),
    training_kwargs=dict(
        num_epochs=50,
        sampler='schlichtkrull',
        batch_size=512, # larger, the quicker, cost increases as O(n^2), time decreases as O(n)
        num_workers=8,
        checkpoint_name='my_checkpoint.pt',
        checkpoint_directory='./output/RotaE/',
        checkpoint_frequency=5
    ),
    optimizer="Adam",
     optimizer_kwargs=dict(
         lr=0.005,
     ),
     training_loop="slcwa",
     regularizer = "no",
     evaluation_kwargs=dict(batch_size=256),  # Batch size for evaluation
     stopper='early',
     stopper_kwargs=dict(frequency=25,patience=4,relative_delta=0.002),
     use_tqdm=True,
     random_seed=42,
)
result.save_to_directory('./output/RotaE/')

INFO:pykeen.pipeline.api:=> no training loop checkpoint file found at 'output/RotaE/my_checkpoint.pt'. Creating a new file.
INFO:pykeen.pipeline.api:Using device: gpu
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: /root/.data/pykeen/checkpoints/best-model-weights-20b2317d-4572-4eb7-a53b-ae021a1a3205.pt
INFO:pykeen.training.training_loop:=> no checkpoint found at 'output/RotaE/my_checkpoint.pt'. Creating a new file.


Training epochs on cuda:0:   0%|          | 0/50 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 2.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 4.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 6.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 8.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 10.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 12.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 14.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 16.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 18.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 20.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 22.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 24.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 77.24s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 25: 0.0003191400016796842. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-20b2317d-4572-4eb7-a53b-ae021a1a3205.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 25.
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 25.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 27.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 29.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 31.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 33.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 35.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 37.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 39.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 41.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 43.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 45.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 47.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 49.


Training batches on cuda:0:   0%|          | 0/2791 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=256.
INFO:pykeen.evaluation.evaluator:Evaluation took 77.09s seconds
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 50.


Evaluating on cuda:0:   0%|          | 0.00/179k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 82.73s seconds
INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=47964, num_relations=10, create_inverse_triples=False, num_triples=1428814, path="/content/drive/My Drive/capstone/output/RotaE/train.edgelist") to file:///content/drive/MyDrive/capstone/output/RotaE/training_triples
INFO:pykeen.pipeline.api:Saved to directory: file:///content/drive/MyDrive/capstone/output/RotaE


In [None]:
from pykeen.pipeline import pipeline
from pykeen.models import RotatE
from pykeen.losses import MarginRankingLoss
result = pipeline(
    model=RotatE,
    training=training_factory,
    testing=testing_factory,
    validation=validation_factory,
    device="gpu",
    model_kwargs=dict(
        embedding_dim=64,
        entity_initializer='xavier_uniform',
        relation_initializer='xavier_uniform',
    ),
    loss=MarginRankingLoss,
    loss_kwargs=dict(
        margin=6.0,
        reduction='mean',
    ),
    training_kwargs=dict(
        num_epochs=150,
        sampler='schlichtkrull',
        batch_size=512, # larger, the quicker, cost increases as O(n^2), time decreases as O(n)
        num_workers=8,
    ),
    optimizer="Adam",
     optimizer_kwargs=dict(
         lr=0.005,
     ),
     training_loop="slcwa",
     regularizer = "no",
     evaluation_kwargs=dict(batch_size=256),  # Batch size for evaluation
     stopper='early',
     stopper_kwargs=dict(frequency=25,patience=4,relative_delta=0.002),
     use_tqdm=True,
     random_seed=42,
)
result.save_to_directory('./output/RotaE/')

INFO:pykeen.pipeline.api:Using device: gpu
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: /root/.data/pykeen/checkpoints/best-model-weights-c2d427a4-3dee-45b2-a1d0-31c79c59b48f.pt


KeyboardInterrupt: ignored

In [None]:
result.save_to_directory('./output/RotaE/')

In [None]:
import json
config_path = os.path.join(out, 'pykeen_results_optim', 'best_pipeline', 'pipeline_config.json')
with open(config_path,'r') as f:
    config = json.load(f)
# Step 2: Modify the data in memory
config['pipeline']['training'] = f'{out}/train.edgelist'
config['pipeline']['testing'] = f'{out}/test.edgelist'
config['pipeline']['validation'] = f'{out}/validation.edgelist'

# Step 3: Write the modified data back to the file
with open(config_path, 'w') as f:
    json.dump(config, f, indent=4)  # `indent=4` for pretty-printing

In [None]:
results =  run_pipeline(
        dataset=(training_factory, validation_factory, testing_factory),
        out_dir=out
    )

INFO:pykeen.pipeline.api:Using device: None


Training epochs on cuda:0:   0%|          | 0/5 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0/4418 [00:00<?, ?batch/s]



Training batches on cuda:0:   0%|          | 0/4418 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/4418 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/4418 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/4418 [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/141k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 59.91s seconds
INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=47716, num_relations=10, create_inverse_triples=False, num_triples=1130795, path="/content/drive/MyDrive/capstone/output/RotaE/train.edgelist") to file:///content/drive/MyDrive/capstone/output/RotaE/pykeen_results_final/training_triples
INFO:pykeen.pipeline.api:Saved to directory: file:///content/drive/MyDrive/capstone/output/RotaE/pykeen_results_final


In [None]:
results.save_to_directory('./output/RotaE/')

INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=47716, num_relations=10, create_inverse_triples=False, num_triples=1130795, path="/content/drive/MyDrive/capstone/output/RotaE/train.edgelist") to file:///content/drive/MyDrive/capstone/output/RotaE/training_triples
INFO:pykeen.pipeline.api:Saved to directory: file:///content/drive/MyDrive/capstone/output/RotaE


In [None]:
best_model = result.model

In [None]:
design = pd.read_csv('./output/design.csv',sep='\t')
design_norm_df = design.astype(str, copy=True)
unique_nodes = kg[~kg['label'].isna()].drop_duplicates('source')
label_mapping = {patient: label for patient, label in zip(unique_nodes['source'], unique_nodes['label'])}

In [None]:
import torch
torch.save(best_model.state_dict(),"./output/RotaE/model.state_dict.pt")

In [None]:
best_model.entity_representations

ModuleList(
  (0): Embedding(
    (regularizer): NoRegularizer()
    (_embeddings): Embedding(47716, 128)
  )
)

In [None]:
# Get the embedding as a numpy array
embedding_values = best_model.entity_representations[0]._embeddings.weight.detach().cpu().numpy()

# Create columns as component names
embedding_columns = [f'Component_{i}' for i in range(1, embedding_values.shape[1] + 1)]

# Get the nodes of the training triples as index
# node_list = list(best_model.triples_factory.entity_to_id.keys())
node_list = list(training_factory.entity_to_id.keys())
# embedding_index = sorted(node_list, key=lambda x: best_model.triples_factory.entity_to_id[x])
embedding_index = sorted(node_list, key=lambda x: training_factory.entity_to_id[x])
embedding = pd.DataFrame(data=embedding_values, columns=embedding_columns, index=embedding_index)

return_patients = True
if return_patients:
    # TODO: Use clustering before classification to see if embeddings are already good enough
    embedding = embedding[embedding.index.isin(design_norm_df['geo_accession'])]

    for index in embedding.index:
        embedding.at[index, 'label'] = label_mapping[index]

In [None]:
embedding.to_csv('./output/RotaE/embeddings_result.csv',sep='\t')