In [1]:
!nvidia-smi

Thu Nov 30 01:57:34 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install pykeen -U
# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

Collecting pykeen
  Downloading pykeen-1.10.1-py3-none-any.whl (739 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m739.3/739.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dataclasses-json (from pykeen)
  Downloading dataclasses_json-0.6.3-py3-none-any.whl (28 kB)
Collecting click-default-group (from pykeen)
  Downloading click_default_group-1.2.4-py2.py3-none-any.whl (4.1 kB)
Collecting optuna>=2.0.0 (from pykeen)
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
Collecting more-click (from pykeen)
  Downloading more_click-0.1.2-py3-none-any.whl (6.7 kB)
Collecting pystow>=0.4.3 (from pykeen)
  Downloading pystow-0.5.2-py3-none-any.whl (31 kB)
Collecting docdata (from pykeen)
  Downloading docdata-0.0.3-py3-none-any.whl (5.8 kB)
Collecting class-resolver>=0.3.10 (from pykeen)
  Downloading class_resolver-0.4.2-py3-none-an

In [3]:
"""Embed patients with the biomedical entities (genes and metabolites) using Knowledge graph embedding."""
import os
from typing import Tuple, Optional, Dict

import numpy as np
import pandas as pd
from pykeen.hpo.hpo import hpo_pipeline
from pykeen.models.base import Model
from pykeen.pipeline import pipeline_from_path
from pykeen.triples import TriplesFactory

INFO:pykeen.utils:Using opt_einsum


In [4]:
def _weighted_splitter(
        edgelist: pd.DataFrame,
        train_size: Optional[float] = 0.8,
        validation_size: Optional[float] = 0.1
) -> Tuple[pd.DataFrame, ...]:
    """Split the given edgelist into training, validation and testing sets on the basis of the ratio of relations.

    :param edgelist: Edgelist in the form of (Source, Relation, Target)
    :param train_size: Size of the training data
    :param validation_size: Size of the training data
    :return: Tuple containing the train, validation & test splits
    """
    # Validation size is the size of the percentage of the remaining data (i.e. If required validation size is 10% of
    # the original data & training size is 80% then the new validation size is 50% of the data without the training
    # data. The similar calculation is done for training size, hence it is always 1
    validation_size = validation_size / (1 - train_size)
    test_size = 1

    # Get the unique relations in the network
    unique_relations = sorted(edgelist['relation'].unique())

    data = edgelist.drop_duplicates().copy()

    split = []
    # Split the data to get training, validation and test samples
    for frac_size in [train_size, validation_size, test_size]:
        frames = []
        # Random sampling of the data for every type of relation
        for relation in unique_relations:
            temp = data[data['relation'] == relation].sample(frac=frac_size) # random sampling may be an issue

            data = data[~data.index.isin(temp.index)]

            frames.append(temp)
        # Join all the different relations in one dataframe
        split.append(pd.concat(frames, ignore_index=True, sort=False))

    return tuple(split)

In [35]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [6]:
%cd drive/MyDrive/capstone
%pwd

/content/drive/MyDrive/capstone


'/content/drive/MyDrive/capstone'

In [7]:
kg = pd.read_csv('output/subgraph-generation.csv',sep='\t')

In [8]:
edgelist = kg.drop(columns='label')
edgelist = edgelist[['source','relation','target']]
edgelist.columns = ['head', 'relation', 'tail']
edgelist.head()

Unnamed: 0,head,relation,tail
0,PHYHIP,protein_protein,KIF15
1,PHYHIP,protein_protein,ZZEF1
2,PHYHIP,protein_protein,PNPLA2
3,PHYHIP,protein_protein,PAQR5
4,PHYHIP,protein_protein,CA10


In [9]:
def run_optimization(dataset: Tuple[TriplesFactory, TriplesFactory, TriplesFactory], model_config: Dict, out_dir: str):
    """Run HPO."""
    training_factory, testing_factory, validation_factory = dataset

    # Define HPO pipeline
    hpo_results = hpo_pipeline(
        dataset=None,
        training=training_factory,
        testing=testing_factory,
        validation=validation_factory,
        model=model_config["model"],
        model_kwargs=model_config["model_kwargs"],
        # model_kwargs_ranges=model_config["model_kwargs_ranges"],
        loss=model_config["loss_function"],
        # loss_kwargs=model_config["loss_kwargs"],
        # loss_kwargs_ranges=model_config["loss_kwargs_ranges"],
        regularizer=model_config["regularizer"],
        optimizer=model_config["optimizer"],
        optimizer_kwargs=model_config["optimizer_kwargs"],
        # optimizer_kwargs_ranges=model_config["optimizer_kwargs_ranges"],
        training_loop=model_config["training_loop"],
        training_kwargs=model_config["training_kwargs"],
        # training_kwargs_ranges=model_config["training_kwargs_ranges"],
        negative_sampler=model_config["negative_sampler"],
        negative_sampler_kwargs=model_config["negative_sampler_kwargs"],
        stopper=model_config["stopper"],
        stopper_kwargs=model_config["stopper_kwargs"],
        evaluator=model_config["evaluator"],
        evaluator_kwargs=model_config["evaluator_kwargs"],
        evaluation_kwargs=model_config["evaluation_kwargs"],
        n_trials=model_config["n_trials"],
        timeout=model_config["timeout"],
        metric=model_config["metric"],
        direction=model_config["direction"],
        sampler=model_config["sampler"],
        # pruner=model_config["pruner"],
    )

    optimization_dir = os.path.join(out_dir, 'pykeen_results_optim')
    if not os.path.isdir(optimization_dir):
        os.makedirs(optimization_dir)

    hpo_results.save_to_directory(optimization_dir)

    return None



In [None]:
model_config = {
  "model": "R-GCN", # RotatE, R-GCN, GIN
  "model_kwargs": {
    'embedding_dim' : 100, # shallow embeddings(256); if GNN - 128
    'interaction':"DistMult" # decoder
  },
  "training_loop": "slcwa",
  "optimizer": "adam",
  "optimizer_kwargs": {
    "weight_decay": 1e-3, # check with the weight decay, optional, check with that
    "lr": 0.005
  },
  "loss_function": "CrossEntropyLoss", # NSSALoss | cross entropy | few other ones.
  "regularizer": "NoRegularizer", # consider that later
  "regularizer_kwargs": {},
  "regularizer_kwargs_ranges": {},
  "negative_sampler": "basic",
  "negative_sampler_kwargs": {"num_negs_per_pos":25}, #
  "create_inverse_triples": False,
  "evaluator": "RankBasedEvaluator",
  "evaluator_kwargs": {
    "filtered": True
  },
  "evaluation_kwargs": {
    "batch_size": 256
  },
  "training_kwargs": {
    "num_workers":12,
    "num_epochs": 100, #1000
    "label_smoothing": 0.0,
    "batch_size":1024
  },
  "stopper": "early",
  "stopper_kwargs": {
    "frequency": 25,
    "patience": 4,
    "relative_delta":0.002
  },
  "n_trials": 3,
  "timeout": 129600,
  "metric": "hits@10",
  "direction": "maximize",
  "sampler": "random",
  # "pruner":"Hyperband Pruner", # define the seeds
}

In [28]:
# ------warning---------
# out = './output/RGCN'
# train, validation, test = _weighted_splitter(
#         edgelist=edgelist,
#         train_size=0.8,
#         validation_size=0.1
#     )
# train.to_csv(f'{out}/train.edgelist', sep='\t', index=False, header=False)
# validation.to_csv(f'{out}/validation.edgelist', sep='\t', index=False, header=False)
# test.to_csv(f'{out}/test.edgelist', sep='\t', index=False, header=False)

In [37]:
out = './output/RGCN'
create_inverse_triples = False
training_factory = TriplesFactory.from_path(
        path=f'{out}/train.edgelist',
        create_inverse_triples=create_inverse_triples,
    )
validation_factory = TriplesFactory.from_path(
    path=f'{out}/validation.edgelist',
    create_inverse_triples=create_inverse_triples,
)
testing_factory = TriplesFactory.from_path(
    path=f'{out}/test.edgelist',
    create_inverse_triples=create_inverse_triples,
)

In [None]:
# from pykeen.hpo import hpo_pipeline
# hpo_pipeline_result = hpo_pipeline(
#     dataset=None,
#     training=training_factory,
#     testing=testing_factory,
#     validation=validation_factory,
#     model='RGCN',
#     model_kwargs=dict(embedding_dim=50,use_batch_norm=None),
# )

In [30]:
import multiprocessing

cores = multiprocessing.cpu_count() # Count the number of cores in a computer
print(f'the number of cpu cores is {cores} in this colab')

the number of cpu cores is 12 in this colab


In [38]:
training_factory

TriplesFactory(num_entities=47972, num_relations=10, create_inverse_triples=False, num_triples=1428814, path="/content/drive/My Drive/capstone/output/RGCN/train.edgelist")

In [41]:
from pykeen.pipeline import pipeline
result = pipeline(
    model='RGCN',
    training=training_factory,
    testing=testing_factory,
    validation=validation_factory,
    device="gpu",
    loss='CrossEntropyLoss', # node classification
    model_kwargs= dict(
        decomposition='bases',
        decomposition_kwargs=dict(
        num_bases=3,
        ),
        embedding_dim=100,
        interaction='DistMult',
        num_layers=2,
     ),
     training_kwargs=dict(
         num_epochs=75,
         sampler='schlichtkrull',
         batch_size=1024, # larger, the quicker, cost increases as O(n^2), time decreases as O(n)
         num_workers=12,
         checkpoint_name='my_checkpoint.pt',
         checkpoint_directory='./output/RGCN/PrimekG3',
         checkpoint_frequency=5
     ),
     training_loop="slcwa",
     regularizer = "no",
     optimizer="Adam",
     optimizer_kwargs=dict(
         lr=0.005
     ),
     negative_sampler="basic",
     evaluator='RankBasedEvaluator',
     evaluator_kwargs=dict(
         filtered=True,
     ),
     evaluation_kwargs=dict(batch_size=512),  # Batch size for evaluation
     stopper='early',
     stopper_kwargs=dict(frequency=25,patience=4,relative_delta=0.002),
     use_tqdm=True,
     random_seed=42
)
result.save_to_directory('./output/RGCN/PrimekG3')

INFO:pykeen.pipeline.api:=> no training loop checkpoint file found at 'output/RGCN/PrimekG3/my_checkpoint.pt'. Creating a new file.
INFO:pykeen.pipeline.api:Using device: gpu
  (fwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(3, 10000)
      )
    )
  )
  (bwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(3, 10000)
      )
    )
  )
  (self_loop): Linear(in_features=100, out_features=100, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
) has parameters, but no reset_parameters.
  (fwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(3, 10000)
      )
    )
  )
  (bwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(3, 10000)
     

Training epochs on cuda:0:   0%|          | 0/75 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 2.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 4.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 6.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 8.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 10.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 12.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 14.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 16.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 18.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 20.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 22.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 24.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 45.88s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 25: 0.0003443352649701856. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-84387cd8-edf6-454c-bc27-a50e0e3d390b.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 25.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 26.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 28.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 30.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 32.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 34.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 36.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 38.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 40.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 42.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 44.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 46.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 48.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=1024.
INFO:pykeen.evaluation.evaluator:Evaluation took 45.23s seconds
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 50.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 52.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 54.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 56.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 58.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 60.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 62.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 64.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 66.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 68.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 70.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 72.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 74.


Training batches on cuda:0:   0%|          | 0/1396 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 45.37s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 75: 0.0003919263178522438. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-84387cd8-edf6-454c-bc27-a50e0e3d390b.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 75.
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 75.


Evaluating on cuda:0:   0%|          | 0.00/179k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 50.65s seconds
INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=47972, num_relations=10, create_inverse_triples=False, num_triples=1428814, path="/content/drive/My Drive/capstone/output/RGCN/train.edgelist") to file:///content/drive/MyDrive/capstone/output/RGCN/PrimekG3/training_triples
INFO:pykeen.pipeline.api:Saved to directory: file:///content/drive/MyDrive/capstone/output/RGCN/PrimekG3


In [43]:
# result.plot() # plot the visualization graph.

In [None]:
result.plot_er()

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-50-b1f743fc5bdb>", line 1, in <cell line: 1>
    result.plot_er()
  File "/usr/local/lib/python3.10/dist-packages/pykeen/pipeline/api.py", line 356, in plot_er
    return plot_er(self, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pykeen/pipeline/plot_utils.py", line 216, in plot_er
    ax.scatter(x, y, color="black")
  File "/usr/local/lib/python3.10/dist-packages/matplotlib/__init__.py", line 1442, in inner
    return func(ax, *map(sanitize_sequence, args), **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/matplotlib/axes/_axes.py", line 4711, in scatter
    self.add_collection(collection)
  File "/usr/local/lib/python3.10/dist-packages/matplotlib/axes/_base.py", line 2263, in add_collection
    self._unstale_viewLim()
  File "/usr/local/lib/python3.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Error in callback <function _draw_all_if_interactive at 0x7d78fd505ea0> (for post_execute):
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-50-b1f743fc5bdb>", line 1, in <cell line: 1>
    result.plot_er()
  File "/usr/local/lib/python3.10/dist-packages/pykeen/pipeline/api.py", line 356, in plot_er
    return plot_er(self, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pykeen/pipeline/plot_utils.py", line 216, in plot_er
    ax.scatter(x, y, color="black")
  File "/usr/local/lib/python3.10/dist-packages/matplotlib/__init__.py", line 1442, in inner
    return func(ax, *map(sanitize_sequence, args), **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/matplotlib/axes/_axes.py", line 4711, in scatter
    self.add_collection(collection)
  File "/usr/local/lib/python3.10/dist-packages/matplotlib/axes/_base.p

In [45]:
#import a trained pkl
import torch
my_pykeen_model = torch.load('./output/RGCN/PrimekG3/trained_model.pkl')

In [46]:
design = pd.read_csv('./output/scoring-5.csv',sep='\t',index_col=0)
design = design[['geo_accession','label']]
design_norm_df = design.astype(str, copy=True)
unique_nodes = kg[~kg['label'].isna()].drop_duplicates('source')
label_mapping = {patient: label for patient, label in zip(unique_nodes['source'], unique_nodes['label'])}

In [47]:
embedding_values = my_pykeen_model.entity_representations[0].entity_embeddings._embeddings.weight.detach().cpu().numpy()
# Create columns as component names
embedding_columns = [f'Component_{i}' for i in range(1, embedding_values.shape[1] + 1)]

# Get the nodes of the training triples as index
# node_list = list(best_model.triples_factory.entity_to_id.keys())
node_list = list(training_factory.entity_to_id.keys())
# embedding_index = sorted(node_list, key=lambda x: best_model.triples_factory.entity_to_id[x])
embedding_index = sorted(node_list, key=lambda x: training_factory.entity_to_id[x])
embedding = pd.DataFrame(data=embedding_values, columns=embedding_columns, index=embedding_index)

return_patients = True
if return_patients:
    # TODO: Use clustering before classification to see if embeddings are already good enough
    embedding = embedding[embedding.index.isin(design_norm_df['geo_accession'])]

    for index in embedding.index:
        embedding.at[index, 'label'] = label_mapping[index]

In [48]:
embedding.to_csv('./output/RGCN/PrimekG3/embeddings_result_100.csv',sep='\t')