In [None]:
!pip install pykeen -U
# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [None]:
"""Embed patients with the biomedical entities (genes and metabolites) using Knowledge graph embedding."""
import os
from typing import Tuple, Optional, Dict

import numpy as np
import pandas as pd
from pykeen.hpo.hpo import hpo_pipeline
from pykeen.models.base import Model
from pykeen.pipeline import pipeline_from_path
from pykeen.triples import TriplesFactory

In [None]:
def _weighted_splitter(
        edgelist: pd.DataFrame,
        train_size: Optional[float] = 0.8,
        validation_size: Optional[float] = 0.1
) -> Tuple[pd.DataFrame, ...]:
    """Split the given edgelist into training, validation and testing sets on the basis of the ratio of relations.

    :param edgelist: Edgelist in the form of (Source, Relation, Target)
    :param train_size: Size of the training data
    :param validation_size: Size of the training data
    :return: Tuple containing the train, validation & test splits
    """
    # Validation size is the size of the percentage of the remaining data (i.e. If required validation size is 10% of
    # the original data & training size is 80% then the new validation size is 50% of the data without the training
    # data. The similar calculation is done for training size, hence it is always 1
    validation_size = validation_size / (1 - train_size)
    test_size = 1

    # Get the unique relations in the network
    unique_relations = sorted(edgelist['relation'].unique())

    data = edgelist.drop_duplicates().copy()

    split = []
    # Split the data to get training, validation and test samples
    for frac_size in [train_size, validation_size, test_size]:
        frames = []
        # Random sampling of the data for every type of relation
        for relation in unique_relations:
            temp = data[data['relation'] == relation].sample(frac=frac_size)

            data = data[~data.index.isin(temp.index)]

            frames.append(temp)
        # Join all the different relations in one dataframe
        split.append(pd.concat(frames, ignore_index=True, sort=False))

    return tuple(split)

In [None]:
kg = pd.read_csv('../output/subgraph-generation.csv',sep='\t')

In [None]:
edgelist = kg.drop(columns='label')
edgelist = edgelist[['source','relation','target']]
edgelist.columns = ['head', 'relation', 'tail']
edgelist.head()

In [None]:
def run_optimization(dataset: Tuple[TriplesFactory, TriplesFactory, TriplesFactory], model_config: Dict, out_dir: str):
    """Run HPO."""
    training_factory, testing_factory, validation_factory = dataset

    # Define HPO pipeline
    hpo_results = hpo_pipeline(
        dataset=None,
        training=training_factory,
        testing=testing_factory,
        validation=validation_factory,
        model=model_config["model"],
        model_kwargs=model_config["model_kwargs"],
        # model_kwargs_ranges=model_config["model_kwargs_ranges"],
        loss=model_config["loss_function"],
        # loss_kwargs=model_config["loss_kwargs"],
        # loss_kwargs_ranges=model_config["loss_kwargs_ranges"],
        regularizer=model_config["regularizer"],
        optimizer=model_config["optimizer"],
        optimizer_kwargs=model_config["optimizer_kwargs"],
        # optimizer_kwargs_ranges=model_config["optimizer_kwargs_ranges"],
        training_loop=model_config["training_loop"],
        training_kwargs=model_config["training_kwargs"],
        # training_kwargs_ranges=model_config["training_kwargs_ranges"],
        negative_sampler=model_config["negative_sampler"],
        negative_sampler_kwargs=model_config["negative_sampler_kwargs"],
        stopper=model_config["stopper"],
        stopper_kwargs=model_config["stopper_kwargs"],
        evaluator=model_config["evaluator"],
        evaluator_kwargs=model_config["evaluator_kwargs"],
        evaluation_kwargs=model_config["evaluation_kwargs"],
        n_trials=model_config["n_trials"],
        timeout=model_config["timeout"],
        metric=model_config["metric"],
        direction=model_config["direction"],
        sampler=model_config["sampler"],
        # pruner=model_config["pruner"],
    )

    optimization_dir = os.path.join(out_dir, 'pykeen_results_optim')
    if not os.path.isdir(optimization_dir):
        os.makedirs(optimization_dir)

    hpo_results.save_to_directory(optimization_dir)

    return None



In [None]:
out = '../output/RGCN'
train, validation, test = _weighted_splitter(
        edgelist=edgelist,
        train_size=0.8,
        validation_size=0.1
    )
train.to_csv(f'{out}/train.edgelist', sep='\t', index=False, header=False)
validation.to_csv(f'{out}/validation.edgelist', sep='\t', index=False, header=False)
test.to_csv(f'{out}/test.edgelist', sep='\t', index=False, header=False)

In [None]:
out = '../output/RGCN'
create_inverse_triples = False
training_factory = TriplesFactory.from_path(
        path=f'{out}/train.edgelist',
        create_inverse_triples=create_inverse_triples,
    )
validation_factory = TriplesFactory.from_path(
    path=f'{out}/validation.edgelist',
    create_inverse_triples=create_inverse_triples,
)
testing_factory = TriplesFactory.from_path(
    path=f'{out}/test.edgelist',
    create_inverse_triples=create_inverse_triples,
)

In [None]:
import multiprocessing

cores = multiprocessing.cpu_count() # Count the number of cores in a computer
print('the number of cpu cores in this colab')

In [None]:
training_factory

In [None]:
from pykeen.pipeline import pipeline
result = pipeline(
    model='RGCN',
    training=training_factory,
    testing=testing_factory,
    validation=validation_factory,
    # device="gpu",
    loss='CrossEntropyLoss', # node classification
    model_kwargs= dict(
        decomposition='bases',
        decomposition_kwargs=dict(
        num_bases=3,
        ),
        embedding_dim=128,
        interaction='DistMult',
        num_layers=2,
     ),
     training_kwargs=dict(
         num_epochs=75,
         sampler='schlichtkrull',
         batch_size=4096, # larger, the quicker, cost increases as O(n^2), time decreases as O(n)
         num_workers=8,
     ),
     training_loop="slcwa",
    #  regularizer = "None",
     optimizer="Adam",
     optimizer_kwargs=dict(
         lr=0.005,
     ),
     negative_sampler="basic",
     evaluator='RankBasedEvaluator',
     evaluator_kwargs=dict(
         filtered=True,
     ),
     evaluation_kwargs=dict(batch_size=1024),  # Batch size for evaluation
     stopper='early',
     stopper_kwargs=dict(frequency=25,patience=4,relative_delta=0.002),
     use_tqdm=True,
     random_seed=42,
)

In [None]:
result.save_to_directory('../output/RGCN/')

In [None]:
result.plot_losses()

In [None]:
result.plot_er()

In [None]:
result.plot() # plot the visualization graph.

In [None]:
#import a trained pkl
import torch
my_pykeen_model = torch.load('../output/RGCN/trained_model.pkl')

In [None]:
design = pd.read_csv('../output/design.csv',sep='\t')
design_norm_df = design.astype(str, copy=True)
unique_nodes = kg[~kg['label'].isna()].drop_duplicates('source')
label_mapping = {patient: label for patient, label in zip(unique_nodes['source'], unique_nodes['label'])}

In [None]:
embedding_values = my_pykeen_model.entity_representations[0].entity_embeddings._embeddings.weight.detach().cpu().numpy()
# Create columns as component names
embedding_columns = [f'Component_{i}' for i in range(1, embedding_values.shape[1] + 1)]

# Get the nodes of the training triples as index
# node_list = list(best_model.triples_factory.entity_to_id.keys())
node_list = list(training_factory.entity_to_id.keys())
# embedding_index = sorted(node_list, key=lambda x: best_model.triples_factory.entity_to_id[x])
embedding_index = sorted(node_list, key=lambda x: training_factory.entity_to_id[x])
embedding = pd.DataFrame(data=embedding_values, columns=embedding_columns, index=embedding_index)

return_patients = True
if return_patients:
    # TODO: Use clustering before classification to see if embeddings are already good enough
    embedding = embedding[embedding.index.isin(design_norm_df['geo_accession'])]

    for index in embedding.index:
        embedding.at[index, 'label'] = label_mapping[index]

In [None]:
embedding.to_csv('./output/RGCN/embeddings_result.csv',sep='\t')