In [1]:
import h5py
import numpy as np

from biotrainer.inference import Inferencer
from hvi_toolkit.interaction_datasets import DatasetRabiesLyssavirusExperimental

In [2]:
# Load all human protein embeddings
# (downloaded from https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/embeddings/UP000005640_9606/per-protein.h5)
human_embeddings_path = "data/human_per_protein_uniprot.h5"
human_embeddings_file = h5py.File(human_embeddings_path, 'r')
id2emb_sequence_human = {idx: np.array(embedding) for (idx, embedding) in human_embeddings_file.items()}

In [3]:
# Load rabies lyssa embeddings
# (extracted from https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/embeddings/uniprot_sprot/per-protein.h5)
lyssa_embeddings_path = "data/rabies_lyssa_per_protein_uniprot.h5"
lyssa_embeddings_file = h5py.File(lyssa_embeddings_path, 'r')
id2emb_sequence_lyssa = {idx: np.array(embedding) for (idx, embedding) in lyssa_embeddings_file.items()}

In [4]:
# Create interactions: Concatenate human and viral embeddings (human_vector ++ viral_vector), len = 2048
interaction_dict = {}
for seq_id_human, embedding_human in id2emb_sequence_human.items():
    for seq_id_lyssa, embedding_lyssa in id2emb_sequence_lyssa.items():
        interaction_name = f"{seq_id_human}&{seq_id_lyssa}"
        interaction_embedding = np.concatenate([embedding_human, embedding_lyssa])
        interaction_dict[interaction_name] = interaction_embedding

In [5]:
# Check correctness
print(f"Human proteins: {len(id2emb_sequence_human.keys())}, Lyssa proteins: {len(id2emb_sequence_lyssa.keys())}")
print(f"Interactions: {len(interaction_dict.keys())}")
assert len(interaction_dict.keys()) == len(id2emb_sequence_human.keys()) * len(id2emb_sequence_lyssa.keys())

Human proteins: 20592, Lyssa proteins: 60
Interactions: 1235520


In [6]:
# Load best model
model_out_path = "../../model/biotrainer_output/out.yml"
hvi_inferencer, output_vars = Inferencer.create_from_out_file(model_out_path)

Reading ../model/biotrainer_output/out.yml..
Reading checkpoint(s) from directory: ../model/biotrainer_output/FNN/custom_embeddings..




Got 5 split(s): k_fold-strat-1, k_fold-strat-2, k_fold-strat-3, k_fold-strat-4, k_fold-strat-5


In [7]:
# Predict interactome
predictions_rabies_lyssa_interactome = hvi_inferencer.from_embeddings(embeddings=interaction_dict,
                                                                      split_name=output_vars["split_results"][
                                                                          "best_split"])

In [8]:
# Number of positive interaction predictions
number_positive_predictions = sum(map(int, predictions_rabies_lyssa_interactome['mapped_predictions'].values()))
print(
    f"Positive interactions: {number_positive_predictions}/{len(predictions_rabies_lyssa_interactome['mapped_predictions'])} ({100 * number_positive_predictions / len(predictions_rabies_lyssa_interactome['mapped_predictions'])}%)")

Positive interactions: 715245/1235520 (57.89020007770008%)


In [9]:
# Evaluate on experimental verified interactions
rabies_lyssa_experimental_interactions = DatasetRabiesLyssavirusExperimental(
    file_path="../../hvi_datasets/raw_data/rabies_lyssavirus_zandi/lyssa_experimental_interactions.csv")
rabies_lyssa_experimental_interactions = rabies_lyssa_experimental_interactions.to_standardized_dataset(taxonomy=None)
experimental_interaction_ids = []
correct_predictions = 0
for _, row in rabies_lyssa_experimental_interactions.data_frame.iterrows():
    interaction = f"{row['Uniprot_human']}&{row['Uniprot_virus']}"
    experimental_interaction_ids.append(interaction)
    correct_predictions += int(predictions_rabies_lyssa_interactome["mapped_predictions"][interaction])

print(
    f"Correct predictions: {correct_predictions}/{len(rabies_lyssa_experimental_interactions)} ({100 * correct_predictions / len(rabies_lyssa_experimental_interactions)})")

Correct predictions: 52/53 (98.11320754716981)


In [14]:
# Look at model error margins for experimental verified interactions using monte-carlo dropout
experimental_interaction_embeddings = {interaction_id: interaction_dict[interaction_id] for interaction_id in
                                       experimental_interaction_ids}
predictions_rabies_lyssa_exp_mcd = hvi_inferencer.from_embeddings_with_monte_carlo_dropout(
    embeddings=experimental_interaction_embeddings,
    split_name=output_vars["split_results"]["best_split"],
    n_forward_passes=30,
    confidence_level=0.05,
    seed=42
)
print(predictions_rabies_lyssa_exp_mcd)

{'Q9GZZ6&O92284': {'prediction': '1', 'mcd_mean': tensor([0.3888, 0.6112], device='cuda:0'), 'mcd_lower_bound': tensor([0.3607, 0.5832], device='cuda:0'), 'mcd_upper_bound': tensor([0.4168, 0.6393], device='cuda:0')}, 'P13591&O92284': {'prediction': '1', 'mcd_mean': tensor([0.2043, 0.7957], device='cuda:0'), 'mcd_lower_bound': tensor([0.1714, 0.7628], device='cuda:0'), 'mcd_upper_bound': tensor([0.2372, 0.8286], device='cuda:0')}, 'P08138&P32550': {'prediction': '1', 'mcd_mean': tensor([0.2741, 0.7259], device='cuda:0'), 'mcd_lower_bound': tensor([0.2408, 0.6927], device='cuda:0'), 'mcd_upper_bound': tensor([0.3073, 0.7592], device='cuda:0')}, 'P08138&O92284': {'prediction': '1', 'mcd_mean': tensor([0.2325, 0.7675], device='cuda:0'), 'mcd_lower_bound': tensor([0.2074, 0.7423], device='cuda:0'), 'mcd_upper_bound': tensor([0.2577, 0.7926], device='cuda:0')}, 'P08138&P08667': {'prediction': '1', 'mcd_mean': tensor([0.2543, 0.7457], device='cuda:0'), 'mcd_lower_bound': tensor([0.2165, 0.70

In [11]:
# Save interactome predictions
interactome_save_path = "rabies_lyssa_human_interactome_predictions.csv"
with open(interactome_save_path, "w") as interactome_save_file:
    interactome_save_file.write(f"Uniprot_human,Uniprot_virus,Prediction\n")
    for interaction_id, prediction in predictions_rabies_lyssa_interactome["mapped_predictions"].items():
        uniprot_human = interaction_id.split("&")[0]
        uniprot_virus = interaction_id.split("&")[1]
        interactome_save_file.write(f"{uniprot_human},{uniprot_virus},{prediction}\n")