In [3]:
import h5py
import numpy as np

from biotrainer.inference import Inferencer
from biotrainer.utilities import INTERACTION_INDICATOR
from hvi_toolkit.evaluators import ModelEvaluator, ModelWrapperBiotrainer

from hvi_toolkit.interaction_datasets import DatasetRabiesLyssavirusExperimental

In [4]:
# Load model
model_out_path = "../model/biotrainer_output/out.yml"
inferencer, output_vars = Inferencer.create_from_out_file(model_out_path)

Reading ../model/biotrainer_output/out.yml..
Reading checkpoint(s) from directory: ../model/biotrainer_output/FNN/custom_embeddings..




Got 5 split(s): k_fold-strat-1, k_fold-strat-2, k_fold-strat-3, k_fold-strat-4, k_fold-strat-5


In [5]:
# Create wrapper object
model = ModelWrapperBiotrainer(inferencer=inferencer, split_name=output_vars["split_results"]["best_split"])

In [6]:
# Get benchmarks (Work in progress)
benchmarks_dict = {}

In [10]:
# 1. Rabies Lyssavirus experimental interactions
# Load all human protein embeddings
# (downloaded from https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/embeddings/UP000005640_9606/per-protein.h5)
human_embeddings_path = "../interactome_predictions/rabies_lyssavirus/data/human_per_protein_uniprot.h5"
human_embeddings_file = h5py.File(human_embeddings_path, 'r')
id2emb_human = {idx: np.array(embedding) for (idx, embedding) in human_embeddings_file.items()}
# Load rabies lyssa embeddings
# (extracted from https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/embeddings/uniprot_sprot/per-protein.h5)
lyssa_embeddings_path = "../interactome_predictions/rabies_lyssavirus/data/rabies_lyssa_per_protein_uniprot.h5"
lyssa_embeddings_file = h5py.File(lyssa_embeddings_path, 'r')
id2emb_lyssa = {idx: np.array(embedding) for (idx, embedding) in lyssa_embeddings_file.items()}
# Load experimental interactions
rabies_lyssa_experimental_interactions = DatasetRabiesLyssavirusExperimental(file_path="../hvi_datasets/raw_data/rabies_lyssavirus_zandi/lyssa_experimental_interactions.csv")
rabies_lyssa_experimental_interactions = rabies_lyssa_experimental_interactions.to_standardized_dataset(taxonomy=None)
rabies_lyssa_experimental_interactions_list = rabies_lyssa_experimental_interactions.to_interaction_list()
# Create interactions: Concatenate human and viral embeddings (human_vector ++ viral_vector), len = 2048
rabies_lyssa_experimental_interaction_dict = {}
for interaction in rabies_lyssa_experimental_interactions_list:
        interaction_name = f"{interaction.uniprot_human}{INTERACTION_INDICATOR}{interaction.uniprot_virus}"
        interaction_embedding = np.concatenate([id2emb_human[interaction.uniprot_human], id2emb_lyssa[interaction.uniprot_virus]])
        rabies_lyssa_experimental_interaction_dict[interaction_name] = interaction_embedding

benchmarks_dict["RabiesLyssavirus"] = (rabies_lyssa_experimental_interaction_dict, [1 for _ in rabies_lyssa_experimental_interactions_list])

In [11]:
# 2. Negatome benchmark
negatome_2_interactions_path = "../hvi_datasets/raw_data/negatome2/negatome_2_combined_stringent.txt"
negatome_2_embeddings_path = "../hvi_datasets/raw_data/negatome2/negatome_reduced_prottrans.h5"
negatome_2_embeddings_file = h5py.File(negatome_2_embeddings_path, 'r')
# Get embeddings and interactions
id2emb_negatome = {negatome_2_embeddings_file[idx].attrs["original_id"]: np.array(embedding) for (idx, embedding) in
                   negatome_2_embeddings_file.items()}
negatome_interactions = []
with open(negatome_2_interactions_path, "r") as negatome_2_interactions_file:
    for line in negatome_2_interactions_file.readlines():
        interaction_id = "&".join([seq_id.strip() for seq_id in line.split("\t")])
        interactor_left = interaction_id.split("&")[0]
        interactor_right = interaction_id.split("&")[1]
        if interactor_left in id2emb_negatome.keys() and interactor_right in id2emb_negatome.keys():
            negatome_interactions.append(interaction_id)

negatome_interaction_embeddings = {}
for interaction_id in negatome_interactions:
        interactor_left = interaction_id.split("&")[0]
        interactor_right = interaction_id.split("&")[1]
        negatome_interaction_embeddings[interaction_id] = np.concatenate([id2emb_negatome[interactor_left], id2emb_negatome[interactor_right]])
negatome_targets = [0 for _ in negatome_interaction_embeddings.keys()]
benchmarks_dict["Negatome2.0"] = (negatome_interaction_embeddings, negatome_targets)

In [12]:
model_evaluator = ModelEvaluator()

In [13]:
model_evaluator.evaluate_model(model=model, benchmarks=benchmarks_dict, check_length_bias=False)

**Evaluating model on 2 benchmarks:**
**Metrics for benchmark RabiesLyssavirus:**
                     RabiesLyssavirus
accuracy                     0.981132
f1_score                     0.990476
matthews-corr-coeff          0.000000
precision                    1.000000
recall                       0.981132
**Metrics for benchmark Negatome2.0:**
                     Negatome2.0
accuracy                 0.28773
f1_score                 0.00000
matthews-corr-coeff      0.00000
precision                0.00000
recall                   0.00000
