In [1]:
import evoVAE.utils.seq_tools as st
import evoVAE.utils.metrics as mt
from evoVAE.models.seqVAE import SeqVAE
from typing import List, Tuple
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import numpy as np
import yaml
from evoVAE.loss.standard_loss import KL_divergence, sequence_likelihood


This notebook can be used to test new features for a model without having to use the WandB service

In [2]:
with open("extant_config.yaml", "r") as stream:
    settings = yaml.safe_load(stream)

seq_len = 770 # A4 Human length 
input_dims = seq_len * settings["AA_count"]

model = SeqVAE(
    input_dims=input_dims,
    latent_dims=settings["latent_dims"],
    hidden_dims=settings["hidden_dims"],
    config=settings,
)

device = "cpu"
model.load_state_dict(torch.load("a4_extants_r1_model_state.pt", map_location=device))
model

SeqVAE(
  (encoder): Sequential(
    (0): Sequential(
      (0): Linear(in_features=16170, out_features=256, bias=True)
      (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): Dropout(p=0.025, inplace=False)
      (3): LeakyReLU(negative_slope=0.01)
    )
    (1): Sequential(
      (0): Linear(in_features=256, out_features=128, bias=True)
      (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): Dropout(p=0.025, inplace=False)
      (3): LeakyReLU(negative_slope=0.01)
    )
    (2): Sequential(
      (0): Linear(in_features=128, out_features=64, bias=True)
      (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): Dropout(p=0.025, inplace=False)
      (3): LeakyReLU(negative_slope=0.01)
    )
  )
  (z_mu_sampler): Linear(in_features=64, out_features=3, bias=True)
  (z_logvar_sampler): Linear(in_features=64, out_features=3, bias=True)
  (upscale_z): Sequ

In [3]:
metadata = pd.read_csv("../data/DMS_substitutions.csv")
dms_data = pd.read_csv("A4_HUMAN_Seuma_2022.csv")
one_hot = dms_data["mutated_sequence"].apply(st.seq_to_one_hot)
dms_data["encoding"] = one_hot

wild_type = metadata[metadata["DMS_id"] == "A4_HUMAN_Seuma_2022"]["target_seq"].to_numpy()[0]



In [4]:
from evoVAE.utils.datasets import DMS_Dataset

dms_dataset = DMS_Dataset(dms_data["encoding"], dms_data["mutant"], dms_data["DMS_score"])
dms_loader = torch.utils.data.DataLoader(
    dms_dataset, batch_size=1, shuffle=True
)

In [37]:
from torch import Tensor

def mean_elbo(one_hot_encoding: torch.Tensor, n_samples):

    one_hot_encoding = one_hot_encoding.expand(n_samples, -1, -1)

    wt_log_p, wt_z_sample, wt_z_mu, wt_z_logvar = model(one_hot_encoding)

    kld = KL_divergence(wt_z_mu, wt_z_logvar, None, None)

    log_PxGz = sequence_likelihood(wild_one_hot, wt_log_p)

    wt_elbo = (-1) * (log_PxGz - kld)
    wt_elbo_mean = wt_elbo.mean()
    
    return wt_elbo_mean

n_samples = 3
wild_one_hot = torch.Tensor(st.seq_to_one_hot(wild_type)).unsqueeze(0).float()

model.eval()
test = []
with torch.no_grad():

    wt_elbo_mean = mean_elbo(wild_one_hot, n_samples)

    for variant_encoding, variant_id, score in dms_loader:
        print(variant_id)

        variant_encoding = variant_encoding.float().to(device)        
        variant_elbo_mean = mean_elbo(variant_encoding, n_samples)

        pred_fitness = variant_elbo_mean - wt_elbo_mean
        print(pred_fitness.item(), score.item())
        test.append(score.item())


        break








('E674G:V689A',)
-10.30810546875 -0.325331966967053
