In [1]:
from evoVAE.utils.datasets import MSA_Dataset
import evoVAE.utils.seq_tools as st
from evoVAE.models.seqVAE import SeqVAE
from evoVAE.trainer.seqVAE_train import seq_train
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import wandb
from pathlib import Path
import os

#### Config

In [2]:
wandb.init(
    project="SeqVAE_training",

    # hyperparameters
    config = {

        # Dataset info
        "dataset": "PhoQ",
        "seq_theta": 0.2, # reweighting 
        "AA_count": 21, # standard AA + gap
        
        # ADAM 
        "learning_rate": 1e-5, # ADAM
        "weight_decay": 0.01, # ADAM

        # Hidden units 
        "momentum": 0.1, 
        "dropout": 0.5,

        # Training loop 
        "epochs": 100,
        "batch_size": 128,
        "max_norm": 1.0, # gradient clipping
        
        # Model info
        "architecture": "SeqVAE",
        "latent_dims": 2,
        "hidden_dims": [32, 16],
    }
)


config = wandb.config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msebastian-porras01[0m. Use [1m`wandb login --relogin`[0m to force relogin


#### Data loading and preprocessing

In [None]:

DATA_PATH = Path("/Users/sebs_mac/OneDrive - The University of Queensland/honours/data/phoQ/uniref90_search/nr65_filtering/odseq_tree/independent_runs/ancestors")

# Gather all the ancestor sequences into a single dataframe 
trees = []
for file in os.listdir(DATA_PATH):
    if file == "ancestor_trees":
        continue 
    run = st.read_aln_file(str(DATA_PATH) + "/" + file)
    run["tree"] = file.split("_")[1]
    trees.append(run)

ancestors = pd.concat(trees)
anc_encodings, anc_weights = st.encode_and_weight_seqs(ancestors["sequence"],theta=0.2)
ancestors["weights"] = anc_weights
#ancestors.to_pickle("phoQ_ancestors_weights.pkl")


In [None]:
# Next, drop N0 and N238 as they come from outgroups 
print(ancestors.shape)
flt_ancestors = ancestors.loc[(ancestors["id"] != "N0") & (ancestors["id"] != "N238")]
print(flt_ancestors.shape)

# Then remove non-unique sequences 
flt_unique_ancestors = flt_ancestors.drop_duplicates(subset="sequence")
flt_unique_ancestors


In [3]:
flt_unique_ancestors = st.read_aln_file("../data/alignments/tiny.aln")
anc_encodings, anc_weights = st.encode_and_weight_seqs(
    flt_unique_ancestors["sequence"], theta=config.seq_theta
)
flt_unique_ancestors["weights"] = anc_weights
flt_unique_ancestors["encodings"] = anc_encodings


train, val = train_test_split(flt_unique_ancestors, test_size=0.1)

# create one-hot encodings and calculate reweightings 

# TRAINING
train_dataset = MSA_Dataset(
    train["encodings"], train["weights"], train["id"]
)

# VALIDATION
val_dataset = MSA_Dataset(
    val["encodings"], val["weights"], val["id"]
)

# DATA LOADERS #
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=False)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=2, shuffle=False)

print(len(train_loader), len(val_loader))
next(iter(train_loader))[0].shape,next(iter(train_loader))[1].shape, next(iter(train_loader))[2]

Encoding the sequences and calculating weights
The sequence encoding has size: (12,)

The sequence weight array has size: (12,)

5 1


(torch.Size([2, 155, 21]),
 torch.Size([2]),
 ('A0A0J8VL97_PhoQ_UniRef90', 'A0A845SAR1_PhoQ_UniRef90'))

#### Create the model

In [4]:
# get the sequence length 
seq_len = train_dataset[0][0].shape[0]
input_dims = seq_len * config.AA_count

# use preset structure for hidden dimensions 
model = SeqVAE(input_dims=input_dims, latent_dims=config.latent_dims, hidden_dims=config.hidden_dims, config=config) 
model

SeqVAE(
  (encoder): Sequential(
    (0): Sequential(
      (0): Linear(in_features=3255, out_features=32, bias=True)
      (1): LeakyReLU(negative_slope=0.01)
      (2): Dropout(p=0.5, inplace=False)
      (3): Linear(in_features=32, out_features=32, bias=True)
      (4): LeakyReLU(negative_slope=0.01)
      (5): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): Sequential(
      (0): Linear(in_features=32, out_features=16, bias=True)
      (1): LeakyReLU(negative_slope=0.01)
      (2): Dropout(p=0.5, inplace=False)
      (3): Linear(in_features=16, out_features=16, bias=True)
      (4): LeakyReLU(negative_slope=0.01)
      (5): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (z_mu_sampler): Linear(in_features=16, out_features=2, bias=True)
  (z_logvar_sampler): Linear(in_features=16, out_features=2, bias=True)
  (upscale_z): Linear(in_features=2, out_features=16, bias=True)
  (decoder): Sequentia

In [5]:

for i in train_loader:
    encoding, weight, name = i

    print(encoding.shape)

    #encoding = encoding.float()
    #output = model.forward(encoding)
    #print(encoding.shape, output[0].shape)
    #loss, kl, likelihood = model.loss_function(output, encoding)
    #print(loss, kl, likelihood)
    
 

torch.Size([2, 155, 21])
torch.Size([2, 155, 21])
torch.Size([2, 155, 21])
torch.Size([2, 155, 21])
torch.Size([2, 155, 21])


#### Training Loop

In [6]:
trained_model = seq_train(model, train_loader=train_loader, val_loader=val_loader, device=device, config=config)

  from .autonotebook import tqdm as notebook_tqdm


tensor([[-1.1558, -1.9932],
        [-0.8483, -1.0853]], grad_fn=<SubBackward0>)
tensor([[-0.9330, -1.2961],
        [-1.0679, -0.9268]], grad_fn=<SubBackward0>)
{'ELBO': 19772.568359375, 'KLD': -0.429389625787735, 'Gauss_likelihood': -19772.998046875}
tensor([[-1.3232, -2.2951],
        [-1.2639, -1.5855]], grad_fn=<SubBackward0>)
tensor([[-2.5393, -4.2231],
        [-0.9191, -0.9307]], grad_fn=<SubBackward0>)
{'ELBO': 20485.369140625, 'KLD': 1.0722577571868896, 'Gauss_likelihood': -20484.296875}
tensor([[-1.0576, -3.3322],
        [-0.9169, -0.9521]], grad_fn=<SubBackward0>)
tensor([[-0.9234, -3.3638],
        [-0.9915, -0.9387]], grad_fn=<SubBackward0>)
{'ELBO': 20120.4375, 'KLD': -0.02064928412437439, 'Gauss_likelihood': -20120.458984375}
tensor([[-2.1483, -2.1173],
        [-1.2557, -0.8246]], grad_fn=<SubBackward0>)
tensor([[-2.7909, -2.3882],
        [-1.6121, -0.9754]], grad_fn=<SubBackward0>)
{'ELBO': 20554.19140625, 'KLD': 0.7103743553161621, 'Gauss_likelihood': -20553.480468

In [9]:
a = torch.rand(size=(2,2))
b = torch.rand(size=(2,2))

c = a - b

print(c.sum(-1))
print(c.sum(dim=tuple(range(1, kl.ndim))))

tensor([0.2197, 1.0837])

In [9]:
wandb.finish()

wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Error while calling W&B API: run 8riocgfl was previously created and deleted; try a new run name (<Response [409]>)
wandb: ERROR Error while calling W&B API: run 8riocgfl was previously created and deleted; try a new run name (<Response [409]>)
wandb: ERROR Error while calling W&B API: run 8riocgfl was previously created and deleted; try a new run name (<Response [409]>)
wandb: ERROR Error while calling W&B API: run 8riocgfl was previously created and deleted; try a new run name (<Response [409]>)
wandb: ERROR Error while calling W&B API: run 8riocgfl was previously created and deleted; try a new run name (<Response [409]>)
[34m[1mwandb[0m: [32m[41mERROR[0m Control-C detected -- Run data was not synced
