In [1]:
import numpy as np
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

from VAEmodel import *

In [2]:
# Load the processed MSA data
import pickle
from pathlib import Path

DATA_DIR = Path("../../data")
REF_SEQ_ID = "PF00041"
AA_TYPES = ['R', 'H', 'K',
      'D', 'E',
      'S', 'T', 'N', 'Q',
      'C', 'G', 'P',
      'A', 'V', 'I', 'L', 'M', 'F', 'Y', 'W']

enumd_msa = np.load(DATA_DIR / "processed" / "enumd_mtx_{}.npy".format(REF_SEQ_ID))
print("enumd_msa.shape: ", enumd_msa.shape)
seq_weights = np.load(DATA_DIR / "processed" / "seq_weights_{}.npy".format(REF_SEQ_ID))
print("seq_weights.shape: ", seq_weights.shape)

msa = MSA_Dataset(enumd_msa, seq_weights, np.array(AA_TYPES), MSA_to_OneHot)

# check length = num of seqs
print("Number of sequences in loaded alignment: ", len(msa))
print("Length of loaded alignment (final number of columns/a.a. positions in alignment for each sequence): ", enumd_msa.shape[1])
print("Number of amino acid types:", len(AA_TYPES)+1)

enumd_msa.shape:  (320066, 79)
seq_weights.shape:  (320066,)
Number of sequences in loaded alignment:  320066
Length of loaded alignment (final number of columns/a.a. positions in alignment for each sequence):  79
Number of amino acid types: 21


In [3]:
# import torch_xla.core.xla_model as xm
# dev = xm.xla_device()

print("GPU available: ", torch.cuda.is_available())
dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 2-dimensional latent space
vae = VAE(len(AA_TYPES), 2, enumd_msa.shape[1] * (len(AA_TYPES)+1), [200, 200])
vae.to(dev)

GPU available:  True


VAE(
  (encoder_comm_layers): Sequential(
    (0): Sequential(
      (0): Linear(in_features=1659, out_features=200, bias=True)
      (1): Tanh()
    )
    (1): Sequential(
      (0): Linear(in_features=200, out_features=200, bias=True)
      (1): Tanh()
    )
  )
  (enc_mu): Linear(in_features=200, out_features=2, bias=True)
  (enc_logvars): Linear(in_features=200, out_features=2, bias=True)
)

In [4]:
first_seq = msa[0][0].to(dev)
print(first_seq)

applying transform:  <VAEmodel.MSA_to_OneHot object at 0x7fe75ff0ffd0>
before transform: [13 13  8 12  9 13 12  8  6  7 11  6  6 14  1 14 10 20  6 16 19  6 13 13
  4  4  7 14  5  6 19  9 16  6 19  1 12 14  9  4 11  6  9  7  5 14  7 17
  7 15  3  5 19 10  6 14  7  8 16 16 12  8  7  9 19  5 18 20 14  7 13  9
  8  1 13 11 16  6 12]
tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0')


In [5]:
print("first layer should be ", enumd_msa.shape[1], " * ", len(AA_TYPES)+1, " = ", enumd_msa.shape[1] * (len(AA_TYPES)+1))
print(vae.encoder_comm_layers)
first_latent = vae.encode(first_seq)

first layer should be  79  *  21  =  1659
Sequential(
  (0): Sequential(
    (0): Linear(in_features=1659, out_features=200, bias=True)
    (1): Tanh()
  )
  (1): Sequential(
    (0): Linear(in_features=200, out_features=200, bias=True)
    (1): Tanh()
  )
)


In [6]:
print(first_latent)

(tensor([-0.0034, -0.0266], device='cuda:0', grad_fn=<AddBackward0>), tensor([1.0857, 1.0416], device='cuda:0', grad_fn=<ExpBackward0>))
