# Proof of Concept - VAEP 
Variational Autoencoder of the Proteome (VAEP), reconstructiong samples on the peptide level using `log`-transformed on peptide intensities. This is the proof of concept (POC) for later use. 

- Fit VAE to Hela-Sample data (3 samples) and overfit. (Functional test of code)
- Fit 

### Handling Missing values
In this semi-supervised setting, where the samples are both input and target, missing values have to be imputed in the sample for the input space, but these values should not be considered for the loss function as their truth is unkown. 

### Alternatives

- [`sklearn.imputer.IterativeImputer`](https://scikit-learn.org/stable/modules/impute.html#iterative-imputer)

## Setup
> You won't have to re-run this (saves times for big in-memory datasets)

In [1]:
import pandas as pd

import torch
import torch.utils.data
from torch import nn, optim
from torch.nn import functional as F

import vaep
from vaep.transform import log

### Load Data

In [2]:
import src.file_utils as io_
FOLDER_DATA = 'data'
files = io_.search_files(path=FOLDER_DATA, query='.pkl')
file = io_.check_for_key(files, 'peptides_n4') # ToDo: check for more than one key behaviour
file # sample_peptides.pkl

'data\\sample_peptides_n4.pkl'

In [3]:
peptides_all = pd.read_pickle(file)

## Experiment
> Re-run everything below

In [4]:
REMOVE_MISSING = False
IMPUTE = False
FILL_NA = False
print_hyperparameter = lambda name, x: print("{} = {}".format(name,x))

Run all cells below from here for testing

In [5]:
N_FEAT = 10
peptides = peptides_all.sample(n=N_FEAT, axis=1)
peptides = peptides.apply(log)

In [6]:
print_hyperparameter("REMOVE_MISSING", REMOVE_MISSING)
if REMOVE_MISSING:
    mask = peptides.isna().sum() == 0
    peptides = peptides.loc[:,mask]
peptides

REMOVE_MISSING = False


Sequence,TYLVSGQPLEEIITYYPAMK,IMLPWDPTGK,ELLTLDEK,AGDAFGDTSFLNSK,RTAATLATHELR,DAAFEALGTALK,ILPEIIPILEEGLRSQK,EAMVMANNVYK,RHPYFYAPELLFFAK,FIPFIGVVK
MQ1.6.0.1_20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_01_200327,,20.580653,19.682602,,17.476428,17.378798,17.395746,,19.115976,
MQ1.6.1.12_20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_01_200330,,20.580653,19.682602,,17.476428,,17.395746,,19.115976,
MQ1.6.1.12_20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_01_20190104110509_200331,18.718189,,,17.73782,,17.980376,18.190676,,,
MQ1.6.1.12_20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_02_200331,,20.693466,,,17.582508,,17.331613,16.782329,,16.760634


In [7]:
print_hyperparameter("IMPUTE", IMPUTE)
if IMPUTE:
    from vaep.imputation import imputation_normal_distribution
    imputed = peptides.iloc.apply(imputation_normal_distribution)
    imputed    

IMPUTE = False


In [8]:
n_samples, n_features = peptides.shape

Impute missing values as 0?

In [9]:
detection_limit = float(int(peptides.min().min()))
detection_limit 

16.0

In [10]:
print_hyperparameter('FILL_NA', FILL_NA)
if FILL_NA:
    peptides.fillna(detection_limit, inplace=True)
else:
    mask_observed = peptides.notna()
    display(mask_observed.head())

FILL_NA = False


Sequence,TYLVSGQPLEEIITYYPAMK,IMLPWDPTGK,ELLTLDEK,AGDAFGDTSFLNSK,RTAATLATHELR,DAAFEALGTALK,ILPEIIPILEEGLRSQK,EAMVMANNVYK,RHPYFYAPELLFFAK,FIPFIGVVK
MQ1.6.0.1_20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_01_200327,False,True,True,False,True,True,True,False,True,False
MQ1.6.1.12_20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_01_200330,False,True,True,False,True,False,True,False,True,False
MQ1.6.1.12_20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_01_20190104110509_200331,True,False,False,True,False,True,True,False,False,False
MQ1.6.1.12_20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_02_200331,False,True,False,False,True,False,True,True,False,True


### Data Loading
Custom Dataset based on [PyTorch Data loading tutorial](https://pytorch.org/tutorials/beginner/data_loading_tutorial.html). See also [`torch.utils.data`](https://pytorch.org/docs/master/data.html#module-torch.utils.data) documentation.

In [11]:
from torch.utils.data import Dataset

class PeptideDatasetInMemory(Dataset):
    """Peptide Dataset fully in memory."""
    def __init__(self, data: pd.DataFrame, fill_na=0):
        self.mask_obs = torch.from_numpy(data.notna().values)
        data = data.fillna(fill_na)
        self.peptides = torch.from_numpy(data.values)
        self.length_ = len(data)
    
    def __len__(self):
        return self.length_
    
    def __getitem__(self, idx):       
        return self.peptides[idx], self.mask_obs[idx]

dataset_in_memory = PeptideDatasetInMemory(peptides, detection_limit)

In [12]:
len(dataset_in_memory)

4

In [13]:
peptide_intensities, masked_obs = dataset_in_memory[1:3]
print("Peptide Intensities: \n",peptide_intensities,"\n######")
print("Masking non-observed: \n", peptide_intensities * masked_obs)

Peptide Intensities: 
 tensor([[16.0000, 20.5807, 19.6826, 16.0000, 17.4764, 16.0000, 17.3957, 16.0000,
         19.1160, 16.0000],
        [18.7182, 16.0000, 16.0000, 17.7378, 16.0000, 17.9804, 18.1907, 16.0000,
         16.0000, 16.0000]], dtype=torch.float64) 
######
Masking non-observed: 
 tensor([[ 0.0000, 20.5807, 19.6826,  0.0000, 17.4764,  0.0000, 17.3957,  0.0000,
         19.1160,  0.0000],
        [18.7182,  0.0000,  0.0000, 17.7378,  0.0000, 17.9804, 18.1907,  0.0000,
          0.0000,  0.0000]], dtype=torch.float64)


A Dataset needs a the methods `__len__` and `__getitem__, so it can be feed to a `DataLoader`, this mean the following has to work

In [14]:
dataset_in_memory[:4]

(tensor([[16.0000, 20.5807, 19.6826, 16.0000, 17.4764, 17.3788, 17.3957, 16.0000,
          19.1160, 16.0000],
         [16.0000, 20.5807, 19.6826, 16.0000, 17.4764, 16.0000, 17.3957, 16.0000,
          19.1160, 16.0000],
         [18.7182, 16.0000, 16.0000, 17.7378, 16.0000, 17.9804, 18.1907, 16.0000,
          16.0000, 16.0000],
         [16.0000, 20.6935, 16.0000, 16.0000, 17.5825, 16.0000, 17.3316, 16.7823,
          16.0000, 16.7606]], dtype=torch.float64),
 tensor([[False,  True,  True, False,  True,  True,  True, False,  True, False],
         [False,  True,  True, False,  True, False,  True, False,  True, False],
         [ True, False, False,  True, False,  True,  True, False, False, False],
         [False,  True, False, False,  True, False,  True,  True, False,  True]]))

## PyTorch Implementation of VAE

### Default Command Line Arguments
- later parameters will be passed a final program

In [15]:
from vaep.cmd import parse_args
BATCH_SIZE = 2
EPOCHS = 600
args = parse_args(['--batch-size', str(BATCH_SIZE), '--no-cuda', '--seed', '43', '--epochs', str(EPOCHS), '--log-interval', str(BATCH_SIZE)])
args

Namespace(batch_size=2, cuda=False, epochs=600, log_interval=2, no_cuda=True, seed=43)

In [16]:
args.inital_lr = 1e-05
args.layers    = 1e-05

### Create a DataLoader instance
Passing the DataSet instance in memory to the DataLoader creates a generator for training which shuffles the data on training.

In [17]:
torch.manual_seed(args.seed)

device = torch.device("cuda" if args.cuda else "cpu")

In [18]:
torch.utils.data.DataLoader?

[1;31mInit signature:[0m
[0mtorch[0m[1;33m.[0m[0mutils[0m[1;33m.[0m[0mdata[0m[1;33m.[0m[0mDataLoader[0m[1;33m([0m[1;33m
[0m    [0mdataset[0m[1;33m,[0m[1;33m
[0m    [0mbatch_size[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mshuffle[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0msampler[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mbatch_sampler[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mnum_workers[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mcollate_fn[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mpin_memory[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mdrop_last[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mtimeout[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mworker_init_fn[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmultiprocessing_context[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m[1;33

In [19]:
kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_loader = torch.utils.data.DataLoader(
    dataset=dataset_in_memory,
    batch_size=args.batch_size, shuffle=True, **kwargs)

Iterate over the data:

In [20]:
for data in train_loader:
    print("Nummber of samples in mini-batch: {}".format(len(data)),
          "\tObject-Type: {}".format(type(data)))

Nummber of samples in mini-batch: 2 	Object-Type: <class 'list'>
Nummber of samples in mini-batch: 2 	Object-Type: <class 'list'>


In [21]:
for i, (data, mask) in enumerate(train_loader):
    print("Nummber of samples in mini-batch: {}".format(len(data)),
          "\tObject-Type: {}".format(type(mask)))
    print(data)
    print(mask)

Nummber of samples in mini-batch: 2 	Object-Type: <class 'torch.Tensor'>
tensor([[16.0000, 20.6935, 16.0000, 16.0000, 17.5825, 16.0000, 17.3316, 16.7823,
         16.0000, 16.7606],
        [18.7182, 16.0000, 16.0000, 17.7378, 16.0000, 17.9804, 18.1907, 16.0000,
         16.0000, 16.0000]], dtype=torch.float64)
tensor([[False,  True, False, False,  True, False,  True,  True, False,  True],
        [ True, False, False,  True, False,  True,  True, False, False, False]])
Nummber of samples in mini-batch: 2 	Object-Type: <class 'torch.Tensor'>
tensor([[16.0000, 20.5807, 19.6826, 16.0000, 17.4764, 17.3788, 17.3957, 16.0000,
         19.1160, 16.0000],
        [16.0000, 20.5807, 19.6826, 16.0000, 17.4764, 16.0000, 17.3957, 16.0000,
         19.1160, 16.0000]], dtype=torch.float64)
tensor([[False,  True,  True, False,  True,  True,  True, False,  True, False],
        [False,  True,  True, False,  True, False,  True, False,  True, False]])


### VAE Model

- adapted from basic [PyTorch VAE tutorial](https://github.com/pytorch/examples/tree/master/vae)
- single hidden encoding and decoding layer, one middle hidden layer being the latent space

In [22]:
# from IPython.core.debugger import set_trace # invoke debugging 
class VAE(nn.Module):
    def __init__(self):
        super(VAE, self).__init__()
        
        n_neurons = 1000

        self.fc1 = nn.Linear(n_features, n_neurons)
        self.fc21 = nn.Linear(n_neurons, 50)
        self.fc22 = nn.Linear(n_neurons, 50)
        self.fc3 = nn.Linear(50, n_neurons)
        self.fc4 = nn.Linear(n_neurons, n_features)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return self.fc21(h1), self.fc22(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return self.fc4(h3)

    def forward(self, x):
        mu, logvar = self.encode(x.view(-1, n_features))
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar


model = VAE().to(device)
model.double()
optimizer = optim.Adam(model.parameters(), lr=1e-5)


# Reconstruction + KL divergence losses summed over all *non-masked* elements and batch
def loss_function(recon_x, x, mask, mu, logvar, t=0.9):
    """Loss function only considering the observed values in the 
    reconstruction loss."""
    
    # Default MSE loss would have a too big nominator (Would this matter?)
    # MSE = F.mse_loss(input=recon_x, target=x, reduction='mean')
    
    # MSE of observed values
    MSE = F.mse_loss(input=recon_x*mask,
                     target=x*mask,
                     reduction='sum')
    MSE /= mask.sum() # only consider observed number of values
    
    # KL-divergence
    ## see Appendix B from VAE paper:
    ## Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
    ## https://arxiv.org/abs/1312.6114
    ## 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

    return t*MSE + (1-t)*KLD


def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, (data, mask) in enumerate(train_loader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_function(recon_batch, data, mask, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
#         if batch_idx % args.log_interval == 0:
#             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
#                 epoch, batch_idx * len(data), len(train_loader.dataset),
#                 100. * batch_idx / len(train_loader),
#                 loss.item() / len(data)))
    print('====> Epoch: {} Average loss: {:.4f}'.format(
          epoch, train_loss / len(train_loader.dataset)))

# # No test set here
# def test(epoch):
#     model.eval()
#     test_loss = 0
#     with torch.no_grad():
#         for i, (data, mask) in enumerate(test_loader):
#             data = data.to(device)
#             recon_batch, mu, logvar = model(data)
#             test_loss += loss_function(recon_batch, data, mask, mu, logvar).item()
# 
#     test_loss /= len(test_loader.dataset)
#     print('====> Test set loss: {:.4f}'.format(test_loss))


In [23]:
if __name__ == "__main__":
    for epoch in range(1, args.epochs + 1):
        train(epoch)

====> Epoch: 1 Average loss: 1433.9426
====> Epoch: 2 Average loss: 1302.4768
====> Epoch: 3 Average loss: 1209.5769
====> Epoch: 4 Average loss: 1096.0888
====> Epoch: 5 Average loss: 992.1412
====> Epoch: 6 Average loss: 935.7090
====> Epoch: 7 Average loss: 862.8980
====> Epoch: 8 Average loss: 823.6378
====> Epoch: 9 Average loss: 763.3002
====> Epoch: 10 Average loss: 732.8845
====> Epoch: 11 Average loss: 678.0465
====> Epoch: 12 Average loss: 623.8913
====> Epoch: 13 Average loss: 609.9456
====> Epoch: 14 Average loss: 567.1102
====> Epoch: 15 Average loss: 538.2294
====> Epoch: 16 Average loss: 512.8249
====> Epoch: 17 Average loss: 497.7225
====> Epoch: 18 Average loss: 469.0381
====> Epoch: 19 Average loss: 462.8169
====> Epoch: 20 Average loss: 432.9566
====> Epoch: 21 Average loss: 418.3955
====> Epoch: 22 Average loss: 418.2901
====> Epoch: 23 Average loss: 403.8908
====> Epoch: 24 Average loss: 376.8999
====> Epoch: 25 Average loss: 375.1311
====> Epoch: 26 Average loss: 

In [24]:
for batch_idx, (data, mask) in enumerate(train_loader):
    data = data.to(device)
    optimizer.zero_grad()
    recon_batch, mu, logvar = model(data)
    break


In [25]:
for x_recon, x in zip(recon_batch, data):
    print("\n{},\n{}".format(x_recon,x))


tensor([13.8214, 17.6445, 14.2478, 12.8008, 15.9272, 13.3975, 16.4665, 14.4127,
        13.5715, 11.1374], dtype=torch.float64, grad_fn=<SelectBackward>),
tensor([16.0000, 20.5807, 19.6826, 16.0000, 17.4764, 16.0000, 17.3957, 16.0000,
        19.1160, 16.0000], dtype=torch.float64)

tensor([12.6226, 17.4818, 12.9331, 11.9694, 15.1505, 13.5199, 15.8620, 14.1113,
        12.7104, 10.6033], dtype=torch.float64, grad_fn=<SelectBackward>),
tensor([16.0000, 20.5807, 19.6826, 16.0000, 17.4764, 17.3788, 17.3957, 16.0000,
        19.1160, 16.0000], dtype=torch.float64)


Latent space for two samples (mean and logvar)

In [26]:
for mu_, logvar_ in zip(mu, logvar):
    print("\n{},\n{}".format(mu_, torch.exp(logvar_)))


tensor([-2.3338,  4.4171,  2.4118,  2.3225,  0.8095,  1.9072, -2.7328,  3.0975,
         2.5912,  3.0253,  4.9794,  7.5737, -0.7739,  1.9621, -3.9962,  6.3436,
        -4.5875,  3.4362,  2.5427,  1.6525,  3.4686,  2.0423, -2.7293, -2.7275,
         4.4718, -0.7327, -3.6151, -0.3192,  8.5042,  1.7399, -1.6256,  3.0905,
         1.8523, -0.9228,  3.5593, -4.9261, -0.9060,  1.3435, -1.6048,  4.0263,
         2.0511, -3.1987, -3.2239,  4.1090,  5.5218,  4.2370,  3.8534,  3.8592,
         2.0801,  1.8580], dtype=torch.float64, grad_fn=<SelectBackward>),
tensor([ 0.7091,  1.0253,  0.5991,  0.6709,  0.6955,  1.0126,  0.7310,  1.0793,
         0.5643,  0.5852,  1.2417,  0.4722,  1.2415,  0.7318,  1.0387,  0.4421,
         0.9005,  1.1317,  1.0081,  3.9041,  0.7244,  0.7216,  0.7797,  1.0136,
         0.8133,  0.9258,  1.2682,  0.8514,  1.1720,  0.5595,  1.5556,  1.0062,
         1.1772,  0.6372,  1.5532,  0.9005,  1.1960, 45.9151,  1.2396,  2.5072,
         8.6153,  0.6591,  0.7486,  0.8445, 