# Proof of Concept - VAEP 
Variational Autoencoder of the Proteome (VAEP), reconstructiong samples on the peptide level using `log`-transformed on peptide intensities. This is the proof of concept (POC) for later use. 

- Fit VAE to Hela-Sample data (3 samples) and overfit. (Functional test of code)
- Fit 

### Handling Missing values
In this semi-supervised setting, where the samples are both input and target, missing values have to be imputed in the sample for the input space, but these values should not be considered for the loss function as their truth is unkown. 

### Alternatives

- [`sklearn.imputer.IterativeImputer`](https://scikit-learn.org/stable/modules/impute.html#iterative-imputer)

## Setup
> You won't have to re-run this (saves times for big in-memory datasets)

In [None]:
import pandas as pd

import torch
import torch.utils.data
from torch import nn, optim
from torch.nn import functional as F

import vaep
from vaep.transform import log

### Load Data

In [None]:
from src.config import FN_PEPTIDE_INTENSITIES
FN_PEPTIDE_INTENSITIES

In [None]:
peptides_all = pd.read_pickle(FN_PEPTIDE_INTENSITIES)

## Experiment
> Re-run everything below

In [None]:
REMOVE_MISSING = False
IMPUTE = False
FILL_NA = False
print_hyperparameter = lambda name, x: print("{} = {}".format(name,x))

Run all cells below from here for testing

In [None]:
N_FEAT = 10
peptides = peptides_all.sample(n=N_FEAT, axis=1)
peptides = peptides.apply(log)

In [None]:
print_hyperparameter("REMOVE_MISSING", REMOVE_MISSING)
if REMOVE_MISSING:
    mask = peptides.isna().sum() == 0
    peptides = peptides.loc[:,mask]
peptides

In [None]:
peptides.describe()

In [None]:
print_hyperparameter("IMPUTE", IMPUTE)
if IMPUTE:
    from vaep.imputation import imputation_normal_distribution
    imputed = peptides.iloc.apply(imputation_normal_distribution)
    imputed    

In [None]:
n_samples, n_features = peptides.shape

Impute missing values as 0?

In [None]:
detection_limit = float(int(peptides.min().min()))
detection_limit 

In [None]:
print_hyperparameter('FILL_NA', FILL_NA)
if FILL_NA:
    peptides.fillna(detection_limit, inplace=True)
else:
    mask_observed = peptides.notna()
    display(mask_observed.head())

### Data Loading
Custom Dataset based on [PyTorch Data loading tutorial](https://pytorch.org/tutorials/beginner/data_loading_tutorial.html). See also [`torch.utils.data`](https://pytorch.org/docs/master/data.html#module-torch.utils.data) documentation.

In [None]:
from vaep.io.datasets import PeptideDatasetInMemoryMasked

dataset_in_memory = PeptideDatasetInMemoryMasked(peptides.to_numpy(), detection_limit)

In [None]:
len(dataset_in_memory)

In [None]:
peptide_intensities, masked_obs = dataset_in_memory[1:3]
print("Peptide Intensities: \n",peptide_intensities,"\n######")
print("Masking non-observed: \n", peptide_intensities * masked_obs)

A Dataset needs a the methods `__len__` and `__getitem__, so it can be feed to a `DataLoader`, this mean the following has to work

In [None]:
dataset_in_memory[:4]

## PyTorch Implementation of VAE

### Default Command Line Arguments
- later parameters will be passed a final program

In [None]:
# from vaep.cmd import parse_args
# BATCH_SIZE = 2
EPOCHS = 150
# args = parse_args(['--batch-size', str(BATCH_SIZE), '--no-cuda', '--seed', '43', '--epochs', str(EPOCHS), '--log-interval', str(BATCH_SIZE)])
from vaep.cmd import get_args
args= get_args(epochs=EPOCHS)

In [None]:
args.inital_lr = 1e-05
args

### Create a DataLoader instance
Passing the DataSet instance in memory to the DataLoader creates a generator for training which shuffles the data on training.

In [None]:
torch.manual_seed(args.seed)

device = torch.device("cuda" if args.cuda else "cpu")
device = "cpu"

In [None]:
# torch.utils.data.DataLoader?

In [None]:
kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_loader = torch.utils.data.DataLoader(
    dataset=dataset_in_memory,
    batch_size=args.batch_size, shuffle=True, **kwargs)

Iterate over the data:

In [None]:
for data in train_loader:
    print("Nummber of samples in mini-batch: {}".format(len(data)),
          "\tObject-Type: {}".format(type(data)))
    break

In [None]:
for i, (data, mask) in enumerate(train_loader):
    print("Nummber of samples in mini-batch: {}".format(len(data)),
          "\tObject-Type: {}".format(type(mask)))
    print(data)
    print(mask)
    break

### VAE Model

- adapted from basic [PyTorch VAE tutorial](https://github.com/pytorch/examples/tree/master/vae)
- single hidden encoding and decoding layer, one middle hidden layer being the latent space

In [None]:

# # No test set here
# def test(epoch):
#     model.eval()
#     test_loss = 0
#     with torch.no_grad():
#         for i, (data, mask) in enumerate(test_loader):
#             data = data.to(device)
#             recon_batch, mu, logvar = model(data)
#             test_loss += loss_function(recon_batch, data, mask, mu, logvar).item()
# 
#     test_loss /= len(test_loader.dataset)
#     print('====> Test set loss: {:.4f}'.format(test_loss))


In [None]:
from vaep.model import VAE, loss_function

model = VAE(n_features=10, n_neurons=20)
model.double()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, (data, mask) in enumerate(train_loader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss, mse, kld = loss_function(recon_batch=recon_batch, batch=(data, mask), mu=mu, logvar=logvar).values()
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
#         if batch_idx % args.log_interval == 0:
#             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
#                 epoch, batch_idx * len(data), len(train_loader.dataset),
#                 100. * batch_idx / len(train_loader),
#                 loss.item() / len(data)))
    print('====> Epoch: {} Average loss: {:.4f}'.format(
          epoch, train_loss / len(train_loader.dataset)))

if __name__ == "__main__":
    for epoch in range(1, args.epochs + 1):
        train(epoch)

In [None]:
for batch_idx, (data, mask) in enumerate(train_loader):
    data = data.to(device)
    optimizer.zero_grad()
    recon_batch, mu, logvar = model(data)
    loss, mse, kld = loss_function(recon_batch, (data, mask), mu, logvar).values()
    break
print(batch_idx)
print(float(mse))
diff = (recon_batch * mask) - (data * mask)
float(diff.pow(2).sum() / mask.sum())

In [None]:
data * mask

In [None]:
import numpy as np
df_diff = pd.DataFrame(recon_batch.detach().numpy()) - pd.DataFrame((data * mask).numpy()).replace(0.0, np.nan)

In [None]:
diff.sum(), df_diff.sum().sum()

In [None]:
np.nanmean((df_diff ** 2))

In [None]:
(df_diff**2).sum().sum() / df_diff.notna().sum().sum()

In [None]:
for x_recon, x, x_mask, x_mu, x_logvar in zip(recon_batch, data, mask, mu, logvar):
    print("\n{},\n{}".format(x_recon,x))
    loss, mse, kld = loss_function(x_recon, (x, x_mask), x_mu, x_logvar).values()
    print(float(loss), float(mse), float(kld))
    print(((x_recon * x_mask) - (x * x_mask)).pow(2).mean())
    print(((x_recon * x_mask) - (x * x_mask)).pow(2).sum() / x_mask.sum())
    break

In [None]:
x_mask

Latent space for two samples (mean and logvar)

In [None]:
for mu_, logvar_ in zip(mu, logvar):
    print("\n{},\n{}".format(mu_, torch.exp(logvar_)))