# Experiment 1

In [None]:
from src import config
from src.analyzers import *
from vaep.transform import StandardScaler, get_df_fitted_mean_std

In [None]:
import logging
from src.logging import setup_logger

logger = logging.getLogger()  # returns root-logger
logger.setLevel(logging.CRITICAL)  # silence for everything else
logger.handlers = []


logger = setup_logger(logger=logging.getLogger('vaep'))
logger.info("Experiment 01")

## Load data

- 1000 features (most abundant peptides)
- later a subset of samples is selected

In [None]:
N_SAMPLES_TO_LOAD = None
FN_PEPTIDE_INTENSITIES = config.FOLDER_DATA / 'df_intensities_N_07813_M01000'
analysis = AnalyzePeptides(
    fname=FN_PEPTIDE_INTENSITIES, nrows=N_SAMPLES_TO_LOAD)
analysis.df = analysis.df.sort_index()  # sort by date
assert analysis.df.index.is_unique, "Non-unique training samples"
analysis

### Select consecutives samples for training

In [None]:
import random

N_SAMPLES = 1000
logger.info(f"Selected {N_SAMPLES}")
analysis.N_SAMPLES = N_SAMPLES


def get_consecutive_data_indices(index, n_samples=N_SAMPLES):
    start_sample = len(index) - n_samples
    start_sample = random.randint(0, start_sample)
    return index[start_sample:start_sample+n_samples]


indices_selected = get_consecutive_data_indices(analysis.df.index)
analysis.samples = indices_selected
analysis.df = analysis.df.loc[indices_selected]

FRACTION = 0.9

class Indices(SimpleNamespace):
    pass

indices = Indices()
indices.train, indices.valid = indices_selected[:int(
    FRACTION*N_SAMPLES)], indices_selected[int(FRACTION*N_SAMPLES):]
analysis.indices = indices

analysis.df_train = analysis.df.loc[indices.train]
analysis.df_valid = analysis.df.loc[indices.valid]

analysis.df

### Create meta data from filename

In [None]:
from src import metadata

data_meta = metadata.get_metadata_from_filenames(indices_selected)
analysis.df_meta = pd.DataFrame.from_dict(
    data_meta, orient='index')
# analysis.df_meta['date'] = pd.to_datetime(analysis.df_meta['date'])
analysis.df_meta

- possibility to group data in time along `(machine, lc)` pairs

In [None]:
analysis.df_meta.loc[indices.train].describe(datetime_is_numeric=False)

In [None]:
# This becomes part of analysis
def compare_meta_data_for_splits(meta, indices):

    _indices = vars(indices)
    logger.info('Found vars: {}'.format(', '.join(str(x)
                                                  for x in _indices.keys())))

    for key_split, split in _indices.items():
        print(f"{key_split:8} - split description:")
        display(
            meta.loc[split].describe(datetime_is_numeric=True)
        )

    _meta_features = list(meta.columns)

    for _column in _meta_features:
        display(
            _=pd.DataFrame({
                key_split: meta.loc[split, _column].value_counts(normalize=True) for key_split, split in _indices.items()
            }).sort_index().plot(kind='line', rot=90, figsize=(10, 5), title=f"{_column} value Counts for different splits")
        )


compare_meta_data_for_splits(analysis.df_meta.iloc[:, :2], indices)

### Analysis state so far

In [None]:
analysis

## PyTorch

### Setup

In [None]:
import torch
from torch import optim

# import importlib; importlib.reload(vaep.model)
from vaep.model import train
from vaep.model import VAE
from vaep.model import loss_function
from vaep.cmd import get_args
from vaep.tf_board import TensorboardModelNamer

from vaep.io.datasets import PeptideDatasetInMemory

In [None]:
args = get_args(no_cuda=True) # data transfer to GPU seems slow
kwargs = {'num_workers': 2, 'pin_memory': True} if args.cuda else {}

# torch.manual_seed(args.seed)
args.epochs = 30
device = torch.device("cuda" if args.cuda else "cpu")
device

In [None]:
detection_limit = np.log10(analysis.df).min().min()  # all zeros become nan.
"Detection limit: {:6.3f}, corresponding to intensity value of {:,d}".format(
    detection_limit,
    int(10 ** detection_limit)
)

### Training and Validation datasets

In [None]:
n_samples, n_features = analysis.df.shape
"N samples: {:10,d} - N Peptides: {:10,d}".format(n_samples, n_features)

In [None]:
len(analysis.indices.valid), analysis.indices.valid[:5]

### Non-log transformed data (Single run)

Scale samples according to training data

In [None]:
# select initial data: transformed vs not log transformed
scaler = StandardScaler().fit(analysis.df_train)
# five examples from validation dataset
scaler.transform(analysis.df_valid.iloc[:5])

#### Dataloaders

In [None]:
def get_dataloaders(df_train, df_valid, scaler):
    data_train = PeptideDatasetInMemory(
        data=scaler.transform(df_train))
    data_valid = PeptideDatasetInMemory(data=scaler.transform(df_valid))

    dl_train = torch.utils.data.DataLoader(
        dataset=data_train,
        batch_size=args.batch_size, shuffle=True, **kwargs)

    dl_valid = torch.utils.data.DataLoader(
        dataset=data_valid,
        batch_size=args.batch_size, shuffle=False, **kwargs)

    return dl_train, dl_valid

In [None]:
n_neurons = max(30, int(n_features/6))
logger.info(f'Latent layer neurons: {n_neurons}')

tensorboard_model_namer = TensorboardModelNamer(prefix_folder='experiment_01')
writer = tensorboard_model_namer.get_writer(1, [n_neurons], 'scaler')
logger.info(f"Logging to: {writer.get_logdir()}")


dl_train, dl_valid = get_dataloaders(
    df_train=analysis.df_train,
    df_valid=analysis.df_valid,
    scaler=scaler)

logger.info(
    "N train: {:5,d} \nN valid: {:5,d}".format(
        len(dl_train.dataset), len(dl_valid.dataset))
)

data, mask = next(iter(dl_train))

writer.add_image(
    f'{len(data)} batch of sampled data (as heatmap)', data, dataformats='HW')
writer.add_image(
    f'{len(mask)} mask for this batch of samples', mask, dataformats='HW')


model = VAE(n_features=n_features, n_neurons=n_neurons)

logger.info(model)
model = model.to(device, non_blocking=True)

# ToDo: compiler warning: error or tracer error?
writer.add_graph(model, input_to_model=data.to(device))  # try to add after training?
# writer.flush()

optimizer = optim.Adam(model.parameters(), lr=1e-4)

#### Training Loop

In [None]:
from collections import defaultdict
from vaep.model import eval

def run_experiment(model, dls, writer, args):
    metrics = defaultdict(dict)
    dl_train, dl_valid = dls
    msg_eval_epoch = "Validation Set - Epoch: {:3d} - loss: {:7.3f} - mse: {:5.3f} - KLD: {:5.3f}"

    for epoch in range(1, args.epochs+1):
        metrics[('train', 'loss')][epoch] = float(train(epoch, model=model, train_loader=dl_train,
                                                        optimizer=optimizer, device=device, writer=writer))
        # ToDo: Pull out writer from eval function
        _epoch_metric_valid = eval(
            model=model, data_loader=dl_valid, device=device)
        n_batches = len(dl_valid)
        writer.add_scalar('avg validation loss',
                          _epoch_metric_valid['loss'] / n_batches,
                          epoch)
        metrics[('valid', 'loss')][epoch] = _epoch_metric_valid['loss']
        metrics[('valid', 'mse')][epoch] = _epoch_metric_valid['mse']
        metrics[('valid', 'kld')][epoch] = _epoch_metric_valid['kld']
        if not epoch % 10:
            logger.info(msg_eval_epoch.format(
                epoch, *_epoch_metric_valid.values()))
    writer.flush()
    writer.close()  # closes all internal writers of SummaryWriter
    return metrics



metrics = run_experiment(model=model, dls=(
    dl_train, dl_valid), writer=writer, args=args)  # decide about format

#### Performance plots

In [None]:
metrics = pd.DataFrame(metrics)
_ = metrics.plot(
    figsize=(18, 6), xlim=(1, args.epochs))

In [None]:
metrics

In [None]:
selected = [(_split, _metric)
            for _split in ['train', 'valid']
            for _metric in ['loss']
            ]
_ = metrics[selected].plot(
    figsize=(18, 6))

### Log transformed data (Single run)

In [None]:
analysis.df_train_log10 = np.log10(analysis.df_train)
analysis.df_valid_log10 = np.log10(analysis.df_valid)
scaler_log = StandardScaler().fit(X=analysis.df_train_log10)
# five examples from validation dataset
scaler_log.transform(analysis.df_valid_log10.iloc[:5])

#### Dataloaders

In [None]:
# n_neurons = max(30, int(n_features/6))
logger.info(f'Latent layer neurons: {n_neurons}')

writer = tensorboard_model_namer.get_writer(1, [n_neurons], 'scaler_log')
logger.info(f"Logging to: {writer.get_logdir()}")


dl_train, dl_valid = get_dataloaders(df_train=analysis.df_train_log10, df_valid=analysis.df_valid_log10, scaler=scaler_log)

logger.info(
    "N train: {:5,d} \nN valid: {:5,d}".format(
        len(dl_train.dataset), len(dl_valid.dataset))
)

data, mask = next(iter(dl_train))

writer.add_image(
    f'{len(data)} batch of sampled data (as heatmap)', data, dataformats='HW')
writer.add_image(
    f'{len(mask)} mask for this batch of samples', mask, dataformats='HW')


model = VAE(n_features=n_features, n_neurons=n_neurons)

logger.info(model)
# model = model.to(device, non_blocking=True)

# ToDo: compiler warning: error or tracer error?
writer.add_graph(model, input_to_model=data)  # try to add after training?
writer.flush()

optimizer = optim.Adam(model.parameters(), lr=1e-4)

#### Training Loop

In [None]:
metrics_log = run_experiment(model=model, dls=(
    dl_train, dl_valid), writer=writer, args=args)  # decide about format

#### Perfromance plots

In [None]:
metrics = pd.DataFrame(metrics_log)
metrics.plot(
    figsize=(18, 6))

In [None]:
selected = [(_split, _metric)
            for _split in ['train', 'valid']
            for _metric in ['loss']
            ]
_ = metrics[selected].plot(
    figsize=(18, 6))

### Tensorboard

- can be run from notebook
- or in a separate process to inspect currently running training loops

In [None]:
# Load the TensorBoard notebook extension
# %load_ext tensorboard

In [None]:
# # first time, it timesout, second time it starts, see https://github.com/tensorflow/tensorboard/issues/2481#issuecomment-516819768
# %tensorboard --logdir {tensorboard_model_namer.folder} --host localhost

In [None]:
print(f"Run to see updates: \n\n\ttensorboard --logdir {tensorboard_model_namer.folder.absolute()}")

## Hyperparameter comparison

- [x] order data by date: consecutive samples from training to validation
- [ ] check stratification based on machine and column length between splits
    - validation and training data have same proportion of machine types
- [ ] complete meta data reading based on filenames
- [ ] compare performance regarding data normalization
    - in original intensity space (non-log-transformed)
- [ ] compare performance regarding several hyperparameters of VAE (layers, activation, etc)
    - plot different losses in one plot as validation data set is the same
- [ ] increase number of samples in training set and create result plot
- [ ] increase the number of peptides (features)
- [ ] mask some values in the validation set missing (Quality Assessment)
- [ ] write main function which trains an entire model (including data transformations)

Debug
- [ ] Check reporting of loss again: average sample loss or average peptide loss?
- [ ] take a close look at VAE tutorial of PyTorch (data normalization, etc)
- [ ] reduce the features size to fewer samples

VAE
- original inputs between 0 and 1 as decoder outputs are transformed originally using the sigmoid fct
- original model use `tanh` activations
- think about the definition of `MSE` in a mini-batch. Should be peptide wise?
    - VAMB does sum over a sample and then takes the mean of the sum (alternative?)
    - multi-output regression?
- learning requires active masking: Mask inputs which should be learned to be recovered. Feed original, 
  not masked image as target to loss.

- [ ] Run MNIST example with MSE loss. Does it still work?
- [ ] Normalize inputs to zero and one, use MNIST VAE. Does it work?
- [ ] Regress M peptide intensities on 1 other peptide intensity. Does it work? (Reference performance)
- [ ] Build a normal AE without probabilistic bottleneck. Does this work?

Refactoring

- [ ] get epoch out of train, eval etc


Ideas
  - combine 1000 most abundant peptides as guidance for different sets of low abundant peptides
  - show the difference between original and reconstruction using a cm in an Image? batch-wise?

- Current optimum for comparision is zero

> The comparison where relatively low abundant, but not super low-abundant peptides will be masked, could skew the comparison.

In [None]:
# writer # new writer
# dls = get_dls(data_in_memory, scaler)
# model = VAE()
# writer =  # new writer for each setup
# metrics = run_experiment(model, dls, writer)
# overview['experiment_name'] = metrics