# Experiment 1

In [None]:
from pathlib import Path

from src import config
from src.analyzers import *
from vaep.transform import StandardScaler, get_df_fitted_mean_std

In [None]:
import logging
from src.logging import setup_logger

logger = logging.getLogger()  # returns root-logger
logger.setLevel(logging.CRITICAL)  # silence for everything else
logger.handlers = []


logger = setup_logger(logger=logging.getLogger('vaep'))
logger.info("Experiment 01")

## Load data

- 1000 features (most abundant peptides)
- later a subset of samples is selected

In [None]:
N_SAMPLES_TO_LOAD = None
FN_PEPTIDE_INTENSITIES = config.FOLDER_DATA / 'df_intensities_N_07813_M01000'
analysis = AnalyzePeptides(
    fname=FN_PEPTIDE_INTENSITIES, nrows=N_SAMPLES_TO_LOAD)
analysis.df = analysis.df.sort_index()  # sort by date
assert analysis.df.index.is_unique, "Non-unique training samples"
analysis

### Select consecutives samples for training

In [None]:
import random

N_SAMPLES = 1000
logger.info(f"Selected {N_SAMPLES}")
analysis.N_SAMPLES = N_SAMPLES


def get_consecutive_data_indices(index, n_samples=N_SAMPLES):
    start_sample = len(index) - n_samples
    start_sample = random.randint(0, start_sample)
    return index[start_sample:start_sample+n_samples]


indices_selected = get_consecutive_data_indices(analysis.df.index)
analysis.samples = indices_selected
analysis.df = analysis.df.loc[indices_selected]
analysis.df

### Create meta data from filename

In [None]:
from collections import namedtuple
import re

columns = 'date ms_instrument lc_instrument researcher rest'.split()

regex_researcher = '[A-Z][a-z][A-Z][a-zA-Z]'



assert re.search(regex_researcher, 'HeWe_').group()   == 'HeWe'
assert re.search(regex_researcher, '_HeWe_').group()   == 'HeWe'
assert re.search(regex_researcher, 'HeWE_').group()   == 'HeWE'
assert re.search(regex_researcher, '_HeWE_').group()   == 'HeWE'

regex_lc_instrument = '[nN]*((lc)|(LC)|([eE]vo))[a-zA-Z0-9]*'
assert re.search(regex_lc_instrument, 'nlc1_').group() == 'nlc1'
assert re.search(regex_lc_instrument, 'Evo_').group() == 'Evo'


regex_hela = '[Hh]e[Ll]a'
assert re.search(regex_hela, 'HeLa').group() == 'HeLa'
assert re.search(regex_hela, 'Hela').group() == 'Hela'
assert re.search(regex_hela, 'hela').group() == 'hela'

In [None]:
RunMetaData = namedtuple('RunMetaData', columns)
data_meta = {}
for filename in indices_selected:
    # ToDo: this approach is too easy for the moment. The first two fields are in order, the rest needs matching.
    _entry = {}
    _entry['date'], _entry['ms_instrument'], _rest_filename = filename.split('_', maxsplit=2)
    try:
        _entry['researcher'] = re.search(regex_researcher, _rest_filename).group()
        if re.search(regex_hela, _entry['researcher']):
            _cleaned_filename = _rest_filename.replace(_entry['researcher'], '').replace('__', '_')
            _entry['researcher'] = re.search(regex_researcher, _cleaned_filename).group()
        _rest_filename = _rest_filename.replace(_entry['researcher'], '').replace('__', '_')
    except AttributeError:
        try:
            _entry['researcher'] = re.search('[A-Z][a-zA-Z]*[-]*[A-Z][a-zA-Z]*_', _rest_filename).group()[:-1]
            logger.debug(f"Found irregular researcher ID: {_entry['researcher']} (from: {filename})")
            _rest_filename = _rest_filename.replace(_entry['researcher']+'_', '').replace('__', '_')
        except AttributeError:
            raise
    try:
        _entry['lc_instrument'] = re.search(regex_lc_instrument, _rest_filename).group()
        _rest_filename = _rest_filename.replace(_entry['lc_instrument']+'_', '').replace('__', '_')
    except AttributeError:
        try: 
            _entry['lc_instrument'] = re.search('[Bb][Rr][0-9]+', _rest_filename).group()
            _rest_filename = _rest_filename.replace(_entry['lc_instrument']+'_', '').replace('__', '_')
        except AttributeError:
            _entry['lc_instrument'] = None
            logger.error(f'Could not find LC instrument in {filename}')
            
    
    _entry['rest'] = _rest_filename
    data_meta[filename] = _entry
    
    # print(RunMetaData(*_meta_filename[:6]))

# from pprint import pprint
# pprint(data_meta)
analysis.df_meta = pd.DataFrame.from_dict(
    data_meta, orient='index')
# analysis.df_meta['date'] = pd.to_datetime(analysis.df_meta['date'])
analysis.df_meta

In [None]:
FRACTION = 0.9


class Indices(SimpleNamespace):
    pass


indices = Indices()
indices.train, indices.valid = indices_selected[:int(
    FRACTION*N_SAMPLES)], indices_selected[int(FRACTION*N_SAMPLES):]
analysis.indices = indices

analysis.df_train = analysis.df.loc[indices.train]
analysis.df_valid = analysis.df.loc[indices.valid]

analysis.df_meta.loc[indices.train].describe(datetime_is_numeric=False)

In [None]:
# This becomes part of analysis
def compare_meta_data_for_splits(meta, indices):

    _indices = vars(indices)
    logger.info('Found vars: {}'.format(', '.join(str(x)
                                                  for x in _indices.keys())))

    for key_split, split in _indices.items():
        print(f"{key_split:8} - split description:")
        display(
            meta.loc[split].describe(datetime_is_numeric=True)
        )

    _meta_features = list(meta.columns)

    for _column in _meta_features:
        display(
            _=pd.DataFrame({
                key_split: meta.loc[split, _column].value_counts(normalize=True) for key_split, split in _indices.items()
            }).sort_index().plot(kind='line', rot=90, figsize=(10, 5), title=f"{_column} value Counts for different splits")
        )


compare_meta_data_for_splits(analysis.df_meta.iloc[:, :2], indices)

## Transforms

### Custom Transforms

- illustrate using adapted scikit-learn [`StandardScaler`](https://scikit-learn.org/stable/modules/preprocessing.html)

In [None]:
N, M = 10, 10  # Samples, Features
analysis.df_train.iloc[:N, :M]

In [None]:
analysis.df_train.iloc[:, :M].describe()

### StandardScaler on raw data

In [None]:
scaler = StandardScaler().fit(analysis.df_train)
scaler_df = get_df_fitted_mean_std(scaler, index=analysis.df_train.columns)
scaler_df.head(N)

In [None]:
sample = scaler.transform(analysis.df_train.iloc[:N])
sample.iloc[:, :M]

In [None]:
sample = scaler.inverse_transform(sample)
sample.iloc[:, :M]

### StandardScaler on log10 transformed data

In [None]:
X_log10 = np.log10(analysis.df_train)
X_log10.iloc[:N, :M]

In [None]:
scaler_log = StandardScaler(
).fit(X=X_log10)
scaler_log_df = get_df_fitted_mean_std(scaler_log, index=analysis.df.index)
scaler_log_df.head(N)

In [None]:
sample_log10 = scaler_log.transform(X_log10.iloc[:N])
sample_log10.iloc[:, :M]

In [None]:
scaler_log.inverse_transform(sample_log10).iloc[:, :M]

### Sanity checks

#### Correlation

- Correlation between the computed `means_` should be nearly perfect
- Correlation between peptide intensities should be high
- As taking the logarithm is a monoton, but non-linear transformation, the linear Pearson correlation can change substantially. [[link]](https://stats.stackexchange.com/questions/127121/do-logs-modify-the-correlation-between-two-variables)

In [None]:
print("Correlation between mean values of linear vs. log-transformed values:",
      f"{np.corrcoef(scaler.mean_, scaler_log.mean_)[1,0]:.4f}", sep='\n')

In [None]:
pd.options.display.float_format = '{:,.3f}'.format

analysis.corr_linear_vs_log = scaler.transform(X=analysis.df).corrwith(
    other=scaler_log.transform(X_log10),
    axis=0)
analysis.corr_linear_vs_log.describe()

In [None]:
# own implemention could be slightly faster as data is already demeanded and standardized.
# pd.DataFrame.corrwith?

#### Distribution

In [None]:
import seaborn as sns
from vaep.utils import sample_iterable

columns_sampled = sample_iterable(list(analysis.df.columns), n=12)
print(columns_sampled)

In [None]:
def plot_scaled_sample(columns_sampled: list, scaler, df: pd.DataFrame = analysis.df):
    _scaled = scaler.transform(df)
    display(_scaled.describe())
    _min, _max = _scaled.min().min(), _scaled.max().max()
    return _min, _max
    print(list(range(_min, _max, step=0.5)))


_min, _max = plot_scaled_sample(columns_sampled=columns_sampled, scaler=scaler)

In [None]:
# if bins should be equal between plots
# addon
import math
xlim = [-5, 5]
FACTOR = 1
[x/FACTOR for x in range(math.floor(xlim[0])*FACTOR,
                         math.ceil(xlim[1])*FACTOR+1)]

In [None]:
import matplotlib.pyplot as plt

columns_sampled = sample_iterable(list(analysis.df.columns), n=9)
subplot_kw = {'xlim': [-5, 5], 'ylim': [0, 600]}
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(
    15, 15), sharey=True, sharex=True, subplot_kw=subplot_kw)
_ = scaler_log.transform(X_log10)[columns_sampled].hist(
    figsize=(15, 15), ax=axes)
axes = scaler.transform(analysis.df)[columns_sampled].hist(
    figsize=(15, 15), ax=axes)
_ = fig.legend(('linear', 'log'), loc=5)

In [None]:
caption = (f"Frequencies are capped at {subplot_kw['ylim'][1]} and "
           "their standardized intensity values plotted between {} and {}.".format(
               *subplot_kw['xlim'])
           )
print(caption)

### Analysis state so far

In [None]:
analysis

## Fastai Dataloader

> fastai includes a replacement for Pytorch's DataLoader which is largely API-compatible, and adds a lot of useful functionality and flexibility. Before we look at the class, there are a couple of helpers we'll need to define. [[link](https://docs.fast.ai/data.load.html)]

In [None]:
# import fastai.tabular.all as tab
from fastcore.transform import Transform

from fastai.tabular.data import TabularDataLoaders

Create dataloaders using an appropriate factory method from `TabularDataLoaders` class, here [`from_df`](https://docs.fast.ai/tabular.data.html#TabularDataLoaders.from_df)

In [None]:
# DataFrame is shuffled
N_VAL = 100
valid_idx = list(range(N_VAL))
dls = TabularDataLoaders.from_df(df=analysis.df, valid_idx=valid_idx, bs=64)
analysis.dls = dls

In [None]:
dls.show_batch()  # loses object index attribute

In [None]:
dls.valid.show_batch()

In [None]:
for batch in dls.train:
    print(batch)
    break

In [None]:
class Normalize(Transform):
    def setup(self, array):
        self.mean = array.mean()  # this assumes tensor, numpy arrays and alike
        # should be applied along axis 0 (over the samples)
        self.std = array.std()  # ddof=0 in scikit-learn

    def encodes(self, x):
        x_enc = (x - self.mean) / self.std
        return x_enc

    def decodes(self, x_enc):
        x = (self.std * x_enc) + self.mean
        return x


tf_norm = Normalize()
tf_norm.setup(analysis.df_train)

Compare results to scikit learn implementation of [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html).

Differences seem to arrive due to iterative computation of mean and standard-deviation in scikit-learn, see [`_incremental_mean_and_var`](https://github.com/scikit-learn/scikit-learn/blob/15a949460dbf19e5e196b8ef48f9712b72a3b3c3/sklearn/utils/extmath.py#L792)

In [None]:
M = 5

pd.DataFrame(
    {
        ('Transform', 'mean'): tf_norm.mean[:M],
        ('Transform', 'std'): tf_norm.std[:M],
        ('StandardScaler', 'mean'): scaler.mean_[:M],
        ('StandardScaler', 'std'): scaler.scale_[:M]
    }
)

In [None]:
N = 10
tf_norm(analysis.df_train.iloc[:N]).iloc[:, :M]

In [None]:
scaler.transform(analysis.df_train.iloc[:N]).iloc[:, :M]

### DataLoader

### loss function

So now the `loss_func` signature and the `NN_Module` forward path have to be adapted. Unsure how to do this in plain PyTorch yet. So we only use the dataloader for now.

- Callback needed to set `xb` to `yb`, see [callback-attributes](https://docs.fast.ai/callback.core.html#Attributes-available-to-callbacks) and [example](https://github.com/dhuynh95/fastai_autoencoder/blob/bc357927f26273d676dca9a41018411408b97430/fastai_autoencoder/callback.py#L16)

In [None]:
# loss_function(recon_x=recon_batch, x=batch, mask=mask, mu=mu, logvar=logvar)
# learn = Learner(dls, NN_Module, opt_func=SGD, loss_func=mnist_loss, metrics=batch_accuracy)

## PyTorch

### Setup

In [None]:
from datetime import datetime

import torch
from torch import optim
from torch.utils.tensorboard import SummaryWriter

# import importlib; importlib.reload(vaep.model)
from vaep.model import train
from vaep.model import VAE
from vaep.model import loss_function
from vaep.cmd import get_args

from vaep.io.datasets import PeptideDatasetInMemory

In [None]:
args = get_args(no_cuda=True)
kwargs = {'num_workers': 2, 'pin_memory': True} if args.cuda else {}

torch.manual_seed(args.seed)
device = torch.device("cuda" if args.cuda else "cpu")
device

In [None]:
from collections import namedtuple

# EpochAverages = namedtuple("EpochAverages", 'loss mse kld')


def eval(epoch, model, data_loader, device, writer=None):
    model.eval()
    metrics = {'loss': 0, 'mse': 0,  'kld': 0}

    for batch, mask in data_loader:
        recon_batch, mu, logvar = model(batch)
        loss, mse, kld = loss_function(
            recon_x=recon_batch, x=batch, mask=mask, mu=mu, logvar=logvar)
        metrics['loss'] += loss.item()
        metrics['mse'] += mse.item()
        metrics['kld'] += kld.item()
    if writer is not None:
        n_samples = len(data_loader.dataset)
        writer.add_scalar('avg validation loss',
                          metrics['loss'] / n_samples,
                          epoch)
    return metrics

In [None]:
detection_limit = np.log10(analysis.df).min().min()  # all zeros become nan.
"Detection limit: {:6.3f}, corresponding to intensity value of {:,d}".format(
    detection_limit,
    int(10 ** detection_limit)
)

### Training and Validation datasets

In [None]:
n_samples, n_features = analysis.df.shape
"N samples: {:10,d} - N Peptides: {:10,d}".format(n_samples, n_features)

In [None]:
# random sample is drawn
PROPORTION_TRAIN = 0.9

# could be a method on analysis
analysis.indices_train = analysis.df.sample(
    int(n_samples*PROPORTION_TRAIN)
).index
analysis.indices_val = analysis.df.index.difference(analysis.indices_train)

analysis.df_train = analysis.df.loc[analysis.indices_train]
analysis.df_valid = analysis.df.loc[analysis.indices_val]

len(analysis.indices_val), analysis.indices_val[:5]

### Non-log transformed data (Single run)

Scale samples according to training data

In [None]:
# select initial data: transformed vs not log transformed
scaler = scaler.fit(analysis.df_train)
# five examples from validation dataset
scaler.transform(analysis.df_valid.iloc[:5])

#### Dataloaders

In [None]:
def get_dataloaders(df_train, df_valid, scaler):
    data_train = PeptideDatasetInMemory(
        data=scaler.transform(df_train))
    data_valid = PeptideDatasetInMemory(data=scaler.transform(df_valid))

    dl_train = torch.utils.data.DataLoader(
        dataset=data_train,
        batch_size=args.batch_size, shuffle=True, **kwargs)

    dl_valid = torch.utils.data.DataLoader(
        dataset=data_valid,
        batch_size=args.batch_size, shuffle=False, **kwargs)

    return dl_train, dl_valid

In [None]:
class TensorboardModelNamer():
    def __init__(self, prefix_folder, root_dir=Path('runs')):
        self.prefix_folder = prefix_folder
        self.root_logdir = Path(root_dir)
        self.folder = (self.root_logdir /
                       f'{self.prefix_folder}_{format(datetime.now(), "%y%m%d_%H%M")}')

    def get_model_name(self, hidden_layers: int,
                       neurons: list,
                       scaler: str,
                       ):
        name = 'model_'
        name += f'hl{hidden_layers:02d}'

        if type(neurons) == str:
            neurons = neurons.split()
        elif not type(neurons) in [list, tuple]:
            raise TypeError(
                "Provide expected format for neurons: [12, 13, 14], '12 13 14' or '12_13_14'")

        for x in neurons:
            name += f'_{x}'

        if type(scaler) == str:
            name += f'_{scaler}'
        else:
            name += f'_{scaler!r}'
        return name

    def get_writer(self, hidden_layers: int,
                   neurons: list,
                   scaler: str,
                   ):
        model_name = self.get_model_name(hidden_layers=hidden_layers,
                                         neurons=neurons,
                                         scaler=scaler)
        return SummaryWriter(log_dir=self.folder / model_name)


expected = 'model_hl01_12_13_14_scaler'

tensorboard_model_namer = TensorboardModelNamer(prefix_folder='experiment_01')

assert tensorboard_model_namer.get_model_name(
    hidden_layers=1, neurons=[12, 13, 14], scaler='scaler') == expected
assert tensorboard_model_namer.get_model_name(
    hidden_layers=1, neurons='12 13 14', scaler='scaler') == expected
assert tensorboard_model_namer.get_model_name(
    hidden_layers=1, neurons='12_13_14', scaler='scaler') == expected
assert tensorboard_model_namer.get_model_name(
    hidden_layers=1, neurons='12_13_14', scaler=scaler) == 'model_hl01_12_13_14_StandardScaler()'
# assert get_writer(hidden_layers=1, neurons=1, scaler=scaler) == TypeError

In [None]:
n_neurons = max(30, int(n_features/6))
logger.info(f'Latent layer neurons: {n_neurons}')

writer = tensorboard_model_namer.get_writer(1, [n_neurons], 'scaler')
logger.info(f"Logging to: {writer.get_logdir()}")


dl_train, dl_valid = get_dataloaders(
    df_train=analysis.df_train,
    df_valid=analysis.df_valid,
    scaler=scaler)

logger.info(
    "N train: {:5,d} \nN valid: {:5,d}".format(
        len(dl_train.dataset), len(dl_valid.dataset))
)

data, mask = next(iter(dl_train))

writer.add_image(
    f'{len(data)} batch of sampled data (as heatmap)', data, dataformats='HW')
writer.add_image(
    f'{len(mask)} mask for this batch of samples', mask, dataformats='HW')


model = VAE(n_features=n_features, n_neurons=n_neurons)

logger.info(model)
# model = model.to(device, non_blocking=True)

# ToDo: compiler warning: error or tracer error?
writer.add_graph(model, input_to_model=data)  # try to add after training?
writer.flush()

optimizer = optim.Adam(model.parameters(), lr=1e-4)

#### Training Loop

In [None]:
from collections import defaultdict


def run_experiment(model, dls, writer, args):
    metrics = defaultdict(dict)
    dl_train, dl_valid = dls
    msg_eval_epoch = "Validation Set - Epoch: {:3d} - loss: {:7.3f} - mse: {:5.3f} - KLD: {:5.3f}"

    for epoch in range(1, args.epochs):
        metrics[('train', 'loss')][epoch] = float(train(epoch, model=model, train_loader=dl_train,
                                                        optimizer=optimizer, device=device, writer=writer))
        # ToDo: Pull out writer from eval function
        _epoch_metric_valid = eval(
            epoch, model=model, data_loader=dl_valid, device=device, writer=writer)
        metrics[('valid', 'loss')][epoch] = _epoch_metric_valid['loss']
        metrics[('valid', 'mse')][epoch] = _epoch_metric_valid['mse']
        metrics[('valid', 'kld')][epoch] = _epoch_metric_valid['kld']
        if not epoch % 10:
            logger.info(msg_eval_epoch.format(
                epoch, *_epoch_metric_valid.values()))
    writer.flush()
    writer.close()  # closes all internal writers of SummaryWriter
    return metrics


args.epochs = 200
metrics = run_experiment(model=model, dls=(
    dl_train, dl_valid), writer=writer, args=args)  # decide about format

#### Performance plots

In [None]:
metrics = pd.DataFrame(metrics)
_ = metrics.plot(
    figsize=(18, 6))

In [None]:
metrics[('valid', 'norm_loss')] = metrics[(
    'valid', 'loss')] / len(dl_valid.dataset)
metrics[('train', 'norm_loss')] = metrics[(
    'train', 'loss')] / len(dl_train.dataset)

selected = [(_split, _metric)
            for _split in ['train', 'valid']
            for _metric in ['norm_loss']
            ]
_ = metrics[selected].plot(
    figsize=(18, 6))

### Log transformed data (Single run)

In [None]:
analysis.df_train_log10 = np.log10(analysis.df_train)
analysis.df_valid_log10 = np.log10(analysis.df_valid)
scaler_log = StandardScaler().fit(X=analysis.df_train_log10)
# five examples from validation dataset
scaler_log.transform(analysis.df_valid_log10.iloc[:5])

#### Dataloaders

In [None]:
# n_neurons = max(30, int(n_features/6))
logger.info(f'Latent layer neurons: {n_neurons}')

writer = tensorboard_model_namer.get_writer(1, [n_neurons], 'scaler_log')
logger.info(f"Logging to: {writer.get_logdir()}")


dl_train, dl_valid = get_dataloaders(df_train=analysis.df_train_log10, df_valid=analysis.df_valid_log10, scaler=scaler_log)

logger.info(
    "N train: {:5,d} \nN valid: {:5,d}".format(
        len(dl_train.dataset), len(dl_valid.dataset))
)

data, mask = next(iter(dl_train))

writer.add_image(
    f'{len(data)} batch of sampled data (as heatmap)', data, dataformats='HW')
writer.add_image(
    f'{len(mask)} mask for this batch of samples', mask, dataformats='HW')


model = VAE(n_features=n_features, n_neurons=n_neurons)

logger.info(model)
# model = model.to(device, non_blocking=True)

# ToDo: compiler warning: error or tracer error?
writer.add_graph(model, input_to_model=data)  # try to add after training?
writer.flush()

optimizer = optim.Adam(model.parameters(), lr=1e-4)

#### Training Loop

In [None]:
args.epochs = 200
metrics_log = run_experiment(model=model, dls=(
    dl_train, dl_valid), writer=writer, args=args)  # decide about format

#### Perfromance plots

In [None]:
metrics = pd.DataFrame(metrics_log)
metrics.plot(
    figsize=(18, 6))

In [None]:
metrics[('valid', 'norm_loss')] = metrics[(
    'valid', 'loss')] / len(dl_valid.dataset)
metrics[('train', 'norm_loss')] = metrics[(
    'train', 'loss')] / len(dl_train.dataset)

selected = [(_split, _metric)
            for _split in ['train', 'valid']
            for _metric in ['norm_loss']
            ]
_ = metrics[selected].plot(
    figsize=(18, 6))

### Tensorboard

- can be run from notebook
- or in a separate process to inspect currently running training loops

In [None]:
# Load the TensorBoard notebook extension
# %load_ext tensorboard

In [None]:
# # first time, it timesout, second time it starts, see https://github.com/tensorflow/tensorboard/issues/2481#issuecomment-516819768
# %tensorboard --logdir {tensorboard_model_namer.folder} --host localhost

In [None]:
print(
    f"Run to see updates: \n\n\ttensorboard --logdir {tensorboard_model_namer.folder.absolute()}")

## Hyperparameter comparison

- [x] order data by date: consecutive samples from training to validation
- [ ] check stratification based on machine and column length between splits
    - validation and traning data have same proportion of machine types
- [ ] complete meta data reading based on filenames
- [ ] compare performance regarding data normalization
    - in original intensity space (non-log-transformed)
- [ ] compare performance regarding several hyperparameters of VAE (layers, activation, etc)
    - plot different losses in one plot as validation data set is the same
- [ ] increase number of samples in training set and create result plot


- Current optimum for comparision is zero

> The comparison where relatively low abundant, but not super low-abundant peptides will be masked, could skew the comparison.

In [None]:
# writer # new writer
# dls = get_dls(data_in_memory, scaler)
# model = VAE()
# writer =  # new writer for each setup
# metrics = run_experiment(model, dls, writer)
# overview['experiment_name'] = metrics