# Latent space visualization

In [None]:
import logging
from pathlib import Path
from pprint import pprint
from src.nb_imports import *


from fastai.losses import MSELossFlat
from fastai.learner import Learner


import fastai
# from fastai.tabular.all import *

from fastai.basics import *
from fastai.callback.all import *
from fastai.torch_basics import *
from fastai.data.all import *

from fastai.tabular.all import *
from fastai.collab import *

# import fastai.callback.hook # Learner.summary

import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

import vaep
from vaep.models import ae
from vaep.io.datasets import DatasetWithTarget
from vaep.transform import VaepPipeline
from vaep.io import datasplits
from vaep.io.dataloaders import get_dls, get_test_dl

import src
import src.analyzers as analyzers
from src import config
from src.logging import setup_logger
logger = setup_logger(logger=logging.getLogger('vaep'))
logger.info("Experiment 03 - Analysis of latent spaces and performance comparisions")

figures = {}  # collection of ax or figures

Papermill script parameters:

In [None]:
# folders
data:str = 'data/msinstrument_in_QE4'
out_folder:str = 'experiment_14'
# training
n_training_samples_max:int = 1000
epochs_max:int = 10
batch_size:int = 32
cuda:bool=True
# model
latent_dim:int = 10
hidden_layers:int = 1

Some argument transformations

In [None]:
args = config.Config()
args.data = Path(data); del data
args.out_folder = Path(out_folder); del out_folder
args.n_training_samples_max = n_training_samples_max; del n_training_samples_max
args.epochs_max = epochs_max; del epochs_max
args.batch_size = batch_size; del batch_size
args.cuda = cuda; del cuda
args.latent_dim = latent_dim; del latent_dim
args.hidden_layers = hidden_layers; del hidden_layers

args

## Load data in long format

In [None]:
data = datasplits.DataSplits.from_folder(args.data) 
# select max_train_samples

- data representation not to easy yet
- should validation and test y (the imputed cases using replicates) be only generated in an application to 
  keep unmanipulated data separate from imputed values?

In [None]:
# data # uncommet to see current representation

data is loaded in long format

In [None]:
data.train_X.sample(5)

## Initialize Comparison

- replicates idea for truely missing values: Define truth as by using n=3 replicates to impute
  each sample
- real test data: Not used for predictions or early stopping.

In [None]:
# data.test_y
test_pred_real_na = data.interpolate('test_X') # "gold standard"
# data.test_y.index.difference(test_predictions_real_na.index) # empty
# test_predictions_real_na.compare(data.test_y) # some wiered bug, maybe due to multi-index?
# are indices exactly the same? seems so
# sorting values and visual inspection: the values seem to be the same
# conclusions: floating point differences, which can be savely ignored
assert abs((test_pred_real_na - data.test_y).sum()) < 0.000001


test_pred_real_na = test_pred_real_na.to_frame(name='replicated')
# test_pred_real_na = data.test_y
test_pred_real_na.sort_index(inplace=True)
test_pred_real_na

In [None]:
test_pred_observed = data.test_X.to_frame('measured')
test_pred_observed.sort_index(inplace=True)

In [None]:
ana_train_X = analyzers.AnalyzePeptides(data=data.train_X, is_wide_format=False, ind_unstack='peptide')
# ana_train_X.df.set_index('peptide', append=True, inplace=True)
# ana_train_X.df.reset_index(inplace=True)
figures['pca_train'] = ana_train_X.plot_pca()
vaep.savefig(figures['pca_train'], args.out_folder / f'pca_plot_raw_data_{ana_train_X.fname_stub}')

## Collaborative Filtering

In [None]:
# class CollabAnalysis(Analysis):   
#     def __init__(datasplits:DataSplits,
#                  sample_column='Sample ID',
#                  item_column='peptide',
#                  target_column='intensity')
    
ana_collab = Analysis()
ana_collab.X = data.train_X.append(data.val_X).reset_index()

# idx_splitter = IndexSplitter(list(range(len(data.train_X), len(data.train_X)+ len(data.val_X) )))
# splits = idx_splitter(ana_collab.X)
def get_splits(train_df, val_df):
    N_train, N_valid = len(train_df), len(val_df)
    return [list(range(0, N_train)), list(range(N_train, N_train, N_valid))]

splits = get_splits(data.train_X, data.val_X)
to = TabularCollab(df=ana_collab.X,
                   procs=[Categorify],
                   cat_names=['Sample ID','peptide'],
                   y_names=['intensity'],
                   y_block=TransformBlock(),
                   splits=splits)
ana_collab.dls = to.dataloaders(path='.')

In [None]:
ana_collab.cat_columns = 'peptide,Sample ID'.split(',') 
ana_collab.target_column = 'intensity'.split(',') 

ana_collab.model_kwargs = dict()
ana_collab.model_kwargs['n_samples'] = len(ana_collab.dls.classes['Sample ID'])
ana_collab.model_kwargs['n_peptides'] = len(ana_collab.dls.classes['peptide'])
ana_collab.model_kwargs['dim_latent_factors'] = args.latent_dim
ana_collab.model_kwargs['y_range'] = (
    int(data.train_X.min()),
    int(data.train_X.max())+1)
print("Args:")
pprint(ana_collab.model_kwargs)

In [None]:
# model = EmbeddingDotBias.from_classes(**ana_collab.model_kwargs)
model = EmbeddingDotBias.from_classes(
    n_factors=ana_collab.model_kwargs['dim_latent_factors'],
    classes=ana_collab.dls.classes,
    y_range=ana_collab.model_kwargs['y_range'])
learn = Learner(dls=ana_collab.dls, model=model, loss_func=MSELossFlat())
if args.cuda:
    learn.cuda()
learn.summary()

## Training

## Data in wide format

- Autoencoder need data in wide format

In [None]:
data.to_wide_format()
data.val_X.head()

## Denoising Autoencoder

### DataLoaders

In [None]:
dae_default_pipeline = sklearn.pipeline.Pipeline(
    [
        ('normalize', StandardScaler()),
        ('impute', SimpleImputer(add_indicator=False))
    ])

dae_transforms = VaepPipeline(
    df_train=data.train_X, encode=dae_default_pipeline, decode=['normalize'])

dls = get_dls(data.train_X, data.val_X, transformer=dae_transforms)

### Model

In [None]:
M = data.train_X.shape[-1]

model = ae.Autoencoder(n_features=M, n_neurons=int(
    M/2), last_decoder_activation=None, dim_latent=args.latent_dim)

### Learner

In [None]:
learn = Learner(dls=dls, model=model,
                loss_func=MSELossFlat(), cbs=ae.ModelAdapter())

In [None]:
learn.show_training_loop()

In [None]:
learn.summary()

In [None]:
suggested_lr = learn.lr_find()
suggested_lr

### Training


In [None]:
learn.fit_one_cycle(args.epochs_max, lr_max=suggested_lr.valley)

### Predictions
- test dataset

In [None]:
dl_test = get_test_dl(df=data.test_X, transformer=dae_transforms, dataset=DatasetWithTarget)

Validation data

In [None]:
pred, target = learn.get_preds(act=noop, concat_dim=0, reorder=False)
len(pred), len(target)

Test data

In [None]:
pred, target = learn.get_preds(dl=dl_test, act=noop, concat_dim=0, reorder=False)
len(pred), len(target)

In [None]:
"""
Returns
-------
    (pred, target)
"""
def get_preds_from_df(df, learn, transformer, dataset=DatasetWithTarget):
    dl = get_test_dl(df=df, transformer=transformer, dataset=dataset)
    res = learn.get_preds(dl=dl_test, concat_dim=0, reorder=False)
    res = L(res).map(lambda x: pd.DataFrame(x, index=df.index, columns=df.columns))
    res = L(res).map(lambda x: transformer.inverse_transform(x))
    return res

res = get_preds_from_df(df=data.test_X, learn=learn, transformer=dae_transforms)

# list(map(lambda x: x.shape, res))
L(res).map(lambda x: x.shape)

Some sanity checks 
- needs to go into test

In [None]:
pred, target = res
assert len(data.test_X) == len(pred) == len(target)

In [None]:
all(dl_test.dataset.mask_obs == data.test_X.isna())

In [None]:
# should this be a function?
test_pred_observed_dae = pred[~dl_test.dataset.mask_obs].stack()
test_pred_real_na_dae = pred[dl_test.dataset.mask_obs].stack()
assert len(test_pred_real_na_dae) + len(test_pred_observed_dae) == reduce(mul, data.test_X.shape)

_diff_index = test_pred_real_na_dae.index.difference(test_pred_real_na.index)
if len(_diff_index):   
    print("Some predictions couldn't be generated using the approach using artifical replicates.\n"
         "These will be omitted for evaluation.")
    for _index in _diff_index:
        print(f"{_index[0]:<40}\t {_index[1]:<40}")
        
test_pred_observed['DAE'] = test_pred_observed_dae
test_pred_real_na['DAE'] = test_pred_real_na_dae

### Plots

## Variational Autoencoder

### DataLoaders

In [None]:
vae_default_pipeline = sklearn.pipeline.Pipeline(
    [
        ('normalize', MinMaxScaler()),
        ('impute', SimpleImputer(add_indicator=False))
    ])

vae_transforms = VaepPipeline(
    df_train=data.train_X, encode=vae_default_pipeline, decode=['normalize'])

dls = get_dls(data.train_X, data.val_X, transformer=vae_transforms)

### Model

In [None]:
from torch.nn import Sigmoid

M = data.train_X.shape[-1]
model = ae.VAE(n_features=M,
               n_neurons=int(M/2),
               last_encoder_activation=None,
               last_decoder_activation=Sigmoid,
               dim_latent=args.latent_dim)

### Training

In [None]:
learn = Learner(dls=dls,
                model=model,
                loss_func=ae.loss_fct_vae,
                cbs=ae.ModelAdapterVAE())

learn.show_training_loop()
learn.summary()

In [None]:
suggested_lr = learn.lr_find()
suggested_lr

In [None]:
learn.fit_one_cycle(args.epochs_max, lr_max=suggested_lr.valley)

### Predictions

In [None]:
dl_test = get_test_dl(df=data.test_X, transformer=vae_transforms, dataset=DatasetWithTarget)

pred, target = learn.get_preds(dl=dl_test, act=noop, concat_dim=0, reorder=False)
len(pred[0]), len(target)

In [None]:
def get_preds_from_df(df, learn, transformer, 
                      position_pred_tuple=None,
                      dataset=DatasetWithTarget):
    dl = get_test_dl(df=df, transformer=transformer, dataset=dataset)
    res = learn.get_preds(dl=dl_test, concat_dim=0, reorder=False)
    if position_pred_tuple is not None and issubclass(type(res[0]), tuple): 
        res = (res[0][position_pred_tuple], *res[1:])
        print("done")
    res = L(res).map(lambda x: pd.DataFrame(x, index=df.index, columns=df.columns))
    res = L(res).map(lambda x: transformer.inverse_transform(x))
    return res

res = get_preds_from_df(df=data.test_X, learn=learn, 
                        position_pred_tuple=0,
                        transformer=vae_transforms)
# res[1]

In [None]:
def split_prediction_by_mask(pred, mask, check_keeps_all:bool=False):
    test_pred_observed = pred[~mask].stack()
    test_pred_real_na = pred[mask].stack()
    if check_keeps_all:
        assert len(test_pred_real_na) + len(test_pred_observed) == reduce(mul, pred.shape)
    return test_pred_observed, test_pred_real_na

test_pred_observed_vae, test_pred_real_na_vae = split_prediction_by_mask(pred=res[0],
                         mask=dl_test.dataset.mask_obs,
                         check_keeps_all=True)

def compare_indices(first_index, second_index):
    _diff_index = first_index.difference(second_index)
    if len(_diff_index):   
        print("Some predictions couldn't be generated using the approach using artifical replicates.\n"
             "These will be omitted for evaluation.")
        for _index in _diff_index:
            print(f"{_index[0]:<40}\t {_index[1]:<40}")

compare_indices(test_pred_real_na_vae.index, test_pred_real_na.index)

test_pred_observed['VAE'] = test_pred_observed_vae
test_pred_real_na['VAE'] = test_pred_real_na_vae

### Plots

## Comparison

In [None]:
test_pred_observed.head()

In [None]:
# df_pred = test_pred_observed
# y_true = df_pred.columns[0] # implicit assumption, default None
# y_true = df_pred.iloc[:,0]
# pred_columns = df_pred.columns[1:]

import sklearn.metrics as sklm

scoring = [('MSE', sklm.mean_squared_error),
           ('MAE', sklm.mean_absolute_error)]

def get_metrics_df(pred_df, true_col=None, scoring=scoring):
    if not true_col:
        # assume first column is truth if None is given
        y_true = pred_df.iloc[:,0]
        print(f'Selected as truth to compare to: {y_true.name}')
        y_pred = pred_df.iloc[:,1:]
    else:
        if issubclass(type(true_col), int):
            y_true = pred_df.iloc[:, true_col]
            pred_df = pred_df.drop(y_true.name, axis=1)
        elif issubclass(type(true_col), str):
            y_true = pred_df[true_col]
            pred_df = pred_df.drop(true_col, axis=1)   
        else:
            raise ValueError(f'true_col has to be of type str or int, not {type(true_col)}')
    

    metrics = {}
    for model_key in y_pred:
        model_pred = y_pred[model_key]
        model_pred_no_na = model_pred.dropna()
        if len(model_pred) > len(model_pred_no_na):
            logger.info(
                f"Drop indices for {col}: {[(idx[0], idx[1]) for idx in model_pred.index.difference(model_pred_no_na.index)]}")

        metrics[model_key] = dict(
            [(k, f(y_true=y_true.loc[model_pred_no_na.index], y_pred=model_pred_no_na))
             for k, f in scoring]
        )
    metrics = pd.DataFrame(metrics)
    return metrics
# metrics.to_csv(folder / f'exp_02_metrics.csv',
#                float_format='{:.3f}'.format)
metrics = get_metrics_df(pred_df = test_pred_observed)
metrics.sort_values(by=[k for k, f in scoring], axis=1)

In [None]:
metrics = get_metrics_df(pred_df = test_pred_real_na)
metrics.sort_values(by=[k for k, f in scoring], axis=1)

In [None]:
# analysis per sample?
# analysis per peptide?
# (test_pred_real_na['replicated'] - test_pred_real_na['DAE']).sort_values().plot(rot=45)

## Config

In [None]:
args