# Latent space visualization

In [None]:
import logging
from pathlib import Path
from pprint import pprint
from src.nb_imports import *

from typing import Union

from fastai.losses import MSELossFlat
from fastai.learner import Learner


import fastai
# from fastai.tabular.all import *

from fastai.basics import *
from fastai.callback.all import *
from fastai.torch_basics import *
from fastai.data.all import *

from fastai.tabular.all import *
from fastai.collab import *

# overwriting Recorder callback with custom plot_loss
from vaep.models import plot_loss
from fastai import learner
learner.Recorder.plot_loss = plot_loss
# import fastai.callback.hook # Learner.summary

import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

import vaep
import vaep.model
import vaep.models as models
from vaep.models import ae
from vaep.models import collab as vaep_collab
from vaep.io.datasets import DatasetWithTarget
from vaep.transform import VaepPipeline
from vaep.io import datasplits
from vaep.io.dataloaders import get_dls, get_test_dl
from vaep import sampling

import src
import src.analyzers as analyzers
from src import config
from src.logging import setup_logger
logger = setup_logger(logger=logging.getLogger('vaep'))
logger.info("Experiment 03 - Analysis of latent spaces and performance comparisions")

figures = {}  # collection of ax or figures

Papermill script parameters:

In [None]:
# folders
data:str = 'data/msinstrument_in_QE4' # Datasplit folder with data for experiment
out_folder:str = 'runs/experiment_14' # Output folder to store all figures and metrics
# training
n_training_samples_max:int = 1000 # Maximum number of training samples to use for training. Take most recent
epochs_max:int = 10  # Maximum number of epochs
early_stopping:bool = True # Wheather to use early stopping or not
batch_size:int = 32 # Batch size for training (and evaluation)
cuda:bool=True # Use the GPU for training?
# model
latent_dim:int = 10 # Dimensionality of encoding dimension (latent space of model)
hidden_layers:Union[int,str] = 1 # A space separated string of layers, '50 20' for the encoder, reverse will be use for decoder
force_train:bool = True # Force training when saved model could be uses

Some argument transformations

In [None]:
args = config.Config()
args.data = Path(data); del data
args.out_folder = Path(out_folder); del out_folder
args.n_training_samples_max = n_training_samples_max; del n_training_samples_max
args.epochs_max = epochs_max; del epochs_max
args.batch_size = batch_size; del batch_size
args.cuda = cuda; del cuda
args.latent_dim = latent_dim; del latent_dim
args.force_train = force_train; del force_train

print(hidden_layers)
if isinstance(hidden_layers, int):
    args.hidden_layers = hidden_layers
    
elif isinstance(hidden_layers, str):
    args.hidden_layers = hidden_layers.split()
else:
    raise ValueError(f"hidden_layers is of unknown type {type(hidden_layers)}")
del hidden_layers
args

## Load data in long format

In [None]:
data = datasplits.DataSplits.from_folder(args.data) 
# select max_train_samples

- data representation not to easy yet
- should validation and test y (the imputed cases using replicates) be only generated in an application to 
  keep unmanipulated data separate from imputed values?

In [None]:
# data # uncommet to see current representation

data is loaded in long format

In [None]:
data.train_X.sample(5)

meta data for splits

In [None]:
# data.train_meta # needs to be created

## Initialize Comparison

- replicates idea for truely missing values: Define truth as by using n=3 replicates to impute
  each sample
- real test data:
    - Not used for predictions or early stopping.
    - [x] add some additional NAs based on distribution of data

In [None]:
freq_peptides = sampling.frequency_by_index(data.train_X, 0)
freq_peptides.head() # training data

Produce some addional fake samples

In [None]:
# execute only once, add to DataSplits?
data.test_X, data.test_y_fake = sampling.sample_data(data.test_X, sample_index_to_drop=0, weights=freq_peptides)
data.test_y_fake

In [None]:
# data.test_y
test_pred_ = data.interpolate('test_X') # "gold standard"
# data.test_y.index.difference(test_predictions_real_na.index) # empty
# test_predictions_real_na.compare(data.test_y) # some wiered bug, maybe due to multi-index?
# are indices exactly the same? seems so
# sorting values and visual inspection: the values seem to be the same
# conclusions: floating point differences, which can be savely ignored
# in_both = test_pred_.index.intersection(data.test_y.index)
# don't stay the same as more missing values lead to a different interpolation
# assert abs((test_pred_.loc[in_both] - data.test_y.loc[in_both]).dropna().sum()) < 0.000001

test_pred_fake_na = data.test_y_fake.to_frame(name='observed')
def in_both(a, b): return a.index.intersection(b.index)
test_pred_fake_na['replicated'] = test_pred_.loc[in_both(test_pred_, test_pred_fake_na)] # interpolated?
test_pred_fake_na

In [None]:
data.test_y = test_pred_.loc[test_pred_.index.difference(data.test_y_fake.index)]
test_pred_real_na = data.test_y.to_frame(name='replicated')
# test_pred_real_na = data.test_y
test_pred_real_na.sort_index(inplace=True)
test_pred_real_na

In [None]:
test_pred_observed = data.test_X.to_frame('measured')
test_pred_observed.sort_index(inplace=True)

And predictions on validation (to see if the test data performs worse than the validation data, which was only used for early stopping)
- possibility to also mask some predictions for model

In [None]:
valid_pred = data.val_X.to_frame('measured')
valid_pred

### PCA plot of training data

- [x] add validation and test data in view

In [None]:
ana_train_X = analyzers.AnalyzePeptides(data=data.train_X, is_wide_format=False, ind_unstack='peptide')
# ana_train_X.df.set_index('peptide', append=True, inplace=True)
# ana_train_X.df.reset_index(inplace=True)
figures['pca_train'] = ana_train_X.plot_pca()
vaep.savefig(figures['pca_train'], args.out_folder / f'pca_plot_raw_data_{ana_train_X.fname_stub}')

In [None]:
# add to DataSplits a inputs attribute
data_dict = {'train': data.train_X, 'valid': data.val_X, 'test': data.test_X}
PCs = pd.DataFrame()
split_map = pd.Series(dtype='string')
for key, df in data_dict.items():
    df = df.unstack()
    PCs = PCs.append(ana_train_X.calculate_PCs(df))
    split_map = split_map.append(pd.Series(key, index=df.index))

fig, ax = plt.subplots(figsize=(15,8))
ax.legend(title='splits')
analyzers.seaborn_scatter(PCs, fig, ax, meta=split_map,
                          title='First two principal compements (based on training data PCA)')
ax.get_legend().set_title("split")

For *Collaborative Filtering*, new samples could be initialized based on a KNN approach in the original sample space or the reduced PCA dimension.
  - The sample embeddings of the K neighearst neighbours could be averaged for a new sample

In [None]:
from sklearn.neighbors import NearestNeighbors

train_PCs = ana_train_X.calculate_PCs(data.train_X.unstack())
test_PCs = ana_train_X.calculate_PCs(data.test_X.unstack())
nn = NearestNeighbors().fit(train_PCs)

In [None]:
d, idx = nn.kneighbors(test_PCs.iloc[1:2])
# test_PCs.iloc[1]
idx

In [None]:
train_PCs.iloc[idx[0]]

## Collaborative Filtering

In [None]:
# class CollabAnalysis(Analysis):
#     def __init__(datasplits:DataSplits,
#                  sample_column='Sample ID',
#                  item_column='peptide',
#                  target_column='intensity')

ana_collab = Analysis()
ana_collab.X, ana_collab.frac = vaep_collab.combine_data(
    data.train_X, data.val_X)

# to = TabularCollab(df=ana_collab.X,
#                    procs=[Categorify],
#                    cat_names=['Sample ID','peptide'],
#                    y_names=['intensity'],
#                    y_block=TransformBlock(),
#                    splits=ana_collab.splits)
# ana_collab.dls = to.dataloaders(path='.')

ana_collab.dls = CollabDataLoaders.from_df(ana_collab.X, valid_pct=ana_collab.frac, seed=42,
                                   user_name='Sample ID', item_name='peptide', rating_name='intensity',
                                   bs=args.batch_size*8)

In [None]:
ana_collab.cat_columns = 'peptide,Sample ID'.split(',') 
ana_collab.target_column = 'intensity'.split(',') 

ana_collab.model_kwargs = dict()
ana_collab.model_kwargs['n_samples'] = len(ana_collab.dls.classes['Sample ID'])
ana_collab.model_kwargs['n_peptides'] = len(ana_collab.dls.classes['peptide'])
ana_collab.model_kwargs['dim_latent_factors'] = args.latent_dim
ana_collab.model_kwargs['y_range'] = (
    int(data.train_X.min()),
    int(data.train_X.max())+1)
print("Args:")
pprint(ana_collab.model_kwargs)

In [None]:
# model = EmbeddingDotBias.from_classes(**ana_collab.model_kwargs)
model = EmbeddingDotBias.from_classes(
    n_factors=ana_collab.model_kwargs['dim_latent_factors'],
    classes=ana_collab.dls.classes,
    y_range=ana_collab.model_kwargs['y_range'])
learn = Learner(dls=ana_collab.dls, model=model, loss_func=MSELossFlat(), 
                cbs=EarlyStoppingCallback(patience=1),
                   model_dir=args.out_folder)
if args.cuda:
    learn.cuda()
# learn.summary() # see comment at DAE

### Training

In [None]:
try:
    if args.force_train:
        raise FileNotFoundError
    learn = learn.load('collab_model')
    logger.info("Loaded saved model")
except FileNotFoundError:
    suggested_lr = learn.lr_find()
    print(f"{suggested_lr.valley = :.5f}")
    learn.fit_one_cycle(args.epochs_max, lr_max=suggested_lr.valley)
    # learn.fit_one_cycle(args.epochs_max, lr_max=1e-3)
    learn.save('collab_model')

In [None]:
fig, ax = plt.subplots(figsize=(15, 8))
ax.set_title('Collab loss: Reconstruction loss')
learn.recorder.plot_loss(skip_start=5, ax=ax)
vaep.savefig(fig, name='collab_training',
                        folder=args.out_folder)

### Predictions
- validation data for collab is a mix of peptides both from the original training and validation data set
- comparison for collab model will therefore not be 1 to 1 comparable with the Autoencoder models on the **validation**  data split

In [None]:
valid_pred_collab = ana_collab.dls.valid_ds.new(ana_collab.dls.valid_ds.all_cols).decode().items
pred, target = learn.get_preds()
valid_pred_collab['intensity_predicted'] = pred.flatten().numpy()
valid_pred_collab = valid_pred_collab.set_index(['Sample ID', 'peptide'])
valid_pred_collab

For predictions on test data, the sample embedding vector has to be initialized manuelly

In [None]:
learn.cpu()

In [None]:
# learn.classes['Sample ID'].o2i
# learn.classes['Sample ID'].map_ids([1,3])
idx = learn.classes['Sample ID'].map_objs(
    ['20130403_QE4_LC6_TW_QC_MNT_HeLa_2', '20130409_QE4_LC6_TW_QC_MNT_HeLa_2'])
idx = torch.tensor(idx)
idx

In [None]:
learn.u_bias(idx) #.mean(dim=0)

In [None]:
learn.u_weight(idx) #.mean(dim=0)

In [None]:
def collab_dot_product(sample_embeddings:torch.tensor, sample_bias:torch.tensor,
            feat_embeddings:Embedding, feat_bias:Embedding, items:torch.tensor,
            y_range=None):
    dot = sample_embeddings * feat_embeddings(items)
    res = dot.sum(1) + sample_bias.squeeze() + feat_bias(items).squeeze()
    res = res.detach()
    if y_range is None:
        return res
    return torch.sigmoid(res) * (y_range[1]-y_range[0]) + y_range[0]

In [None]:
collab_dot_product(learn.u_weight(idx), learn.u_bias(idx),
                   learn.i_weight, learn.i_bias, idx, # this is abritrary
                   y_range=learn.y_range)

build new embeddings for test data

In [None]:
d, idx = nn.kneighbors(test_PCs)
idx = torch.from_numpy(idx)

test_sample_embeddings = learn.u_weight(idx).sum(1)
test_sample_biases     = learn.u_bias(idx).sum(1)

test_sample_embeddings = torch.nn.Embedding.from_pretrained(test_sample_embeddings)
test_sample_biases = torch.nn.Embedding.from_pretrained(test_sample_biases)

In [None]:
# fixed ?
feat_embeddings = learn.i_weight(torch.arange(1, 1001))
feat_biases = learn.i_bias(torch.arange(1, 1001))

In [None]:
test_pred_collab = []
for i in range(test_sample_biases.num_embeddings):
    x = collab_dot_product(test_sample_embeddings(torch.tensor(i)), test_sample_biases(torch.tensor(i)),
                   learn.i_weight,  learn.i_bias, torch.arange(1, 1001),
                   y_range=learn.y_range)
    test_pred_collab.append(x)

test_pred_collab = torch.vstack(test_pred_collab)
test_pred_collab = pd.DataFrame(test_pred_collab, 
                         columns=pd.Index(list(learn.classes['peptide'].items[1:]), name='peptide'), 
                         index=test_PCs.index)
test_pred_collab = test_pred_collab.stack()
test_pred_collab.unstack()

In [None]:
test_pred_fake_na['collab'] = test_pred_collab
test_pred_real_na['collab'] = test_pred_collab
test_pred_observed['collab'] = test_pred_collab

## Data in wide format

- Autoencoder need data in wide format

In [None]:
data.to_wide_format()
args.M = data.train_X.shape[-1]
data.val_X.head()

Calculate hidden layer dimensionality based on latent space dimension and number of hidden layers requested:

In [None]:
if isinstance(args.hidden_layers , int):
    args.overwrite_entry(entry='hidden_layers', 
                         value=list(
                             np.linspace(args.latent_dim, 
                                         args.M,
                                         2+args.hidden_layers, 
                                         endpoint=True
                                        ).astype(int)[1:-1])
    )
args

## Denoising Autoencoder

### DataLoaders

In [None]:
dae_default_pipeline = sklearn.pipeline.Pipeline(
    [
        ('normalize', StandardScaler()),
        ('impute', SimpleImputer(add_indicator=False))
    ])

dae_transforms = VaepPipeline(
    df_train=data.train_X, encode=dae_default_pipeline, decode=['normalize'])

dls = get_dls(data.train_X, data.val_X, transformer=dae_transforms)

### Model

In [None]:
M = data.train_X.shape[-1]

model = ae.Autoencoder(n_features=M, n_neurons=args.hidden_layers,
                       last_decoder_activation=None, dim_latent=args.latent_dim)
model

### Learner

In [None]:
learn = Learner(dls=dls, model=model,
                loss_func=MSELossFlat(), cbs=[EarlyStoppingCallback(), ae.ModelAdapter()])

In [None]:
learn.show_training_loop()

Adding a `EarlyStoppingCallback` results in an error.  Potential fix in [PR3509](https://github.com/fastai/fastai/pull/3509) is not yet in current version. Try again later

In [None]:
# learn.summary()

In [None]:
suggested_lr = learn.lr_find()
suggested_lr

### Training


In [None]:
learn.fit_one_cycle(args.epochs_max, lr_max=suggested_lr.valley)

In [None]:
fig, ax = plt.subplots(figsize=(15, 8))
ax.set_title('DAE loss: Reconstruction loss')
learn.recorder.plot_loss(skip_start=5, ax=ax)
vaep.savefig(fig, name='dae_training',
                        folder=args.out_folder)

### Predictions

- on validation dataset

In [None]:
pred, target =res = ae.get_preds_from_df(df=data.val_X, learn=learn, transformer=dae_transforms)

valid_pred['DAE'] = pred.stack()

- on test dataset

In [None]:
res = ae.get_preds_from_df(df=data.test_X, learn=learn, transformer=dae_transforms)

pred, target = res
assert len(data.test_X) == len(pred) == len(target)
# list(map(lambda x: x.shape, res))
L(res).map(lambda x: x.shape)

Some sanity checks 
- needs to go into test

In [None]:
# dl_test = get_test_dl(df=data.test_X, transformer=dae_transforms, dataset=DatasetWithTarget)
# all(dl_test.dataset.mask_obs == data.test_X.isna())

In [None]:
test_pred_observed_dae, test_pred_dae = models.split_prediction_by_mask(pred=pred,
                         mask=data.test_X.isna(),
                         check_keeps_all=True)
test_pred_fake_na_dae = test_pred_dae.loc[test_pred_fake_na.index]
test_pred_real_na_dae = test_pred_dae.loc[test_pred_dae.index.difference(test_pred_fake_na_dae.index)]
_ = models.compare_indices(test_pred_real_na_dae.index, data.test_y.stack().index)

test_pred_observed['DAE'] = test_pred_observed_dae
test_pred_fake_na['DAE'] = test_pred_fake_na_dae
test_pred_real_na['DAE'] = test_pred_real_na_dae

### Plots

- validation data
- [ ] add test data

In [None]:
df_dae_latent = vaep.model.get_latent_space(model.encoder, dl=dls.valid, dl_index=dls.valid.data.index)
df_dae_latent

In [None]:
val_meta = analyzers.build_metadata_df(data.val_X.index)
val_meta

In [None]:
ana_latent_dae = analyzers.LatentAnalysis(df_dae_latent, val_meta, 'DAE', folder=args.out_folder)
figures['latent_DAE_by_date'], ax = ana_latent_dae.plot_by_date()

In [None]:
figures['latent_DAE_by_ms_instrument'], ax = ana_latent_dae.plot_by_category('ms_instrument')

## Variational Autoencoder

### DataLoaders

In [None]:
vae_default_pipeline = sklearn.pipeline.Pipeline(
    [
        ('normalize', MinMaxScaler()),
        ('impute', SimpleImputer(add_indicator=False))
    ])

vae_transforms = VaepPipeline(
    df_train=data.train_X, encode=vae_default_pipeline, decode=['normalize'])

dls = get_dls(data.train_X, data.val_X, transformer=vae_transforms)

### Model

In [None]:
from torch.nn import Sigmoid

M = data.train_X.shape[-1]
model = ae.VAE(n_features=M,
               n_neurons=args.hidden_layers,
               last_encoder_activation=None,
               last_decoder_activation=Sigmoid,
               dim_latent=args.latent_dim)
model

### Training

In [None]:
learn = Learner(dls=dls,
                model=model,
                loss_func=ae.loss_fct_vae,
                cbs=[ae.ModelAdapterVAE(), EarlyStoppingCallback()])

learn.show_training_loop()
# learn.summary() # see comment above under DAE

In [None]:
suggested_lr = learn.lr_find()
suggested_lr

In [None]:
learn.fit_one_cycle(args.epochs_max, lr_max=suggested_lr.valley)

### Predictions

In [None]:
pred, target =res = ae.get_preds_from_df(df=data.val_X, learn=learn, 
                                         position_pred_tuple=0, 
                                         transformer=vae_transforms)
valid_pred['VAE'] = pred.stack()

In [None]:
dl_test = get_test_dl(df=data.test_X, transformer=vae_transforms, dataset=DatasetWithTarget)

pred, target = learn.get_preds(dl=dl_test, act=noop, concat_dim=0, reorder=False)
len(pred[0]), len(target)

In [None]:
res = ae.get_preds_from_df(df=data.test_X, learn=learn, 
                        position_pred_tuple=0,
                        transformer=vae_transforms)
# res[1]

In [None]:
test_pred_observed_vae, test_pred_vae = models.split_prediction_by_mask(pred=res[0],
                         mask=dl_test.dataset.mask_obs,
                         check_keeps_all=True)
test_pred_fake_na_vae = test_pred_vae.loc[test_pred_fake_na.index]
test_pred_real_na_vae = test_pred_vae.loc[test_pred_vae.index.difference(test_pred_fake_na_vae.index)]
_ = models.compare_indices(test_pred_real_na_vae.index, test_pred_real_na.index)

test_pred_observed['VAE'] = test_pred_observed_vae
test_pred_fake_na['VAE'] = test_pred_fake_na_vae
test_pred_real_na['VAE'] = test_pred_real_na_vae

### Plots

- validation data

In [None]:
df_vae_latent = vaep.model.get_latent_space(model.get_mu_and_logvar, dl=dls.valid, dl_index=dls.valid.data.index)
df_vae_latent

In [None]:
_model_key = 'VAE'
ana_latent_vae = analyzers.LatentAnalysis(df_vae_latent, val_meta, _model_key, folder=args.out_folder)
figures[f'latent_{_model_key}_by_date'], ax = ana_latent_vae.plot_by_date()

In [None]:
_cat = 'ms_instrument'
figures[f'latent_{_model_key}_by_{_cat}'], ax = ana_latent_vae.plot_by_category(_cat)

## Comparisons

### Validation data

- all measured (identified, observed) peptides in validation data

> Does not make to much sense to compare collab and AEs,  
> as the setup differs of training and validation data differs

In [None]:
metrics = models.get_metrics_df(pred_df = valid_pred_collab)
# metrics.sort_values(by=[k for k, f in scoring], axis=1)
metrics.to_csv(args.out_folder / f'metrics_valid_split_observed.csv',
               float_format='{:.3f}'.format)
metrics

In [None]:
metrics = models.get_metrics_df(pred_df = valid_pred)
# metrics.sort_values(by=[k for k, f in scoring], axis=1)
metrics.to_csv(args.out_folder / f'metrics_valid_split_observed.csv',
               float_format='{:.3f}'.format)
metrics

### Test Datasplit

Fake NAs : Artificially created NAs. Some data was sampled and set explicitly to misssing before it was fed to the model for reconstruction.

In [None]:
metrics = models.get_metrics_df(pred_df = test_pred_fake_na)
# metrics.sort_values(by=[k for k, f in scoring], axis=1)
metrics.to_csv(args.out_folder / f'metrics_fake_na.csv',
               float_format='{:.3f}'.format)
metrics

Non missing data, which was fed to the model

In [None]:
metrics = models.get_metrics_df(pred_df = test_pred_observed)
# metrics.sort_values(by=[k for k, f in scoring], axis=1)
metrics.to_csv(args.out_folder / f'metrics_observed.csv',
               float_format='{:.3f}'.format)
metrics

True NA data which was interpolated by other samples
> Comparing to imputation with each other might not be sensible

In [None]:
metrics = models.get_metrics_df(pred_df = test_pred_real_na)
# metrics.sort_values(by=[k for k, f in scoring], axis=1)
metrics.to_csv(args.out_folder / f'metrics_real_na.csv',
               float_format='{:.3f}'.format)
metrics

In [None]:
# analysis per sample?
# analysis per peptide?
# (test_pred_real_na['replicated'] - test_pred_real_na['DAE']).sort_values().plot(rot=45)

In [None]:
# mask = test_pred_observed > 29
# test_pred_df = test_pred_observed.where(~mask, np.nan)
# models.get_metrics_df(test_pred_df.dropna(subset=['measured']))

## Config

In [None]:
args.dump()
args