# Experiment 02

- [ ] mask entries in larger dataset in long-format
- [ ] mask peptides based on their frequency in samples (probability of being observed)
- [ ] create training data set without masked values for each model
    - Denoising AE
    - FNN based on embeddings (Collaborative Filtering)
    - VAE
- [ ] restrict to only a training data split of consective data: Increase number of samples.
    - focus on best reconstruction performance
    - mean comparison
    - 

In [None]:
from src.nb_imports import *

from pathlib import Path
from src import metadata

import logging
from src.logging import setup_logger

logger = setup_logger(logger=logging.getLogger('vaep'))
logger.info("Experiment 02")

figures = {}  # collection of ax or figures

ADD_TENSORBOARD = False

## Raw data

In [None]:
# only some sample have many missings
FN_PEPTIDE_INTENSITIES = Path('data') / 'df_intensities_N_07813_M01000'  # all
FN_PEPTIDE_INTENSITIES = Path('data') / 'df_intensities_N_07637_M01000'  # 60%
FN_PEPTIDE_INTENSITIES = Path('data') / 'df_intensities_N_07285_M01000'  # 90%

In [None]:
analysis = AnalyzePeptides(fname=FN_PEPTIDE_INTENSITIES, nrows=None)
analysis.df.columns.name = 'peptide'
analysis.log_transform(np.log2)
analysis

In [None]:
# some date are not possible in the indices
rename_indices_w_wrong_dates = {'20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_03': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_03',
                                '20180230_QE10_nLC0_MR_QC_MNT_Hela_12': '20180330_QE10_nLC0_MR_QC_MNT_Hela_12',
                                '20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_01': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_01',
                                '20180230_QE10_nLC0_MR_QC_MNT_Hela_11': '20180330_QE10_nLC0_MR_QC_MNT_Hela_11',
                                '20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_02': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_02'}
analysis.df.rename(index=rename_indices_w_wrong_dates, inplace=True)

- biological stock differences in PCA plot. Show differences in models. Only see biological variance

In [None]:
d_meta = metadata.get_metadata_from_filenames(analysis.df.index)
analysis.df_meta = pd.DataFrame.from_dict(
    d_meta, orient='index')
analysis.df_meta

Use to find date parsing errors, used for renaming above.

In [None]:
# invalid_dates = pd.to_datetime(analysis.df_meta.date, errors='coerce').isna()
# display(analysis.df_meta.loc[invalid_dates])
# {i : i for i in analysis.df_meta.loc[invalid_dates].index} # to rename

In [None]:
analysis.df_meta.describe()

See rare instrument types (potential labeling errors)

In [None]:
N_MIN_INSTRUMENT = 10
ms_instruments = analysis.df_meta.ms_instrument.value_counts()
ms_instruments = ms_instruments[ms_instruments > N_MIN_INSTRUMENT].index
mask = ~analysis.df_meta.ms_instrument.isin(ms_instruments)
analysis.df_meta.loc[mask]

### PCA plot of raw data

In [None]:
import itertools
from sklearn.impute import SimpleImputer
X = SimpleImputer().fit_transform(analysis.df)
X = vaep.pandas._add_indices(X, analysis.df)
assert X.isna().sum().sum() == 0

pca = analyzers.run_pca(X)
cols = list(pca.columns)

fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(
    15, 20), constrained_layout=True)
fig.suptitle('First two Principal Components of entire dataset', fontsize=30)


# by instrument
ax = axes[0]
pca['ms_instrument'] = analysis.df_meta['ms_instrument'].astype('category')
for name, group in pca.groupby('ms_instrument'):
    ax.scatter(x=group[cols[0]], y=group[cols[1]], label=name)
ax.set_title('by category', fontsize=18)
ax.legend(loc='center right', bbox_to_anchor=(1.11, 0.5))

# by dates
ax = axes[1]
ax.set_title('by date', fontsize=18)
path_collection = analyzers.scatter_plot_w_dates(
    ax, pca, dates=analysis.df_meta.date, errors='raise')
path_collection = analyzers.add_date_colorbar(path_collection, fig)

## Long format

- Data in long format: (peptide, sample_id, intensity)
- no missing values kept
- 

In [None]:
def get_long_format(self, colname_values='intensity', inplace=False):
    df = self.df
    df_long = df.unstack().dropna().to_frame(colname_values)
    df_long = df_long.reset_index('Sample ID')
    if inplace:
        self.df_long = df_long
        return
    return df_long


get_long_format(analysis, inplace=True)
analysis.df_long.head()

In [None]:
assert analysis.df_long.isna().sum().sum() == 0, "There are still missing values in the long format."

In [None]:
def get_wide_format(self, columns='Sample ID', name_values='intensity', inplace=False):
    df_wide = self.df_long.pivot(columns=columns, values=name_values)
    df_wide = df_wide.T
    if inplace:
        self.df_wide = df_wide
        return
    return df_wide


get_wide_format(analysis, inplace=True)
analysis.df_wide.head()

In [None]:
assert analysis.df_wide.isna().sum().sum() > 0, "There are no missing values left in the wide format"

### Sampling peptides by their frequency (important for later)

- higher count, higher probability to be sampled into training data
- missing peptides are sampled both into training as well as into validation dataset
- everything not in training data is validation data

In [None]:
# freq_per_peptide = analysis.df.unstack().to_frame('intensity').reset_index(1, drop=True)
freq_per_peptide = analysis.df_long['intensity']
freq_per_peptide = freq_per_peptide.notna().groupby(level=0).sum()

In [None]:
# df_long = analysis.df.unstack().to_frame('intensity').reset_index(1)
analysis.df_train = analysis.df_long.groupby(
    by='Sample ID').sample(frac=0.95, weights=freq_per_peptide)
analysis.df_train = analysis.df_train.reset_index().set_index([
    'Sample ID', 'peptide'])
analysis.df_train

## Multiindex 

- use mulitindex for obtaining validation split

In [None]:
analysis.df_long = analysis.df_long.reset_index(
).set_index(['Sample ID', 'peptide'])
analysis.df_long.head()

In [None]:
analysis.indices_valid = analysis.df_long.index.difference(
    analysis.df_train.index)
analysis.df_valid = analysis.df_long.loc[analysis.indices_valid]

In [None]:
assert len(analysis.df_long) == len(analysis.df_train) + len(analysis.df_valid)

## Setup DL

In [None]:
import vaep.model as vaep_model
from vaep.cmd import get_args

BATCH_SIZE, EPOCHS = 8, 30
args = get_args(batch_size=BATCH_SIZE, epochs=EPOCHS,
                no_cuda=True)  # data transfer to GPU seems slow
kwargs = {'num_workers': 2, 'pin_memory': True} if args.cuda else {}

# torch.manual_seed(args.seed)
device = torch.device("cuda" if args.cuda else "cpu")
device

print(args, device)

## Collaboritive filtering model

In [None]:
from fastai.collab import CollabDataLoaders, MSELossFlat, Learner
# from types import SimpleNamespace

analysis.collab = Analysis()
collab = analysis.collab
collab.columns = 'peptide,Sample ID,intensity'.split(',')

In [None]:
collab.df_train = analysis.df_train.reset_index()
collab.df_valid = analysis.df_valid.reset_index()
collab.df_train.head()

In [None]:
collab.df_valid.head()

In [None]:
assert (collab.df_train.intensity.isna().sum(),
        collab.df_valid.intensity.isna().sum()) == (0, 0), "Remove missing values."

In [None]:
collab.dl_train = CollabDataLoaders.from_df(
    collab.df_train, valid_pct=0.0, user_name='Sample ID', item_name='peptide', rating_name='intensity', bs=64)
collab.dl_valid = CollabDataLoaders.from_df(
    collab.df_valid, valid_pct=0.0, user_name='Sample ID', item_name='peptide', rating_name='intensity', bs=64, 
    shuffle=False)
collab.dl_train.show_batch()

In [None]:
type(collab.dl_train.valid)

In [None]:
collab.dl_valid.show_batch()

In [None]:
from fastai.data.core import DataLoaders
collab.dls = DataLoaders(collab.dl_train.train, collab.dl_valid.train)

In [None]:
len(collab.dls.classes['Sample ID']), len(collab.dls.classes['peptide'])

Alternatively to the hacky version, one could use a factory method, but there the sampling/Splitting methods would need to be implemented (not using [`RandomSplitter`](https://docs.fast.ai/data.transforms.html#RandomSplitter) somehow)

 - [`TabDataLoader`](https://docs.fast.ai/tabular.core.html#TabDataLoader)
 - uses [`TabularPandas`](https://docs.fast.ai/tabular.core.html#TabularPandas)
 
 > Current problem: No custom splitter can be provided

In [None]:
# # drop NAs before?
# valid_idx = [analysis.df_long.index.get_loc(key=key) for key in analysis.indices_valid]
# from fastai.tabular.all import *
# from fastai.tabular.data import TabularDataLoaders
# collab.dls = TabularDataLoaders.from_df(
#     df=analysis.df_long.reset_index(), 
#     procs=[Categorify],
#     valid_idx=valid_idx,
#     cat_names=['Sample ID', 'peptide'],
#     y_names=['intensity'],
#     with_cont=False,
#     y_block=TransformBlock(),
#     bs=64)
# collab.dls.show_batch()
# # Problem: this return a second empty df - > would need to adapt model.

A brief check that the values match roughly

In [None]:
# from numpy.testing import assert_almost_equal
# UPTODECIMAL = 5
# assert_almost_equal(
#     collab.dls.valid_ds['intensity'].values, 
#     analysis.df_long.iloc[valid_idx]['intensity'],
#     decimal=UPTODECIMAL
# )
# print(f"Values match up to the {UPTODECIMAL} decimal.")

In [None]:
from pprint import pprint
import fastai.torch_core
# device = torch.device('cpu')
# fastai.torch_core.defaults.device = torch.device('cpu')
device = fastai.torch_core.defaults.device

collab.model_args = {}
collab.model_args['n_samples'] = len(collab.dls.classes['Sample ID'])
collab.model_args['n_peptides'] = len(collab.dls.classes['peptide'])
collab.model_args['dim_latent_factors'] = 20
collab.model_args['y_range'] = (
    int(analysis.df_train['intensity'].min()), int(analysis.df_train['intensity'].max())+1)

print("Args:")
pprint(collab.model_args)

model = vaep_model.DotProductBias(**collab.model_args).to(device)
learn = Learner(dls=collab.dls, model=model, loss_func=MSELossFlat())
learn.summary()

In [None]:
learn.fit_one_cycle(5, 5e-3)

In [None]:
(abs(target - preds)).sum() / len(target)

In [None]:
pred, target = learn.get_preds()

In [None]:
collab.dls.valid_ds

In [None]:
# show False does not return results..
res = learn.show_results(show=True)  # something similar with return

In [None]:
# # Adapt to get prediction Ddataframe
# encodings, pred, target = learn.get_preds(
#     with_input=True)  # per default validation data
# pred_df = pd.DataFrame([{'Sample ID': dls.classes['Sample ID'][obs[0]], 'peptide': dls.classes['peptide']
#                          [obs[1]], 'intensity': pred_intensity.item()} for obs, pred_intensity in zip(encodings, pred)])
# pred_df = pred_df.pivot(index='Sample ID', columns='peptide')
# pred_df

## Denoising Autoencoder (DAE)

## Variational Autoencoder (VAE)