# Experiment 02

- [ ] mask entries in larger dataset in long-format
- [ ] mask peptides based on their frequency in samples (probability of being observed)
- [ ] create training data set without masked values for each model
    - Denoising AE
    - FNN based on embeddings (Collaborative Filtering)
    - VAE
- [ ] restrict to only a training data split of consective data: Increase number of samples.
    - focus on best reconstruction performance
    - mean comparison
    - 

In [None]:
from src.nb_imports import *

from pathlib import Path
from src import metadata

import logging
from src.logging import setup_logger

logger = setup_logger(logger=logging.getLogger('vaep'))
logger.info("Experiment 02")

figures = {}  # collection of ax or figures

ADD_TENSORBOARD = False

## Raw data

In [None]:
# only some sample have many missings
FN_PEPTIDE_INTENSITIES = Path('data') / 'df_intensities_N_07813_M01000'  # all
FN_PEPTIDE_INTENSITIES = Path('data') / 'df_intensities_N_07637_M01000'  # 60%
FN_PEPTIDE_INTENSITIES = Path('data') / 'df_intensities_N_07285_M01000'  # 90%

In [None]:
analysis = AnalyzePeptides(fname=FN_PEPTIDE_INTENSITIES, nrows=None)
analysis.log_transform(np.log2)
analysis

In [None]:
# some date are not possible in the indices
rename_indices_w_wrong_dates = {'20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_03': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_03',
                                '20180230_QE10_nLC0_MR_QC_MNT_Hela_12': '20180330_QE10_nLC0_MR_QC_MNT_Hela_12',
                                '20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_01': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_01',
                                '20180230_QE10_nLC0_MR_QC_MNT_Hela_11': '20180330_QE10_nLC0_MR_QC_MNT_Hela_11',
                                '20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_02': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_02'}
analysis.df.rename(index=rename_indices_w_wrong_dates, inplace=True)

- biological stock differences in PCA plot. Show differences in models. Only see biological variance

In [None]:
d_meta = metadata.get_metadata_from_filenames(analysis.df.index)
analysis.df_meta = pd.DataFrame.from_dict(
    d_meta, orient='index')
analysis.df_meta

Use to find date parsing errors, used for renaming above.

In [None]:
# invalid_dates = pd.to_datetime(analysis.df_meta.date, errors='coerce').isna()
# display(analysis.df_meta.loc[invalid_dates])
# {i : i for i in analysis.df_meta.loc[invalid_dates].index} # to rename

In [None]:
analysis.df_meta.describe()

See rare instrument types (potential labeling errors)

In [None]:
N_MIN_INSTRUMENT = 10
ms_instruments = analysis.df_meta.ms_instrument.value_counts()
ms_instruments = ms_instruments[ms_instruments > N_MIN_INSTRUMENT].index
mask = ~analysis.df_meta.ms_instrument.isin(ms_instruments)
analysis.df_meta.loc[mask]

### PCA plot of raw data

In [None]:
import itertools
from sklearn.impute import SimpleImputer
X = SimpleImputer().fit_transform(analysis.df)
X = vaep.pandas._add_indices(X, analysis.df)
assert X.isna().sum().sum() == 0

pca = analyzers.run_pca(X)
cols = list(pca.columns)

fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(
    15, 20), constrained_layout=True)
fig.suptitle('First two Principal Components of entire dataset', fontsize=30)


# by instrument
ax = axes[0]
pca['ms_instrument'] = analysis.df_meta['ms_instrument'].astype('category')
for name, group in pca.groupby('ms_instrument'):
    ax.scatter(x=group[cols[0]], y=group[cols[1]], label=name)
ax.set_title('by category', fontsize=18)
ax.legend(loc='center right', bbox_to_anchor=(1.11, 0.5))

# by dates
ax = axes[1]
ax.set_title('by date', fontsize=18)
path_collection = analyzers.scatter_plot_w_dates(
    ax, pca, dates=analysis.df_meta.date, errors='raise')
path_collection = analyzers.add_date_colorbar(path_collection, fig)

## Long format


In [None]:
def get_long_format(self, colname_values='intensity', inplace=False):
    df = self.df
    df_long = df.unstack().to_frame(colname_values)
    df_long = df_long.reset_index('Sample ID')

    if inplace:
        self.df_long = df_long
        return
    return df_long

get_long_format(analysis, inplace=True)
analysis.df_long.head()

In [None]:
def get_wide_format(self, columns='Sample ID', name_values='intensity', inplace=False):
    df_wide = self.df_long.pivot(columns=columns, values=name_values)
    df_wide = df_wide.T
    if inplace:
        self.df_wide = df_wide
        return
    return df_wide

get_wide_format(analysis, inplace=True)
analysis.df_wide.head()

### Sampling by index frequency

In [None]:
# freq_per_peptide = analysis.df.unstack().to_frame('intensity').reset_index(1, drop=True)
freq_per_peptide = analysis.df_long['intensity']
freq_per_peptide = freq_per_peptide.notna().groupby(level=0).sum()

In [None]:
# df_long = analysis.df.unstack().to_frame('intensity').reset_index(1)
analysis.df_train = analysis.df_long.groupby(by='Sample ID').sample(frac=0.95, weights=freq_per_peptide)
analysis.df_train