# Experiment 02 - Data

Find all samples in single plots.
    - [ ] 1000 most abundant peptides

Create batches of samples. Partition data to clusters of samples

In [None]:
from pprint import pprint
from src.nb_imports import *


import vaep.io_images
import seaborn

from pathlib import Path
from src import metadata


import logging
from src.logging import setup_logger

logger = setup_logger(logger=logging.getLogger('vaep'))
logger.info("Experiment 02")

figures = {}  # collection of ax or figures

ADD_TENSORBOARD = False

In [None]:
# None takes all
N_SAMPLES = None

Change some Matplotlib configuration defaults

In [None]:
plt.rcParams.update({'xtick.labelsize': 'xx-large',
                     'ytick.labelsize': 'xx-large',
                     'axes.titlesize' : 'xx-large',
                     'axes.labelsize' : 'xx-large',
                    })

## Raw data

In [None]:
# only some sample have many missings

FN_PEPTIDE_INTENSITIES = config.FOLDER_DATA / 'df_intensities_N07813_M01000'  # all
FN_PEPTIDE_INTENSITIES = config.FOLDER_DATA / 'df_intensities_N07637_M01000'  # 60%
FN_PEPTIDE_INTENSITIES = config.FOLDER_DATA / 'df_intensities_N07285_M01000'  # 90%

In [None]:
analysis = AnalyzePeptides(fname=FN_PEPTIDE_INTENSITIES, nrows=None)
analysis.df.columns.name = 'peptide'
analysis.log_transform(np.log2)
analysis

In [None]:
# some date are not possible in the indices
rename_indices_w_wrong_dates = {'20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_03': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_03',
                                '20180230_QE10_nLC0_MR_QC_MNT_Hela_12': '20180330_QE10_nLC0_MR_QC_MNT_Hela_12',
                                '20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_01': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_01',
                                '20180230_QE10_nLC0_MR_QC_MNT_Hela_11': '20180330_QE10_nLC0_MR_QC_MNT_Hela_11',
                                '20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_02': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_02',
                                '20171208_MR_QC_HeLa2': '20171208_?_MR_QC_HeLa2'}
analysis.df.rename(index=rename_indices_w_wrong_dates, inplace=True)

### Select N consecutive samples

In [None]:
# sort index
analysis.df.sort_index(inplace=True)
analysis.df_all = analysis.df

In [None]:
N_SAMPLES = min(len(analysis.df), N_SAMPLES) if N_SAMPLES else len(analysis.df)

import random
random.seed(42)
def get_consecutive_data_indices(df, n_samples=N_SAMPLES):
    index = df.sort_index().index
    start_sample = len(index) - n_samples
    start_sample = random.randint(0, start_sample)
    return df.loc[index[start_sample:start_sample+n_samples]]

_attr_name = f'df_{N_SAMPLES}'
setattr(analysis,_attr_name,get_consecutive_data_indices(analysis.df_all) )
print("Training data stored under:", _attr_name)
analysis.df = getattr(analysis, _attr_name)
analysis.df

In [None]:
assert not analysis.df._is_view

### Remove samples based on completeness

- some sample have a low peptide count as they are originating from fractionated samples
- remove samples based on a certain treshold

In [None]:
print("current minimal number of features out of {} in a single sample: {}".format(
    analysis.df.shape[-1],
    analysis.df.notna().sum(axis=1).min()
))

In [None]:
_ax = analysis.df.notna().sum(axis=1).hist(figsize=(10,5))
_ = _ax.set_ylabel('counts')
_ = _ax.set_xlabel('number of non-missing peptides')

- biological stock differences in PCA plot. Show differences in models. Only see biological variance

### Add meta data

In [None]:
analysis.add_metadata()
analysis.df_meta

In [None]:
for _idx, v in analysis.df_meta.researcher.value_counts().sort_index().items():
    print(f'{_idx:7} - {v:3}')

Use to find date parsing errors, used for renaming above.

In [None]:
# invalid_dates = pd.to_datetime(analysis.df_meta.date, errors='coerce').isna()
# display(analysis.df_meta.loc[invalid_dates])
# {i : i for i in analysis.df_meta.loc[invalid_dates].index} # to rename

In [None]:
analysis.df_meta.describe(include='all')

In [None]:
_ = analysis.df_meta.lc_instrument.value_counts().sort_index()
_.to_csv(config.PROCESSED_DATA / f'counts_{_.name}.csv')
for _idx, v in _.items():
    print(f'{_idx:7} - {v:3}')

In [None]:
_ = analysis.df_meta.ms_instrument.value_counts().sort_index()
_.to_csv(config.PROCESSED_DATA / f'counts_{_.name}.csv')
_

See rare instrument types (potential labeling errors)

In [None]:
N_MIN_INSTRUMENT = 10
column = 'ms_instrument'
ms_instruments = analysis.df_meta.ms_instrument.value_counts()
ms_instruments = ms_instruments[ms_instruments <= N_MIN_INSTRUMENT].index
print(f'Entries with less than {N_MIN_INSTRUMENT} in {column}: {", ".join(str(x) for x in ms_instruments)}')
mask = analysis.df_meta.ms_instrument.isin(ms_instruments)
analysis.df_meta.loc[mask]

Some further information in the rest

abbreviation | what it stands for
--- | ---
MNT | maintanance (weekly runs to access quality)
QC  | quality control (assessing instrument quality during an experiment) <br> - every x runs a QC is taken


On liquid chromatography instruments:

- sometimes instrument names are provided (LC1200) instead of their tags
- `LC` and `nLC` are most likely the same

### Number of non-missing values

- used for plotting the data in PCA plot

In [None]:
analysis.df_meta.prop_not_na 

## PCA plot of raw data

In [None]:
fig = analysis.plot_pca()

In [None]:
vaep.io_images._savefig(fig, config.FIGUREFOLDER/ f'pca_plot_raw_data_{analysis.fname_stub}')

### Single plots without titles

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(
    15, 10), constrained_layout=True)
pca = analysis.get_PCA()
cols = pca.columns
seaborn.scatterplot(x=pca[cols[0]], y=pca[cols[1]], hue=pca['ms_instrument'], ax=ax, palette='deep')
ax.legend(loc='center right', bbox_to_anchor=(1.11, 0.5))

vaep.io_images._savefig(fig, config.FIGUREFOLDER/ f'pca_plot_raw_data_{analysis.fname_stub}_by_category')

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(
    15, 10), constrained_layout=True)


path_collection = analyzers.scatter_plot_w_dates(
    ax, pca, dates=analysis.df_meta.date, errors='raise')
path_collection = analyzers.add_date_colorbar(path_collection, ax=ax, fig=fig)
ax.set_xlabel(cols[0])
ax.set_ylabel(cols[1])

vaep.io_images._savefig(fig, config.FIGUREFOLDER/ f'pca_plot_raw_data_{analysis.fname_stub}_by_date')

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(
    15, 10), constrained_layout=True)

path_collection = ax.scatter(
x=cols[0], y=cols[1], c=analysis.df_meta['prop_not_na'], data=pca, alpha=0.3)
_ = fig.colorbar(path_collection, ax=ax)

ax.set_xlabel(cols[0])
ax.set_ylabel(cols[1])

vaep.io_images._savefig(fig, config.FIGUREFOLDER/ f'pca_plot_raw_data_{analysis.fname_stub}_by_prop_not_NA')

## Batching data using Gaussian Mixtures

In [None]:
# rename get_PCA to get_PCs
from sklearn.mixture import GaussianMixture 
N_COMPONENTS = 6
gm = GaussianMixture(n_components=N_COMPONENTS, covariance_type='full')

X = pca.iloc[:, :2].copy()
gm.fit(X)

In [None]:
gm.means_, gm.covariances_

In [None]:
X['batch_predicted'] = gm.predict(X)
X

In [None]:
cols = X.columns
seaborn.scatterplot(x=X[cols[0]], y=X[cols[1]], hue=X[cols[2]], palette='deep')

In [None]:
X.loc[X['batch_predicted']==1]

In [None]:
X.to_csv(config.PROCESSED_DATA / 'gaussian_clusters.csv')

## Interactive PCA plots of raw data

In [None]:
import plotly.express as px
# seaborn.scatterplot(x=pca[cols[0]], y=pca[cols[1]], hue=pca['ms_instrument'], ax=ax, palette='deep')
fig = px.scatter(pca, x=cols[0], y=cols[1], color="ms_instrument")
fig.show()

## Long format

- Data in long format: (peptide, sample_id, intensity)
- no missing values kept
- 

In [None]:
def get_long_format(self, colname_values='intensity', inplace=False):
    df = self.df
    df_long = df.unstack().dropna().to_frame(colname_values)
    df_long = df_long.reset_index('Sample ID')
    if inplace:
        self.df_long = df_long
        return
    return df_long


get_long_format(analysis, inplace=True)
analysis.df_long.head()

In [None]:
assert analysis.df_long.isna().sum().sum() == 0, "There are still missing values in the long format."

In [None]:
def get_wide_format(self, columns='Sample ID', name_values='intensity', inplace=False):
    df_wide = self.df_long.pivot(columns=columns, values=name_values)
    df_wide = df_wide.T
    if inplace:
        self.df_wide = df_wide
        return
    return df_wide


get_wide_format(analysis, inplace=True)
analysis.df_wide.head()

In [None]:
assert analysis.df_wide.isna().sum().sum() > 0, "There are no missing values left in the wide format"

### Sampling peptides by their frequency (important for later)

- higher count, higher probability to be sampled into training data
- missing peptides are sampled both into training as well as into validation dataset
- everything not in training data is validation data

In [None]:
# freq_per_peptide = analysis.df.unstack().to_frame('intensity').reset_index(1, drop=True)
freq_per_peptide = analysis.df_long['intensity']
freq_per_peptide = freq_per_peptide.notna().groupby(level=0).sum()

In [None]:
# df_long = analysis.df.unstack().to_frame('intensity').reset_index(1)
analysis.df_train = analysis.df_long.groupby(
    by='Sample ID').sample(frac=0.95, weights=freq_per_peptide, random_state=42)
analysis.df_train = analysis.df_train.reset_index().set_index([
    'Sample ID', 'peptide'])
analysis.df_train

## MultiIndex 

- use mulitindex for obtaining validation split

In [None]:
analysis.df_long = analysis.df_long.reset_index(
).set_index(['Sample ID', 'peptide'])
analysis.df_long.head()

In [None]:
analysis.indices_valid = analysis.df_long.index.difference(
    analysis.df_train.index)
analysis.df_valid = analysis.df_long.loc[analysis.indices_valid]

In [None]:
assert len(analysis.df_long) == len(analysis.df_train) + len(analysis.df_valid)