# Experiment 03 - Data

Create data split

In [None]:
import logging
from pathlib import Path
from pprint import pprint
from src.nb_imports import *

import vaep.io_images
from vaep.pandas import interpolate
from vaep.io.datasplits import DataSplits

import src
from src.logging import setup_logger
logger = setup_logger(logger=logging.getLogger('vaep'))
logger.info("Experiment 03 - data")

figures = {}  # collection of ax or figures

## Arguments

In [None]:
# None takes all
ADD_TENSORBOARD: bool = False
FN_PEPTIDE_INTENSITIES: Path = (
    config.FOLDER_DATA / 'df_intensities_N07285_M01000')  # 90%
# query expression for subsetting
query_subset_meta: str = 'ms_instrument in ["QE4", ]'
experiment_folder_name: str = ''

process arguments

In [None]:
FN_PEPTIDE_INTENSTIES = Path(FN_PEPTIDE_INTENSITIES)
logger.info(f"{FN_PEPTIDE_INTENSITIES = }")

### Setup

In [None]:
printable = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ '


def parse_query_expression(s, printable=printable):
    return ''.join(filter(lambda x: x in printable, s))


if not experiment_folder_name:
    experiment_folder_name = parse_query_expression(query_subset_meta)
    experiment_folder_name = experiment_folder_name.strip()
    experiment_folder_name = experiment_folder_name.replace(' ', '_')
logger.info(f'Folder for output = {experiment_folder_name}')

## Raw data

In [None]:
analysis = AnalyzePeptides(fname=FN_PEPTIDE_INTENSITIES, nrows=None)
analysis.df.columns.name = 'peptide'
analysis.log_transform(np.log2)
logger.info(f"{analysis = }")

Rename some samples
- [ ] needs to be moved into the data extraction pipeline from the server

In [None]:
# some date are not possible in the indices
rename_indices_w_wrong_dates = {'20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_03': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_03',
                                '20180230_QE10_nLC0_MR_QC_MNT_Hela_12': '20180330_QE10_nLC0_MR_QC_MNT_Hela_12',
                                '20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_01': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_01',
                                '20180230_QE10_nLC0_MR_QC_MNT_Hela_11': '20180330_QE10_nLC0_MR_QC_MNT_Hela_11',
                                '20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_02': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_02'}
analysis.df.rename(index=rename_indices_w_wrong_dates, inplace=True)

### Select N consecutive samples

In [None]:
assert analysis.df.index.is_unique, "Duplicates in index"
analysis.df.sort_index(inplace=True)

## Metadata

In [None]:
analysis.add_metadata()

In [None]:
analysis.df_meta['datetime'] = pd.to_datetime(
    analysis.df_meta.date, format="%Y/%m/%d")  # persistent

In [None]:
analysis.df_meta.describe(include='all', datetime_is_numeric=True)

In [None]:
analysis.df_meta = analysis.df_meta.query(query_subset_meta)
analysis.df_meta.describe(include='all', datetime_is_numeric=True)

select proteomics data based on meta data

In [None]:
analysis.df = analysis.df.loc[analysis.df_meta.index]
analysis.df

### PCA plot of raw data
- biological stock differences in PCA plot. Show differences in models. Only see biological variance

In [None]:
fig = analysis.plot_pca()

In [None]:
vaep.io_images._savefig(
    fig, f'pca_plot_raw_data_{analysis.fname_stub}', folder=config.FIGUREFOLDER / experiment_folder_name)

Scatter plots need to become interactive.

## Split: Train, validation and test data

- test data is in clinical language often denoted as independent validation cohort
- validation data (for model)

In [None]:
analysis.splits = DataSplits()
splits = analysis.splits
print(f"{analysis.splits = }")
analysis.splits.__annotations__

In [None]:
analysis.df

In [None]:
percentiles = (0.8, 0.9)  # change here

percent_str = [f'{int(x*100)}%' for x in percentiles]
split_at_date = analysis.df_meta['datetime'].describe(
    datetime_is_numeric=True, percentiles=(0.8, 0.9)).loc[percent_str]
split_at_date = tuple(pd.Timestamp(t.date()) for t in split_at_date)

print(f"{split_at_date[0] = }", f"{split_at_date[1] = }", sep="\n")

In [None]:
idx_train = analysis.df_meta['datetime'] < split_at_date[0]
analysis.splits.train_X = analysis.df.loc[idx_train]
analysis.splits.train_X

In [None]:
idx_validation = ((analysis.df_meta['datetime'] >= split_at_date[0]) & (
    analysis.df_meta['datetime'] < split_at_date[1]))
analysis.splits.val_X = analysis.df.loc[idx_validation]
analysis.splits.val_X

In [None]:
idx_test = (analysis.df_meta['datetime'] >= split_at_date[1])
# analysis.df_test =
analysis.splits.test_X = analysis.df.loc[idx_test]
analysis.splits.test_X

In [None]:
idx_test_na = analysis.splits.test_X.stack(
    dropna=False).loc[splits.test_X.isna().stack()].index
print(f"number of missing values in test data: {len(idx_test_na)}")

Add goldstandard targets for valiation and test data
- based on same day
- same instrument

In [None]:
splits.val_y = interpolate(splits.val_X)
splits.test_y = interpolate(splits.test_X)

NA values not imputed using other data:

In [None]:
idx_still_na = idx_test_na.difference(splits.test_y.index)
if not idx_still_na.empty:
    logger.info(idx_still_na.to_list())
else:
    logger.info("all missing values imputed.")

## Save in long format

- Data in long format: (peptide, sample_id, intensity)
- no missing values kept

In [None]:
folder = config.FOLDER_DATA / experiment_folder_name

splits.dump(folder=folder)
# splits = DataSplits.from_folder(folder=folder)
# splits.load(folder=folder)

### Sampling peptides by their frequency (important for later)

- higher count, higher probability to be sampled into training data
- missing peptides are sampled both into training as well as into validation dataset
- everything not in training data is validation data

Based on unmodified training data

In [None]:
X_train = analysis.splits.train_X # won't work with loaded splits object.
freq_per_peptide = X_train.unstack().to_frame(
    'intensity').reset_index(1, drop=True)
freq_per_peptide = freq_per_peptide.notna().groupby(level=0).sum()
freq_per_peptide

In [None]:
freq_per_peptide.to_csv(folder / 'freq_train')

In [None]:
# # df_long = analysis.df.unstack().to_frame('intensity').reset_index(1)
# analysis.df_train = analysis.df_long.groupby(
#     by='Sample ID').sample(frac=0.95, weights=freq_per_peptide, random_state=42)
# analysis.df_train = analysis.df_train.reset_index().set_index([
#     'Sample ID', 'peptide'])
# analysis.df_train

## MultiIndex 

- use mulitindex for obtaining validation split

In [None]:
analysis.df_long.head()

In [None]:
analysis.df_long.loc[pd.IndexSlice[:, 'YRVPDVLVADPPIAR'], :]

In [None]:
analysis.df_long.loc[(slice(None), 'AAAAAAALQAK'), :]

In [None]:
# analysis.indices_valid = analysis.df_long.index.difference(
#     analysis.df_train.index)
# analysis.df_valid = analysis.df_long.loc[analysis.indices_valid]

In [None]:
# assert len(analysis.df_long) == len(analysis.df_train) + len(analysis.df_valid)