# Experiment 03 - Data

Create data split

In [None]:
import logging
from pathlib import Path
from pprint import pprint
from src.nb_imports import *

import vaep.io_images
from vaep.pandas import interpolate
from vaep.io.datasplits import DataSplits
from vaep.sampling import feature_frequency, frequency_by_index, sample_data

import src
from src.logging import setup_logger
logger = setup_logger(logger=logging.getLogger('vaep'))
logger.info("Experiment 03 - data")

figures = {}  # collection of ax or figures

## Arguments

In [None]:
# None takes all
ADD_TENSORBOARD: bool = False
FN_PEPTIDE_INTENSITIES: Path = (
    config.FOLDER_DATA / 'df_intensities_N07285_M01000')  # 90%
# query expression for subsetting
query_subset_meta: str = 'ms_instrument in ["QE4", ]'
experiment_folder_name: str = ''

process arguments

In [None]:
FN_PEPTIDE_INTENSTIES = Path(FN_PEPTIDE_INTENSITIES)
logger.info(f"{FN_PEPTIDE_INTENSITIES = }")

### Setup

In [None]:
printable = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ '


def parse_query_expression(s, printable=printable):
    return ''.join(filter(lambda x: x in printable, s))


if not experiment_folder_name:
    experiment_folder_name = parse_query_expression(query_subset_meta)
    experiment_folder_name = experiment_folder_name.strip()
    experiment_folder_name = experiment_folder_name.replace(' ', '_')
logger.info(f'Folder for output = {experiment_folder_name}')

## Raw data

In [None]:
analysis = AnalyzePeptides.from_file(fname=FN_PEPTIDE_INTENSITIES, nrows=None)
analysis.df.columns.name = 'peptide'
analysis.log_transform(np.log2)
logger.info(f"{analysis = }")

Rename some samples
- [ ] needs to be moved into the data extraction pipeline from the server

In [None]:
# some date are not possible in the indices
rename_indices_w_wrong_dates = {'20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_03': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_03',
                                '20180230_QE10_nLC0_MR_QC_MNT_Hela_12': '20180330_QE10_nLC0_MR_QC_MNT_Hela_12',
                                '20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_01': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_01',
                                '20180230_QE10_nLC0_MR_QC_MNT_Hela_11': '20180330_QE10_nLC0_MR_QC_MNT_Hela_11',
                                '20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_02': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_02'}
analysis.df.rename(index=rename_indices_w_wrong_dates, inplace=True)

### Select N consecutive samples

In [None]:
assert analysis.df.index.is_unique, "Duplicates in index"
analysis.df.sort_index(inplace=True)

## Metadata

In [None]:
analysis.add_metadata()

In [None]:
analysis.df_meta['datetime'] = pd.to_datetime(
    analysis.df_meta.date, format="%Y/%m/%d")  # persistent

In [None]:
analysis.df_meta.describe(include='all', datetime_is_numeric=True)

In [None]:
analysis.df_meta = analysis.df_meta.query(query_subset_meta)
analysis.df_meta.describe(include='all', datetime_is_numeric=True)

select proteomics data based on meta data

In [None]:
analysis.df = analysis.df.loc[analysis.df_meta.index]
analysis.df

### PCA plot of raw data
- biological stock differences in PCA plot. Show differences in models. Only see biological variance

In [None]:
fig = analysis.plot_pca()

In [None]:
vaep.io_images._savefig(
    fig, f'pca_plot_raw_data_{analysis.fname_stub}', folder=config.FIGUREFOLDER / experiment_folder_name)

Scatter plots need to become interactive.

## Split: Train, validation and test data

- test data is in clinical language often denoted as independent validation cohort
- validation data (for model)

In [None]:
analysis.splits = DataSplits(is_wide_format=True)
splits = analysis.splits
print(f"{analysis.splits = }")
analysis.splits.__annotations__

In [None]:
analysis.df

In [None]:
percentiles = (0.8, 0.9)  # change here

percent_str = [f'{int(x*100)}%' for x in percentiles]
split_at_date = analysis.df_meta['datetime'].describe(
    datetime_is_numeric=True, percentiles=(0.8, 0.9)).loc[percent_str]
split_at_date = tuple(pd.Timestamp(t.date()) for t in split_at_date)

print(f"{split_at_date[0] = }", f"{split_at_date[1] = }", sep="\n")

In [None]:
idx_train = analysis.df_meta['datetime'] < split_at_date[0]
analysis.splits.train_X = analysis.df.loc[idx_train]
analysis.splits.train_X

In [None]:
idx_validation = ((analysis.df_meta['datetime'] >= split_at_date[0]) & (
    analysis.df_meta['datetime'] < split_at_date[1]))
analysis.splits.val_X = analysis.df.loc[idx_validation]
analysis.splits.val_X

In [None]:
idx_test = (analysis.df_meta['datetime'] >= split_at_date[1])
# analysis.df_test =
analysis.splits.test_X = analysis.df.loc[idx_test]
analysis.splits.test_X

In [None]:
idx_test_na = analysis.splits.test_X.stack(
    dropna=False).loc[splits.test_X.isna().stack()].index
print(f"number of missing values in test data: {len(idx_test_na)}")

Add goldstandard targets for valiation and test data
- based on same day
- same instrument

In [None]:
splits.val_y = interpolate(splits.val_X)
splits.test_y = interpolate(splits.test_X)

NA values not imputed using other data:

In [None]:
idx_still_na = idx_test_na.difference(splits.test_y.index)
if not idx_still_na.empty:
    logger.info(idx_still_na.to_list())
else:
    logger.info("all missing values imputed.")

## Save in long format

- Data in long format: (peptide, sample_id, intensity)
- no missing values kept

In [None]:
folder = config.FOLDER_DATA / experiment_folder_name

# currently val_y and test_y are in long format, while all *_X are in wide format
# dump transforms all into long-format

splits.dump(folder=folder) # dumps data in long-format

### Sampling peptides by their frequency (important for later)

- higher count, higher probability to be sampled into training data
- missing peptides are sampled both into training as well as into validation dataset
- everything not in training data is validation data

Based on unmodified training data

In [None]:
# section does work with data loaded from dumps in long-format
# start with single view of all data (i.e. in long-format)
analysis.splits = DataSplits.from_folder(folder=folder) 
analysis.splits.to_wide_format()

In [None]:
freq_per_peptide = feature_frequency(analysis.splits.train_X) # freq_per_peptide.notna().groupby(level=0).sum()

analysis.splits.to_long_format()
assert all(frequency_by_index(df_long=analysis.splits.train_X, sample_index_to_drop=0) == freq_per_peptide)

freq_per_peptide

In [None]:
msg = "Total number of samples in training data split: {}"
print(msg.format(len(analysis.splits.train_X)))

In [None]:
freq_per_peptide.to_csv(folder / 'freq_train')

In [None]:
series_sampled, series_not_sampled = sample_data(analysis.splits.test_X, 0)
# in test_sampling:
# assert len(analysis.splits.test_X) == len(
#     series_sampled) + len(series_not_sampled)
# assert analysis.splits.test_X.index.difference(
#     series_sampled.index.append(series_not_sampled.index)).empty

Conserning sampling with frequency weights:
  - larger weight -> higher probablility of being sampled
  - weights need to be alignable to index of original DataFrame before grouping (same index)

In [None]:
peptide_selected = 'YYVTIIDAPGHR'
freq_per_peptide.loc[peptide_selected] = 0 # non should be sampled
freq_per_peptide.tail()

In [None]:
# Sanity check
# set one peptide weights to 0 and affirm that they are only in the not sample series
series_sampled, series_not_sampled = sample_data(analysis.splits.test_X, 0, weights=freq_per_peptide)
# series_sampled

# in test_sampling
# import pytest
# with pytest.raises(KeyError):
#     series_sampled.loc[pd.IndexSlice[:, peptide_selected]]

# assert (len(series_not_sampled.loc[pd.IndexSlice[:, peptide_selected]]) 
# == 
# series_not_sampled.reset_index().peptide.value_counts().loc[peptide_selected])
    
series_not_sampled.loc[pd.IndexSlice[:, [peptide_selected]]]

## MultiIndex 

- use mulitindex for obtaining validation split

[[stackoverflow](https://stackoverflow.com/questions/53927460/select-rows-in-pandas-multiindex-dataframe), [guide](https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html)]

- [`xs` method](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.xs.html) or [`pd.IndexSlice`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.IndexSlice.html?highlight=indexslice)

In [None]:
analysis.df_long.head()

In [None]:
analysis.df_long.loc[pd.IndexSlice[:, 'YRVPDVLVADPPIAR'], :]

In [None]:
analysis.df_long.loc[(slice(None), 'AAAAAAALQAK'), :]

with a series the syntax changes slightly (no column) and the indexing behaviour different if a string or a list is passed:

In [None]:
s = analysis.df_long.squeeze()
s

In [None]:
s.loc[pd.IndexSlice[:, 'AAAAAAALQAK']]

In [None]:
s.loc[pd.IndexSlice[:, ['AAAAAAALQAK']]]