# Experiment 03 - Data

Create data split

In [None]:
from dataclasses import dataclass
import logging
import typing
from pathlib import Path
from pprint import pprint
from src.nb_imports import *

from sklearn.neighbors import NearestNeighbors

import vaep.io_images
from vaep.pandas import interpolate
from vaep.io.datasplits import DataSplits
from vaep.sampling import feature_frequency, frequency_by_index, sample_data

import src
from src.logging import setup_logger
logger = setup_logger(logger=logging.getLogger('vaep'))
logger.info("Experiment 03 - data")

figures = {}  # collection of ax or figures

## Arguments

In [None]:
FN_PEPTIDE_INTENSITIES: str = 'data/df_intensities_N07813_M10000.csv'  # Samples
FN_PEPTIDE_FREQ: str = 'data/processed/count_all_peptides.json'
M: int = 1000 # M most common features
index_col: typing.Union[str,int] = 'Sample ID' # Can be either a string or position (typical 0 for first column)
# query expression for subsetting
query_subset_meta: str = 'ms_instrument in ["QE4", ]'
experiment_folder: str = 'data'
columns_name: str = 'peptide'

In [None]:
@dataclass
class DataConfig:
    """Documentation. Copy pasted arguments to a dataclass."""
    FN_PEPTIDE_INTENSITIES: str = 'data/df_intensities_N07285_M10000.csv'  # Samples
    FN_PEPTIDE_FREQ: str = 'data/processed/count_all_peptides.json'
    M: int = 1000 # M most common features
    index_col: typing.Union[
        str, int
    ] = "Sample ID"  # Can be either a string or position (typical 0 for first column)
    # query expression for subsetting
    query_subset_meta: str = 'ms_instrument in ["QE4", ]'
    experiment_folder: str = "data"
    columns_name: str = "peptide"


params = DataConfig(
    FN_PEPTIDE_INTENSITIES=FN_PEPTIDE_INTENSITIES,
    FN_PEPTIDE_FREQ=FN_PEPTIDE_FREQ,
    index_col=index_col,
    query_subset_meta=query_subset_meta,
    experiment_folder=experiment_folder,
    columns_name=columns_name
)

from omegaconf import OmegaConf
# OmegaConf.create(cfg)
params = OmegaConf.create(params.__dict__)
params

process arguments

In [None]:
logger.info(f"{FN_PEPTIDE_INTENSITIES = }")

### Setup

In [None]:
printable = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ '


def parse_query_expression(s, printable=printable):
    return ''.join(filter(lambda x: x in printable, s))


if not experiment_folder:
    experiment_folder = query_subset_meta.replace('_', ' ')
    experiment_folder = parse_query_expression(experiment_folder)
    experiment_folder = experiment_folder.strip()
    experiment_folder = experiment_folder.replace(' ', '_')
    params.experiment_folder
experiment_folder = Path(experiment_folder)
logger.info(f'Folder for output = {experiment_folder}')

### Select M most common features


In [None]:
import json
from collections import Counter
# Use PeptideCounter instead?
with open(Path(params.FN_PEPTIDE_FREQ)) as f:
    freq_pep_all = Counter(json.load(f)['counter'])
    
selected_peptides = {k: v for k, v in freq_pep_all.most_common(M)}
print(f"No. of selected features: {len(selected_peptides):,d}")

## Raw data

In [None]:
%%time
params.usecols = sorted(selected_peptides)
if isinstance(params.index_col, str): params.usecols.insert(0, params.index_col)
analysis = AnalyzePeptides.from_file(fname=params.FN_PEPTIDE_INTENSITIES,
                                     nrows=None,
                                     index_col=params.index_col,
                                     usecols=params.usecols
                                    )
analysis.df.columns.name = columns_name

analysis.log_transform(np.log2)
logger.info(f"{analysis = }")

Rename some samples
- [ ] needs to be moved into the data extraction pipeline from the server

In [None]:
# some date are not possible in the indices
rename_indices_w_wrong_dates = {'20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_03': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_03',
                                '20180230_QE10_nLC0_MR_QC_MNT_Hela_12': '20180330_QE10_nLC0_MR_QC_MNT_Hela_12',
                                '20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_01': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_01',
                                '20180230_QE10_nLC0_MR_QC_MNT_Hela_11': '20180330_QE10_nLC0_MR_QC_MNT_Hela_11',
                                '20161131_LUMOS1_nLC13_AH_MNT_HeLa_long_02': '20161130_LUMOS1_nLC13_AH_MNT_HeLa_long_02'}
analysis.df.rename(index=rename_indices_w_wrong_dates, inplace=True)

In [None]:
assert analysis.df.index.is_unique, "Duplicates in index"
analysis.df.sort_index(inplace=True)

## Metadata

In [None]:
analysis.add_metadata()

In [None]:
analysis.df_meta['datetime'] = pd.to_datetime(
    analysis.df_meta.date, format="%Y/%m/%d")  # persistent

In [None]:
analysis.df_meta.describe(include='all', datetime_is_numeric=True)

In [None]:
analysis.df_meta = analysis.df_meta.query(query_subset_meta)
analysis.df_meta.describe(include='all', datetime_is_numeric=True)

select proteomics data based on meta data query

In [None]:
analysis.df = analysis.df.loc[analysis.df_meta.index]
analysis.df

### PCA plot of raw data
- biological stock differences in PCA plot. Show differences in models. Only see biological variance

In [None]:
fig = analysis.plot_pca()

In [None]:
vaep.io_images._savefig(
    fig, f'pca_plot_raw_data_{analysis.fname_stub}', folder=experiment_folder)

Scatter plots need to become interactive.

## Split: Train, validation and test data

- test data is in clinical language often denoted as independent validation cohort
- validation data (for model)

In [None]:
analysis.splits = DataSplits(is_wide_format=True)
splits = analysis.splits
print(f"{analysis.splits = }")
analysis.splits.__annotations__

In [None]:
analysis.df

In [None]:
percentiles = (0.8, 0.9)  # change here

percent_str = [f'{int(x*100)}%' for x in percentiles]
split_at_date = analysis.df_meta['datetime'].describe(
    datetime_is_numeric=True, percentiles=(0.8, 0.9)).loc[percent_str]
split_at_date = tuple(pd.Timestamp(t.date()) for t in split_at_date)

print(f"{split_at_date[0] = }", f"{split_at_date[1] = }", sep="\n")

In [None]:
idx_train = analysis.df_meta['datetime'] < split_at_date[0]
analysis.splits.train_X = analysis.df.loc[idx_train]
analysis.splits.train_X

In [None]:
idx_validation = ((analysis.df_meta['datetime'] >= split_at_date[0]) & (
    analysis.df_meta['datetime'] < split_at_date[1]))
analysis.splits.val_X = analysis.df.loc[idx_validation]
analysis.splits.val_X

In [None]:
idx_test = (analysis.df_meta['datetime'] >= split_at_date[1])
# analysis.df_test =
analysis.splits.test_X = analysis.df.loc[idx_test]
analysis.splits.test_X

In [None]:
idx_test_na = analysis.splits.test_X.stack(
    dropna=False).loc[splits.test_X.isna().stack()].index
print(f"number of missing values in test data: {len(idx_test_na)}")

## Peptide frequency  in training data

- higher count, higher probability to be sampled into training data
- missing peptides are sampled both into training as well as into validation dataset
- everything not in training data is validation data

Based on unmodified training data

In [None]:
# analysis.splits.to_wide_format()
assert analysis.splits is splits, "Sanity check failed."

In [None]:
freq_per_peptide = feature_frequency(analysis.splits.train_X)
freq_per_peptide

In [None]:
msg = "Total number of samples in training data split: {}"
print(msg.format(len(analysis.splits.train_X)))

In [None]:
freq_per_peptide.to_csv(experiment_folder /'data' / 'freq_train.csv')

Conserning sampling with frequency weights:
  - larger weight -> higher probablility of being sampled
  - weights need to be alignable to index of original DataFrame before grouping (same index)

## Sample targets (Fake NAs)

Add goldstandard targets for valiation and test data
- based on same day
- same instrument

Create some target values by sampling 5% of the validation and test data.

In [None]:
analysis.splits.to_long_format(name_values='intensity') # long format as sample_data uses long-format

In [None]:
splits.val_X, splits.val_y = sample_data(splits.val_X, sample_index_to_drop=0, weights=freq_per_peptide)
splits.test_X, splits.test_y = sample_data(splits.test_X, sample_index_to_drop=0, weights=freq_per_peptide)

for k, s in splits:
    s.sort_index(inplace=True)

## Save in long format

- Data in long format: (peptide, sample_id, intensity)
- no missing values kept

In [None]:
folder = experiment_folder / 'data'# possibly avoid duplication?

splits.dump(folder=folder)  # dumps data in long-format

In [None]:
# # Reload from disk
# splits = DataSplits.from_folder(folder)

## PCA plot of training data

In [None]:
ana_train_X = analyzers.AnalyzePeptides(data=splits.train_X, is_wide_format=False, ind_unstack='peptide')
figures['pca_train'] = ana_train_X.plot_pca()
vaep.savefig(figures['pca_train'], experiment_folder / f'pca_plot_raw_data_{ana_train_X.fname_stub}')

In [None]:
# add to DataSplits a inputs attribute

data_dict = {'train': splits.train_X, 'valid': splits.val_X, 'test': splits.test_X}
PCs = pd.DataFrame()
split_map = pd.Series(dtype='string')
for key, df in data_dict.items():
    df = df.unstack()
    PCs = PCs.append(ana_train_X.calculate_PCs(df))
    split_map = split_map.append(pd.Series(key, index=df.index))

fig, ax = plt.subplots(figsize=(15,8))
ax.legend(title='splits')
analyzers.seaborn_scatter(PCs.iloc[:, :2], fig, ax, meta=split_map,
                          title='First two principal compements (based on training data PCA)')
ax.get_legend().set_title("split")

For *Collaborative Filtering*, new samples could be initialized based on a KNN approach in the original sample space or the reduced PCA dimension.
  - The sample embeddings of the K neighearst neighbours could be averaged for a new sample

In [None]:
# Optional: Change number of principal components
# K = 2
# _ = ana_train_X.get_PCA(n_components=K)

train_PCs = ana_train_X.calculate_PCs(splits.train_X.unstack())
test_PCs = ana_train_X.calculate_PCs(splits.test_X.unstack())
nn = NearestNeighbors(n_neighbors=5).fit(train_PCs)

Select K neareast neighbors for first test data sample from training data. Compare equal distance mean to mean weighted by distances.

In [None]:
d, idx = nn.kneighbors(test_PCs.iloc[1:2])
# test_PCs.iloc[1]
idx

In [None]:
train_PCs.iloc[idx[0]]

In [None]:
w = d / d.sum()
display(f"Sample weights based on distances: {w = }")
w.flatten().reshape(5,1) * train_PCs.iloc[idx[0]] # apply weights to values

In [None]:
pd.DataFrame( (train_PCs.iloc[idx[0]].mean(), # mean
              (w.flatten().reshape(5,1) * train_PCs.iloc[idx[0]]).sum() # sum of weighted samples
              ), index=['mean','weighted by distance '])

Add visual representation of picked points in the first two principal components

In [None]:
ax.scatter(x=test_PCs.iloc[1]['PC 1'], y=test_PCs.iloc[1]['PC 2'], s=100, marker="v", c='r')
ax.scatter(x=train_PCs.iloc[idx[0]]['PC 1'], y=train_PCs.iloc[idx[0]]['PC 2'], s=100, marker="s", c='y')
fig

## Digression on MultiIndex: Data Selection

- use mulitindex for obtaining validation split

[[stackoverflow](https://stackoverflow.com/questions/53927460/select-rows-in-pandas-multiindex-dataframe), [guide](https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html)]

- [`xs` method](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.xs.html) or [`pd.IndexSlice`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.IndexSlice.html?highlight=indexslice)

In [None]:
sample_name_1, sample_name_2 = analysis.df_long.sample(2).index.get_level_values(-1).to_list()
sample_name_1, sample_name_2

In [None]:
analysis.df_long.head()

In [None]:
analysis.df_long.loc[pd.IndexSlice[:, sample_name_1], :]

In [None]:
analysis.df_long.loc[(slice(None), sample_name_2), :]

with a series the syntax changes slightly (no column) and the indexing behaviour different if a string or a list is passed:

In [None]:
s = analysis.df_long.squeeze()
s

In [None]:
s.loc[pd.IndexSlice[:, sample_name_2]]

In [None]:
s.loc[pd.IndexSlice[:, [sample_name_2]]]

## Save parameters

In [None]:
print(OmegaConf.to_yaml(params))

In [None]:
with open(experiment_folder/'data_config.yaml', 'w') as f:
    OmegaConf.save(params, f)