# Split up data into single datasets

- create datasets per (set of) instruments for a specific experiments
- drop some samples based on quality criteria

In [None]:
import logging

import pandas as pd

from vaep.io import thermo_raw_files
from vaep.analyzers import analyzers

from config import erda_dumps
from config import defaults

import vaep
from vaep.logging import setup_nb_logger

logger = setup_nb_logger()

FOLDER_DATA = defaults.FOLDER_DATA

## Parameters

In [None]:
# DUMP: str = erda_dumps.FN_PROTEIN_GROUPS
# DUMP: str = erda_dumps.FN_PEPTIDES
DUMP: str = erda_dumps.FN_EVIDENCE
FOLDER_DATASETS: str = f'single_datasets/{DUMP.stem}'
FILE_EXT = 'pkl'
SAMPLE_ID = 'Sample ID'

Make sure output folder exists

In [None]:
FOLDER_DATASETS = defaults.FOLDER_DATA / FOLDER_DATASETS
FOLDER_DATASETS.mkdir(exist_ok=True, parents=True)
logger.info(f"Folder for datasets to be created: {FOLDER_DATASETS.absolute()}")

## Dumps

- load dumps
- load file to machine mappings

In [None]:
data = pd.read_pickle(DUMP)
# data = data.squeeze() # In case it is a DataFrame, not a series (-> leads to MultiIndex)
data

## Filter for odd samples

- fractionated samples
- GPF - Gas phase fractionation # Faims? DIA? 
- DIA
- CV

In [None]:
# see 02_data_exploration_peptides

## Meta Data

- based on ThermoRawFileParser

In [None]:
# sample_ids = data.index.levels[0] # assume first index position is Sample ID?
sample_ids = data.index.get_level_values(SAMPLE_ID).unique() # more explict
sample_ids

In [None]:
df_meta = pd.read_csv('data/files_selected_metadata.csv', index_col=0)
date_col = 'Content Creation Date'
df_meta[date_col] = pd.to_datetime(df_meta[date_col])
df_meta = df_meta.loc[sample_ids]
df_meta

### Available instruments

In [None]:
counts_instrument = df_meta.groupby(thermo_raw_files.cols_instrument)[date_col].agg(
    ['count', 'min', 'max']).sort_values(by=thermo_raw_files.cols_instrument[:2] + ['count'], ascending=False)
counts_instrument

In [None]:
N_MIN_INSTRUMENT = 300
selected_instruments = counts_instrument.query(f"count >= {N_MIN_INSTRUMENT}")
fname = FOLDER_DATASETS / 'dataset_info'
selected_instruments.to_json(f"{fname}.json", indent=4)
selected_instruments.to_latex(f"{fname}.tex")
selected_instruments.to_excel(f"{fname}.xlsx")
logger.info(f"Save Information to: {fname} (as json, tex)")
selected_instruments

## Dump single experiments

In [None]:
cols = selected_instruments.index.names

file_formats = {'pkl': 'to_pickle',
                'pickle': 'to_pickle',
                'csv': 'to_csv'}


for values in selected_instruments.index:
    mask = df_meta[cols] == values
    logger.info(f"Samples: {mask.all(axis=1).sum()}")
    sample_ids = df_meta.loc[mask.all(axis=1)].index
    dataset = data.loc[sample_ids]
    fname_dataset = vaep.io.get_fname_from_keys(values,
                                                folder=FOLDER_DATASETS,
                                                file_ext=f".{FILE_EXT}")

    logger.info(f'Dump dataset with N = {len(dataset)} to {fname_dataset}')
    _to_file_format = getattr(dataset, file_formats[FILE_EXT])
    _to_file_format(fname_dataset)

## Last example dumped

In [None]:
dataset