# Split up data into single datasets

- create datasets per (set of) instruments for a specific experiments
- drop some samples based on quality criteria

In [None]:
import logging
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates
import seaborn as sns

import umap

from vaep.io import thermo_raw_files
from vaep.analyzers import analyzers

from config import erda_dumps
from config import defaults

import vaep
import vaep.io.filenames
from vaep.logging import setup_nb_logger

logger = setup_nb_logger()

FOLDER_DATA = defaults.FOLDER_DATA

In [None]:
vaep.plotting.make_large_descriptors()
FIGSIZE = (15, 10)

## Parameters

In [None]:
N_MIN_INSTRUMENT = 300
META_DATA: str = 'data/files_selected_metadata.csv'
FILE_EXT = 'pkl'
SAMPLE_ID = 'Sample ID'

DUMP: str = erda_dumps.FN_PROTEIN_GROUPS
OUT_NAME = 'protein group'  # for legends labels
# DUMP: str = erda_dumps.FN_PEPTIDES
# OUT_NAME = 'aggregated peptide' # for legends labels
# DUMP: str = erda_dumps.FN_EVIDENCE
# OUT_NAME = 'charged peptide' # for legends labels

FOLDER_DATASETS: str = f'single_datasets/{DUMP.stem}'

INSTRUMENT_LEGEND_TITLE = 'Q Exactive HF-X Orbitrap'

Make sure output folder exists

In [None]:
DUMP = Path(DUMP)  # set parameter from cli or yaml to Path
FOLDER_DATASETS = defaults.FOLDER_DATA / FOLDER_DATASETS
FOLDER_DATASETS.mkdir(exist_ok=True, parents=True)
logger.info(f"Folder for datasets to be created: {FOLDER_DATASETS.absolute()}")

## Dumps

- load dumps
- load file to machine mappings

In [None]:
data = pd.read_pickle(DUMP)
data = data.squeeze()  # In case it is a DataFrame, not a series (-> leads to MultiIndex)
name_data = data.name
logger.info(
    f"Number of rows (row = sample, feature, intensity): {len(data):,d}")
data

Make categorical index a normal string index (this lead to problems when selecting data using `loc` and grouping data as level of data could not easily be removed from MultiIndex)

- see [blog](https://towardsdatascience.com/staying-sane-while-adopting-pandas-categorical-datatypes-78dbd19dcd8a)

In [None]:
index_columns = data.index.names
data = data.reset_index()
print(data.memory_usage(deep=True))
cat_columns = data.columns[data.dtypes == 'category']
if not cat_columns.empty:
    data[cat_columns] = data[cat_columns].astype('object')
    print("non categorical: \n", data.memory_usage(deep=True))
    logger.warning(
        "if time allows, this should be investigate -> use of loc with data which is not categorical")
data = data.set_index(index_columns)

## Support per sample

In [None]:
idx_non_sample = list(data.index.names)
idx_non_sample.remove(SAMPLE_ID)
idx_non_sample

In [None]:
# M = data.index.droplevel(SAMPLE_ID).nunique() # very slow alternative, but 100% correct
M = vaep.io.filenames.read_M_features(DUMP.stem)
logger.info(f"Number of unqiue features: {M}")

In [None]:
counts = data.groupby(SAMPLE_ID).count().squeeze()
N = len(counts)
counts.to_json(FOLDER_DATASETS / 'support_all.json', indent=4)
ax = (counts
      .sort_values()  # will raise an error with a DataFrame
      .reset_index(drop=True)
      .plot(rot=45,
            figsize=FIGSIZE,
            grid=True,
            ylabel='number of features in sample',
            xlabel='Sample rank ordered by number of features',
            title=f'Support of {N:,d} samples features over {M} features ({", ".join(idx_non_sample)})',
            ))
vaep.plotting.add_prop_as_second_yaxis(ax, M)
fig = ax.get_figure()
fig.tight_layout()
vaep.plotting.savefig(fig, name='support_all',
                      folder=FOLDER_DATASETS)

In [None]:
counts = data.groupby(idx_non_sample).count().squeeze()
counts.to_json(FOLDER_DATASETS / 'feat_completeness_all.json', indent=4)
ax = (counts
      .sort_values()  # will raise an error with a DataFrame
      .reset_index(drop=True)
      .plot(rot=45,
            figsize=FIGSIZE,
            grid=True,
            ylabel='number of samples per feature',
            xlabel='Feature rank ordered by number of samples',
            title=f'Support of {len(counts):,d} features over {N} samples ({", ".join(idx_non_sample)})',
            ))
vaep.plotting.add_prop_as_second_yaxis(ax, N)
fig = ax.get_figure()
vaep.plotting.savefig(fig, name='feat_per_sample_all',
                      folder=FOLDER_DATASETS)

## Filter for odd samples

- fractionated samples
- GPF - Gas phase fractionation # Faims? DIA? 
- DIA
- CV

In [None]:
# see misc_data_exploration_peptides

## Meta Data

- based on ThermoRawFileParser

In [None]:
# sample_ids = data.index.levels[0] # assume first index position is Sample ID?
sample_ids = data.index.get_level_values(SAMPLE_ID).unique()  # more explict
sample_ids

In [None]:
df_meta = pd.read_csv(META_DATA, index_col=SAMPLE_ID)
date_col = 'Content Creation Date'
df_meta[date_col] = pd.to_datetime(df_meta[date_col])
df_meta = df_meta.loc[sample_ids]
df_meta

### Available instruments

In [None]:
counts_instrument = df_meta.groupby(thermo_raw_files.cols_instrument)[date_col].agg(
    ['count', 'min', 'max']).sort_values(by=thermo_raw_files.cols_instrument[:2] + ['count'], ascending=False)
counts_instrument

In [None]:
len(counts_instrument)

In [None]:
selected_instruments = counts_instrument.query(f"count >= {N_MIN_INSTRUMENT}")
fname = FOLDER_DATASETS / 'dataset_info'
selected_instruments.to_latex(f"{fname}.tex")
selected_instruments.to_excel(f"{fname}.xlsx")
logger.info(f"Save Information to: {fname} (as json, tex)")
selected_instruments

In [None]:
# mask = pd.Series(False, index=df_meta.index)
# for v in selected_instruments.index:
#     mask = mask | (df_meta[selected_instruments.index.names] == v).all(axis=1)
# mask.sum()

In [None]:
# df_meta = df_meta.loc[mask]
# data = data.loc[df_meta.index]

## Summary plot - UMAP

- embedding based on all samples
- visualization of top 5 instruments

In [None]:
reducer = umap.UMAP(random_state=42)
data = data.unstack(idx_non_sample)
data

In [None]:
embedding = reducer.fit_transform(data.fillna(data.median()))
embedding = pd.DataFrame(embedding, index=data.index,
                         columns=['UMAP 1', 'UMAP 2'])
embedding = embedding.join(
    df_meta[["Content Creation Date", "instrument serial number"]])
d_instrument_counts = counts_instrument['count'].reset_index(
    level=[0, 1], drop=True).to_dict()
embedding["count"] = embedding["instrument serial number"].replace(
    d_instrument_counts)
embedding

In [None]:
digits = int(np.ceil(np.log10(embedding["count"].max())))
digits

In [None]:
embedding["instrument with N"] = embedding[["instrument serial number",
                                            "count"]].apply(lambda s: f"{s[0]} (N={s[1]:{digits}d})", axis=1)
embedding["instrument with N"] = embedding["instrument with N"].str.replace(
    'Exactive Series slot', 'Instrument')
embedding

define top five instruments

In [None]:
top_5 = counts_instrument["count"].nlargest(5)
top_5 = top_5.index.levels[-1]
embedding["instrument"] = embedding["instrument serial number"].apply(
    lambda x: x if x in top_5 else 'other')
mask_top_5 = embedding["instrument"] != 'other'

In [None]:
embedding["Date (90 days intervals)"] = embedding["Content Creation Date"].dt.round(
    "90D").astype(str)
to_plot = embedding.loc[mask_top_5]
print(f"N samples in plot: {len(to_plot):,d}")
fig, ax = plt.subplots(figsize=(20, 10))

ax = sns.scatterplot(data=to_plot, x='UMAP 1', y='UMAP 2', style="instrument with N",
                     hue="Date (90 days intervals)", ax=ax)  # ="Content Creation Date")
vaep.savefig(fig, name='umap_interval90days_top5_instruments',
             folder=FOLDER_DATASETS)

In [None]:
markers = ['o', 'x', 's', 'P', 'D', '.']
alpha = 0.6
fig, ax = plt.subplots(figsize=(12, 8))
groups = list()

vaep.plotting.make_large_descriptors()
embedding["Content Creation Date"] = embedding["Content Creation Date"].dt.round(
    "D")
embedding["mdate"] = embedding["Content Creation Date"].apply(
    matplotlib.dates.date2num)

to_plot = embedding.loc[mask_top_5]

norm = matplotlib.colors.Normalize(
    embedding["mdate"].quantile(0.05), embedding["mdate"].quantile(0.95))
cmap = sns.color_palette("cubehelix", as_cmap=True)


for k, _to_plot in to_plot.groupby('instrument with N'):
    if markers:
        marker = markers.pop(0)
    _ = ax.scatter(
        x=_to_plot["UMAP 1"],
        y=_to_plot["UMAP 2"],
        c=_to_plot["mdate"],
        alpha=alpha,
        marker=marker,
        cmap=cmap,
        norm=norm
    )
    groups.append(k)

cbar = vaep.analyzers.analyzers.add_date_colorbar(
    ax.collections[0], ax=ax, fig=fig)
cbar.ax.set_ylabel("date of measurement", labelpad=-115, loc='center')
ax.legend(ax.collections, groups,
          title=INSTRUMENT_LEGEND_TITLE, fontsize='xx-large')
ax.set_xlabel('UMAP 1')  # , fontdict={'size': 16})
ax.set_ylabel('UMAP 2')
vaep.savefig(fig, name='umap_date_top5_instruments', folder=FOLDER_DATASETS)

## Summary statistics for top 5 instruments 

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
# boxplot: number of available sample for included features
to_plot = data.loc[mask_top_5].notna().sum(axis=0).reset_index(
    drop=True).to_frame(f'{OUT_NAME.capitalize()} prevalence')
# boxplot: number of features per sample
to_plot = to_plot.join(data.loc[mask_top_5].notna().sum(axis=1).reset_index(
    drop=True).to_frame(f'{OUT_NAME.capitalize()}s per sample'))
to_plot = to_plot.join(counts_instrument.reset_index([0, 1], drop=True).loc[top_5, 'count'].reset_index(
    drop=True).rename('Samples per instrument', axis='index'))
ax = to_plot.plot(kind='box', ax=ax, fontsize=16, )
ax.set_ylabel('number of observations',
              fontdict={'fontsize': 14})
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
                   horizontalalignment='right')
to_plot.to_csv(FOLDER_DATASETS / 'summary_statistics_dump_data.csv')
vaep.savefig(fig, name='summary_statistics_dump',
             folder=FOLDER_DATASETS)

In [None]:
top_5_meta = df_meta.loc[mask_top_5] 
top_5_meta[['injection volume setting', 'dilution factor']].describe()

### Meta data stats for top 5

In [None]:
for _instrument, _df_meta_instrument in top_5_meta.groupby(by=thermo_raw_files.cols_instrument):
    print('#'* 80, ' - '.join(_instrument), sep='\n')
    display(_df_meta_instrument.describe())
    display(_df_meta_instrument['injection volume setting'].value_counts())
    break

## Dump single experiments

from long-format

In [None]:
data = data.stack(idx_non_sample)
data

In [None]:
cols = selected_instruments.index.names

file_formats = {'pkl': 'to_pickle',
                'pickle': 'to_pickle',
                'csv': 'to_csv'}


for values in selected_instruments.index:
    mask = df_meta[cols] == values
    logger.info(f"Samples: {mask.all(axis=1).sum()}")
    sample_ids = df_meta.loc[mask.all(axis=1)]
    display(sample_ids.sort_index())
    sample_ids = sample_ids.index
    # which categorical this might need to be a categorical Index as well?
    dataset = data.loc[sample_ids]
    dataset.index = dataset.index.remove_unused_levels()

    display(dataset
            .unstack(dataset.index.names[1:])
            .sort_index()
            )

    fname_dataset = vaep.io.get_fname_from_keys(values,
                                                folder=FOLDER_DATASETS,
                                                file_ext=f".{FILE_EXT}")

    logger.info(f'Dump dataset with N = {len(dataset)} to {fname_dataset}')
    _to_file_format = getattr(dataset, file_formats[FILE_EXT])
    _to_file_format(fname_dataset)

    fname_support = vaep.io.get_fname_from_keys(values,
                                                folder='.',
                                                file_ext="")
    fname_support = fname_support.stem + '_support'
    logger.info(f"Dump support to: {fname_support}")
    counts = dataset.groupby(SAMPLE_ID).count().squeeze()
    counts.to_json(FOLDER_DATASETS / f"{fname_support}.json", indent=4)

    # very slow alternative, but 100% correct
    M = dataset.index.droplevel(SAMPLE_ID).nunique()

    # plot:
    fig, ax = plt.subplots()
    ax = (counts
          .sort_values()  # will raise an error with a DataFrame
          .reset_index(drop=True)
          .plot(rot=45,
                ax=ax,
                figsize=FIGSIZE,
                grid=True,
                xlabel='Count of samples ordered by number of features',
                title=f'Support of {len(counts):,d} samples features over {M} features ({", ".join(idx_non_sample)})',
                ))
    vaep.plotting.add_prop_as_second_yaxis(ax, M)
    fig.tight_layout()
    vaep.plotting.savefig(fig, name=fname_support,
                          folder=FOLDER_DATASETS)

## Last example dumped

In [None]:
dataset

In [None]:
# add json dump as target file for script for workflows
selected_instruments.to_json(f"{fname}.json", indent=4)
logger.info(f"Saved: {fname}.json")