# Build a set of training data

Use a set of (most) common peptides to create inital data sets

In [None]:
import yaml
import json
import random  # shuffle, seed
from pathlib import Path

import pandas as pd
from tqdm.notebook import tqdm

import src.config as config
from src.config import FOLDER_PROCESSED
import vaep.io
from vaep.io import data_objects

from src.config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES

FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES

In [None]:
N: int = 10_000  # Number of max samples
RANDOM_SEED: int = 42  # Random seed for reproducibility

FEAT_COMPLETNESS_CUTOFF = 0.25 # Minimal proportion of samples which have to share a feature

In [None]:
## Charged Peptides

counter = data_objects.EvidenceCounter(FNAME_C_EVIDENCE)
counts = counter.get_df_counts()
counts = counts.convert_dtypes().astype({'Charge': int})
mask = counts['proportion'] >= FEAT_COMPLETNESS_CUTOFF
counts.loc[mask]

In [None]:
selected_evidence = counts.loc[mask].set_index(counter.idx_names).sort_index().index
selected_evidence

In [None]:
dumps = list(counter.dumps.values())


def load_evidence_dump(fpath, index_col=counter.idx_names):
    df = pd.read_csv(fpath, index_col=index_col)
    return df

load_evidence_dump(dumps[0])

# counter.load_dump(dumps[0])

In [None]:
selected_dumps = []
years = ['2018', '2019']
for year_folder in years:
    selected_dumps += [dump for dump in dumps if year_folder in dump.parent.stem]
print("Total number of files:", len(selected_dumps))
selected_dumps[-10:]

In [None]:
# %%time
# # current limit ~4000 files on erda (16GB of memory)
# N = min(len(selected_dumps), N)

# data_intensity = {}
# # data_genes = {}
# support = {}

# load_dump = load_evidence_dump
# selected_features = selected_evidence

# for fpath in tqdm(selected_dumps[:N]):
#     sample_name = fpath.stem
#     dump = load_dump(fpath)
#     sequences_available = dump.index.intersection(selected_features)
#     support[sample_name] = len(sequences_available)
#     data_intensity[sample_name] = dump.loc[sequences_available,
#                                                'Intensity'].to_dict()

In [None]:
%%time
# N = 100
N = min(len(selected_dumps), N)

data_intensity = {}

load_dump = load_evidence_dump
selected_features = selected_evidence

import functools
from typing import List, Callable

def process_folders(fpaths: List[Path],
                    selected_features: pd.Index,
                    load_folder: Callable)-> tuple:

    data_intensity = {}
    print("started new process.")
    for fpath in fpaths:
        print(fpath)
        sample_name = fpath.stem
        dump = load_dump(fpath)
        sequences_available = dump.index.intersection(selected_features)
        data_intensity[sample_name] = dump.loc[sequences_available,
                                               'Intensity'].to_dict()
    return pd.DataFrame.from_dict(data_intensity).T


process_folders_peptides = functools.partial(selected_dumps,
                                             selected_features=selected_evidence,
                                             load_folder=load_evidence_dump)
collected_data_intensities = data_objects.collect_in_chuncks(paths=dumps,
                                                             process_chunk_fct=process_folders_peptides,
                                                             chunks=200)


collected_dfs = pd.concat(collected_dfs)
df_intensities = collected_dfs

In [None]:
df_intensities.columns.names = counter.idx_names
df_intensities.index.name = 'Sample ID'
df_intensities.sort_index(inplace=True)
base_name = "df_intensities_evidence_" + '_'.join(years)
fname = config.FOLDER_DATA / config.insert_shape(df_intensities, base_name + '{}.csv')
print(f"{fname = }")
df_intensities.to_csv(fname)
df_intensities

In [None]:
base_name = "df_intensities_evidence_long" + '_'.join(years)
fname = config.FOLDER_DATA / config.insert_shape(df_intensities, base_name + '{}.csv')
print(f"{fname = }")

In [None]:
df_intensities = df_intensities.stack([0,1])
df_intensities.index = df_intensities.index.astype(pd.CategoricalDtype)
df_intensities.name = 'Intensity'

In [None]:
df_intensities.to_csv(fname)
df_intensities

## Aggregated Peptides

In [None]:
peptide_counter = data_objects.PeptideCounter(FNAME_C_PEPTIDES)

In [None]:
peptide_counts = peptide_counter.get_df_counts()
mask = peptide_counts['proportion'] >= FEAT_COMPLETNESS_CUTOFF
peptide_counts.loc[mask]

In [None]:
selected_peptides = peptide_counts.loc[mask].set_index('Sequence').index
selected_peptides

In [None]:
dumps = list(peptide_counter.dumps.values())

peptides = data_objects.load_agg_peptide_dump(dumps[0])
peptides.head()

In [None]:
%%time
N = min(len(dumps), N)

data_intensity = {}
support = {}
# again with multiprocessing? await functions?
for fp_training_sample in tqdm(dumps[:N]):
    sample_name = fp_training_sample.stem
    peptides = data_objects.load_agg_peptide_dump(fp_training_sample)
    sequences_available = peptides.index.intersection(selected_peptides)
    support[sample_name] = len(sequences_available)
    data_intensity[sample_name] = peptides.loc[sequences_available,
                                               'Intensity'].to_dict()

In [None]:
df_intensities = pd.DataFrame.from_dict(data_intensity).T
df_intensities.index.name = 'Sample ID'
df_intensities.sort_index(inplace=True)
df_intensities.to_csv(config.FOLDER_DATA /
                      config.insert_shape(df_intensities, 'df_intensities{}.csv'))
df_intensities

In [None]:
with open(config.FOLDER_DATA /
          config.insert_shape(df_intensities, 'support_agg_peptides{}.json'), 'w') as f:
    json.dump(support, f)