# Build a set of training data

Use a set of (most) common peptides to create inital data sets

In [None]:
import yaml
import json
import random  # shuffle, seed
import functools
from pathlib import Path

import pandas as pd
from tqdm.notebook import tqdm

import src.config as config
from src.config import FOLDER_PROCESSED
import vaep.io
from vaep.io import data_objects

from src.config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES

FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES

In [None]:
N: int = 10_000  # Number of max samples
RANDOM_SEED: int = 42  # Random seed for reproducibility

FEAT_COMPLETNESS_CUTOFF = 0.25 # Minimal proportion of samples which have to share a feature

YEARS = ['2018', '2019']


TYPES_EVIDENCE = {'Sample ID': 'category',
                  'Sequence': 'category',
                  'Charge': 'category',}

IDX_COLS_LONG = ['Sample ID', 'Sequence', 'Charge'] # in order 

In [None]:
## Charged Peptides

counter = data_objects.EvidenceCounter(FNAME_C_EVIDENCE)
counts = counter.get_df_counts()
counts = counts.convert_dtypes().astype({'Charge': int})
mask = counts['proportion'] >= FEAT_COMPLETNESS_CUTOFF
counts.loc[mask]

In [None]:
selected_evidence = counts.loc[mask].set_index(counter.idx_names).sort_index().index
selected_evidence

In [None]:
dumps = list(counter.dumps.values())


def load_evidence_dump(fpath, index_col=counter.idx_names):
    df = pd.read_csv(fpath, index_col=index_col)
    return df

load_evidence_dump(dumps[0])

In [None]:
from typing import List
def select_files_by_parent_folder(fpaths:List, years:List):
    selected = []
    for year_folder in years:
        # several passes, but not a bottle neck
        selected += [dump for dump in fpaths if year_folder in dump.parent.stem]
    return selected

selected_dumps = select_files_by_parent_folder(dumps, years=YEARS)
print("Total number of files:", len(selected_dumps))
selected_dumps[-10:]

In [None]:
from typing import List, Callable

def process_folders(fpaths: List[Path],
                    selected_features: pd.Index,
                    load_folder: Callable,
                    id_col='Sample ID',
                    dtypes: dict = {
                        'Sample ID': 'category',
                        'Sequence': 'category'}) -> tuple:
    print(f"started new process with {len(fpaths)} files.")
    data_intensity = []
    for i, fpath in enumerate(fpaths):
        if not i % 10: print(f"File ({i}): {fpath}")
        sample_name = fpath.stem
        dump = load_folder(fpath)
        sequences_available = dump.index.intersection(selected_features)
        dump = dump.loc[sequences_available, 'Intensity'].reset_index()
        dump[id_col] = sample_name
        dump = dump.astype(dtypes)
        data_intensity.append(dump)
    
    data_intensity = pd.concat(data_intensity, copy=False, ignore_index=True)
    data_intensity = data_intensity.astype(dtypes)
    return data_intensity

# experiment:
# process_folders(dumps[:2],
#                 selected_features=selected_evidence,
#                 load_folder=load_evidence_dump,
#                 dtypes=TYPES_EVIDENCE).dtypes


In [None]:
%%time
# N = 100
N = min(len(selected_dumps), N)

LOAD_DUMP = load_evidence_dump
SELECTED_FEATURES = selected_evidence


process_folders_peptides = functools.partial(process_folders,
                                             selected_features=SELECTED_FEATURES,
                                             load_folder=LOAD_DUMP)
collected_dfs = data_objects.collect_in_chuncks(paths=selected_dumps,
                                                process_chunk_fct=process_folders_peptides,
                                                chunks=200)

# one would need to aggregate categories first to keep them during aggregation?
collected_dfs = pd.concat(collected_dfs, copy=False, ignore_index=True)
collected_dfs = collected_dfs.astype(TYPES_EVIDENCE)
df_intensities = collected_dfs
df_intensities

In [None]:
df_intensities.dtypes

In [None]:
df_intensities = df_intensities.set_index(IDX_COLS_LONG)

In [None]:
M = len(selected_evidence)
N = len(selected_dumps)
N,M

In [None]:
df_intensities.sort_index(inplace=True)
base_name = "df_intensities_evidence_long" + '_'.join(YEARS)
fname = config.FOLDER_DATA / config.insert_shape(df_intensities, base_name + '{}.csv', shape=(N,M))
print(f"{fname = }")
df_intensities.to_csv(fname)
df_intensities

In [None]:
fname = config.FOLDER_DATA / config.insert_shape(df_intensities, base_name + '{}.pkl', shape=(N,M))
print(f"{fname = }")
df_intensities.to_pickle(fname)

## Aggregated Peptides

In [None]:
peptide_counter = data_objects.PeptideCounter(FNAME_C_PEPTIDES)

In [None]:
peptide_counts = peptide_counter.get_df_counts()
mask = peptide_counts['proportion'] >= FEAT_COMPLETNESS_CUTOFF
peptide_counts.loc[mask]

In [None]:
selected_peptides = peptide_counts.loc[mask].set_index('Sequence').index
selected_peptides

In [None]:
dumps = list(peptide_counter.dumps.values())

peptides = data_objects.load_agg_peptide_dump(dumps[0])
peptides.head()

In [None]:
%%time
N = min(len(dumps), N)

data_intensity = {}
support = {}
# again with multiprocessing? await functions?
for fp_training_sample in tqdm(dumps[:N]):
    sample_name = fp_training_sample.stem
    peptides = data_objects.load_agg_peptide_dump(fp_training_sample)
    sequences_available = peptides.index.intersection(selected_peptides)
    support[sample_name] = len(sequences_available)
    data_intensity[sample_name] = peptides.loc[sequences_available,
                                               'Intensity'].to_dict()

In [None]:
df_intensities = pd.DataFrame.from_dict(data_intensity).T
df_intensities.index.name = 'Sample ID'
df_intensities.sort_index(inplace=True)
df_intensities.to_csv(config.FOLDER_DATA /
                      config.insert_shape(df_intensities, 'df_intensities{}.csv'))
df_intensities

In [None]:
with open(config.FOLDER_DATA /
          config.insert_shape(df_intensities, 'support_agg_peptides{}.json'), 'w') as f:
    json.dump(support, f)