# Build a set of training data

Use a set of (most) common peptides to create inital data sets

In [None]:
import yaml
import json
import random  # shuffle, seed
import functools
from pathlib import Path
import logging

import pandas as pd
from tqdm.notebook import tqdm

import src.config as config
from src.config import FOLDER_PROCESSED
import vaep.io
from vaep.io import data_objects

from src.config import FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES

FNAME_C_PEPTIDES, FNAME_C_EVIDENCE, FNAME_C_PG, FNAME_C_GENES

In [None]:
from typing import List
def select_files_by_parent_folder(fpaths:List, years:List):
    selected = []
    for year_folder in years:
        # several passes, but not a bottle neck
        selected += [dump for dump in fpaths if year_folder in dump.parent.stem]
    return selected

def load_evidence_dump(fpath, index_col=['Sequence', 'Charge']):
    df = pd.read_csv(fpath, index_col=index_col)
    return df

def load_agg_peptide_dump(fpath):
    fpath = Path(fpath)
    peptides = pd.read_csv(fpath, index_col=0, dtype=d_dtypes_training_sample)
    return peptides

In [None]:
RANDOM_SEED: int = 42  # Random seed for reproducibility

FEAT_COMPLETNESS_CUTOFF = 0.25 # Minimal proportion of samples which have to share a feature

YEARS = ['2017','2018', '2019', '2020']


NAME = 'evidence'
BASE_NAME = f"df_intensities_{NAME}_long"

TYPES_DUMP = {'Sample ID': 'category',
              'Sequence': 'category',
              'Charge': 'category',}

TYPES_COUNT = {'Charge': int}

IDX_COLS_LONG = ['Sample ID', 'Sequence', 'Charge'] # in order 

LOAD_DUMP = load_evidence_dump

CounterClass = data_objects.EvidenceCounter
FNAME_COUNTER = FNAME_C_EVIDENCE

In [None]:
NAME = 'peptides'
BASE_NAME = f"df_intensities_{NAME}_long"

TYPES_DUMP = {'Sample ID': 'category',
                  'Sequence': 'category',
                  }

TYPES_COUNT = {}

IDX_COLS_LONG = ['Sample ID', 'Sequence'] # in order 

LOAD_DUMP = data_objects.load_agg_peptide_dump

CounterClass = data_objects.PeptideCounter
FNAME_COUNTER = FNAME_C_PEPTIDES

In [None]:
NAME = 'proteinGroups'
BASE_NAME = f"df_intensities_{NAME}_long"

TYPES_DUMP = {'Sample ID': 'category',
              'Gene names': 'category',
                  }

TYPES_COUNT = {}

IDX_COLS_LONG = ['Sample ID', 'Gene names'] # in order 


def load_pg_dump(folder):
    logger.debug(f"Load: {folder}")
    df = pd.read_csv(folder, index_col=pg_cols.Gene_names, usecols=use_cols)
    return df

LOAD_DUMP = data_objects.pg_idx_gene_fct



CounterClass = data_objects.GeneCounter
FNAME_COUNTER = FNAME_C_GENES

In [None]:
## Charged Peptides

counter = CounterClass(FNAME_COUNTER)
counts = counter.get_df_counts()

if TYPES_COUNT:
    counts = counts.convert_dtypes().astype({'Charge': int}) #
mask = counts['proportion'] >= FEAT_COMPLETNESS_CUTOFF
counts.loc[mask]

In [None]:
selected_features = counts.loc[mask].set_index(counter.idx_names).sort_index().index
# selected_features.name = 'Gene names' # needs to be fixed
selected_features

In [None]:
selected_dumps = select_files_by_parent_folder(list(counter.dumps.values()), years=YEARS)
print("Total number of files:", len(selected_dumps))
selected_dumps[-10:]

In [None]:
LOAD_DUMP(selected_dumps[0])

In [None]:
from typing import List, Callable
from pandas.errors import EmptyDataError
def process_folders(fpaths: List[Path],
                    selected_features: pd.Index,
                    load_folder: Callable,
                    id_col='Sample ID',
                    dtypes: dict = {
                        'Sample ID': 'category',
                        'Sequence': 'category'}) -> tuple:
    print(f"started new process with {len(fpaths)} files.")
    data_intensity = []
    for i, fpath in enumerate(fpaths):
        if not i % 10: print(f"File ({i}): {fpath}")
        sample_name = fpath.stem
        try:
            dump = load_folder(fpath)
        except EmptyDataError:
            logging.warning(f'Empty dump: {fpath}')
            continue
        except FileNotFoundError:
            logging.warning(f'Missing dump: {fpath}')
            continue
        sequences_available = dump.index.intersection(selected_features)
        dump = dump.loc[sequences_available, 'Intensity'].reset_index()
        dump[id_col] = sample_name
        dump = dump.astype(dtypes)
        data_intensity.append(dump)
    
    data_intensity = pd.concat(data_intensity, copy=False, ignore_index=True)
    data_intensity = data_intensity.astype(dtypes)
    return data_intensity

# # experiment
# process_folders(selected_dumps[:2],
#                 selected_features=selected_features,
#                 load_folder=LOAD_DUMP,
#                 dtypes=TYPES_DUMP)


In [None]:
%%time

process_folders_peptides = functools.partial(process_folders,
                                             selected_features=selected_features,
                                             load_folder=LOAD_DUMP,
                                             dtypes=TYPES_DUMP)
collected_dfs = data_objects.collect_in_chuncks(paths=selected_dumps,
                                                process_chunk_fct=process_folders_peptides,
                                                chunks=200,
                                                n_workers=1 # to debug, don't multiprocess
                                               )

# one would need to aggregate categories first to keep them during aggregation?
collected_dfs = pd.concat(collected_dfs, copy=False, ignore_index=True)
collected_dfs = collected_dfs.astype(TYPES_DUMP)
df_intensities = collected_dfs
df_intensities

In [None]:
df_intensities.dtypes

In [None]:
df_intensities = df_intensities.set_index(IDX_COLS_LONG)

In [None]:
M = len(selected_features)
N = len(selected_dumps)
N,M

In [None]:
base_name = f'{BASE_NAME}_' + '_'.join(YEARS)
fname = config.FOLDER_DATA / config.insert_shape(df_intensities, base_name + '{}.pkl', shape=(N,M))
print(f"{fname = }")
df_intensities.to_pickle(fname)

In [None]:
# df_intensities = df_intensities.unstack(['Sample ID'])
# df_intensities

In [None]:
# df_intensities.sort_index(inplace=True)
# base_name = "df_intensities_evidence_long" + '_'.join(YEARS)
# fname = config.FOLDER_DATA / config.insert_shape(df_intensities, base_name + '{}.csv', shape=(N,M))
# print(f"{fname = }")
# df_intensities.to_csv(fname)
# df_intensities