# Build a set of training data

Use a set of (most) common peptides to create inital data sets

In [None]:
import random  # shuffle, seed

import pandas as pd
from tqdm.notebook import tqdm

import src.config as config
from src.config import FOLDER_PROCESSED
from src.data_objects import PeptideCounter

In [None]:
RANDOM_SEED = 42
M = 5_000 # Number of features to select
N = 10_000 # Number of max samples

In [None]:
training_data = [folder for folder in FOLDER_PROCESSED.iterdir() if folder.suffix == '.csv']
random.seed(RANDOM_SEED)
random.shuffle(training_data)
training_data[:10]

In [None]:
peptide_counter = PeptideCounter()
selected_peptides = peptide_counter.counter.most_common(M)
selected_peptides[:10]

In [None]:
selected_peptides = {k: v for k, v in selected_peptides}

In [None]:
from pathlib import Path

d_dtypes_training_sample = {
    #     'Sequence': pd.StringDtype(),
    'Proteins': pd.StringDtype(),
    'Leading razor protein': pd.StringDtype(),
    'Gene names': pd.StringDtype(),
    'Intensity': pd.Int64Dtype()
}


def load_training_sample(fpath):
    fpath = Path(fpath)
    peptides = pd.read_csv(fpath, index_col=0, dtype=d_dtypes_training_sample)
    return peptides

# peptides = load_training_sample(training_data[0])

In [None]:
%%time
N = min(len(training_data), N)

data_intensity = {}
data_genes = {}
support = {}
# again with multiprocessing? await functions?
for fp_training_sample in tqdm(training_data[:N]):
    sample_name = fp_training_sample.stem
    peptides = load_training_sample(fp_training_sample)
    sequences_available = peptides.index.intersection(selected_peptides.keys())
    support[sample_name] = len(sequences_available)
    data_intensity[sample_name] = peptides.loc[sequences_available,
                                               'Intensity'].to_dict()
    data_genes[sample_name] = peptides.loc[sequences_available,
                                           'Gene names'].to_dict()

In [None]:
df_intensities = pd.DataFrame.from_dict(data_intensity).T
df_intensities.sort_index(inplace=True)
df_intensities.to_csv(config.FOLDER_DATA /
                      config.build_df_fname(df_intensities, 'df_intensities'))
df_intensities

In [None]:
df_genes = pd.DataFrame.from_dict(data_genes).T
df_genes.to_csv(config.FOLDER_DATA / 
                config.build_df_fname(df_genes, 'df_genes'))
df_genes