# Count peptides over all files

In [None]:
import os
import sys
import logging
from pathlib import Path
import random

import pandas as pd
import ipywidgets as widgets

sys.path.append('/home/jovyan/work/vaep/')
from vaep.io.mq import MaxQuantOutputDynamic

from src.logging import setup_logger_w_file
from src.data_objects import MqAllSummaries 

##################
##### CONFIG #####
##################
from config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED

from config import FOLDER_DATA # project folder for storing the data
print(f"Search Raw-Files on path: {FOLDER_MQ_TXT_DATA}")

##################
### Logging ######
##################

#Delete Jupyter notebook root logger handler
root_logger = logging.getLogger()
root_logger.handlers = []

logger = logging.getLogger('vaep')
logger = setup_logger_w_file(logger, fname_base='log_00_mq_count_peptides')

logger.info('Start with handlers: \n' + "\n".join(f"- {repr(log_)}" for log_ in logger.handlers))

# folders = [folder for folder in  Path(FOLDER_MQ_TXT_DATA).iterdir()]
# w_file = widgets.Dropdown(options=[folder for folder in folders], description='View files')
# w_file

In [None]:
mq_all_summaries = MqAllSummaries()
threshold_ms2_identified = 10_000
folders = mq_all_summaries.get_files_w_min_MS2()
folders[:10]

## Random example

In [None]:
import random
pd.set_option('max_columns', 60)
random_folder = folders[random.randint(0, len(folders)-1)]
mq_output = MaxQuantOutputDynamic(random_folder)
print(f"peptides.txt from {random_folder!s}")
mq_output.peptides

In [None]:
mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands

## Count peptides

In [None]:
from collections import Counter
import multiprocessing
from tqdm.notebook import tqdm

import vaep.io.mq as mq

import numpy as np
N_WORKERS = 8

# #ToDo
# class PeptideCounter():
    
# # add directly dumping of folder? unique peptides?    
# # reduce storage for potential download? which columns to retain
# # check df for redundant information (same feature value for all entries)
def count_peptides(folders):
    c = Counter()
    for folder in folders:
        peptides = pd.read_table(folder / 'peptides.txt' , usecols=[mq.mq_col.SEQUENCE, mq.mq_col.INTENSITY, "Potential contaminant"], index_col=0)
        mask = (peptides[mq.mq_col.INTENSITY] == 0) | (peptides["Potential contaminant"] == '+')
        c.update(peptides.loc[~mask, mq.mq_col.INTENSITY].index)

    return c

# combine multiprocessing?
# def sum_over_files(folders, n_workers=N_WORKERS):
with multiprocessing.Pool(N_WORKERS) as p:
    len(folders) // N_WORKERS
    list_of_sample_dicts = list(tqdm(p.imap(count_peptides, np.array_split(folders, 100)), total=100, desc='Count peptides'))

c = Counter()
for d in tqdm(list_of_sample_dicts, desc='combine counters'):
    c += d

In [None]:
print("Potential maximum:", len(folders))
c.most_common(100)

Define missing pattern based on most abundant peptides (hope is to have only few cases in pattern)

In [None]:
from vaep.io import dump_json
dump_json(c, filename=FOLDER_PROCESSED / 'count_all_peptides.json')

## Create peptide intensity dumps for each MQ outputfolder

All folders are stored in a list

Check if the output folder contains already parsed files

maybe this should

In [None]:
# import json

# import config

# with open(config.FN_FASTA_DB) as f:
#     data_fasta = json.load(f)
# print(f'Number of proteins in fasta file DB: {len(data_fasta)}')

Some files to investigate in more detail

```
20130408_QE6_LC5_KBS_MNT_QC_HeLa_02  # reversed protein leading razor protein
``` 

In [None]:
# %%time
# FOLDER_PROCESSED = Path(FOLDER_PROCESSED)
# set_previously_loaded =  {folder.name for folder in FOLDER_PROCESSED.iterdir()}

# FORCE = True

# for folder in folders:
#     if folder.name in set_previously_loaded and not FORCE and (folder / '0_completness_all_genes.json').exists():
#         pass
#     else:
#         logger.info('\n\nProcess: {folder.name}')
#         print(f"Process: {folder.name}")
#         mq_output = MaxQuantOutputDynamic(folder)
#         peptide_extractor = ExtractFromPeptidesTxt(
#             out_folder=FOLDER_PROCESSED, mq_output_object=mq_output, fasta_db=data_fasta)
#         completeness_per_gene = peptide_extractor()

In [None]:
def load_peptides(folders):
        #logger.info(f'\n\nProcess: {folder.name}')
        #print(f"Process: {folder.name}")
        peptides = pd.read_table(folder / 'peptides.txt' , usecols=[mq.mq_col.SEQUENCE, mq.mq_col.INTENSITY, "Potential contaminant"], index_col=0)
        mask = (peptides[mq.mq_col.INTENSITY] == 0) | (peptides["Potential contaminant"] == '+')
        return {folder.stem: peptides.loc[~mask, mq.mq_col.INTENSITY].to_dict()}

## Theoretial Peptides from used fasta-file

> `01_explore_FASTA.ipynb` (formely `01_FASTA_tryptic_digest.ipynb`)