# Count peptides over all files

In [None]:
import os
import sys
import logging
from pathlib import Path
import random

import pandas as pd
import ipywidgets as widgets

### Logging setup ######
from vaep.logging import setup_nb_logger
setup_nb_logger()

### vaep imports ######
from vaep.io.mq import MaxQuantOutputDynamic
from vaep.io.data_objects import MqAllSummaries
from vaep.io.data_objects import PeptideCounter

##################
##### CONFIG #####
##################
from src.config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED

from src.config import FOLDER_DATA # project folder for storing the data
logging.info(f"Search Raw-Files on path: {FOLDER_MQ_TXT_DATA}")

Use samples previously loaded.

In [None]:
mq_all_summaries = MqAllSummaries()
threshold_ms2_identified = 15_000
folders = mq_all_summaries.get_files_w_min_MS2(threshold=threshold_ms2_identified, relativ_to=FOLDER_MQ_TXT_DATA)
folders[:10]

## Random example

In [None]:
import random
pd.set_option('max_columns', 60)
random_folder = folders[random.randint(0, len(folders)-1)]
mq_output = MaxQuantOutputDynamic(random_folder)
print(f"peptides.txt from {random_folder!s}")
mq_output.peptides

In [None]:
use_columns = mq_output.peptides.columns[33:45]
df = mq_output.peptides[use_columns].convert_dtypes() #.to_json('test.json')
df

In [None]:
df_json_string = df.to_json(orient='index', indent=4)
df_json_string[:1000]

In [None]:
df_csv = df.to_csv()
df_csv[:1000]

In [None]:
pd.read_json(df_json_string, orient='index')

In [None]:
mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands

## Count peptides

In [None]:
peptide_counter = PeptideCounter('data/processed/count_all_peptides.json')
peptide_counter

In [None]:
try:
    print(peptide_counter.counter.most_common(10),
          len(peptide_counter.loaded),
          sep='\n')
except AttributeError:
    print('New file created.')

In [None]:
%%time
c = peptide_counter.sum_over_files(folders=folders)

In [None]:
c.most_common(10) # peptide_counter.counter.most_common(10)

In [None]:
# To share as python file
N = 1000
with open(f'most_common_{10}_peptides.py', 'w') as f:
    f.write('import pandas as pd\n\n')
    
    #pprint.pformat list -> do this using standardlibrary
    # https://docs.python.org/3/library/pprint.html
    f.write(f"most_common = [\n  ")
    f.write(',\n  '.join(f"{str(t)}" for t in c.most_common(N)))
    f.write("\n]\n\n")
    
    #peptide_counter.loaded()
    
    f.write("pd.DataFrame.from_records(most_common, index='Sequence', columns=['Sequence', 'counts'])\n")

Define missing pattern based on most abundant peptides (hope is to have only few cases in pattern)

## Create peptide intensity dumps for each MQ outputfolder

All folders are stored in a list

- [ ] find where this is done

## Theoretial Peptides from used fasta-file

> `01_explore_FASTA.ipynb` (formely `01_FASTA_tryptic_digest.ipynb`)