# Count peptides over all files

In [None]:
import os
import sys
import logging
from pathlib import Path
import random
import yaml

import pandas as pd
import ipywidgets as widgets

### Logging setup ######
from vaep.logging import setup_nb_logger
setup_nb_logger()

### vaep imports ######
from vaep.io.mq import MaxQuantOutputDynamic
from vaep.io.data_objects import MqAllSummaries
from vaep.io.data_objects import PeptideCounter
import vaep.pandas

##################
##### CONFIG #####
##################
from src.config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED

from src.config import FOLDER_DATA # project folder for storing the data
logging.info(f"Search Raw-Files on path: {FOLDER_MQ_TXT_DATA}")

Use samples previously loaded.

In [None]:
ELIGABLE_FILES_YAML = Path('config/eligable_files.yaml')
MAP_FOLDER_PATH = Path('config/file_paths')

with open(ELIGABLE_FILES_YAML) as f:
    files = yaml.safe_load(f)['files']
with open(MAP_FOLDER_PATH) as f:
    folders_dict = yaml.safe_load(f)


## Random example

In [None]:
import random
pd.set_option('max_columns', 60)
random_folder, random_path = random.sample(folders_dict.items(), 1)[0]
mq_output = MaxQuantOutputDynamic(random_path)
print(f"peptides.txt from {random_folder!s}")
mq_output.peptides

In [None]:
use_columns = mq_output.peptides.columns[33:45]
df = mq_output.peptides[use_columns].convert_dtypes() #.to_json('test.json')
df

In [None]:
df_json_string = df.to_json(orient='index', indent=4)
df_json_string[:1000]

In [None]:
df_csv = df.to_csv()
df_csv[:1000]

In [None]:
pd.read_json(df_json_string, orient='index')

In [None]:
mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands

## Count aggregated peptides

In [None]:
peptide_counter = PeptideCounter('data/processed/count_all_peptides.json')
peptide_counter

In [None]:
try:
    print(peptide_counter.counter.most_common(10),
          len(peptide_counter.loaded),
          sep='\n')
except AttributeError:
    print('New file created.')

- creates peptide intensity dumps for each MQ outputfolder per default `count_peptides` function (default processing function for `PeptideCounter`)

In [None]:
%%time
folders = [Path(folder_path) for folder_path in folders_dict.values()]
c = peptide_counter.sum_over_files(folders=folders)

In [None]:
c.most_common(10) # peptide_counter.counter.most_common(10)

In [None]:
# To share as python file
N = 1000
with open(FOLDER_PROCESSED / f'most_common_{10}_peptides.py', 'w') as f:
    f.write('import pandas as pd\n\n')
    
    #pprint.pformat list -> do this using standardlibrary
    # https://docs.python.org/3/library/pprint.html
    f.write(f"most_common = [\n  ")
    f.write(',\n  '.join(f"{str(t)}" for t in c.most_common(N)))
    f.write("\n]\n\n")
    
    #peptide_counter.loaded()
    
    f.write("pd.DataFrame.from_records(most_common, index='Sequence', columns=['Sequence', 'counts'])\n")

## Peptides by charge

- count peptides by charge state (which are aggregated in `peptides.txt`)

In [None]:
evidence_cols = vaep.pandas.get_colums_accessor(mq_output.evidence.reset_index())
evidence_cols # vaep.mq get this list

In [None]:
evidence = mq_output.evidence.set_index(evidence_cols.Charge, append=True)
evidence

In [None]:
vaep.pandas.prop_unique_index(evidence)

Using the protein AA sequence and it's charge as identifiers, does not yield a unique index.

First potential contaminants and peptides with zero intensity (or missing intensity) can be removed from the table.

These are apparently peptides identified by an MS2 spectrum but which could not be quantified by a MS1 scans

In [None]:
mask =  evidence[evidence_cols.Intensity].isna()
evidence.loc[mask, evidence_cols.Type].value_counts()

In [None]:
def select_evidence(df_evidence):
    mask = (df_evidence[evidence_cols.Potential_contaminant] == '+') | (df_evidence[evidence_cols.Intensity] == 0)
    evidence = df_evidence.loc[~mask].drop(evidence_cols.Potential_contaminant, axis=1)
    evidence = evidence.dropna(subset=[evidence_cols.Intensity])
    return evidence

use_cols = [evidence_cols.mz, evidence_cols.Protein_group_IDs, evidence_cols.Intensity, evidence_cols.Score, evidence_cols.Potential_contaminant]
evidence_selected = select_evidence(evidence[use_cols])
evidence_selected

In [None]:
evidence_selected = evidence_selected.sort_values(by=['Sequence', 'Charge', 'Score'], ascending=False)
evidence_selected

In [None]:
evidence_selected = vaep.pandas.select_max_by(evidence_selected.reset_index(), [evidence_cols.Sequence, evidence_cols.Charge], evidence_cols.Score)
evidence_selected

In [None]:
from collections import Counter
c = Counter()
c.update(evidence.index)
c.most_common(10)

In [None]:
example = evidence.loc[c.most_common(10)[0][0]]

vaep.pandas.show_columns_with_variation(example)

- `Type`: only `MULTI-MSMS` and `MULIT-SECPEP` are quantified (does this mean a matching MS1 spectrum?)

In [None]:
evidence[evidence_cols.Type].value_counts()

Some peptides can be assigned to different protein group IDs (razor peptides)
 - option: discared non-unique peptides (and Protein group IDs can be already a combination of several isotopes)
 - option: select on `Score` or `Intensity` (is there a relationship?)
 - option: select based on `Number of isotopic peaks`

In [None]:
evidence[evidence_cols.Protein_group_IDs].value_counts()

### Count peptides based on evidence files

In [None]:
from typing import List
from tqdm.notebook import tqdm

idx_columns_evidence = [evidence_cols.Sequence, evidence_cols.Charge]

def create_parent_folder_name(folder):
    return folder.stem[:4]

def count_evidence(folders:List[Path],
                   select_by:str='Score',
                   dump=True,
                   parent_folder_fct:callable=create_parent_folder_name,
                   outfolder=FOLDER_PROCESSED / 'evidence_dumps'):
    outfolder = Path(outfolder)
    outfolder.mkdir(exist_ok=True, parents=True)
    c = Counter()
    for folder in tqdm(folders):
        folder = Path(folder)
        evidence = pd.read_table(folder / 'evidence.txt',
                usecols=idx_columns_evidence + use_cols)
        evidence = select_evidence(evidence)
        evidence = vaep.pandas.select_max_by(evidence, index_columns=idx_columns_evidence, selection_column=select_by)
        evidence = evidence.sort_index()
        c.update(evidence.index)
        if dump:
            fname = f"{folder.stem}.csv"
            if parent_folder_fct is not None:
                parent_folder = outfolder / parent_folder_fct(folder)
                parent_folder.mkdir(exist_ok=True)
                fname = parent_folder / fname
            else:
                fname = outfolder / fname
            logging.info(f"Dump to file: {fname}")
            evidence.to_csv(fname)

    return c

counts_evidence = count_evidence(folders)
counts_evidence.most_common(10)

## Theoretial Peptides from used fasta-file

> `01_explore_FASTA.ipynb` (formely `01_FASTA_tryptic_digest.ipynb`)