# MaxQuant (MQ) Output-Files

Files compared:
1. `Summary.txt`
2. `mqpar.xml`
3. `peptides.txt`
4. `proteins.txt`

There is are many files more, where several files seem to be available in several times in different formats.

In [None]:
import os
import sys
import logging
from pathlib import Path
import random

import pandas as pd
import ipywidgets as widgets

# sys.path.append('/home/jovyan/work/vaep/')
from vaep.io import PathsList
from vaep.io.mq import MaxQuantOutputDynamic
from vaep.io.mq import ExtractFromPeptidesTxt
import vaep.io.mq as mq

import src
from src.file_utils import process_files
from src.file_utils import load_summary, load_mqpar_xml
from src.logging import setup_logger_w_file

##################
##### CONFIG #####
##################
from src.config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED
from src.config import FOLDER_KEY  # defines how filenames are parsed for use as indices

from src.config import FOLDER_DATA # project folder for storing the data
print(f"Search Raw-Files on path: {FOLDER_MQ_TXT_DATA}")

##################
### Logging ######
##################

#Delete Jupyter notebook root logger handler
root_logger = logging.getLogger()
root_logger.handlers = []

logger = logging.getLogger('vaep')
logger = setup_logger_w_file(logger, fname_base='log_00_maxquant_file_reader')

logger.info('Start with handlers: \n' + "\n".join(f"- {repr(log_)}" for log_ in logger.handlers))

In [None]:
folders_mq_txt_data = Path(FOLDER_MQ_TXT_DATA)

In [None]:
folders = [folder for folder in  Path(FOLDER_MQ_TXT_DATA).iterdir()]
w_file = widgets.Dropdown(options=[folder for folder in folders], description='View files')
w_file

In [None]:
mq_output = MaxQuantOutputDynamic(w_file.value)
mq_output

Results will be saved in a subfolder under `vaep/project/data` using the name of the specified input-folder per default. Change to your liking:

> Go to the block you are interested in!

## MQ Summary files

In [None]:
mq_output.summary.iloc[0].to_dict()

### File Handler

In [None]:
# load_summary??

### Summaries

In [None]:
if paths_summaries.files:
    df, names, failed = process_files(handler_fct=load_summary, filepaths=paths_summaries.files, key=FOLDER_KEY, relative_to=paths_summaries.folder)
    df.columns = names
    print(f"Number of failed reads: {len(failed)}")
    display(df)

In [None]:
if paths_summaries.files:
    df.to_csv(os.path.join(FOLDER_PROCESSED, 'all_summary_txt.csv'))
    df.to_pickle(os.path.join(FOLDER_PROCESSED, 'all_summary_txt.pkl'))

- SIL - MS2 based on precursor which was a set of peaks
- PEAK - MS2 scan based on a single peak on precursor spectrum
- ISO - isotopic pattern detection


In [None]:
if paths_summaries.files:
    MS_spectra = df.loc[['MS', 'MS/MS Identified']].T.astype('int64')
    mask  = MS_spectra['MS/MS Identified'] > 0
    display(MS_spectra.loc[mask].describe())
    MS_spectra.to_csv(os.path.join(FOLDER_PROCESSED, 'overview_stats.csv'))

## MaxQuant Parameter File

In [None]:
paths_parameters = PathsList(files=[file for file in all_files.files if '.xml' in file], folder=all_files.folder)
w_file = widgets.Dropdown(options=paths_parameters.files, description='Select a file')
w_file

### Parameter Files

In [None]:
load_mqpar_xml??

In [None]:
fname_mqpar_xml = os.path.join(FOLDER_PROCESSED, 'peptide_intensities.{}')

if paths_parameters.files:
    df, col_names, failed = process_files(handler_fct=load_mqpar_xml, filepaths=paths_parameters.files, key=FOLDER_KEY, relative_to=paths_parameters.folder) 
    df.columns = col_names
    print(f"Number of failed reads: {len(failed)}")
    pd.set_option('max_rows', 160)
    display(df)
    df.to_pickle(fname_mqpar_xml.format("pkl"))

In [None]:
del df

## Peptides

In [None]:
paths_peptides = PathsList(files = [file for file in all_files.files if 'peptides.txt' in file], folder=all_files.folder)
# paths_peptides

In [None]:
pd.set_option('max_columns', 60)

mq_output = MaxQuantOutputDynamic(
    folder=folders[random.randint(0, len(paths_peptides.files)-1)])
mq_output.peptides

In [None]:
mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands

### Create peptide intensity dumps for each MQ outputfolder

All folders are stored in a list

In [None]:
# folders[:10]

Check if the output folder contains already parsed files

In [None]:
import json

import config

with open(config.FN_FASTA_DB) as f:
    data_fasta = json.load(f)
print(f'Number of proteins in fasta file DB: {len(data_fasta)}')

In [None]:
# 20170509_QE4_LC12_IAH_QC_MNT_HeLa_01 # trigger re-execution
# 20130408_QE6_LC5_KBS_MNT_QC_HeLa_02  # reversed protein leading razor protein

In [None]:
%%time
FOLDER_PROCESSED = Path(FOLDER_PROCESSED)
set_previously_loaded =  {folder.name for folder in FOLDER_PROCESSED.iterdir()}

FORCE = True

for folder in folders:
    if folder.name in set_previously_loaded and not FORCE and (folder / '0_completness_all_genes.json').exists():
        pass
    else:
        logger.info('\n\nProcess: {folder.name}')
        print(f"Process: {folder.name}")
        mq_output = MaxQuantOutputDynamic(folder)
        peptide_extractor = ExtractFromPeptidesTxt(
            out_folder=FOLDER_PROCESSED, mq_output_object=mq_output, fasta_db=data_fasta)
        completeness_per_gene = peptide_extractor()

In [None]:
# %debug

## Theoretial Peptides from used fasta-file

> `01_explore_FASTA.ipynb` (formely `01_FASTA_tryptic_digest.ipynb`)