# MaxQuant (MQ) Output-Files

Files compared:
1. `Summary.txt`
2. `mqpar.xml`
3. `peptides.txt`
4. `proteins.txt`

There is are many files more, where several files seem to be available in several times in different formats.

In [None]:
# # easiest way to add custom packages on erda.dk
# import sys
# sys.path.append('/home/jovyan/work/vaep/')

In [None]:
import os
from pathlib import Path
import random

import pandas as pd
import ipywidgets as widgets

from vaep.io import search_files, search_subfolders, PathsList, dump_json
from vaep.io.mq import MaxQuantOutputDynamic
from vaep.io.mq import ExtractFromPeptidesTxt, MaxQuantOutputDynamic
import vaep.io.mq as mq

import src
import src.file_utils as file_io
from src.file_utils import check_for_key
from src.file_utils import process_files
from src.file_utils import load_summary, load_mqpar_xml

##################
##### CONFIG #####
##################
from config import FOLDER_MQ_TXT_DATA, FOLDER_PROCESSED
from config import FOLDER_KEY  # defines how filenames are parsed for use as indices

from config import FOLDER_DATA # project folder for storing the data
print(f"Search Raw-Files on path: {FOLDER_MQ_TXT_DATA}")

In [None]:
import logging
from datetime import datetime

#Delete Jupyter notebook root logger handler
logger = logging.getLogger()
# logger.setLevel(logging.ERROR)
logger.handlers = []

# logger = logging.getLogger(mq_output.folder.stem)
logger = logging.getLogger('vaep')
logger.setLevel(logging.INFO)

c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)

date_log_file = "{:%y%m%d_%H%M}".format(datetime.now())
f_handler = logging.FileHandler(f"log_01_maxquant_file_processing_{date_log_file}.txt")
f_handler.setLevel(logging.INFO)

c_format = logging.Formatter(
    f'%(name)s - %(levelname)-8s %(message)s ')

c_handler.setFormatter(c_format)
f_handler.setFormatter(c_format)

logger.handlers = []  #remove any handler in case you reexecute the cell
logger.addHandler(c_handler)
logger.addHandler(f_handler)

In [None]:
folders_mq_txt_data = Path(FOLDER_MQ_TXT_DATA)

In [None]:
folders = [folder for folder in  Path(FOLDER_MQ_TXT_DATA).iterdir()]
w_file = widgets.Dropdown(options=[folder for folder in folders], description='View files')
w_file

In [None]:
mq_output = MaxQuantOutputDynamic(w_file.value)
mq_output

In [None]:
print(f"Results will be saved in subfolders in\n\t{str(FOLDER_PROCESSED.absolute())}"
      "\nusing the name of the specified input-folder per default. Change to your liking.")
# FOLDER_PROCESSED = Path('')

> Go to the block you are interested in!

In [None]:
mq_output.folder.stem

### Summaries Data

In [None]:
fp_summaries = Path(FOLDER_PROCESSED) / 'all_summaries.json'
if fp_summaries.exists():
    df_summaries = pd.read_json(fp_summaries, orient='index')
    display(df_summaries.head())
    d_summaries = df_summaries.to_dict(orient='index')
else:
    d_summaries = {}
print(f"Use {len(d_summaries)} previously loaded files.")

In [None]:
empty_folders = []
i = len(d_summaries)
N_MAX = 5000

for folder in folders:
    try:
        folder_name = folder.stem
        if not folder_name  in d_summaries:
            mq_output = MaxQuantOutputDynamic(folder)
            d_summaries[folder_name] = mq_output.summary.iloc[0].to_dict()
            i += 1
            if i % 50 == 0:
                logger.info(f'Processed: {len(d_summaries):5}')
            if i > N_MAX:
                break
    except FileNotFoundError as e:
        if not mq_output.files and len(list(mq_output.folder.iterdir())) == 0 :
            mq_output.folder.rmdir()
            logger.warning(f'Remove empty folder: {mq_output}')
            folders.remove(folder)
            empty_folders.append(f"{folder_name}\n")
        else:
            logger.error(f"{mq_output}, No summary and not empty.")

if empty_folders:
    print(empty_folders)
    with open('log_empty_folder.txt', 'a') as f:
        f.writelines(empty_folders)
print(f"In total processed: {len(d_summaries):5}")

In [None]:
df = pd.DataFrame.from_dict(d_summaries, orient='index')
pd.options.display.max_columns = len(df.columns)

In [None]:
df = df.convert_dtypes()
l_string_columns = df.columns[df.dtypes == 'string']
df[l_string_columns] = df[l_string_columns].astype('category')
df.info()

In [None]:
df.to_json(fp_summaries, orient='index')
df.to_pickle(fp_summaries.parent / f"{fp_summaries.stem}.pkl")

- SIL - MS2 based on precursor which was a set of peaks
- PEAK - MS2 scan based on a single peak on precursor spectrum
- ISO - isotopic pattern detection


In [None]:
class col_summary:
    MS = 'MS'
    MS2 =  'MS/MS Identified'

if 'df' in globals():
    MS_spectra = df[[col_summary.MS, col_summary.MS2]]
    def compute_summary(threshold_ms2_identified):
        mask  = MS_spectra[col_summary.MS2] > threshold_ms2_identified
        display(MS_spectra.loc[mask].describe())
    
    w_ions_range = widgets.IntSlider(value=0.0, min=.0, max=df[col_summary.MS2].max())
    display(widgets.interactive(compute_summary, threshold_ms2_identified=w_ions_range))

## MaxQuant Parameter File

In [None]:
# paths_parameters = PathsList(files=[file for file in all_files.files if '.xml' in file], folder=all_files.folder)
# w_file = widgets.Dropdown(options=paths_parameters.files, description='Select a file')
# w_file

### Parameter Files

In [None]:
# load_mqpar_xml??

In [None]:
# fname_mqpar_xml = os.path.join(FOLDER_PROCESSED, 'peptide_intensities.{}')

# if paths_parameters.files:
#     df, col_names, failed = process_files(handler_fct=load_mqpar_xml, filepaths=paths_parameters.files, key=FOLDER_KEY, relative_to=paths_parameters.folder) 
#     df.columns = col_names
#     print(f"Number of failed reads: {len(failed)}")
#     pd.set_option('max_rows', 160)
#     display(df)
#     df.to_pickle(fname_mqpar_xml.format("pkl"))

In [None]:
# del df

## Peptides

In [None]:
pd.set_option('max_columns', 60)

mq_output.peptides

In [None]:
mq_output.peptides.Intensity # as is in peptides.txt, comma seperated thousands

### Create peptide intensity dumps for each MQ outputfolder

All folders are stored in a list

In [None]:
folders[:10]

Check if the output folder contains already parsed files

In [None]:
import json

import config

with open(config.FN_FASTA_DB) as f:
    data_fasta = json.load(f)
print(f'Number of proteins in fasta file DB: {len(data_fasta)}')

Some files to investigate in more detail

```
20130408_QE6_LC5_KBS_MNT_QC_HeLa_02  # reversed protein leading razor protein
``` 

In [None]:
%%time
FOLDER_PROCESSED = Path(FOLDER_PROCESSED)
set_previously_loaded =  {folder.name for folder in FOLDER_PROCESSED.iterdir()}

FORCE = True

for folder in folders:
    if folder.name in set_previously_loaded and not FORCE and (folder / '0_completness_all_genes.json').exists():
        pass
    else:
        logger.info('\n\nProcess: {folder.name}')
        print(f"Process: {folder.name}")
        mq_output = MaxQuantOutputDynamic(folder)
        peptide_extractor = ExtractFromPeptidesTxt(
            out_folder=FOLDER_PROCESSED, mq_output_object=mq_output, fasta_db=data_fasta)
        completeness_per_gene = peptide_extractor()

## Theoretial Peptides from used fasta-file

> `01_explore_FASTA.ipynb` (formely `01_FASTA_tryptic_digest.ipynb`)