# MaxQuant (MQ) Output-Files

Files compared:
1. `Summary.txt`
2. `mqpar.xml`
3. `peptides.txt`
4. `proteins.txt`

There is are many files more, where several files seem to be available in several times in different formats.

In [None]:
import os
from pathlib import Path

import pandas as pd
import ipywidgets as widgets

import src
import src.file_utils as file_io
from src.file_utils import search_files, search_subfolders, check_for_key
from src.file_utils import process_files
from src.file_utils import load_summary, load_mqpar_xml, load_peptide_intensities, load_protein_intensities

##################
##### CONFIG #####
##################

FILEPATH_UTILS = 'src/file_utils.py'


from config import FOLDER_RAW_DATA 
from config import FOLDER_KEY  # defines how filenames are parsed for use as indices

from config import FOLDER_DATA # project folder for storing the data
print(f"Search Raw-Files on path: {FOLDER_RAW_DATA}")

In [None]:
folders= search_subfolders(path=FOLDER_RAW_DATA, depth=1)
w_folder = widgets.Dropdown(options=folders, description='Select a folder')
w_folder

Results will be saved in a subfolder under `vaep/project/data` using the name of the specified input-folder per default. Change to your liking:

In [None]:
FOLDER_PROCESSED = w_folder.value
FOLDER_PROCESSED = os.path.join(FOLDER_DATA, FOLDER_PROCESSED.name)
print(f"Save all output of this notebook to : {FOLDER_PROCESSED}")
os.makedirs(FOLDER_PROCESSED, exist_ok=True)

> Go to the block you are interested in!

## MQ Summary files

In [None]:
paths_summaries = search_files(path=w_folder.value, query='summary.txt')
w_file = widgets.Dropdown(options=paths_summaries.files, description='View files')
w_file

### File Handler

In [None]:
load_summary??

### Summaries

In [None]:
if paths_summaries.files:
    df, names, failed = process_files(handler_fct=load_summary, filepaths=paths_summaries.files, key=FOLDER_KEY, relative_to=paths_summaries.folder)
    df.columns = names
    print(f"Number of failed reads: {len(failed)}")
    display(df)

In [None]:
if paths_summaries.files:
    df.to_csv(os.path.join(FOLDER_PROCESSED, 'all_summary_txt.csv'))
    df.to_pickle(os.path.join(FOLDER_PROCESSED, 'all_summary_txt.pkl'))

- SIL - MS2 based on precursor which was a set of peaks
- PEAK - MS2 scan based on a single peak on precursor spectrum
- ISO - isotopic pattern detection


In [None]:
if paths_summaries.files:
    MS_spectra = df.loc[['MS', 'MS/MS Identified']].T.astype('int64')
    mask  = MS_spectra['MS/MS Identified'] > 0
    display(MS_spectra.loc[mask].describe())
    MS_spectra.to_csv(os.path.join(FOLDER_PROCESSED, 'overview_stats.csv'))

## MaxQuant Parameter File

In [None]:
paths_parameters = search_files(path=w_folder.value, query='.xml')
w_file = widgets.Dropdown(options=paths_parameters.files, description='Select a file')
w_file

### Parameter Files

In [None]:
load_mqpar_xml??

In [None]:
if paths_parameters.files:
    df, col_names, failed = process_files(handler_fct=load_mqpar_xml, filepaths=paths_parameters.files, key=FOLDER_KEY, relative_to=paths_parameters.folder) 
    df.columns = col_names
    print(f"Number of failed reads: {len(failed)}")
    pd.set_option('max_rows', 160)
    display(df)

## Peptides

In [None]:
%%time 
paths_peptides = search_files(path=w_folder.value, query='peptides.txt')

In [None]:
import random
random.randint(0, len(paths_peptides.files))

In [None]:
df = pd.read_table(os.path.join(paths_peptides.folder, 
                                paths_peptides.files[random.randint(0, len(paths_peptides.files))]),
                   index_col='Sequence')
pd.set_option('max_columns', 60)
# types = dict(df.dtypes)
df

### File-Handler

In [None]:
load_peptide_intensities??

### Load Peptide Intensities

In [None]:
import logging
logger = logging.getLogger()
logger.setLevel(level=logging.INFO)

fname_peptides = os.path.join(FOLDER_PROCESSED, 'peptide_intensities.{}')

def get_intensities(paths_peptides, 
                    fname_peptides=os.path.join(FOLDER_PROCESSED, 'peptide_intensities.{}')
                   ):
    """Take a path namedtuple and check if files have been loaded previously."""
    try:
        peptides = pd.read_pickle(fname_peptides.format('pkl'))
        logging.info(f"Previously processed files: {len(peptides)}")
        set_files_already_processed = {os.path.join(_folder, 'peptides.txt') for _folder in peptides.index}
        paths_peptides_to_do = list(set(paths_peptides.files) - set_files_already_processed)
    except:
        logging.info(f'No file such file: {fname_peptides.format("pkl")}')
        paths_peptides_to_do = paths_peptides.files
        peptides = None
    #ToDo: add more functionality   names
    _peptides, _names, _failed = process_files(handler_fct=load_peptide_intensities,
                                        filepaths=paths_peptides_to_do,
                                        key=FOLDER_KEY,
                                        relative_to=paths_peptides.folder) 
    if _failed:
        logging.info(f'Failed: {", ".join(_failed)}')
    _peptides.columns = _names
    
    if peptides is not None:
        return peptides.T.join(_peptides, how='outer').T
    else:
        return _peptides.T
    
peptides = get_intensities(paths_peptides)

In [None]:
print(f"Peptide intesities take up {peptides.memory_usage(deep=False).sum() / 1000000:7.2f} MB of memory")

In [None]:
pd.options.display.float_format = '{:,.0f}'.format
peptides

In [None]:
peptides.to_csv(os.path.join(FOLDER_PROCESSED, 'peptide_intensities.csv'))
peptides.to_pickle(os.path.join(FOLDER_PROCESSED, 'peptide_intensities.pkl'))

In [None]:
peptides.sort_values(by='AAAAAAAAAPAAAATAPTTAATTAATAAQ')

In [None]:
peptides_ordered_by_availability = peptides.notna().sum().sort_values(ascending=False)
peptides_ordered_by_availability

In [None]:
N_peptides_notna = (peptides_ordered_by_availability >= max(peptides_ordered_by_availability)).sum()
print(f'A total of {N_peptides_notna} peptides have been identified in {max(peptides_ordered_by_availability)} samples (max identification).')

In [None]:
N_CONSIDER_FIRST = 2000
peptides = peptides[peptides_ordered_by_availability.index]
peptides.sort_values(by=list(peptides_ordered_by_availability.index[:N_CONSIDER_FIRST]))

### Identified Peptides by sample (reconstructed)

In [None]:
#In case no summary.txt is available or for comparison
ms_ms_identified = peptides.notna().sum(axis=1).to_frame(name='MS/MS Identified')
ms_ms_identified.describe()

### Peptide sequences
- average length, max, min, etc.
- overlap

In [None]:
peptides.columns.to_series()

### Comparison Intensities (e.g. between MaxQuant v1.6.0.1 and v1.6.1.12, if you have it)

In [None]:
# peptides_1601  = peptides.loc['MQ1.6.0.1_20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_01_200327']
# peptides_16112 =  peptides.loc['MQ1.6.1.12_20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_01_200330']

In [None]:
# mask_diff = peptides_1601 == peptides_16112
# mask_diff.sum()

In [None]:
# differences = pd.DataFrame([peptides_1601[~mask_diff], peptides_16112[~mask_diff]])
# differences = differences.dropna(axis=1, how='all')
# differences

Amount of _overall_ assigned intensity is not the same.

In [None]:
# differences.sum(axis=1)

## Proteins

In [None]:
paths_proteins = search_files(path=w_folder.value, query='proteinGroups.txt')

In [None]:
run_protein = pd.read_table(os.path.join(paths_proteins.folder, paths_proteins.files[2]))
run_protein

### Most proteins are grouped
- How many proteins are grouped together for one intensity value?
- Are proteins uniquely placed into one of the protein groups?

In [None]:
ax = run_protein["Number of proteins"].value_counts().sort_index().plot(kind='bar', title='Counts of protein-groups\' sizes')
ax.set_xlabel('Number of proteins in group (group size)')
ax.set_ylabel('Frequency')

In [None]:
protein_index = run_protein["Protein IDs"].str.split(';').apply(set)
protein_index.apply(len).sum()

In [None]:
protein_index.apply(len)

In [None]:
protein_index_set = set()
for _set in protein_index:
    protein_index_set = protein_index_set.union(_set)
len(protein_index_set)

Unique entries

### Load Protein Intensities
- by all proteins
- by majority proteins

In [None]:
dtypes_proteins = run_protein.dtypes.to_dict()
dtypes_proteins

In [None]:
load_protein_intensities??

In [None]:
proteins, col_names, failed = process_files(handler_fct=load_protein_intensities, filepaths=paths_proteins.files[:20], key=FOLDER_KEY, relative_to=paths_proteins.folder) 

In [None]:
proteins.columns = col_names

In [None]:
proteins

In [None]:
proteins.isna().sum()

## Verify: ~500 proteins have no unique peptides
Theoretical analysis has established that roughly ~500 out of the ~20000 human proteins have no unique peptides using trypsin as protease. 

- ask Marie

## Theoretial Peptides from used fasta-file