# MaxQuant (MQ) Output-Files

Files compared:
1. `Summary.txt`
2. `mqpar.xml`
3. `peptides.txt`
4. `proteins.txt`

There is are many files more, where several files seem to be available in several times in different formats.

In [None]:
import os
from pathlib import Path

import logging
logger = logging.getLogger()
logger.setLevel(level=logging.INFO)

import pandas as pd
import ipywidgets as widgets

import src
import src.file_utils as file_io
from src.file_utils import search_files, search_subfolders, check_for_key, PathsList
from src.file_utils import process_files
from src.file_utils import load_summary, load_mqpar_xml, load_peptide_intensities, load_protein_intensities

##################
##### CONFIG #####
##################
from config import FOLDER_RAW_DATA, FOLDER_PROCESSED
from config import FOLDER_KEY  # defines how filenames are parsed for use as indices

from config import FOLDER_DATA # project folder for storing the data
print(f"Search Raw-Files on path: {FOLDER_RAW_DATA}")

In [None]:
folders= search_subfolders(path=FOLDER_RAW_DATA, depth=1, exclude_root=True)
folders[:10]

Results will be saved in a subfolder under `vaep/project/data` using the name of the specified input-folder per default. Change to your liking:

In [None]:
all_files = search_files(path=FOLDER_DATA, query='')

In [None]:
all_files.folder

In [None]:
all_files.files[:10]

> Go to the block you are interested in!

## MQ Summary files

In [None]:
paths_summaries = PathsList([file for file in all_files.files if 'summary.txt' in file], folder=all_files.folder)
w_file = widgets.Dropdown(options=paths_summaries.files, description='View files')
w_file

### File Handler

In [None]:
load_summary??

### Summaries

In [None]:
if paths_summaries.files:
    df, names, failed = process_files(handler_fct=load_summary, filepaths=paths_summaries.files, key=FOLDER_KEY, relative_to=paths_summaries.folder)
    df.columns = names
    print(f"Number of failed reads: {len(failed)}")
    display(df)

In [None]:
if paths_summaries.files:
    df.to_csv(os.path.join(FOLDER_PROCESSED, 'all_summary_txt.csv'))
    df.to_pickle(os.path.join(FOLDER_PROCESSED, 'all_summary_txt.pkl'))

- SIL - MS2 based on precursor which was a set of peaks
- PEAK - MS2 scan based on a single peak on precursor spectrum
- ISO - isotopic pattern detection


In [None]:
if paths_summaries.files:
    MS_spectra = df.loc[['MS', 'MS/MS Identified']].T.astype('int64')
    mask  = MS_spectra['MS/MS Identified'] > 0
    display(MS_spectra.loc[mask].describe())
    MS_spectra.to_csv(os.path.join(FOLDER_PROCESSED, 'overview_stats.csv'))

## MaxQuant Parameter File

In [None]:
paths_parameters = PathsList(files=[file for file in all_files.files if '.xml' in file], folder=all_files.folder)
w_file = widgets.Dropdown(options=paths_parameters.files, description='Select a file')
w_file

### Parameter Files

In [None]:
load_mqpar_xml??

In [None]:
fname_mqpar_xml = os.path.join(FOLDER_PROCESSED, 'peptide_intensities.{}')

if paths_parameters.files:
    df, col_names, failed = process_files(handler_fct=load_mqpar_xml, filepaths=paths_parameters.files, key=FOLDER_KEY, relative_to=paths_parameters.folder) 
    df.columns = col_names
    print(f"Number of failed reads: {len(failed)}")
    pd.set_option('max_rows', 160)
    display(df)
    df.to_pickle(fname_mqpar_xml.format("pkl"))

In [None]:
del df

## Peptides

In [None]:
paths_peptides = PathsList(files = [file for file in all_files.files if 'peptides.txt' in file], folder=all_files.folder)

In [None]:
import random
random.randint(0, len(paths_peptides.files))

In [None]:
df = pd.read_table(os.path.join(paths_peptides.folder, 
                                paths_peptides.files[random.randint(0, len(paths_peptides.files))]),
                   index_col='Sequence')
pd.set_option('max_columns', 60)
# types = dict(df.dtypes)
df

In [None]:
s_intensities = load_peptide_intensities(os.path.join(paths_peptides.folder, 
                                paths_peptides.files[random.randint(0, len(paths_peptides.files))]))
s_intensities

### File-Handler

In [None]:
load_peptide_intensities??

### Load Peptide Intensities

In [None]:
import pathlib
FOLDER_PROCESSED = Path(FOLDER_PROCESSED)

In [None]:
peptide_intensities_ = [x for x in os.listdir(FOLDER_PROCESSED)  if 'peptide_intensities' in x and 'pkl' in x]
i = len(peptide_intensities_)
peptide_intensities_

In [None]:
import pickle

fname_files_loaded = FOLDER_PROCESSED / 'peptides_files_processed.pkl'

# files_previously_loaded = set()
# for fname_dump in peptide_intensities_:
#     loaded = pd.read_pickle(FOLDER_PROCESSED / fname_dump)
#     files_previously_loaded |= set(loaded.index)
#     del loaded

# with open(fname_files_loaded, "wb") as f:
#     pickle.dump(files_previously_loaded, f)

In [None]:
try:
    with open(fname_files_loaded, "rb") as f:
        files_previously_loaded = pickle.load(f)
    logging.info(f"Previously processed files: {len(files_previously_loaded)}")
except FileNotFoundError:
    logging.info("Not files were processed so far.")
    files_previously_loaded = None

In [None]:
# ToDo: Pull out query-process of previously loaded files. Only load a set of files.
def get_intensities(paths_,
                    fname_MQ_txt,
                    fnames_dumped:set=None):
    """Take a path namedtuple and check if files have been loaded previously.
    
    paths_: namedtuple
        Custom path object with file paths to consider.
    fnames_dumped: set
        Set of filenames previously dumped.
        
    """
    if fnames_dumped is not None:
        logging.info(f"Previously processed files: {len(fnames_dumped)}")
        set_files_already_processed = {os.path.join(_folder, fname_MQ_txt) for _folder in fnames_dumped}
        paths_peptides_to_do = list(set(paths_.files) - set_files_already_processed)
    else:
        logging.info(f'No previous processed files provided.')
        paths_peptides_to_do = paths_peptides.files

    #ToDo: add more functionality   names
    _peptides, _names, _failed = process_files(handler_fct=load_peptide_intensities,
                                        filepaths=paths_peptides_to_do,
                                        key=FOLDER_KEY,
                                        relative_to=paths_peptides.folder) 
    if _failed:
        logging.info(f'Failed: {", ".join(_failed)}')
        
    _peptides.columns = _names
    
    return _peptides.T
    
peptides_new = get_intensities(paths_peptides, fname_MQ_txt= 'peptides.txt', fnames_dumped=files_previously_loaded)

In [None]:
fname_peptides = 'peptide_intensities_{i}.{format}'
peptides_new.to_pickle(FOLDER_PROCESSED / fname_peptides.format(i=i, format='pkl'))

In [None]:
if files_previously_loaded:
    logging.info(f"Add newly loaded files to set of processed files. No. {len(set(peptides_new.index))}")
    files_previously_loaded |= set(peptides_new.index)

    with open(fname_files_loaded, "wb") as f:
        pickle.dump(files_previously_loaded, f)
    logging.info(f"Dumped set of previously loaded files to {fname_files_loaded}")

In [None]:
peptides_new.to_csv(os.path.join(FOLDER_PROCESSED, f'peptide_intensities_{i}.csv'))

In [None]:
# print(f"Peptide intesities take up {peptides.memory_usage(deep=False).sum() / 1000000:7.2f} MB of memory")

In [None]:
# pd.options.display.float_format = '{:,.0f}'.format
# peptides

In [None]:
# peptides.sort_values(by='AAAAAAAAAPAAAATAPTTAATTAATAAQ')

In [None]:
# peptides_ordered_by_availability = peptides.notna().sum().sort_values(ascending=False)
# peptides_ordered_by_availability

In [None]:
# N_peptides_notna = (peptides_ordered_by_availability >= max(peptides_ordered_by_availability)).sum()
# print(f'A total of {N_peptides_notna} peptides have been identified in {max(peptides_ordered_by_availability)} samples (max identification) out of {len(peptides)}.')

In [None]:
# N_CONSIDER_FIRST = 2000
# peptides = peptides[peptides_ordered_by_availability.index]
# peptides.sort_values(by=list(peptides_ordered_by_availability.index[:N_CONSIDER_FIRST]))

### Identified Peptides by sample (reconstructed)

In [None]:
# #In case no summary.txt is available or for comparison
# ms_ms_identified = peptides.notna().sum(axis=1).to_frame(name='MS/MS Identified')
# ms_ms_identified.describe()

### Peptide sequences
- average length, max, min, etc.
- overlap

In [None]:
peptides.columns.to_series()

### Comparison Intensities (e.g. between MaxQuant v1.6.0.1 and v1.6.1.12, if you have it)

In [None]:
# peptides_1601  = peptides.loc['MQ1.6.0.1_20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_01_200327']
# peptides_16112 =  peptides.loc['MQ1.6.1.12_20190103_QE8_nLC0_LiNi_QC_MNT_15cm_Hela_01_200330']

In [None]:
# mask_diff = peptides_1601 == peptides_16112
# mask_diff.sum()

In [None]:
# differences = pd.DataFrame([peptides_1601[~mask_diff], peptides_16112[~mask_diff]])
# differences = differences.dropna(axis=1, how='all')
# differences

Amount of _overall_ assigned intensity is not the same.all_files

In [None]:
# differences.sum(axis=1)

## Proteins

In [None]:
paths_proteins = PathsList([file for file in all_files.files if 'proteinGroups.txt' in file], folder=all_files.folder)

In [None]:
run_protein = pd.read_table(os.path.join(paths_proteins.folder, paths_proteins.files[2]))
run_protein

### Most proteins are grouped
- How many proteins are grouped together for one intensity value?
- Are proteins uniquely placed into one of the protein groups?

In [None]:
ax = run_protein["Number of proteins"].value_counts().sort_index().plot(kind='bar', title='Counts of protein-groups\' sizes')
ax.set_xlabel('Number of proteins in group (group size)')
ax.set_ylabel('Frequency')

In [None]:
protein_index = run_protein["Protein IDs"].str.split(';').apply(set)
protein_index.apply(len).sum()

In [None]:
protein_index.apply(len)

In [None]:
protein_index_set = set()
for _set in protein_index:
    protein_index_set = protein_index_set.union(_set)
len(protein_index_set)

Unique entries

### Load Protein Intensities
- by all proteins
- by majority proteins

In [None]:
dtypes_proteins = run_protein.dtypes.to_dict()
dtypes_proteins

In [None]:
load_protein_intensities??

In [None]:
proteins, col_names, failed = process_files(handler_fct=load_protein_intensities, filepaths=paths_proteins.files[:20], key=FOLDER_KEY, relative_to=paths_proteins.folder) 

In [None]:
proteins.columns = col_names

In [None]:
proteins

In [None]:
proteins.isna().sum()

## Verify: ~500 proteins have no unique peptides
Theoretical analysis has established that roughly ~500 out of the ~20000 human proteins have no unique peptides using trypsin as protease. 

- ask Marie

## Theoretial Peptides from used fasta-file