In [3]:
import pandas as pd
import numpy as np
import sqlite3
import json
import matplotlib.pyplot as plt
%matplotlib inline
from os.path import expanduser

In [21]:
run_name = 'P3856_YHE211_1_Slot1-1_1_5104'
RT_LOWER = 1650
RT_UPPER = 2200

#### prepare the MaxQuant results

In [53]:
mq_results_df = pd.read_csv('{}/evidence-P3856.txt'.format(expanduser('~')), sep='\\t', engine='python')

In [54]:
# remove decoys, which are indicated by a '+' in the Reverse column
mq_results_df = mq_results_df[pd.isna(mq_results_df.Reverse)]
# remove identifications with no intensity
mq_results_df = mq_results_df[(mq_results_df.Intensity > 0)]
# remove potential contaminants
mq_results_df = mq_results_df[pd.isna(mq_results_df['Potential contaminant'])]
# only those identifications for this run
mq_results_df = mq_results_df[mq_results_df['Raw file'] == run_name]
# only MS/MS
mq_results_df = mq_results_df[mq_results_df['Type'] == 'TIMS-MULTI-MSMS']
# remove identifications with large mass error
mq_results_df = mq_results_df[np.abs(mq_results_df['Mass error [ppm]']) <= 20.0]
# only those in the RT region of interest
mq_results_df = mq_results_df[(mq_results_df['Retention time']*60.0 > RT_LOWER) & (mq_results_df['Retention time']*60.0 < RT_UPPER)]

In [55]:
# definition of uniqueness in MaxQuant output with MBR on
unique_peptide_key = ['Sequence','Modifications','Charge']

In [56]:
mq_sequences_l = []
for group_name,group_df in mq_results_df.groupby(unique_peptide_key, as_index=False):
    mq_sequences_l.append({'unique_peptide_key':unique_peptide_key})
unique_identifications_mq_df = pd.DataFrame(mq_sequences_l)

In [57]:
len(unique_identifications_mq_df)

7293

#### prepare the MSFragger results

In [22]:
# use the Fragger analysis without MBR because there doesn't seem to be a way of distinguishing between identification and matching
fragger_results_df = pd.read_csv('{}/MSstats-without-MBR.csv'.format(EXPERIMENT_DIR), sep=',')


In [23]:
fragger_results_df = fragger_results_df[fragger_results_df.Intensity.notnull()]

In [24]:
fragger_results_df.sample(n=5)

Unnamed: 0,ProteinName,PeptideSequence,PrecursorCharge,FragmentIon,ProductCharge,IsotopeLabelType,Condition,BioReplicate,Run,Intensity
212564,sp|P26368|U2AF2_HUMAN,LFIGGLPNYLNDDQVK,2,,,L,YHE114,4,P3856_YHE114_4_Slot1-1_1_5118,83068.33
500722,sp|Q6P2Q9|PRP8_HUMAN,GPGNPVPGPLAPLPDYMSEEK,2,,,L,YHE211,2,P3856_YHE211_2_Slot1-1_1_5105,195592.89
188000,sp|P21281|VATB2_HUMAN,GPVVLAEDFLDIMGQPINPQC[57.0215]R,3,,,L,YHE211,10,P3856_YHE211_10_Slot1-1_1_5113,28382.742
642703,sp|Q9P258|RCC2_HUMAN,NHTLALTETGSVFAFGENK,2,,,L,YHE114,3,P3856_YHE114_3_Slot1-1_1_5117,68730.63
373177,sp|P63010|AP2B1_HUMAN,LAPPLVTLLSGEPEVQYVALR,3,,,L,YHE010,7,P3856_YHE010_7_Slot1-1_1_5099,31515.666


In [25]:
# count the number of unique peptide identifications in each run
fragger_counts_d = {}
for group_name,group_df in fragger_results_df.groupby(['PeptideSequence', 'PrecursorCharge'], as_index=False):
    for run_name in group_df['Run'].unique():
        if run_name not in fragger_counts_d:
            fragger_counts_d[run_name] = 0
        fragger_counts_d[run_name] += 1

In [26]:
# sort the run names by group then run number within the group
sorted_counts_l = []
for k in sorted(list(fragger_counts_d.keys()), key=lambda x: ( x.split('_')[1], int(x.split('_')[2]) )):
    short_run_name = '_'.join(k.split('_Slot')[0].split('_')[1:])
    sorted_counts_l.append((short_run_name, fragger_counts_d[k]))
fragger_sorted_counts_df = pd.DataFrame(sorted_counts_l, columns=['run_name','count'])