In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
import json
from os.path import expanduser

In [2]:
PROTON_MASS = 1.00727647
ADD_C_CYSTEINE_DA = 57.021464

In [3]:
# calculate the monoisotopic mass    
def calculate_monoisotopic_mass_from_mz(monoisotopic_mz, charge):
    monoisotopic_mass = (monoisotopic_mz * charge) - (PROTON_MASS * charge)
    return monoisotopic_mass

In [4]:
experiment_name = 'P3830'
run_name = 'P3830_YeastUPS2_01_Slot1-1_1_5082'
experiment_base_dir = '/media/big-ssd/experiments'

In [5]:
EXPERIMENT_DIR = "{}/{}".format(experiment_base_dir, experiment_name)
MQ_PERCOLATOR_OUTPUT_DIR = '{}/percolator-output-maxquant'.format(EXPERIMENT_DIR)
MQ_MGF_DIR = '{}/mgf-maxquant'.format(EXPERIMENT_DIR)

In [6]:
MAXIMUM_Q_VALUE = 0.01

In [7]:
# load the percolator output
MQ_PERCOLATOR_OUTPUT_FILE_NAME = "{}/{}.percolator.target.psms.txt".format(MQ_PERCOLATOR_OUTPUT_DIR, experiment_name)
mq_psms_df = pd.read_csv(MQ_PERCOLATOR_OUTPUT_FILE_NAME, sep='\t')
mq_psms_df.rename(columns={'scan': 'mq_index'}, inplace=True)
mq_psms_df.drop(['charge'], axis=1, inplace=True)
# remove the poor quality identifications
mq_psms_df = mq_psms_df[mq_psms_df['peptide mass'] > 0]

In [8]:
mq_psms_df.columns

Index(['file_idx', 'mq_index', 'spectrum precursor m/z',
       'spectrum neutral mass', 'peptide mass', 'percolator score',
       'percolator q-value', 'percolator PEP', 'total matches/spectrum',
       'sequence', 'protein id', 'flanking aa'],
      dtype='object')

In [9]:
# merge the precolator results with the MQ features - obtained with the -v flag of the APL-to-MGF converter
MQ_FEATURES_NAME = '{}/{}.pkl'.format(MQ_MGF_DIR, run_name)
mq_features_df = pd.read_pickle(MQ_FEATURES_NAME)

In [10]:
mq_features_df.columns

Index(['monoisotopic_mass', 'charge', 'monoisotopic_mz', 'intensity',
       'scan_apex', 'rt_apex', 'raw_file', 'mq_index', 'ms2_peaks'],
      dtype='object')

In [11]:
mq_identifications_df = pd.merge(mq_features_df, mq_psms_df, how='left', left_on=['mq_index'], right_on=['mq_index'])
# remove any features that were not identified
mq_identifications_df.dropna(subset=['sequence'], inplace=True)


In [12]:
mq_identifications_df['theoretical_peptide_mass'] = mq_identifications_df['peptide mass'] + (mq_identifications_df.sequence.str.count('C') * ADD_C_CYSTEINE_DA)
mq_identifications_df['mass_accuracy_ppm'] = (mq_identifications_df['monoisotopic_mass'] - mq_identifications_df['theoretical_peptide_mass']) / mq_identifications_df['theoretical_peptide_mass'] * 10**6
mq_identifications_df['mass_error'] = mq_identifications_df['monoisotopic_mass'] - mq_identifications_df['theoretical_peptide_mass']


In [13]:
mq_identifications_df.columns

Index(['monoisotopic_mass', 'charge', 'monoisotopic_mz', 'intensity',
       'scan_apex', 'rt_apex', 'raw_file', 'mq_index', 'ms2_peaks', 'file_idx',
       'spectrum precursor m/z', 'spectrum neutral mass', 'peptide mass',
       'percolator score', 'percolator q-value', 'percolator PEP',
       'total matches/spectrum', 'sequence', 'protein id', 'flanking aa',
       'theoretical_peptide_mass', 'mass_accuracy_ppm', 'mass_error'],
      dtype='object')

In [14]:
mq_identifications_df = mq_identifications_df[(mq_identifications_df['percolator q-value'] <= MAXIMUM_Q_VALUE) & (~mq_identifications_df['protein id'].str.contains('YEAST'))]

In [15]:
print('number of unique proteins: {}'.format(len(mq_identifications_df['protein id'].unique())))

number of unique proteins: 20


#### using the Top3 approach to determine protein quanitification

In [16]:
ups2_proteins_l = []

ups2_proteins_l.append({'uniprot':'P00915', 'fmoles':50000})
ups2_proteins_l.append({'uniprot':'P00918', 'fmoles':50000})
ups2_proteins_l.append({'uniprot':'P01031', 'fmoles':50000})
ups2_proteins_l.append({'uniprot':'P69905', 'fmoles':50000})
ups2_proteins_l.append({'uniprot':'P68871', 'fmoles':50000})
ups2_proteins_l.append({'uniprot':'P41159', 'fmoles':50000})
ups2_proteins_l.append({'uniprot':'P02768', 'fmoles':50000})
ups2_proteins_l.append({'uniprot':'P62988', 'fmoles':50000})

ups2_proteins_l.append({'uniprot':'P04040', 'fmoles':5000})
ups2_proteins_l.append({'uniprot':'P00167', 'fmoles':5000})
ups2_proteins_l.append({'uniprot':'P01133', 'fmoles':5000})
ups2_proteins_l.append({'uniprot':'P02144', 'fmoles':5000})
ups2_proteins_l.append({'uniprot':'P15559', 'fmoles':5000})
ups2_proteins_l.append({'uniprot':'P62937', 'fmoles':5000})
ups2_proteins_l.append({'uniprot':'Q06830', 'fmoles':5000})
ups2_proteins_l.append({'uniprot':'P63165', 'fmoles':5000})

ups2_proteins_l.append({'uniprot':'P00709', 'fmoles':500})
ups2_proteins_l.append({'uniprot':'P06732', 'fmoles':500})
ups2_proteins_l.append({'uniprot':'P12081', 'fmoles':500})
ups2_proteins_l.append({'uniprot':'P61626', 'fmoles':500})
ups2_proteins_l.append({'uniprot':'Q15843', 'fmoles':500})
ups2_proteins_l.append({'uniprot':'P02753', 'fmoles':500})
ups2_proteins_l.append({'uniprot':'P16083', 'fmoles':500})
ups2_proteins_l.append({'uniprot':'P63279', 'fmoles':500})

ups2_proteins_l.append({'uniprot':'P01008', 'fmoles':50})
ups2_proteins_l.append({'uniprot':'P61769', 'fmoles':50})
ups2_proteins_l.append({'uniprot':'P55957', 'fmoles':50})
ups2_proteins_l.append({'uniprot':'O76070', 'fmoles':50})
ups2_proteins_l.append({'uniprot':'P08263', 'fmoles':50})
ups2_proteins_l.append({'uniprot':'P01344', 'fmoles':50})
ups2_proteins_l.append({'uniprot':'P01127', 'fmoles':50})
ups2_proteins_l.append({'uniprot':'P10599', 'fmoles':50})

ups2_proteins_l.append({'uniprot':'P99999', 'fmoles':5})
ups2_proteins_l.append({'uniprot':'P06396', 'fmoles':5})
ups2_proteins_l.append({'uniprot':'P09211', 'fmoles':5})
ups2_proteins_l.append({'uniprot':'P01112', 'fmoles':5})
ups2_proteins_l.append({'uniprot':'P01579', 'fmoles':5})
ups2_proteins_l.append({'uniprot':'P02787', 'fmoles':5})
ups2_proteins_l.append({'uniprot':'O00762', 'fmoles':5})
ups2_proteins_l.append({'uniprot':'P51965', 'fmoles':5})

ups2_proteins_l.append({'uniprot':'P08758', 'fmoles':0.5})
ups2_proteins_l.append({'uniprot':'P02741', 'fmoles':0.5})
ups2_proteins_l.append({'uniprot':'P05413', 'fmoles':0.5})
ups2_proteins_l.append({'uniprot':'P10145', 'fmoles':0.5})
ups2_proteins_l.append({'uniprot':'P02788', 'fmoles':0.5})
ups2_proteins_l.append({'uniprot':'P10636-8', 'fmoles':0.5})
ups2_proteins_l.append({'uniprot':'P00441', 'fmoles':0.5})
ups2_proteins_l.append({'uniprot':'P01375', 'fmoles':0.5})

ups2_d = {}
for p in ups2_proteins_l:
    ups2_d[p['uniprot']] = p['fmoles']

In [17]:
mq_identifications_df['protein id'].unique()

array(['P02768ups|ALBU_HUMAN_UPS', 'P00918ups|CAH2_HUMAN_UPS',
       'P00915ups|CAH1_HUMAN_UPS', 'P15559ups|NQO1_HUMAN_UPS',
       'P68871ups|HBB_HUMAN_UPS', 'P41159ups|LEP_HUMAN_UPS',
       'P63165ups|SUMO1_HUMAN_UPS', 'P04040ups|CATA_HUMAN_UPS',
       'P02144ups|MYG_HUMAN_UPS', 'P00167ups|CYB5_HUMAN_UPS',
       'P69905ups|HBA_HUMAN_UPS', 'P01031ups|CO5_HUMAN_UPS',
       'P12081ups|SYHC_HUMAN_UPS', 'P01579ups|IFNG_HUMAN_UPS',
       'Q06830ups|PRDX1_HUMAN_UPS', 'P00709ups|LALBA_HUMAN_UPS',
       'P62937ups|PPIA_HUMAN_UPS,P62988ups|UBIQ_HUMAN_UPS,O76070ups|SYUG_HUMAN_UPS',
       'P62988ups|UBIQ_HUMAN_UPS', 'P06732ups|KCRM_HUMAN_UPS',
       'P01133ups|EGF_HUMAN_UPS'], dtype=object)

In [18]:
# find the three most intense peptides for each protein
top3_l = []
for group_name,group_df in mq_identifications_df.groupby('protein id'):
    df = group_df.sort_values(by=['intensity'], ascending=False, inplace=False)
    top3_df = df.head(n=3)
    if len(top3_df) == 3:
        uniprot_id = group_name.split('ups|')[0]
        pmoles = ups2_d[uniprot_id] / 1000
        average_intensity = top3_df.intensity.mean()
        signal_response_factor = average_intensity / pmoles
        peptide_count = len(df)
        attribs_d = top3_df[['mq_index','sequence','charge','intensity']].to_dict('records')
        top3_l.append({'protein':group_name, 'pmoles':pmoles, 'average_intensity':average_intensity, 'signal_response_factor':signal_response_factor, 'peptide_count':peptide_count, 'attribs_d':attribs_d})
    else:
        print('didn\'t have three peptides for {}'.format(group_name))
top3_with_df = pd.DataFrame(top3_l)

didn't have three peptides for P00167ups|CYB5_HUMAN_UPS
didn't have three peptides for P00709ups|LALBA_HUMAN_UPS
didn't have three peptides for P01133ups|EGF_HUMAN_UPS
didn't have three peptides for P01579ups|IFNG_HUMAN_UPS
didn't have three peptides for P06732ups|KCRM_HUMAN_UPS
didn't have three peptides for P62937ups|PPIA_HUMAN_UPS,P62988ups|UBIQ_HUMAN_UPS,O76070ups|SYUG_HUMAN_UPS
didn't have three peptides for P62988ups|UBIQ_HUMAN_UPS
didn't have three peptides for P63165ups|SUMO1_HUMAN_UPS


In [19]:
# using Serum albumin (P02768) as an internal reference
universal_signal_response_factor = top3_with_df[top3_with_df.protein.str.startswith('P02768')].iloc[0].signal_response_factor
universal_signal_response_factor

130899.33333333334

In [20]:
top3_with_df['calculated_pmoles'] = top3_with_df.average_intensity / universal_signal_response_factor

In [21]:
top3_with_df

Unnamed: 0,protein,pmoles,average_intensity,signal_response_factor,peptide_count,attribs_d,calculated_pmoles
0,P00915ups|CAH1_HUMAN_UPS,50.0,1901467.0,38029.333333,15,"[{'mq_index': 13373, 'sequence': 'HDTSLKPISVSY...",14.526175
1,P00918ups|CAH2_HUMAN_UPS,50.0,1739333.0,34786.666667,16,"[{'mq_index': 16798, 'sequence': 'YDPSLKPLSVSY...",13.287564
2,P01031ups|CO5_HUMAN_UPS,50.0,559199.0,11183.98,3,"[{'mq_index': 16942, 'sequence': 'CCYDGACVNNDE...",4.271977
3,P02144ups|MYG_HUMAN_UPS,5.0,177288.7,35457.733333,5,"[{'mq_index': 7284, 'sequence': 'VEADIPGHGQEVL...",1.354389
4,P02768ups|ALBU_HUMAN_UPS,50.0,6544967.0,130899.333333,74,"[{'mq_index': 15443, 'sequence': 'VFDEFKPLVEEP...",50.0
5,P04040ups|CATA_HUMAN_UPS,5.0,216976.7,43395.333333,7,"[{'mq_index': 8854, 'sequence': 'GAGAFGYFEVTHD...",1.657584
6,P12081ups|SYHC_HUMAN_UPS,0.5,56167.0,112334.0,3,"[{'mq_index': 5825, 'sequence': 'HGAEVIDTPVFEL...",0.429085
7,P15559ups|NQO1_HUMAN_UPS,5.0,145679.0,29135.8,4,"[{'mq_index': 15586, 'sequence': 'LKDPANFQYPAE...",1.112909
8,P41159ups|LEP_HUMAN_UPS,50.0,1225630.0,24512.6,14,"[{'mq_index': 13196, 'sequence': 'VTGLDFIPGLHP...",9.363149
9,P68871ups|HBB_HUMAN_UPS,50.0,1219700.0,24394.0,11,"[{'mq_index': 8044, 'sequence': 'VLGAFSDGLAHLD...",9.317847


In [22]:
# calculate the CV for the signal response factor
cv = top3_with_df.signal_response_factor.std() / top3_with_df.signal_response_factor.mean()
cv

0.889434727594015

#### calculate correlation

In [24]:
top3_with_df['pmoles'].corr(top3_with_df['calculated_pmoles'])

0.5208846217405785