In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
import json
from os.path import expanduser

In [2]:
experiment_name = 'P3830'
feature_detection_method = 'pasef'

In [3]:
MAXIMUM_Q_VALUE = 0.01

In [4]:
BASE_RESULTS_DIR = '/media/big-ssd'

#### identifications with correction for saturation

In [5]:
IDENTIFICATIONS_WITH_CS_DIR = '{}/results-P3830/P3830-results-cs-true-fmdw-true-2021-05-17-15-49-22/identifications-pasef'.format(BASE_RESULTS_DIR)
IDENTIFICATIONS_WITH_CS_FILE = '{}/exp-{}-identifications-{}-recalibrated.pkl'.format(IDENTIFICATIONS_WITH_CS_DIR, experiment_name, feature_detection_method)

In [6]:
# load the identifications that had saturated points in the mono
with open(IDENTIFICATIONS_WITH_CS_FILE, 'rb') as handle:
    d = pickle.load(handle)
identifications_with_cs_df = d['identifications_df']
identifications_with_cs_df = identifications_with_cs_df[(identifications_with_cs_df['percolator q-value'] <= MAXIMUM_Q_VALUE) & (~identifications_with_cs_df['protein id'].str.contains('YEAST'))]

In [7]:
s1 = identifications_with_cs_df.mono_intensity_adjustment_outcome.value_counts(normalize=False).rename('counts', inplace=True)
s2 = round(identifications_with_cs_df.mono_intensity_adjustment_outcome.value_counts(normalize=True)*100,1).rename('%', inplace=True)
pd.concat([s1, s2], axis=1)

Unnamed: 0,counts,%
monoisotopic_not_saturated,217,80.1
monoisotopic_saturated_adjusted,53,19.6
no_nonsaturated_isotopes,1,0.4


In [8]:
print('number of unique proteins: {}'.format(len(identifications_with_cs_df['protein id'].unique())))

number of unique proteins: 19


#### identifications without correction for saturation

In [9]:
IDENTIFICATIONS_WITHOUT_CS_DIR = '{}/results-P3830/P3830-results-cs-false-fmdw-true-2021-05-17-20-15-03/identifications-pasef'.format(BASE_RESULTS_DIR)
IDENTIFICATIONS_WITHOUT_CS_FILE = '{}/exp-{}-identifications-{}-recalibrated.pkl'.format(IDENTIFICATIONS_WITHOUT_CS_DIR, experiment_name, feature_detection_method)

In [10]:
# load the identifications that had saturated points in the mono
with open(IDENTIFICATIONS_WITHOUT_CS_FILE, 'rb') as handle:
    d = pickle.load(handle)
identifications_without_cs_df = d['identifications_df']
identifications_without_cs_df = identifications_without_cs_df[(identifications_without_cs_df['percolator q-value'] <= MAXIMUM_Q_VALUE) & (~identifications_without_cs_df['protein id'].str.contains('YEAST'))]

In [11]:
print('number of unique proteins: {}'.format(len(identifications_without_cs_df['protein id'].unique())))

number of unique proteins: 19


#### using the Top3 approach to determine protein quanitification

In [12]:
ups2_proteins_l = []

ups2_proteins_l.append({'uniprot':'P00915', 'fmoles':50000})
ups2_proteins_l.append({'uniprot':'P00918', 'fmoles':50000})
ups2_proteins_l.append({'uniprot':'P01031', 'fmoles':50000})
ups2_proteins_l.append({'uniprot':'P69905', 'fmoles':50000})
ups2_proteins_l.append({'uniprot':'P68871', 'fmoles':50000})
ups2_proteins_l.append({'uniprot':'P41159', 'fmoles':50000})
ups2_proteins_l.append({'uniprot':'P02768', 'fmoles':50000})
ups2_proteins_l.append({'uniprot':'P62988', 'fmoles':50000})

ups2_proteins_l.append({'uniprot':'P04040', 'fmoles':5000})
ups2_proteins_l.append({'uniprot':'P00167', 'fmoles':5000})
ups2_proteins_l.append({'uniprot':'P01133', 'fmoles':5000})
ups2_proteins_l.append({'uniprot':'P02144', 'fmoles':5000})
ups2_proteins_l.append({'uniprot':'P15559', 'fmoles':5000})
ups2_proteins_l.append({'uniprot':'P62937', 'fmoles':5000})
ups2_proteins_l.append({'uniprot':'Q06830', 'fmoles':5000})
ups2_proteins_l.append({'uniprot':'P63165', 'fmoles':5000})

ups2_proteins_l.append({'uniprot':'P00709', 'fmoles':500})
ups2_proteins_l.append({'uniprot':'P06732', 'fmoles':500})
ups2_proteins_l.append({'uniprot':'P12081', 'fmoles':500})
ups2_proteins_l.append({'uniprot':'P61626', 'fmoles':500})
ups2_proteins_l.append({'uniprot':'Q15843', 'fmoles':500})
ups2_proteins_l.append({'uniprot':'P02753', 'fmoles':500})
ups2_proteins_l.append({'uniprot':'P16083', 'fmoles':500})
ups2_proteins_l.append({'uniprot':'P63279', 'fmoles':500})

ups2_proteins_l.append({'uniprot':'P01008', 'fmoles':50})
ups2_proteins_l.append({'uniprot':'P61769', 'fmoles':50})
ups2_proteins_l.append({'uniprot':'P55957', 'fmoles':50})
ups2_proteins_l.append({'uniprot':'O76070', 'fmoles':50})
ups2_proteins_l.append({'uniprot':'P08263', 'fmoles':50})
ups2_proteins_l.append({'uniprot':'P01344', 'fmoles':50})
ups2_proteins_l.append({'uniprot':'P01127', 'fmoles':50})
ups2_proteins_l.append({'uniprot':'P10599', 'fmoles':50})

ups2_proteins_l.append({'uniprot':'P99999', 'fmoles':5})
ups2_proteins_l.append({'uniprot':'P06396', 'fmoles':5})
ups2_proteins_l.append({'uniprot':'P09211', 'fmoles':5})
ups2_proteins_l.append({'uniprot':'P01112', 'fmoles':5})
ups2_proteins_l.append({'uniprot':'P01579', 'fmoles':5})
ups2_proteins_l.append({'uniprot':'P02787', 'fmoles':5})
ups2_proteins_l.append({'uniprot':'O00762', 'fmoles':5})
ups2_proteins_l.append({'uniprot':'P51965', 'fmoles':5})

ups2_proteins_l.append({'uniprot':'P08758', 'fmoles':0.5})
ups2_proteins_l.append({'uniprot':'P02741', 'fmoles':0.5})
ups2_proteins_l.append({'uniprot':'P05413', 'fmoles':0.5})
ups2_proteins_l.append({'uniprot':'P10145', 'fmoles':0.5})
ups2_proteins_l.append({'uniprot':'P02788', 'fmoles':0.5})
ups2_proteins_l.append({'uniprot':'P10636-8', 'fmoles':0.5})
ups2_proteins_l.append({'uniprot':'P00441', 'fmoles':0.5})
ups2_proteins_l.append({'uniprot':'P01375', 'fmoles':0.5})

ups2_d = {}
for p in ups2_proteins_l:
    ups2_d[p['uniprot']] = p['fmoles']

#### with correction

In [13]:
identifications_with_cs_df['protein id'].unique()

array(['P02768ups|ALBU_HUMAN_UPS', 'P00915ups|CAH1_HUMAN_UPS',
       'P00918ups|CAH2_HUMAN_UPS', 'P68871ups|HBB_HUMAN_UPS',
       'P41159ups|LEP_HUMAN_UPS', 'P69905ups|HBA_HUMAN_UPS',
       'P01031ups|CO5_HUMAN_UPS', 'P62988ups|UBIQ_HUMAN_UPS',
       'Q06830ups|PRDX1_HUMAN_UPS', 'P04040ups|CATA_HUMAN_UPS',
       'P02144ups|MYG_HUMAN_UPS', 'P00167ups|CYB5_HUMAN_UPS',
       'P12081ups|SYHC_HUMAN_UPS',
       'P62937ups|PPIA_HUMAN_UPS,P62988ups|UBIQ_HUMAN_UPS,O76070ups|SYUG_HUMAN_UPS',
       'P15559ups|NQO1_HUMAN_UPS', 'P06732ups|KCRM_HUMAN_UPS',
       'P00709ups|LALBA_HUMAN_UPS', 'P63165ups|SUMO1_HUMAN_UPS',
       'P62937ups|PPIA_HUMAN_UPS'], dtype=object)

In [14]:
# find the three most intense peptides for each protein
top3_l = []
for group_name,group_df in identifications_with_cs_df.groupby('protein id'):
    df = group_df.sort_values(by=['feature_intensity'], ascending=False, inplace=False)
    top3_df = df.head(n=3)
    if len(top3_df) == 3:
        uniprot_id = group_name.split('ups|')[0]
        pmoles = ups2_d[uniprot_id] / 1000
        average_intensity = top3_df.feature_intensity.mean()
        signal_response_factor = average_intensity / pmoles
        peptide_count = len(df)
        attribs_d = top3_df[['feature_id','sequence','charge','feature_intensity']].to_dict('records')
        top3_l.append({'protein':group_name, 'pmoles':pmoles, 'average_intensity':average_intensity, 'signal_response_factor':signal_response_factor, 'peptide_count':peptide_count, 'attribs_d':attribs_d})
    else:
        print('didn\'t have three peptides for {}'.format(group_name))
top3_with_df = pd.DataFrame(top3_l)

didn't have three peptides for P00167ups|CYB5_HUMAN_UPS
didn't have three peptides for P00709ups|LALBA_HUMAN_UPS
didn't have three peptides for P06732ups|KCRM_HUMAN_UPS
didn't have three peptides for P62937ups|PPIA_HUMAN_UPS
didn't have three peptides for P62988ups|UBIQ_HUMAN_UPS


In [15]:
# using Serum albumin (P02768) as an internal reference
universal_signal_response_factor = top3_with_df[top3_with_df.protein.str.startswith('P02768')].iloc[0].signal_response_factor
universal_signal_response_factor

2705.826666666667

In [16]:
top3_with_df['calculated_pmoles'] = top3_with_df.average_intensity / universal_signal_response_factor

In [17]:
top3_with_df

Unnamed: 0,protein,pmoles,average_intensity,signal_response_factor,peptide_count,attribs_d,calculated_pmoles
0,P00915ups|CAH1_HUMAN_UPS,50.0,57896.0,1157.92,21,"[{'feature_id': 829701, 'sequence': 'HDTSLKPIS...",21.396788
1,P00918ups|CAH2_HUMAN_UPS,50.0,55843.0,1116.86,19,"[{'feature_id': 2161501, 'sequence': 'YDPSLKPL...",20.638055
2,P01031ups|CO5_HUMAN_UPS,50.0,43721.666667,874.433333,4,"[{'feature_id': 613701, 'sequence': 'CCYDGACVN...",16.15834
3,P02144ups|MYG_HUMAN_UPS,5.0,6799.666667,1359.933333,6,"[{'feature_id': 1141101, 'sequence': 'VEADIPGH...",2.512972
4,P02768ups|ALBU_HUMAN_UPS,50.0,135291.333333,2705.826667,127,"[{'feature_id': 4736101, 'sequence': 'MPCAEDYL...",50.0
5,P04040ups|CATA_HUMAN_UPS,5.0,46716.0,9343.2,8,"[{'feature_id': 3697701, 'sequence': 'GPLLVQDV...",17.264964
6,P12081ups|SYHC_HUMAN_UPS,0.5,3963.666667,7927.333333,3,"[{'feature_id': 4697301, 'sequence': 'LLNQLQYC...",1.464863
7,P15559ups|NQO1_HUMAN_UPS,5.0,15619.0,3123.8,4,"[{'feature_id': 834301, 'sequence': 'ALIVLAHSE...",5.772358
8,P41159ups|LEP_HUMAN_UPS,50.0,38552.333333,771.046667,29,"[{'feature_id': 3952601, 'sequence': 'VTGLDFIP...",14.247895
9,"P62937ups|PPIA_HUMAN_UPS,P62988ups|UBIQ_HUMAN_...",5.0,3851.333333,770.266667,5,"[{'feature_id': 12101, 'sequence': 'GSSHHHHHHS...",1.423348


In [18]:
# calculate the CV for the signal response factor
cv = top3_with_df.signal_response_factor.std() / top3_with_df.signal_response_factor.mean()
cv

1.1695607526938105

#### without correction

In [19]:
# find the three most intense peptides for each protein
top3_l = []
for group_name,group_df in identifications_without_cs_df.groupby('protein id'):
    df = group_df.sort_values(by=['feature_intensity'], ascending=False, inplace=False)
    top3_df = df.head(n=3)
    if len(top3_df) == 3:
        uniprot_id = group_name.split('ups|')[0]
        pmoles = ups2_d[uniprot_id] / 1000
        average_intensity = top3_df.feature_intensity.mean()
        signal_response_factor = average_intensity / pmoles
        peptide_count = len(df)
        attribs_d = top3_df[['feature_id','sequence','charge','feature_intensity']].to_dict('records')
        top3_l.append({'protein':group_name, 'pmoles':pmoles, 'average_intensity':average_intensity, 'signal_response_factor':signal_response_factor, 'peptide_count':peptide_count, 'attribs_d':attribs_d})
    else:
        print('didn\'t have three peptides for {}'.format(group_name))
top3_without_df = pd.DataFrame(top3_l)

didn't have three peptides for P00167ups|CYB5_HUMAN_UPS
didn't have three peptides for P00709ups|LALBA_HUMAN_UPS
didn't have three peptides for P06732ups|KCRM_HUMAN_UPS
didn't have three peptides for P62937ups|PPIA_HUMAN_UPS
didn't have three peptides for P62988ups|UBIQ_HUMAN_UPS


In [20]:
# using Serum albumin (P02768) as an internal reference
universal_signal_response_factor = top3_without_df[top3_without_df.protein.str.startswith('P02768')].iloc[0].signal_response_factor

In [21]:
top3_without_df['calculated_pmoles'] = top3_without_df.average_intensity / universal_signal_response_factor

In [22]:
# calculate the CV for the signal response factor
cv = top3_without_df.signal_response_factor.std() / top3_without_df.signal_response_factor.mean()
cv

1.199136606177974

#### calculate correlation

In [23]:
combined_df = pd.merge(top3_with_df, top3_without_df, how='inner', left_on=['protein'], right_on=['protein'], suffixes=['_with','_without'])

In [24]:
combined_df.sample(n=3)

Unnamed: 0,protein,pmoles_with,average_intensity_with,signal_response_factor_with,peptide_count_with,attribs_d_with,calculated_pmoles_with,pmoles_without,average_intensity_without,signal_response_factor_without,peptide_count_without,attribs_d_without,calculated_pmoles_without
8,P41159ups|LEP_HUMAN_UPS,50.0,38552.333333,771.046667,29,"[{'feature_id': 3952601, 'sequence': 'VTGLDFIP...",14.247895,50.0,34207.666667,684.153333,29,"[{'feature_id': 3952601, 'sequence': 'VTGLDFIP...",16.38743
6,P12081ups|SYHC_HUMAN_UPS,0.5,3963.666667,7927.333333,3,"[{'feature_id': 4697301, 'sequence': 'LLNQLQYC...",1.464863,0.5,3963.666667,7927.333333,3,"[{'feature_id': 4697301, 'sequence': 'LLNQLQYC...",1.898823
10,P63165ups|SUMO1_HUMAN_UPS,5.0,1993.0,398.6,3,"[{'feature_id': 3621302, 'sequence': 'YIAWPLQG...",0.736559,5.0,1993.0,398.6,3,"[{'feature_id': 3621302, 'sequence': 'YIAWPLQG...",0.954761


In [25]:
combined_df['pmoles_with'].corr(combined_df['calculated_pmoles_with'])

0.6671905312003801

In [26]:
combined_df['pmoles_without'].corr(combined_df['calculated_pmoles_without'])

0.6756979027475872