In [18]:
import pandas as pd
from os.path import expanduser
import numpy as np

In [2]:
experiment_name = 'P3856'
run_name = 'P3856_YHE211_1_Slot1-1_1_5104'
precursor_definition_method = 'pasef'
experiment_base_dir = '/media/big-ssd/experiments'

In [3]:
EXPERIMENT_DIR = "{}/{}".format(experiment_base_dir, experiment_name)
MQ_PERCOLATOR_OUTPUT_DIR = '{}/percolator-output-pasef-maxquant'.format(EXPERIMENT_DIR)

In [4]:
MAXIMUM_Q_VALUE = 0.01

In [5]:
# load the percolator output
MQ_PERCOLATOR_OUTPUT_FILE_NAME = "{}/{}.percolator.target.psms.txt".format(MQ_PERCOLATOR_OUTPUT_DIR, experiment_name)
mq_psms_df = pd.read_csv(MQ_PERCOLATOR_OUTPUT_FILE_NAME, sep='\t')
mq_psms_df.rename(columns={'scan': 'mq_index'}, inplace=True)
# remove the poor quality identifications
mq_psms_df = mq_psms_df[mq_psms_df['peptide mass'] > 0]

In [6]:
mq_sequences_l = []
for group_name,group_df in mq_psms_df.groupby(['sequence','charge'], as_index=False):
    if group_df['percolator q-value'].min() <= MAXIMUM_Q_VALUE:
        mq_index = group_df.loc[group_df['percolator q-value'].idxmin()].mq_index
        mq_sequences_l.append({'sequence_key':'{}-{}'.format(group_name[0], group_name[1]), 'mq_index':mq_index})
mq_sequences_df = pd.DataFrame(mq_sequences_l)

In [7]:
mq_sequences_df.sample(n=3)

Unnamed: 0,sequence_key,mq_index
1849,FISEQLSQSGFHDIK-2,10342
1543,EQFSDGVGYSWIDGLK-2,11879
4487,MQEAMTQEVSDVFSDTTTPIK-2,20563


In [8]:
# load my identifications
BASE_DIR = '{}/P3856-results-cs-true-fmdw-true-2021-04-24-19-58-32'.format(expanduser('~'))
IDENTS_DIR = '{}/identifications-{}'.format(BASE_DIR, precursor_definition_method)
IDENTS_NAME = '{}/exp-{}-identifications-{}-recalibrated.pkl'.format(IDENTS_DIR, experiment_name, precursor_definition_method)

In [9]:
identifications_before_unique_df = pd.read_pickle(IDENTS_NAME)['identifications_df']

In [22]:
sequences_l = []
for group_name,group_df in identifications_before_unique_df.groupby(['sequence','charge'], as_index=False):
    if group_df['percolator q-value'].min() <= MAXIMUM_Q_VALUE:
        feature_id = group_df.loc[group_df['percolator q-value'].idxmin()].feature_id
        sequences_l.append({'sequence_key':'{}-{}'.format(group_name[0], group_name[1]), 'feature_id':int(feature_id)})
sequences_df = pd.DataFrame(sequences_l)

In [23]:
sequences_df.sample(n=3)

Unnamed: 0,sequence_key,feature_id
682,DVNAAIATIK-1,1556901
1742,IATDPFVGNLTFFR-2,4240803
701,DYGTDTILQDFHYDEKPLIAK-3,3080501


In [24]:
common_df = pd.merge(mq_sequences_df, sequences_df, how='left', left_on=['sequence_key'], right_on=['sequence_key'])

In [25]:
common_df.sample(n=5)

Unnamed: 0,sequence_key,mq_index,feature_id
4569,NAAFGQSGGAGSDSNSPGNVQPNSAPSVESHPVLEK-3,23121,
6815,VNVTSAEGLQSVLETNLGPK-2,17107,
2802,HVDENDTLYSLVSR-2,8321,
7212,YFTLGLPTGSTPLGCYK-2,13551,3599007.0
5059,QITQVYGFYDECQTK-2,13651,


In [27]:
common_df[common_df.feature_id.isnull()].sample(n=4)

Unnamed: 0,sequence_key,mq_index,feature_id
563,AVLPLLDAQQPCYLLYR-2,16715,
1984,FSNTGEDWYVLVGVAK-2,11514,
2017,FVADGDIGSGSVIIKPFVDMEHPETSIK-4,22460,
235,AIGVQNAYFPMFVSSR-2,11547,
