In [None]:
import pandas as pd
import numpy as np
import os
import glob
import statistics
import itertools
import re
import pickle
from pyteomics import mgf, mass
from collections import Counter

# 0. Initial parameters

In [None]:
# Fragmentation method used
Frag_method = 'HCD' # HCD, CID, ETD, ECD

# Ion types for theoretical spectrum generation
ion_type = ['Precur', 'y', 'b', 'a', 'z', 'c', 'INT', 'IM']

# Generate each ion type
if Frag_method == 'HCD' or Frag_method == 'CID':
    annot_yb     = True # y-, b-, a-ion
    annot_zc     = False # z-, c-ion
elif Frag_method == 'ETD' or Frag_method == 'ECD':
    annot_yb     = False # y-, b-, a-ion
    annot_zc     = True # z-, c-ion
annot_precur = True # precursor ion
annot_INT    = True # internal ion
annot_IM     = True # immonium ion
annot_dict = {
    'Precur' : annot_precur,
    'y'      : annot_yb,
    'b'      : annot_yb,
    'a'      : annot_yb,
    'z'      : annot_zc,
    'c'      : annot_zc,
    'INT'    : annot_INT,
    'IM'     : annot_IM
}

# MS2 mass tolerance (ppm)
ms2_ppm = 15

# Signal-to-noise (SNR) filter for MS2 spectrum
apply_SNR = True # Apply SNR filter
SNR       = 2 # SNR threshold
low       = 0.05 # Define low x% intensity as baseline noise level

# Maximum charge state of sequence ions
max_charge = 2 # 2, 3, ... 'max'

# Neutral loss from precursor & sequence ion
NL_seq = {
    'NH3' : { 'mass' : mass.calculate_mass(formula = 'NH3'),   'AA' : {'R', 'K', 'Q', 'N', 'r'} },
    'H2O' : { 'mass' : mass.calculate_mass(formula = 'H2O'),   'AA' : {'S', 'T', 'E', 'D'} },
    '43'  : { 'mass' : mass.calculate_mass(formula = 'HNCO'),  'AA' : {'r'} }, # Citrullination specific NL
    '42'  : { 'mass' : mass.calculate_mass(formula = 'CN2H2'), 'AA' : {'R'} }, # Arg specific NL
    '59'  : { 'mass' : mass.calculate_mass(formula = 'CN3H5'), 'AA' : {'R'} } # Arg specific NL
}

# Neutral loss from internal ion
NL_INT = {
    **NL_seq, 
    **{ 'CO' : { 'mass' : mass.calculate_mass(formula = 'CO'), 'AA' : {''} } } # CO loss from C-terminus of internal ion
}

# Maximum number of neutral loss from a single ion
max_NL = 3

# Common masses
proton = mass.calculate_mass(formula = 'H')
OH     = mass.calculate_mass(formula = 'OH')
CO     = mass.calculate_mass(formula = 'CO')
NH3    = mass.calculate_mass(formula = 'NH3')

# Symbols for common modifications
Mod_symbol = {
    'C+57.021'  : 'c', # C(Carbamidomethyl) 
    'M+15.995'  : 'm', # M(Oxidation)
    'R+0.984'   : 'r', # R(Citrullination)
    'N+0.984'   : 'n', # N(Deamidation)
    'Q+0.984'   : 'q', # Q(Deamidation)
    'E-17.027'  : '@', # Pyro-Glu from Glu
    'Q-18.011'  : '#', # Pyro-Glu from Gln    
    'K+144.102' : 'i', # K(iTRAQ 4-plex)
    'K+304.205' : 'u', # K(iTRAQ 8-plex)    
    'K+229.163' : 't', # K(TMT)
    '+144.102'  : '%', # N-term(iTRAQ 4-plex)
    '+304.205'  : '&', # N-term(iTRAQ 8-plex)
    '+229.163'  : '^', # N-term(TMT)
    '+42.011'   : 'a'  # N-term(Acetyl)
}
Nterm_mod_symbol = [symbol for mod, symbol in Mod_symbol.items() if len(mod.split('+')[0]) == 0] # N-term modification symbols

# Amino acid monoisotopic mass
aa = {
    'c' : 160.030649, # C(Carbamidomethyl)
    'm' : 147.0354,   # M(Oxidation)
    'r' : 157.085127, # R(Citrullination)
    'n' : 115.026943, # N(Deamidation)
    'q' : 129.042594, # Q(Deamidation)
    '@' : 112.015593, # Pyro-Glu from Glu
    '#' : 110.047578, # Pyro-Glu from Gln
    'i' : 272.196963, # K(iTRAQ 4-plex)
    'u' : 432.299963, # K(iTRAQ 8-plex)
    't' : 357.257963, # K(TMT)
    '%' : 144.102,    # N-term(iTRAQ 4-plex)
    '&' : 304.205,    # N-term(iTRAQ 8-plex)
    '^' : 229.163,    # N-term(TMT)
    'a' : 42.011,     # N-term(Acetyl)
    'A' : 71.037114,
    'R' : 156.101111,
    'N' : 114.042927,
    'D' : 115.026943,
    'C' : 103.009185,
    'E' : 129.042593,
    'Q' : 128.058578,
    'G' : 57.021464,
    'H' : 137.058912,
    'I' : 113.084064,
    'L' : 113.084064,
    'K' : 128.094963,
    'M' : 131.040485,
    'F' : 147.068414,
    'P' : 97.052764,
    'S' : 87.032028,
    'T' : 101.047679,
    'W' : 186.079313,
    'Y' : 163.06332,
    'V' : 99.068414
}

# 1. Input files

In [None]:
# Set current working directory
PATH = "F:/Project/DGIST_김민식_교수/Citrullination/Biological_data/Human_brain_Citrullinome/"
os.chdir(PATH)

## 1.1 Spectrum file (mgf)

In [None]:
# Read spectrum files
spec_files = glob.glob('*.mgf')
dfs = []
for spec_file in spec_files:
    df_temp = pd.DataFrame(mgf.read(spec_file, convert_arrays = 1, read_charges = False))
    dfs.append(df_temp)

df_exp = pd.concat(dfs).reset_index(drop = True)
df_exp['params'] = df_exp['params'].apply(lambda x: x.get('title')) # Get scan title
df_exp = df_exp.rename(columns = {'params': 'Title'}) # Change column name

In [None]:
# Remove redundant spectra
df_exp.drop_duplicates('Title', keep = False, ignore_index = True, inplace = True)

In [None]:
# Retain only nonzero intensity 
nonzero_intensity = df_exp['intensity array'].apply(lambda x: (x > 0).astype(int))
df_exp['intensity array'] = (df_exp['intensity array']*nonzero_intensity).apply(lambda x: [i for i in x if i != 0])
df_exp['m/z array']       = (df_exp['m/z array']      *nonzero_intensity).apply(lambda x: [i for i in x if i != 0])

In [None]:
# Remove spectrum with <20 peaks
df_exp = df_exp[df_exp['m/z array'].str.len() >= 20]

In [None]:
# Apply SNR filter
if apply_SNR == True:
    signal_intensity = df_exp['intensity array'].apply(lambda x: (x > statistics.mean(sorted(x)[:round(len(x)*low)])*SNR).astype(int))
    df_exp['intensity array'] = (df_exp['intensity array']*signal_intensity).apply(lambda x: [i for i in x if i != 0])
    df_exp['m/z array']       = (df_exp['m/z array']      *signal_intensity).apply(lambda x: [i for i in x if i != 0])

In [None]:
# Create a column of mz-intensity pairs
df_exp['mz_intensity'] = df_exp[['m/z array', 'intensity array']].apply(lambda x: [list(t) for t in zip(x['m/z array'], x['intensity array'])], axis = 1)


## 1.2 Search result file

In [None]:
# Read search result files
search_files = glob.glob('Human_brain_Citrullinome_paper_multi_Cit_PSMs.csv')
dfs = []
for search_file in search_files:
    df_temp = pd.read_csv(search_file)
    dfs.append(df_temp)

df = pd.concat(dfs).reset_index(drop = True)

In [None]:
# Remove redundant PSMs
df.drop_duplicates('Title', keep = False, ignore_index = True, inplace = True)

In [None]:
# Define a function to convert modifications to specific symbols
def Mod_to_symbol(peptide):
    for key, value in Mod_symbol.items():
        peptide = peptide.replace(key, value)
    return peptide

In [None]:
# Define a function to remove N-term modification symbols
def Nterm_mod_remove(peptide):
    for mod in Nterm_mod_symbol:
        peptide = peptide.replace(mod, '')
    return peptide

In [None]:
# Dataframe generation
df_pep = pd.DataFrame(data = df['Title'].tolist(), columns = ['Title'] )
df_pep.insert(loc = 1, column='Peptide', value = list(df['Peptide']))
df_pep['mod_Peptide'] = df_pep['Peptide'].apply(lambda x: Mod_to_symbol(x)) # Convert modification to specific symbols
df_pep['seq_Peptide'] = df_pep['mod_Peptide'].apply(lambda x: Nterm_mod_remove(x)) # Remove N-term modification symbol
df_pep['Pep_length'] = df_pep['seq_Peptide'].str.len()
df_pep.insert(loc = 4, column='Charge', value = list(df['Charge']))

In [None]:
# Remove sequence without Cit
df_pep = df_pep[df_pep.mod_Peptide.str.contains("r")]
df_pep = df_pep.reset_index(drop = True)

In [None]:
# Get theoretical precursor m/z
df_pep['mz_Precursor'] = ([sum(aa.get(x, 0) for x in ','.join(j)) + OH + proton for j in df_pep['mod_Peptide'].astype(str).fillna('')] + df_pep['Charge']*proton)/df_pep['Charge']


In [None]:
# Count the number of Cit
df_pep['Cit_Count'] = df_pep['seq_Peptide'].str.count("r")

In [None]:
# Keep only PSMs with corresponding spectra
df_pep = df_pep.merge(df_exp['Title'], on = 'Title').sort_values('Title', ascending = True).reset_index(drop = True)

In [None]:
# Keep only spectra with corresponding PSMs
df_exp = df_exp.merge(df_pep['Title'], on = 'Title').sort_values('Title', ascending = True).reset_index(drop = True)

# 2. Theoretical spectrum generation

## 2.1 Ion sequence

In [None]:
# Define functions to generate ion sequence
def getCombinations_seq (lst): # For regular sequence ion
    for i, j in itertools.combinations(range(len(lst) + 1), 2):
        yield lst[0:j]

def getCombinations_INT (lst): # For internal ion
    for i, j in itertools.combinations(range(len(lst) + 1), 2):
        yield lst[i:j]

In [None]:
## Get ion sequence
# Regular sequence ion
df_pep_mz = df_pep.copy()

if annot_yb == True:
    # y ion
    Cterm_combo = df_pep['seq_Peptide'].str[1:].apply(lambda x: [i for i in sorted(list(set(sorted(getCombinations_seq (x[::-1]), key = len))))])
    df_pep_mz   = pd.concat([df_pep_mz, pd.DataFrame(Cterm_combo.values.tolist(), 
                                                columns = ['seq_y_' + str(i) for i in range(1, len(max(Cterm_combo, key = len)) + 1)])], axis=1)
    # b ion
    Nterm_mod = df_pep['mod_Peptide'].apply(lambda x: ''.join([AA for AA in x if AA in Nterm_mod_symbol]))
    Nterm_combo = df_pep['seq_Peptide'].str[0:-1].apply(lambda x: [i for i in sorted(list(set(sorted(getCombinations_seq (x), key = len))))])
    Nterm_combo_mod = pd.concat([Nterm_mod, Nterm_combo], axis = 1).apply(lambda x: [x['mod_Peptide'] + ion for ion in x['seq_Peptide']], axis = 1)
    df_pep_mz   = pd.concat([df_pep_mz, pd.DataFrame(Nterm_combo_mod.values.tolist(), 
                                                   columns = ['seq_b_' + str(i) for i in range(1, len(max(Nterm_combo_mod, key = len)) + 1)])], axis=1)
    # a ion
    df_pep_mz   = pd.concat([df_pep_mz, pd.DataFrame(Nterm_combo_mod.values.tolist(), 
                                                   columns = ['seq_a_' + str(i) for i in range(1, len(max(Nterm_combo_mod, key = len)) + 1)])], axis=1)

if annot_zc == True:
    # Get non-sensical z-, c-ions
    P_index = df_pep['seq_Peptide'].apply(lambda x: [index for index, AA in enumerate(x) if AA == 'P'] if 'P' in x else np.nan)
    z_nonsense = pd.DataFrame([df_pep['Pep_length'], P_index]).T.apply(lambda x: [x['Pep_length'] - index for index in x['seq_Peptide']] if isinstance(x['seq_Peptide'], list) else np.nan, axis = 1)
    c_nonsense = P_index.apply(lambda x: [index for index in x] if isinstance(x, list) else np.nan)
    
    # z ion
    Cterm_combo = df_pep['seq_Peptide'].str[1:].apply( lambda x: [i for i in sorted(list(set(sorted(getCombinations_seq(x[::-1]), key = len))))])
    Cterm_combo = pd.DataFrame([Cterm_combo, z_nonsense]).T.apply(lambda x: [ion if len(ion) not in x[1] else np.nan for ion in x[0]] if isinstance(x[1], list) else x[0], axis = 1) # Remove non-sensical z-ions
    df_pep_mz   = pd.concat([df_pep_mz, pd.DataFrame(Cterm_combo.values.tolist(), 
                                                columns = ['seq_z_' + str(i) for i in range(1, len(max(Cterm_combo, key = len)) + 1)])], axis = 1)
    # c ion
    Nterm_mod = df_pep['mod_Peptide'].apply(lambda x: ''.join([AA for AA in x if AA in Nterm_mod_symbol]))
    Nterm_combo = df_pep['seq_Peptide'].str[0:-1].apply(lambda x: [i for i in sorted(list(set(sorted(getCombinations_seq(x), key = len))))])
    Nterm_combo_mod = pd.concat([Nterm_mod, Nterm_combo], axis = 1).apply(lambda x: [x['mod_Peptide'] + ion for ion in x['seq_Peptide']], axis = 1)
    Nterm_combo = pd.DataFrame([Nterm_combo, c_nonsense]).T.apply(lambda x: [ion if len(ion) not in x[1] else np.nan for ion in x[0]] if isinstance(x[1], list) else x[0], axis = 1) # Remove non-sensical c-ions
    Nterm_combo_mod = pd.DataFrame([Nterm_combo_mod, c_nonsense]).T.apply(lambda x: [ion if len(Nterm_mod_remove(ion)) not in x[1] else np.nan for ion in x[0]] if isinstance(x[1], list) else x[0], axis = 1) # Remove non-sensical c-ions
    df_pep_mz   = pd.concat([df_pep_mz, pd.DataFrame(Nterm_combo_mod.values.tolist(), 
                                                   columns = ['seq_c_' + str(i) for i in range(1, len(max(Nterm_combo_mod, key = len)) + 1)])], axis=1)
# Internal ion
if annot_INT == True:
    INT_combo   = df_pep['seq_Peptide'].str[1:-1].apply(lambda x: [i for i in sorted(getCombinations_INT(x), key = len) if len(i) > 1])
    df_pep_mz   = pd.concat([df_pep_mz, pd.DataFrame(INT_combo.values.tolist(), 
                                                   columns = ['seq_INT_' + str(i) for i in range(1, len(max(INT_combo, key = len)) + 1)])], axis=1)


In [None]:
## Get ion annotation
# Regular sequence ion
df_pep_label = df_pep.copy()

if annot_yb == True:
    # y ion
    Cterm_combo_label = Cterm_combo.apply(lambda x: ['y' + str(len(i)) for i in x])
    df_pep_label = pd.concat([df_pep_label, pd.DataFrame(Cterm_combo_label.values.tolist(), 
                                                columns = ['seq_y_' + str(i) for i in range(1, len(max(Cterm_combo, key = len)) + 1)])], axis=1)
    # b ion
    Nterm_combo_label = Nterm_combo.apply(lambda x: ['b' + str(len(i)) for i in x])
    df_pep_label = pd.concat([df_pep_label, pd.DataFrame(Nterm_combo_label.values.tolist(), 
                                                columns = ['seq_b_' + str(i) for i in range(1, len(max(Nterm_combo, key = len)) + 1)])], axis=1)
    # a ion
    Nterm_combo_label = Nterm_combo.apply(lambda x: ['a' + str(len(i)) for i in x])
    df_pep_label = pd.concat([df_pep_label, pd.DataFrame(Nterm_combo_label.values.tolist(), 
                                                columns = ['seq_a_' + str(i) for i in range(1, len(max(Nterm_combo, key = len)) + 1)])], axis=1)

if annot_zc == True:
    # z ion
    Cterm_combo_label = Cterm_combo.apply(lambda x: ['z' + str(len(i)) if pd.notnull(i) else np.nan for i in x])
    df_pep_label = pd.concat([df_pep_label, pd.DataFrame(Cterm_combo_label.values.tolist(), 
                                                columns = ['seq_z_' + str(i) for i in range(1, len(max(Cterm_combo, key = len)) + 1)])], axis=1)
    # c ion
    Nterm_combo_label = Nterm_combo.apply(lambda x: ['c' + str(len(i)) if pd.notnull(i) else np.nan for i in x])
    df_pep_label = pd.concat([df_pep_label, pd.DataFrame(Nterm_combo_label.values.tolist(), 
                                                columns = ['seq_c_' + str(i) for i in range(1, len(max(Nterm_combo, key = len)) + 1)])], axis=1)

# Internal ion
if annot_INT == True:
    df_pep_label = pd.concat([df_pep_label, pd.DataFrame(INT_combo.values.tolist(), 
                                                           columns = ['seq_INT_' + str(i) for i in range(1, len(max(INT_combo, key=len)) + 1)])], axis=1)

# Precursor ion
if annot_precur == True:
    df_pep_label.loc[(df_pep_label.mz_Precursor != np.nan), 'mz_Precursor'] = 'Precursor'

## 2.2 Singly charged ion

In [None]:
# Define functions to calculate m/z for each ion type
def y_mz(x):
    return sum(aa.get(a, 0) for a in ','.join(x)) + 2*(proton) + OH 

def b_mz(x):
    return sum(aa.get(a, 0) for a in ','.join(x)) + proton

def a_mz(x):
    return sum(aa.get(a, 0) for a in ','.join(x)) + proton - CO

def z_mz(x):
    return sum(aa.get(a, 0) for a in ','.join(x)) + 2*(proton) + OH - NH3 + proton

def c_mz(x):
    return sum(aa.get(a, 0) for a in ','.join(x)) + proton + NH3

def INT_mz(x):
    return sum(aa.get(a, 0) for a in ','.join(x)) + proton

mz_dict = {
    'y'   : y_mz,
    'b'   : b_mz,
    'a'   : a_mz,
    'z'   : z_mz,
    'c'   : c_mz,
    'INT' : INT_mz
}

In [None]:
# Calculate m/z for each ion type
for ion in ion_type:
    if ion in ['y', 'b', 'a', 'z', 'c', 'INT'] and annot_dict.get(ion) == True:
        df_temp = df_pep_mz.filter(like = 'seq_%s' % (ion)).apply(lambda x: [mz_dict.get(ion)(i) if pd.notnull(i) else np.nan for i in x])
        df_temp.rename(columns = lambda x: x.replace('seq', 'mz'), inplace = True)
        df_pep_mz = pd.concat([df_pep_mz, df_temp], axis = 1)

# Update annotation df
df_label_temp = df_pep_label.filter(like='seq').drop(columns = 'seq_Peptide')
df_label_temp.rename(columns = lambda x: x.replace('seq', 'mz'), inplace = True)
df_pep_label = pd.concat([df_pep_label, df_label_temp], axis = 1)
df_pep_label.fillna(value = np.nan, inplace = True)

# Replace None to NaN
df_pep_mz.fillna(value = np.nan, inplace = True)
df_pep_label.fillna(value = np.nan, inplace = True)

## 2.3 Neutral loss variant

In [None]:
# Define a function for generating NL combinations
def NL_combo(x, max_NL):    
    
    l = []    
    for i in range(1, max_NL + 1):
        temp = list(itertools.product(list(x.keys()),repeat = i))
        l.append(temp)
        
    l = list(itertools.chain.from_iterable(l))
    l = [tuple(sorted(tuple_)) for tuple_ in l]
    myset = set(l)
    l = sorted(myset)
    for i in range(len(l)):
        l[i] = '-'.join(l[i])    
        
    L = [sub.split('-') for sub in l]    
    for i in range(len(L)):
        L[i] = sum(x.get(L[i][j])['mass'] for j in range(len(L[i]))) 
        
    return l, L

In [None]:
# Define a function for checking whether NL condition is met for each ion
def NL_condition(NL_AA_list, sequence):    
    
    c = Counter(sequence)
    
    # If one-to-one NL type exists, take care of it first
    for NL_group in NL_AA_list:
        if len(NL_group) == 1:
            if NL_group[0] not in sequence:
                return
            else:
                if len(NL_AA_list) > 1:
                    if c[NL_group[0]] >= 1:
                        c[NL_group[0]] -= 1
                    elif c[NL_group[0]] <= 0:
                        return
                elif len(NL_AA_list) == 1:
                    return True
    
    # Check if remaining NL types is feasible in each ion
    NL_AA_list = [NL_group for NL_group in NL_AA_list if len(NL_group) > 1]
    common_AA  = list(set(itertools.chain.from_iterable(NL_AA_list)))
    
    # If there are only one-to-one NL types, return True
    if len(NL_AA_list) == 0:
        return True
    
    # In case of one-to-multiple NL types
    for AA in list(c):
        if AA not in common_AA:
            del c[AA]
        
    if sum(c.values()) >= len(NL_AA_list):
        return True   

In [None]:
%%time
# Create NL variant m/z dataframes and append to the original dataframe
df_pep_mz_temp    = df_pep_mz.copy()
df_pep_label_temp = df_pep_label.copy()

for ion in ['Precur', 'y', 'b', 'a', 'z', 'c', 'INT']:
    if annot_dict.get(ion) == True:

        NL_list = []
        if ion == 'Precur':
            NL_list  = NL_combo(NL_seq, max_NL)
            df_seq   = df_pep_mz_temp[['seq_Peptide']]
            df_mz    = df_pep_mz_temp[['mz_Precursor']]
            df_label = df_pep_label_temp[['mz_Precursor']]
        elif ion in ['y', 'b', 'a', 'z', 'c']:
            NL_list  = NL_combo(NL_seq, max_NL)
            df_seq   = df_pep_mz_temp[[col for col in df_pep_mz_temp.columns if 'seq_%s' %(ion) in col]]
            df_seq_temp = pd.Series(df_seq.values.tolist()).apply(lambda x: [i for i in x if isinstance(i, str)])
            df_mz    = df_pep_mz_temp[[col for col in df_pep_mz_temp.columns if 'mz_%s'  %(ion) in col]]
            df_label = df_pep_label_temp[[col for col in df_pep_label_temp.columns if 'mz_%s' %(ion) in col]]        

        elif ion == 'INT':
            NL_list  = NL_combo(NL_INT, max_NL)
            df_seq   = df_pep_mz_temp[[col for col in df_pep_mz_temp.columns if 'seq_%s' %(ion) in col]]
            df_seq_temp = pd.Series(df_seq.values.tolist()).apply(lambda x: [i for i in x if isinstance(i, str)])
            df_mz    = df_pep_mz_temp[[col for col in df_pep_mz_temp.columns if 'mz_%s'  %(ion) in col]]
            df_label = df_pep_label_temp[[col for col in df_pep_mz_temp.columns if 'mz_%s' %(ion) in col]]

        # For each NL type get NL variant m/z
        if NL_list != []: # if the ion types are to be included
            for NL in NL_list[0]:

                NL_AA_list = []
                for sub_NL in NL.split('-'):

                    if ion in ['Precur', 'y', 'b', 'a', 'z', 'c']:
                        temp1 = list(NL_seq.get(sub_NL)['AA'])
                    elif ion == 'INT':
                        temp1 = list(NL_INT.get(sub_NL)['AA'])
                    NL_AA_list.append(temp1)
                
                # m/z dataframe
                if ion == 'Precur':
                    df_bool     = df_seq.applymap(lambda x: 1 if NL_condition(NL_AA_list, str(x)) == True else np.nan)
                    df_mz_temp  = pd.DataFrame(df_bool.values*df_mz.values, columns = df_mz.columns, index = df_mz.index)
                    df_mz_temp  = df_mz_temp.replace({0:np.nan})
                    df_mz_temp['mz_Precursor'] = df_mz_temp.mz_Precursor*df_pep_mz_temp.Charge # Convert m/z to mass
                    df_mz_temp  = df_mz_temp.apply(lambda x: 0 if str(x) == np.nan else x - NL_list[1][NL_list[0].index(NL)])
                    df_mz_temp['mz_Precursor'] = df_mz_temp.mz_Precursor/df_pep_mz_temp.Charge # Convert mass to m/z
                    df_mz_temp.rename(columns = lambda x: x + '-%s' % (NL), inplace = True)
                    df_pep_mz_temp = pd.concat([df_pep_mz_temp, df_mz_temp], axis = 1)                
                else:
                    df_bool     = pd.DataFrame(df_seq_temp.apply(lambda x: [1 if NL_condition(NL_AA_list, i) == True else np.nan for i in x]).tolist())
                    df_mz_temp  = pd.DataFrame(df_bool.values*df_mz.values, columns = df_mz.columns, index = df_mz.index)
                    NL_mz = NL_list[1][NL_list[0].index(NL)]
                    df_mz_temp  = df_mz_temp - NL_mz
                    df_mz_temp.rename(columns = lambda x: x + '-%s' % (NL), inplace = True)
                    df_pep_mz_temp = pd.concat([df_pep_mz_temp, df_mz_temp], axis = 1)

                # annotation dataframe
                df_label_temp = (df_bool==1).astype(int) * df_label.values
                df_label_temp = df_label_temp.replace(r'^\s*$', np.nan, regex=True)
                df_label_temp.columns = df_label.columns
                df_label_temp = pd.DataFrame(np.where(df_label_temp.notnull(), df_label_temp.astype(str) + '-' + NL, np.nan), columns = df_label_temp.columns)
                df_label_temp.rename(columns = lambda x: x + '-%s' % (NL), inplace = True)
                df_pep_label_temp = pd.concat([df_pep_label_temp, df_label_temp], axis = 1)

In [None]:
# Copy temporary dataframe back into original dataframe
df_pep_mz    = df_pep_mz_temp.copy()
df_pep_label = df_pep_label_temp.copy()

## 2.4 Multiply charged ion

In [None]:
# Define a function for checking whether multiple charge state condition is met for each ion
def multi_charge_condition(charge, sequence):
    
    if pd.notnull(sequence):
        # Count the number of basic residues
        AA_count = sequence.count('R') + sequence.count('K') + sequence.count('H')

        # Condition for multiple charge state
        if AA_count + 1 >= charge:
            return 1

In [None]:
%%time
# Create multiple charge state m/z dataframe and append to the original dataframe
if max_charge > 1:
    # Create temporary m/z and annotation dataframe
    df_seq   = df_pep_mz[[col for col in df_pep_mz.columns       if any(['seq_y' in col, 'seq_b' in col, 'seq_a' in col, 'seq_z' in col, 'seq_c' in col]) == True]]
    df_mz    = df_pep_mz[[col for col in df_pep_mz.columns       if any(['mz_y' in col, 'mz_b' in col, 'mz_a' in col, 'mz_z' in col, 'mz_c' in col]) == True]]
    df_label = df_pep_label[[col for col in df_pep_label.columns if any(['mz_y' in col, 'mz_b' in col, 'mz_a' in col, 'mz_z' in col, 'mz_c' in col]) == True]]

    # Create temporary m/z and annotation dataframe
    df_pep_mz_temp    = df_pep_mz.copy()
    df_pep_label_temp = df_pep_label.copy()

    # For each charge state get m/z
    if max_charge == 'max':
        charge_boundary = df_pep_mz['Charge'].max()
    else:
        charge_boundary = max_charge

    for charge in range(2, charge_boundary + 1): # Generate multiply charged sequence ions (from +2 to predefined max. charge)

        # m/z dataframe
        df_charge = df_pep_mz['Charge'].copy()
        df_charge.loc[df_charge < charge] = 0
        df_bool1 = df_seq.apply(lambda x: [multi_charge_condition(charge, ion) if isinstance(ion, str) else 0 for ion in x])
        df_bool1 = pd.concat([df_charge, df_bool1], axis = 1)
        df_bool1.loc[df_bool1['Charge'] == 0, df_bool1.columns] = 0
        df_bool1 = df_bool1.drop(columns = 'Charge')

        df_mz_temp  = df_mz.copy()
        for column in df_mz_temp.columns:
            df_mz_temp[column] = df_bool1['%s' % (column.split('-')[0].replace('mz', 'seq'))]
        df_mz_temp  = df_mz_temp.replace({0:np.nan})
        df_mz_temp  = pd.DataFrame(df_mz_temp.values*df_mz.values, columns = df_mz_temp.columns, index = df_mz_temp.index)
        df_mz_temp  = (df_mz_temp + proton*(charge - 1))/charge
        df_mz_temp.rename(columns = lambda x: x + '%s' % ("+"*charge ), inplace = True)
        df_pep_mz_temp = pd.concat([df_pep_mz_temp, df_mz_temp], axis = 1)

        # annotation dataframe
        df_bool2 = df_mz_temp.notnull().astype(int)  
        df_label_temp = pd.DataFrame(df_bool2.values*df_label.values,
                                     columns = df_label.columns, index = df_label.index)
        df_label_temp = df_label_temp.replace(r'^\s*$', np.nan, regex=True)
        df_label_temp = df_label_temp.replace({0:np.nan})
        df_label_temp = pd.DataFrame(np.where(df_label_temp.notnull(), df_label_temp.astype(str) + '+'*charge, np.nan), columns = df_label_temp.columns)
        df_label_temp.columns = df_label_temp.columns + '+'*charge
        df_pep_label_temp = pd.concat([df_pep_label_temp, df_label_temp], axis = 1)    

    # Copy temporary dataframe back into original dataframe
    df_pep_mz    = df_pep_mz_temp.copy()
    df_pep_label = df_pep_label_temp.copy()

## 2.5 Immonium ion

In [None]:
# Immonium ions (IM, IM-NH3)
if annot_IM == True:
    # m/z
    df_IM_mz = pd.DataFrame([[aa.get(AA) + proton - CO for AA in seq] for seq in df_pep_mz.seq_Peptide.tolist()])
    df_IM_mz[0] = pd.concat([Nterm_mod, df_IM_mz[0]], axis = 1).apply(lambda x: np.nan if len(x['mod_Peptide']) > 0 else x[0], axis = 1)
    df_IM_mz.columns = ['mz_IM_%s' % (n+1) for n in range(df_pep_mz['Pep_length'].max())]    
    df_IM_NH3_mz = df_IM_mz - NH3
    df_IM_NH3_mz.columns = [col.replace('IM', 'IM-NH3') for col in df_IM_mz.columns]

    df_pep_mz = df_pep_mz.join([df_IM_mz, df_IM_NH3_mz])

    # annotation
    df_IM_label = pd.DataFrame([['IM(%s)' % (AA) for AA in seq] for seq in df_pep_label.seq_Peptide.tolist()])
    df_IM_label[0] = pd.concat([Nterm_mod, df_IM_label[0]], axis = 1).apply(lambda x: np.nan if len(x['mod_Peptide']) > 0 else x[0], axis = 1)
    df_IM_label.columns = ['mz_IM_%s' % (n+1) for n in range( df_pep_label['Pep_length'].max())]   
    df_IM_NH3_label = pd.DataFrame(np.where((df_IM_label.notnull()), df_IM_label.astype(str) + '-NH3', np.nan))
    df_IM_NH3_label.columns = [col.replace('IM', 'IM-NH3') for col in df_IM_label.columns]

    df_pep_label = df_pep_label.join([df_IM_label, df_IM_NH3_label])

In [None]:
# Drop empty columns
df_pep_mz    = df_pep_mz.dropna(how = 'all', axis = 1)
df_pep_label = df_pep_label.dropna(how = 'all', axis = 1)

# 3. Retain matching experimental peaks

In [None]:
# Experimental spectrum
df_exp_temp  = df_exp.copy()
df_exp_temp2 = pd.DataFrame(df_exp_temp['mz_intensity'].values.tolist())
df_exp_temp2.fillna(value = np.nan, inplace = True)
df_exp_mz = df_exp_temp['m/z array'].copy()

# Theoretical spectrum
df_theo_mz    = df_pep_mz.filter(like = "mz") # Theoretical ion m/z
df_theo_label = df_pep_label.filter(like = "mz") # Theoretical ion annotation

In [None]:
%%time
# For each PSM, compare experimental vs. theoretical spectrum and retain only matching ions
appended_mz    = []
appended_label = []
appended_order = []

for r in range(len(df_exp_mz)):
    # Retain only annotated experimental peaks
    exp_array = np.array(df_exp_mz[r])
    exp_array = exp_array[~np.isnan(exp_array)] # Get non-NaN experimental peaks
    theo_array = np.array(df_theo_mz.loc[r])
    theo_array = theo_array[~np.isnan(theo_array)] # Get non-NaN theoretical peaks
    
    exp_array_int  = exp_array.astype(int)  # Get nominal experimental m/z
    theo_array_int = theo_array.astype(int) # Get nominal theoretical m/z
    exp_array_overlap = np.in1d(exp_array_int, theo_array_int) * exp_array # Check only overlapping nominal m/z
    exp_array = exp_array_overlap[exp_array_overlap > 0] # Retain only overlapping experimental m/z
    theo_array_overlap = np.in1d(theo_array_int, exp_array_int) * theo_array # Check only overlapping nominal m/z
    theo_array = theo_array_overlap[theo_array_overlap > 0] # Retain only overlapping theoretical m/z
    exp_order = (exp_array_overlap > 0).astype(int) # Keep original array order
    appended_order.append(exp_order)
    
    array_exp_mz  = np.tile(exp_array, (theo_array.shape[0], 1)) # Experimental peak array
    array_theo_mz = np.transpose(np.tile(theo_array, (exp_array.shape[0], 1))) # Theoretical peak array
    array_eCheck  = abs(array_theo_mz - array_exp_mz) < (array_theo_mz*ms2_ppm * 1e-6) # Check for m/z match within mass tolerance
    array_match   = 1*(array_eCheck.sum(axis = 0) > 0)
    appended_mz.append(array_match)
    
    # Get annotation for the retained experimental peaks
    exp_mz     = np.sum((array_exp_mz*array_eCheck), axis = 1)
    theo_label = np.array(df_theo_label.loc[r].dropna())
    theo_ex_overlap = (theo_label * (theo_array_overlap > 0).astype(bool))
    theo_label = np.array([x for x in theo_ex_overlap if x])
    
    # Merge peak m/z and annotation
    mz_label_pair = pd.DataFrame(np.vstack((exp_mz, theo_label))).T
    mz_label_pair.columns = ["exp_mz", "label"]
    mz_label_pair['exp_mz'] = mz_label_pair['exp_mz'].astype(float)
    mz_label_pair.loc[mz_label_pair['exp_mz'] == 0, 'label'] = np.nan
    mz_label_pair_group = mz_label_pair.groupby('exp_mz', as_index = False)['label'].agg({'labels':(lambda x: list(set(x)))})
    mz_label_pair_group = mz_label_pair_group[mz_label_pair_group['exp_mz'] != 0]
    appended_label.append(np.array(mz_label_pair_group['labels']))
    
    print(str(r + 1) + str('/%s spectra processed') %(len(df_exp_mz)), end = "\r")

In [None]:
# Remove non-overlapped experimental peaks
all_data_order = (pd.DataFrame(appended_order) > 0).astype(int)
df_exp_temp2_order = (df_exp_temp2*all_data_order).applymap(lambda x: np.nan if x == [] else x) 
df_exp_temp2_order = df_exp_temp2_order.apply(lambda row: pd.Series(row.dropna().values), axis = 1)

# Retain only matched experimental peaks
all_data_mz = (pd.DataFrame(appended_mz) > 0).astype(int)
df_obs = pd.DataFrame(df_exp_temp2_order.values*all_data_mz.values, columns = df_exp_temp2_order.columns, index = df_exp_temp2_order.index)
df_obs = df_obs.applymap(lambda x: np.nan if x == [] else x)
df_obs = df_obs.apply(lambda row: pd.Series(row.dropna().values), axis = 1)

# Merge m/z-intensity-annotation
all_data_label = pd.DataFrame([df for df in appended_label])
all_data_label = all_data_label.reset_index(drop = True)
all_data_label.columns = range(all_data_label.shape[1])
df_mz_label = (df_obs + all_data_label).dropna(how = 'all', axis = 1)
df_mz_label.columns = ['peak_%s' % (i+1) for i in range(len(df_mz_label.columns))] # Rename columns

In [None]:
# Append peptide info columns
df_mz_label = pd.concat([df_exp_temp['Title'], df_mz_label], axis = 1)
df_mz_label = df_pep_mz[['Title', 'Peptide', 'mod_Peptide', 'Pep_length', 'Charge', 'mz_Precursor', 'Cit_Count']].merge(df_mz_label, on = 'Title')

# 4. Cit diagnostic ion analysis

In [None]:
# Make a new dataframe
df_mz_label_uniq = df_mz_label.copy()

In [None]:
# Define a function to filter out secondary ions w/o primary ions (for precursor & sequence ion only)
def Secondary_filter(Primary_ion_list, Secondary_ion_list):
    # Get secondary ion of precursor & sequence ion
    Secondary_ion_pri  = [re.sub(r'[0-9]+', '', i.split('-')[0]) for i in Secondary_ion_list]
    Secondary_ion_precur_seq = [a*b for a, b in zip([True if ion in ['Precursor', 'y', 'b', 'a', 'z', 'c'] else False for ion in Secondary_ion_pri], Secondary_ion_list)]
    Secondary_ion_precur_seq = [ion for ion in Secondary_ion_precur_seq if len(ion) > 0]
    
    Secondary_ion_list_filtered = []
    for sec in Secondary_ion_list:
        if sec in Secondary_ion_precur_seq:
            if sec.split('-')[0] in Primary_ion_list:
                Secondary_ion_list_filtered.append(sec)
        else:
            Secondary_ion_list_filtered.append(sec)
    
    return Secondary_ion_list_filtered

In [None]:
# Filter out secondary ions without primary ions (precursor & sequence ion only)
label = df_mz_label_uniq.filter(like = 'peak').applymap(lambda x: x[2:] if isinstance(x, list) else np.nan)
appended_list = []

for r in range(len(label)):
    list_temp = label.loc[r, :].values.tolist()
    list_temp = [x for x in list_temp if isinstance(x, list)]
    list_temp = [item for sublist in list_temp for item in sublist]
    appended_list.append(list_temp)
    
Primary_ion   = pd.Series(appended_list).apply(lambda x: [ion for ion in x if '-' not in ion])
Secondary_ion = pd.Series(appended_list).apply(lambda x: [ion for ion in x if '-' in ion])

# Append a total annotation column
Total_ion = Primary_ion + pd.DataFrame([Primary_ion, Secondary_ion]).T.apply(lambda x: Secondary_filter(x[0], x[1]), axis = 1)

In [None]:
# Define a function to remove annotations not in the filtered total annotation list
def Annot_filter(peaks, Total_label):
    
    peaks = [peak if (isinstance(peak, list) and len(set(peak[2:]).intersection(Total_label)) > 0) else np.nan for peak in peaks]
    return peaks

In [None]:
# Filtered peak annotation
all_peaks = df_mz_label_uniq.filter(like = 'peak').applymap(lambda x: x if isinstance(x, list) else np.nan)
appended_list = []

for r in range(len(all_peaks)):
    list_temp = all_peaks.loc[r, :].values.tolist()
    list_temp = [x if isinstance(x, list) else np.nan for x in list_temp]
    appended_list.append(list_temp)
    
# Update original dataframe
df_mz_label_uniq = df_mz_label_uniq[[col for col in df_mz_label_uniq.columns if 'peak' not in col]].join(pd.DataFrame(pd.DataFrame([appended_list, Total_ion]).T.apply(lambda x: Annot_filter(x[0], x[1]), axis = 1).tolist(), columns = df_mz_label_uniq.filter(like = 'peak').columns))


In [None]:
# Define a function to retain only unique 43 NL
def uniq_43(labels):
    # If there is only one 43 NL annotation, retain it
    if len(labels) == 1:
        if '43' in labels[0]:
            return labels
        else:
            return np.nan
    
    # In case of multiple annotations
    else:
        # Check whether there is at least one 43 NL annotation
        labels_43 = [('43' in label) for label in labels]
        
        # If there is no 43 NL annotation, return NaN
        if Counter(labels_43)[True] == 0:            
            return np.nan
        
        # If there is at least one 43 NL annotation
        else:
            # If all labels are 43 NL annotations, retain all
            if Counter(labels_43)[True] == len(labels):
                Precur_43 = [label for label in labels if 'Precur' in label]
                seq_43    = [label for label in labels if any(['y' in label, 'b' in label, 'a' in label, 'z' in label, 'c' in label]) == True and len(re.sub(r'[0-9]+', '', label.split('-')[0])) == 1]
                INT_43    = list(set(labels) - set(Precur_43 + seq_43))
                if len(Precur_43) > 0: # If there is precur 43 NL, return it
                    return Precur_43
                elif len(seq_43) > 0: # If there is no precur but sequence 43 NL, return it
                    return seq_43
                else: # If there is no precur/sequence but INT 43 NL, return it
                    return INT_43
            # If there is at least one non-43 NL annotation, 
            else:
                labels_43_non = [label for label in labels if labels_43[labels.index(label)] == False]
                # If non-43 NL is of precursor or sequence ion type, remove 43 NL
                if len([label for label in labels_43_non if any(['Precur' in label, 'y' in label, 'b' in label, 'a' in label, 'z' in label, 'c' in label]) == True and len(re.sub(r'[0-9]+', '', label.split('-')[0])) == 1]) > 0:
                    return np.nan
                # If non-43 NL is an immonium ion, remove 43 NL
                elif len([label for label in labels_43_non if 'IM(' in label]) > 0:
                    return np.nan
                else: # Otherwise, retain 43 NL
                    return list(set(labels) - set(labels_43_non))

In [None]:
# Define a function to retain only unique INT
def uniq_INT(labels):
    
    INT = [label for label in labels if all(['r' in label, 'IM(' not in label, 'Precur' not in label, len(label.split('-')[0]) in range(2,4)]) == True]
    non_INT = list(set(labels) - set(INT))
    # If there is no di/tripeptide, return NaN
    if len(INT) == 0:
        return np.nan
    else:    
        # If there is no non-INT annotation, retain INT annotation
        if len(non_INT) == 0:
            return INT
        else: # If there is at least one non-INT annotation, return NaN
            return np.nan

In [None]:
# Retain unique 43 NL and INT
unique_NL  = pd.Series(df_mz_label_uniq.filter(like = 'peak').applymap(lambda x: uniq_43(x[2:])  if isinstance(x, list) else np.nan).values.tolist()).apply(lambda x: [label for label in x if isinstance(label, list)])
unique_INT = pd.Series(df_mz_label_uniq.filter(like = 'peak').applymap(lambda x: uniq_INT(x[2:]) if isinstance(x, list) else np.nan).values.tolist()).apply(lambda x: [label for label in x if isinstance(label, list)])


In [None]:
## Unique Cit DI annotations
# Get unique NL annotations
Total_NL_label = unique_NL.apply(lambda x: [label for sublist in x for label in sublist])
df_mz_label_uniq['Total_NL_label'] = Total_NL_label.apply(lambda x: ','.join(x))
df_mz_label_uniq['precNL_label']   = Total_NL_label.apply(lambda x: ','.join([label for label in x if 'Precur' in label]))
df_mz_label_uniq['seqNL_label']    = Total_NL_label.apply(lambda x: ','.join([label for label in x if any(['y' in label, 'b' in label, 'a' in label, 'z' in label, 'c' in label]) == True and len(re.sub(r'[0-9]+', '', label.split('-')[0])) == 1]))
df_mz_label_uniq['intNL_label']    = Total_NL_label.apply(lambda x: ','.join([label for label in x if all(['r' in label, 'Precur' not in label]) == True]))

# Get unique INT annotations
Total_INT_label = unique_INT.apply(lambda x: [label for sublist in x for label in sublist])
df_mz_label_uniq['Total_INT_label']  = Total_INT_label.apply(lambda x: ','.join(x))
df_mz_label_uniq['Dipeptide_label']  = Total_INT_label.apply(lambda x: ','.join([label for label in x if len(label.split('-')[0]) == 2]))
df_mz_label_uniq['Tripeptide_label'] = Total_INT_label.apply(lambda x: ','.join([label for label in x if len(label.split('-')[0]) == 3]))

In [None]:
## IM(Cit)-NH3 filtering using intensity ratio with IM(R)-NH3
# Get IM(Cit)-NH3 intensity
IMCit_NH3 = pd.Series(df_mz_label_uniq.filter(like = 'peak').applymap(lambda x: x[1] if isinstance(x, list) and 'IM(r)-NH3' in x[2:] else np.nan).values.tolist())
IMCit_NH3 = IMCit_NH3.apply(lambda x: [intensity for intensity in x if pd.notnull(intensity)]).apply(lambda x: x[0] if len(x) > 0 else np.nan)
IMCit_NH3_presence = (IMCit_NH3 > 0).astype(int)

# Get IM(R)-NH3 intensity
IMR_NH3 = pd.Series(df_mz_label_uniq.filter(like = 'peak').applymap(lambda x: x[1] if isinstance(x, list) and 'IM(R)-NH3' in x[2:] else np.nan).values.tolist())
IMR_NH3 = IMR_NH3.apply(lambda x: [intensity for intensity in x if pd.notnull(intensity)]).apply(lambda x: x[0] if len(x) > 0 else np.nan)

# Retain only IM(Cit)-NH3 with intensity greater than that of IM(R)-NH3
IM_filter = ((IMR_NH3/IMCit_NH3) > 1).astype(int)
IMCit_NH3_presence_filtered = IMCit_NH3_presence - (IMCit_NH3_presence*IM_filter)

# Get IM(Cit)-NH3 annotation
df_mz_label_uniq['IM_NH3_label'] = Total_ion.apply(lambda x: ','.join([label for label in x if label == 'IM(r)-NH3']))*IMCit_NH3_presence_filtered

In [None]:
## Unique Cit DI counts
# Get unique NL counts
df_mz_label_uniq['Total_NL_count'] = df_mz_label_uniq['Total_NL_label'].apply(lambda x: len(x.split(',')) if len(x) > 0 else 0)
df_mz_label_uniq['precNL_count']   = df_mz_label_uniq['precNL_label'].apply(lambda x: len(x.split(',')) if len(x) > 0 else 0)
df_mz_label_uniq['seqNL_count']    = df_mz_label_uniq['seqNL_label'].apply(lambda x: len(x.split(',')) if len(x) > 0 else 0)
df_mz_label_uniq['intNL_count']    = df_mz_label_uniq['intNL_label'].apply(lambda x: len(x.split(',')) if len(x) > 0 else 0)

# Get unique INT counts
df_mz_label_uniq['Total_INT_count']  = unique_INT.str.len()
df_mz_label_uniq['Dipeptide_count']  = unique_INT.apply(lambda x: [label for label in x if len(label[0].split('-')[0]) == 2]).str.len()
df_mz_label_uniq['Tripeptide_count'] = unique_INT.apply(lambda x: [label for label in x if len(label[0].split('-')[0]) == 3]).str.len()

# Get IM(Cit)-NH3 counts
df_mz_label_uniq['IM_NH3_count'] = df_mz_label_uniq['IM_NH3_label'].apply(lambda x: 1 if len(x) > 0 else 0)

In [None]:
# Final output
Final_result = df_mz_label_uniq[[col for col in df_mz_label_uniq.columns if 'peak' not in col]]

# 5. Assessment of Cit status using logistic regression model

In [None]:
# Define a function to calculate Cit probability using EN model developed in the paper
def EN_model(NL_count, INT_count, IM_NH3_count):
    # Regression coefficients
    Intercept = -1.9270
    NL_coeff  = 0.9759
    INT_coeff = 0.6519
    IM_coeff  = 2.7804
    
    # Logit function
    logit = Intercept + NL_coeff*NL_count + INT_coeff*INT_count + IM_coeff*IM_NH3_count
    prob  = np.exp(logit)/(np.exp(logit) + 1)
    
    return prob

In [None]:
if Frag_method == 'HCD': # Apply model for HCD data only
    Cit_prob = Final_result[['Total_NL_count', 'Total_INT_count', 'IM_NH3_count']].apply(lambda x: EN_model(x['Total_NL_count'], x['Total_INT_count'], x['IM_NH3_count']), axis = 1)
    Final_result = Final_result.assign(Cit_probability = Cit_prob) # Citrullination status probability
    Final_result = Final_result.assign(Cit_prediction = np.where(Cit_prob >= 0.5, 1, 0)) # Citrullination status prediction

In [None]:
Final_result['Total_ion'] = Total_ion
Final_result['Primary_ion'] = Primary_ion

In [None]:
# Save result
Final_result.to_csv("Cit_DI_analysis_result.csv") # As csv file