# Script to quanitfy PSM in Iodination Data
Mod 125.8966 (H/Y)

Want:

    1.) # time specific H/Y iodo (fasta numbering)
    2.) # time specific H/Y not iodo (fasta numbering)
    3.) Percent

### Imports

In [5]:
import pandas as pd
import re

### Quantification on psm.tsv

In [7]:
# import data for one sample output from fp

filepath = "/Users/nithesh/Documents/Iodo_script/20240227_FPI_apomyoglobin_NvD_plus-minusHis-correctDB/2024_02_25_KB_NoFAIMS_MV_004_A2/psm.tsv"
psm = pd.read_csv(filepath, sep = "\t")

In [8]:
psm.columns

Index(['Spectrum', 'Spectrum File', 'Peptide', 'Modified Peptide', 'Prev AA',
       'Next AA', 'Peptide Length', 'Charge', 'Retention', 'Observed Mass',
       'Calibrated Observed Mass', 'Observed M/Z', 'Calibrated Observed M/Z',
       'Calculated Peptide Mass', 'Calculated M/Z', 'Delta Mass',
       'Expectation', 'Hyperscore', 'Nextscore', 'PeptideProphet Probability',
       'Number of Enzymatic Termini', 'Number of Missed Cleavages',
       'Protein Start', 'Protein End', 'Intensity', 'Assigned Modifications',
       'Observed Modifications', 'Is Unique', 'Protein', 'Protein ID',
       'Entry Name', 'Gene', 'Protein Description', 'Mapped Genes',
       'Mapped Proteins'],
      dtype='object')

In [9]:
col_subs = ['Peptide', "Modified Peptide", "Protein Start", "Protein End", "Assigned Modifications", "Entry Name"]

psm_subs = psm[col_subs]

In [10]:
# Data Wrangling

# Remove peptides with no mods
psm_clean = psm_subs.dropna().reset_index(drop=True)

In [11]:
psm_clean

Unnamed: 0,Peptide,Modified Peptide,Protein Start,Protein End,Assigned Modifications,Entry Name
0,HLKTEAEMK,HLKTEAEM[147]K,49,57,8M(15.9949),MYG_HORSE
1,HLKTEAEMK,HLKTEAEM[147]K,49,57,8M(15.9949),MYG_HORSE
2,HLKTEAEMK,HLKTEAEM[147]K,49,57,8M(15.9949),MYG_HORSE
3,HLKTEAEMK,HLKTEAEM[147]K,49,57,8M(15.9949),MYG_HORSE
4,HLKTEAEMK,HLKTEAEM[147]K,49,57,8M(15.9949),MYG_HORSE
...,...,...,...,...,...,...
1043,YLEFISDAIIHVLHSK,Y[289]LEFISDAIIHVLHSK,104,119,1Y(125.8966),MYG_HORSE
1044,YLEFISDAIIHVLHSK,Y[415]LEFISDAIIHVLHSK,104,119,1Y(251.7933),MYG_HORSE
1045,YLEFISDAIIHVLHSK,Y[289]LEFISDAIIHVLHSK,104,119,1Y(125.8966),MYG_HORSE
1046,YLEFISDAIIHVLHSK,Y[289]LEFISDAIIHVLHSK,104,119,1Y(125.8966),MYG_HORSE


In [12]:
# Function to iterate through df and keep mods that affect H or Y
# Returns list of indexes that have modifications to the amino acids of interest
def mod_of_interest (df):
    index = []
    for i in range(len(df["Assigned Modifications"])):
        mods = df["Assigned Modifications"][i]
        aa_mod = mods.split("(")[0][-1]
        
        if aa_mod == "H" or aa_mod == "Y":
            index.append(i)
    
    return(index)

In [13]:
index_list = mod_of_interest(psm_clean)

In [14]:
psm_clean = psm_clean.loc[index_list]
psm_clean['unique_pep'] = range(len(psm_clean))

In [223]:
# Function to extract values
def extract_values(row):
    pattern = r'(\d+)([A-Z])\((\d+\.\d+)\)'
    matches = re.findall(pattern, row)
    return matches

In [16]:
psm_clean['Extracted Values'] = psm_clean['Assigned Modifications'].apply(extract_values)

In [17]:
long_psm_clean = pd.DataFrame([(row['unique_pep'], val[0], val[1], float(val[2])) for _, row in psm_clean.iterrows() for val in row['Extracted Values']],
                       columns=['unique_pep', 'pep_loc', 'AA', 'Mod'])

In [18]:
merge_psm = pd.merge(long_psm_clean, psm_clean, on='unique_pep', how='inner')

In [19]:
merge_psm

Unnamed: 0,unique_pep,pep_loc,AA,Mod,Peptide,Modified Peptide,Protein Start,Protein End,Assigned Modifications,Entry Name,Extracted Values
0,0,1,Y,125.8966,YLEFISDAIIHVLHSK,Y[289]LEFISDAIIHVLHSK,104,119,1Y(125.8966),MYG_HORSE,"[(1, Y, 125.8966)]"
1,1,1,Y,125.8966,YLEFISDAIIHVLHSK,Y[289]LEFISDAIIHVLHSK,104,119,1Y(125.8966),MYG_HORSE,"[(1, Y, 125.8966)]"
2,2,1,Y,125.8966,YLEFISDAIIHVLHSK,Y[289]LEFISDAIIHVLHSK,104,119,1Y(125.8966),MYG_HORSE,"[(1, Y, 125.8966)]"
3,3,1,H,125.8966,HGTVVLTALGGILK,H[263]GTVVLTALGGILK,65,78,1H(125.8966),MYG_HORSE,"[(1, H, 125.8966)]"
4,4,1,Y,251.7933,YLEFISDAIIHVLHSK,Y[415]LEFISDAIIHVLHSK,104,119,1Y(251.7933),MYG_HORSE,"[(1, Y, 251.7933)]"
...,...,...,...,...,...,...,...,...,...,...,...
428,393,1,Y,125.8966,YLEFISDAIIHVLHSK,Y[289]LEFISDAIIHVLHSK,104,119,1Y(125.8966),MYG_HORSE,"[(1, Y, 125.8966)]"
429,394,1,Y,251.7933,YLEFISDAIIHVLHSK,Y[415]LEFISDAIIHVLHSK,104,119,1Y(251.7933),MYG_HORSE,"[(1, Y, 251.7933)]"
430,395,1,Y,125.8966,YLEFISDAIIHVLHSK,Y[289]LEFISDAIIHVLHSK,104,119,1Y(125.8966),MYG_HORSE,"[(1, Y, 125.8966)]"
431,396,1,Y,125.8966,YLEFISDAIIHVLHSK,Y[289]LEFISDAIIHVLHSK,104,119,1Y(125.8966),MYG_HORSE,"[(1, Y, 125.8966)]"


In [20]:
merge_psm['pep_loc'] = merge_psm['pep_loc'].astype(int)
merge_psm['Mod'] = merge_psm['Mod'].astype(int)
merge_psm['fasta_loc'] = merge_psm['pep_loc'] + merge_psm['Protein Start'] - 1
merge_psm['AA_fasta']= merge_psm['AA'] + merge_psm['fasta_loc'].astype(str)

In [21]:
merge_psm

Unnamed: 0,unique_pep,pep_loc,AA,Mod,Peptide,Modified Peptide,Protein Start,Protein End,Assigned Modifications,Entry Name,Extracted Values,fasta_loc,AA_fasta
0,0,1,Y,125,YLEFISDAIIHVLHSK,Y[289]LEFISDAIIHVLHSK,104,119,1Y(125.8966),MYG_HORSE,"[(1, Y, 125.8966)]",104,Y104
1,1,1,Y,125,YLEFISDAIIHVLHSK,Y[289]LEFISDAIIHVLHSK,104,119,1Y(125.8966),MYG_HORSE,"[(1, Y, 125.8966)]",104,Y104
2,2,1,Y,125,YLEFISDAIIHVLHSK,Y[289]LEFISDAIIHVLHSK,104,119,1Y(125.8966),MYG_HORSE,"[(1, Y, 125.8966)]",104,Y104
3,3,1,H,125,HGTVVLTALGGILK,H[263]GTVVLTALGGILK,65,78,1H(125.8966),MYG_HORSE,"[(1, H, 125.8966)]",65,H65
4,4,1,Y,251,YLEFISDAIIHVLHSK,Y[415]LEFISDAIIHVLHSK,104,119,1Y(251.7933),MYG_HORSE,"[(1, Y, 251.7933)]",104,Y104
...,...,...,...,...,...,...,...,...,...,...,...,...,...
428,393,1,Y,125,YLEFISDAIIHVLHSK,Y[289]LEFISDAIIHVLHSK,104,119,1Y(125.8966),MYG_HORSE,"[(1, Y, 125.8966)]",104,Y104
429,394,1,Y,251,YLEFISDAIIHVLHSK,Y[415]LEFISDAIIHVLHSK,104,119,1Y(251.7933),MYG_HORSE,"[(1, Y, 251.7933)]",104,Y104
430,395,1,Y,125,YLEFISDAIIHVLHSK,Y[289]LEFISDAIIHVLHSK,104,119,1Y(125.8966),MYG_HORSE,"[(1, Y, 125.8966)]",104,Y104
431,396,1,Y,125,YLEFISDAIIHVLHSK,Y[289]LEFISDAIIHVLHSK,104,119,1Y(125.8966),MYG_HORSE,"[(1, Y, 125.8966)]",104,Y104


In [91]:
def iodo_quant(df, entry_name):
    entry = df.loc[df['Entry Name'] == entry_name] # Subset unique entry
    iod_freq = entry['AA_fasta'].value_counts()
    iodo_freq_df = pd.DataFrame(iod_freq).rename(columns={"AA_fasta":"Iodo_freq"})
    return(iodo_freq_df)

In [93]:
iodo_quant(merge_psm, "MYG_HORSE")

Unnamed: 0,Iodo_freq
Y104,158
H65,89
H94,39
H83,31
H82,27
H25,20
M132,16
Y147,13
H120,12
H117,6


In [95]:
iodo_quant_results = iodo_quant(merge_psm, "MYG_HORSE")

In [219]:
def total_pep_appearances(iodo_quant_out, merge_df, clean_df):

    Tot_freq = []
    
    for i in iodo_quant_out.index:
        
        # Find the value corresponding to the current index in iodo_quant_out
        matching_rows = merge_df[merge_df["AA_fasta"] == i]
        if matching_rows.empty:
            # Handle case where there is no match
            Tot_freq.append(0)
            continue
        
        value = matching_rows["fasta_loc"].iloc[0]
        
        # Determine if 'value' is within the range specified in clean_df
        within_range = (clean_df['Protein Start'] <= value) & (clean_df['Protein End'] >= value)
        count_within_range = within_range.sum()
        
        Tot_freq.append(count_within_range)
    
    # Combine the Tot_freq with iodo_quant_out
    iodo_quant_out["Tot_freq"] = Tot_freq
    iodo_quant_out["% Mod"] = round(iodo_quant_out["Iodo_freq"] / iodo_quant_out["Tot_freq"]*100,2)
    return iodo_quant_out


In [221]:
total_pep_appearances(iodo_quant_results, merge_psm, psm_clean)

Unnamed: 0,Iodo_freq,Tot_freq,% Mod
Y104,158,165,95.76
H65,89,94,94.68
H94,39,83,46.99
H83,31,84,36.9
H82,27,84,32.14
H25,20,20,100.0
M132,16,47,34.04
Y147,13,14,92.86
H120,12,47,25.53
H117,6,163,3.68
