The purpose of this notebook is to calculate the number of consecutive y peaks, and the percent of annotated peaks in a consecutive series

In [1]:
import pandas as pd
import sys
sys.path.append('..')
import data_loader as dl
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def enumerate_y_peaks(row):
    enumeration = []
    Y_and_B = row["Matched Ion Series"].split(";")
    
    if len(Y_and_B) == 2: #if there are y and b peaks
        if "y" in Y_and_B[1]:
            peaks = Y_and_B[1]
        elif "y" in Y_and_B[0]:
            peaks = Y_and_B[0]
            
        peak_list = peaks.strip('][').split(",")

        for peak in peak_list:
            enumer_string = peak.split("+")[0]
            enumer = int(re.search(r'\d+', enumer_string).group(0))
            enumeration.append(enumer)
                
    else: #if there are either y or b peaks
        if "y" in Y_and_B[0]: #if it's a y peak
            peaks = Y_and_B[0]
            peak_list = peaks.strip('][').split(",")

            for peak in peak_list:
                enumer_string = peak.split("+")[0]
                enumer = int(re.search(r'\d+', enumer_string).group(0))
                enumeration.append(enumer)
        
        else: #if it's a b peak
            pass

        
    return enumeration  

In [3]:
def enumerate_b_peaks(row):
    enumeration = []
    Y_and_B = row["Matched Ion Series"].split(";")
    
    if len(Y_and_B) == 2: #if there are y and b peaks
        if "b" in Y_and_B[1]:
            peaks = Y_and_B[1]
        elif "b" in Y_and_B[0]:
            peaks = Y_and_B[0]
            
        peak_list = peaks.strip('][').split(",")

        for peak in peak_list:
            enumer_string = peak.split("+")[0]
            enumer = int(re.search(r'\d+', enumer_string).group(0))
            enumeration.append(enumer)
                
    else: #if there are either y or b peaks
        if "b" in Y_and_B[0]: #if it's a b peak
            peaks = Y_and_B[0]
            peak_list = peaks.strip('][').split(",")

            for peak in peak_list:
                enumer_string = peak.split("+")[0]
                enumer = int(re.search(r'\d+', enumer_string).group(0))
                enumeration.append(enumer)
        
        else: #if it's a y peak
            pass

        
    return enumeration    

In [4]:
def count_consecutive(row, col_name):
    peak_list = row[col_name]
    retlist = []
    count = 1
    # Avoid IndexError for  random_list[i+1]
    for i in range(len(peak_list) - 1):
        # Check if the next number is consecutive
        if peak_list[i] + 1 == peak_list[i+1]:
            count += 1
        else:
            # If it is not append the count and restart counting
            retlist.append(count)
            count = 1
    # Since we stopped the loop one early append the last count
    retlist.append(count)
    return retlist

In [5]:
def get_max_consec(row, col_name):
    return max(row[col_name])

In [6]:
def perc_ladder_peaks(row):
    peak_link_list = row['Ypeak_enum']
    consec_peak_list = row['consecutive_y_peaks']
    consec_peak_list = list(filter(lambda a: a != 1, consec_peak_list))
    if len(peak_link_list) == 0:
        perc = -1 #this is for when there no Y peaks
    else:
        perc = sum(consec_peak_list)/len(peak_link_list)
    return perc

In [7]:
def calc_consecutive_peaks(df):
    sc = df[['scan','Matched Ion Series', 'QValue', 'peptide']]

    sc = sc.assign(Bpeak_enum = sc.apply(enumerate_b_peaks, axis=1))

    sc = sc.assign(consecutive_b_peaks = sc.apply(lambda row: count_consecutive(row, 'Bpeak_enum'), axis=1))

    sc["Ypeak_enum"] = sc.apply(enumerate_y_peaks, axis=1)
    sc['consecutive_y_peaks'] = sc.apply(lambda row: count_consecutive(row, 'Ypeak_enum'), axis=1)

    sc['max_consecutive_b'] = sc.apply(lambda row: get_max_consec(row, "consecutive_b_peaks"), axis=1)
    sc['max_consecutive_y'] = sc.apply(lambda row: get_max_consec(row, "consecutive_y_peaks"), axis=1)

    sc['perc_in_ladder'] = sc.apply(perc_ladder_peaks, axis=1)
#     sc = sc[['scan','max_consecutive_b','max_consecutive_y','perc_in_ladder', 'QValue', 'peptide']] 

    return sc

In [8]:
all_files = ["2ng_rep1", "2ng_rep2", "2ng_rep3", "2ng_rep4", "2ng_rep5", "2ng_rep6", "0.2ng_rep1", "0.2ng_rep2", "0.2ng_rep3", "0.2ng_rep4", "0.2ng_rep5", "0.2ng_rep6"]

In [9]:
for file in all_files:
    df = dl.clean_metamorph(file)
    data = calc_consecutive_peaks(df)
    data.to_csv("peaks_files/" + file + "_peaks_data.csv")