In [1]:
import pandas as pd
import data_loader as dl
import pyteomics.mzml
import seaborn as sns
import matplotlib.pyplot as plt



In [2]:
def get_data_for_one(file_name):
    #Read in mzml 
    mzml_df = dl.load_mzml(file_name)
    ms1 = mzml_df[mzml_df["ms_level"]==1]
    ms2 = mzml_df[mzml_df["ms_level"]==2]
    
    #Read in psm
    mm_df = dl.load_joined_psm_mzml(file_name)
    mm_df= mm_df.sort_values("minute")
    
    #MM scan below 1% cutoff
    mm_df_cut = mm_df[mm_df['QValue'] <=.01]
    
     #make df with all types
    line_df_ms1 = ms1['minute']
    line_df_ms1 = line_df_ms1.value_counts().rename_axis('minute').reset_index(name='counts')

    line_df_ms2 = ms2['minute']
    line_df_ms2 = line_df_ms2.value_counts().rename_axis('minute').reset_index(name='counts')

    line_df_mm_cut = mm_df_cut['minute']
    line_df_mm_cut = line_df_mm_cut.value_counts().rename_axis('minute').reset_index(name='counts')

    line_df = pd.merge(line_df_ms1, line_df_ms2, on='minute', how='left').fillna(method='ffill')
    line_df = line_df.rename({'counts_x': 'ms1', 'counts_y': 'ms2'}, axis=1)
    line_df = pd.merge(line_df, line_df_mm_cut, on='minute', how='left').fillna(method='ffill')
    line_df = line_df.rename({'counts': '1_percent_FDR'}, axis=1)

    #get order for minutes
    line_df = line_df.sort_values("minute")

    return (line_df)

In [3]:
def get_data_for_all_files(list_of_file_names):
    
    result = get_data_for_one(list_of_file_names[0])
    df2 = get_data_for_one(list_of_file_names[1])
    df3 = get_data_for_one(list_of_file_names[2])
    df4 = get_data_for_one(list_of_file_names[3])
    df5 = get_data_for_one(list_of_file_names[4])
    df6 = get_data_for_one(list_of_file_names[5])

    counter = 1
    result = pd.merge(result, df2, on='minute', how='left').fillna(method='ffill')
    result = result.rename({'ms1_x': 'ms1_'+str(1), 'ms2_x': 'ms2_'+str(1), '1_percent_FDR_x': '1_percent_FDR_'+str(1),
                           'ms1_y': 'ms1_'+str(2), 'ms2_y': 'ms2_'+str(2), '1_percent_FDR_y': '1_percent_FDR_'+str(2)}, axis=1)
    counter = 3
    result = pd.merge(result, df3, on='minute', how='left').fillna(method='ffill')
    result = result.rename({'ms1': 'ms1_'+str(counter), 'ms2': 'ms2_'+str(counter), '1_percent_FDR': '1_percent_FDR_'+str(counter)}, axis=1)

    counter = 4
    result = pd.merge(result, df4, on='minute', how='left').fillna(method='ffill')
    result = result.rename({'ms1': 'ms1_'+str(counter), 'ms2': 'ms2_'+str(counter), '1_percent_FDR': '1_percent_FDR_'+str(counter)}, axis=1)

    counter = 5
    result = pd.merge(result, df5, on='minute', how='left').fillna(method='ffill')
    result = result.rename({'ms1': 'ms1_'+str(counter), 'ms2': 'ms2_'+str(counter), '1_percent_FDR': '1_percent_FDR_'+str(counter)}, axis=1)

    counter = 6
    result = pd.merge(result, df6, on='minute', how='left').fillna(method='ffill')
    result = result.rename({'ms1': 'ms1_'+str(counter), 'ms2': 'ms2_'+str(counter), '1_percent_FDR': '1_percent_FDR_'+str(counter)}, axis=1)

    result = result.sort_values("minute")

    return(result)


In [4]:
def get_data_for_all_files_bulk(list_of_file_names):
    
    result = get_data_for_one(list_of_file_names[0])
    df2 = get_data_for_one(list_of_file_names[1])
    df3 = get_data_for_one(list_of_file_names[2])

    counter = 1
    result = pd.merge(result, df2, on='minute', how='left').fillna(method='ffill')
    result = result.rename({'ms1_x': 'ms1_'+str(1), 'ms2_x': 'ms2_'+str(1), '1_percent_FDR_x': '1_percent_FDR_'+str(1),
                           'ms1_y': 'ms1_'+str(2), 'ms2_y': 'ms2_'+str(2), '1_percent_FDR_y': '1_percent_FDR_'+str(2)}, axis=1)
    counter = 3
    result = pd.merge(result, df3, on='minute', how='left').fillna(method='ffill')
    result = result.rename({'ms1': 'ms1_'+str(counter), 'ms2': 'ms2_'+str(counter), '1_percent_FDR': '1_percent_FDR_'+str(counter)}, axis=1)

    result = result.sort_values("minute")

    return(result)

In [5]:
def plot_psm_over_time(df,title):
    a4_dims = (30, 8.27) #dimensions for bigger plot
    fig, ax = plt.subplots(figsize=a4_dims)
    sns.lineplot(data=df, x="minute", y="ms1")
    sns.lineplot(data=df, x="minute", y="ms2")
    sns.lineplot(data=df, x="minute", y="1_percent_FDR")
    ax.legend(['ms1','ms2','1% cutoff'],fontsize='15')
    plt.rcParams['xtick.labelsize']=20
    plt.rcParams['ytick.labelsize']=20
    plt.title(title,fontsize='15')
#     plt.ylim(0,400)
    plt.show()

# Bulk data

In [6]:
list_of_file_names = ['bulk_rep1', 'bulk_rep2','bulk_rep3']
all_bulk = get_data_for_all_files_bulk(list_of_file_names)

AttributeError: module 'data_loader' has no attribute 'load_mzml'

In [None]:
a4_dims = (30, 8.27) #dimensions for bigger plot
fig, ax = plt.subplots(figsize=a4_dims)
sns.lineplot(data=all_bulk, x="minute", y="ms2_1")
sns.lineplot(data=all_bulk, x="minute", y="ms2_2")
sns.lineplot(data=all_bulk, x="minute", y="ms2_3")
ax.legend(['rep1','rep2','rep3'],fontsize='20')
plt.rcParams['xtick.labelsize']=20
plt.rcParams['ytick.labelsize']=20
# plt.title('All acquired ms2 spectra ',fontsize='20')
plt.ylabel('Number of PSMs', fontsize=30)
plt.xlabel('Minute', fontsize=30)
plt.savefig('Figures/supplemental_figure2A.png', dpi=600)
plt.ylim(0,700)
plt.show()

# 2ng data

In [None]:
list_of_file_names = ['2ng_rep1', '2ng_rep2','2ng_rep3','2ng_rep4','2ng_rep5','2ng_rep6']
all_2ng = get_data_for_all_files(list_of_file_names)

In [None]:
a4_dims = (30, 8.27) #dimensions for bigger plot
fig, ax = plt.subplots(figsize=a4_dims)
sns.lineplot(data=all_2ng, x="minute", y="ms2_1")
sns.lineplot(data=all_2ng, x="minute", y="ms2_2")
sns.lineplot(data=all_2ng, x="minute", y="ms2_3")
sns.lineplot(data=all_2ng, x="minute", y="ms2_4")
sns.lineplot(data=all_2ng, x="minute", y="ms2_5")
sns.lineplot(data=all_2ng, x="minute", y="ms2_6")
ax.legend(['rep1','rep2','rep3','rep4','rep5','rep6'],fontsize='20')
plt.rcParams['xtick.labelsize']=20
plt.rcParams['ytick.labelsize']=20
# plt.title('All acquired ms2 spectra ',fontsize='20')
plt.ylabel('Number of PSMs', fontsize=30)
plt.xlabel('Minute', fontsize=30)
plt.ylim(0,700)
plt.savefig('Figures/supplemental_figure2B.png', dpi=600)
plt.show()

# 0.2ng data

In [None]:
list_of_file_names = ['0.2ng_rep1','0.2ng_rep2','0.2ng_rep3','0.2ng_rep4','0.2ng_rep5','0.2ng_rep6']
all_02ng = get_data_for_all_files(list_of_file_names)

In [None]:
a4_dims = (30, 8.27) #dimensions for bigger plot
fig, ax = plt.subplots(figsize=a4_dims)
sns.lineplot(data=all_02ng, x="minute", y="ms2_1")
sns.lineplot(data=all_02ng, x="minute", y="ms2_2")
sns.lineplot(data=all_02ng, x="minute", y="ms2_3")
sns.lineplot(data=all_02ng, x="minute", y="ms2_4")
sns.lineplot(data=all_02ng, x="minute", y="ms2_5")
sns.lineplot(data=all_02ng, x="minute", y="ms2_6")
ax.legend(['rep1','rep2','rep3','rep4','rep5','rep6'],fontsize='20')
plt.rcParams['xtick.labelsize']=20
plt.rcParams['ytick.labelsize']=20
# plt.title('All acquired ms2 spectra ',fontsize='20')
plt.ylabel('Number of PSMs', fontsize=30)
plt.xlabel('Minute', fontsize=30)
plt.ylim(0,700)
plt.savefig('Figures/supplemental_figure2C.png', dpi=600)
plt.show()