In [32]:
#import basic modules
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats

In [33]:
#load in all mass spec proteinGroup files. These contain the normalized heavy to light ratios for all
#identified proteins.
allnames = glob.glob('massspec/protein*.txt')

In [34]:
#we can also load in the proteinGroup files for only the targets we will be summarizing.
all_filtered = glob.glob('massspec/filtered/*')

In [38]:
#we will format an output dataframe that contains a mean value and variance the most highly enriched protein
#and for all background proteins.
#create a dataframe for pvals
out_pval = pd.DataFrame(columns=['pval'])
#format the output look of each dataframe.
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.float_format', '{:10,.9f}'.format)

In [39]:
#We will loop through all enriched proteins displayed in the figures in the Reg-Seq paper.
for z,filtered_name in enumerate(all_filtered):
    #load in file with proteins and enrichments
    indf = pd.io.parsers.read_csv(filtered_name)
    #get the correct column name that contains the heavy to light ratio.
    indf_ratio_col = indf.columns[-1]
    row = indf.loc[0,:]
    name = row['Protein names']

    #We use the p-value calculation of cox and mann.
    #first we get the column of all enrichment ratios.
    q = indf.loc[:,indf_ratio_col]
    
    #drop any ratios equal to zero.
    q = q[q > 1e-8]
    #we need to log transform ratios in order to do the p-value calculation.
    allratios = np.log(np.array(list(q)))
    #we will estimate the S.D. by looking making the following calculations.
    [rlow,r0,r1] = np.percentile(allratios,[15.87,50,84.13])
    #using the calculated S.D. we can compute the test stat z from the cox and mann paper.
    test_stats = calc_test_stat(allratios,r0=r0,r1=r1)
    #now we calculate the p_values for each ratio to be an outlier.
    p = .5*erfc(test_stats/np.sqrt(2))
    #we will look at the lowest p-value to see if the most enriched protein is an outlier
    pval = p.min()
    #to correct for multiple hypothesis testing we will multiply by the number of enrichment ratios
    pval = pval*
    #write results to file. To correct for multiple hypothesis testing 
    with open('test_pval','a') as f:
        f.write(indf_ratio_col + ',' + str(p.min()*len(allratios)) + '\n')