In [21]:
#import basic modules
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats

The purpose of this script is assign p-values to the enrichments for the identified transcription factors

In [15]:
#load in all mass spec proteinGroup files. These contain the normalized heavy to light ratios for all
#identified proteins.
allnames = glob.glob('../massspec/protein*')

In [16]:
#we can also load in the proteinGroup files for only the targets we will be assessing.
all_filtered = glob.glob('../massspec/filtered/*')

In [22]:
#We will compile all ratios from all mass spec runs. We will organize these by the protein name. 
fulldf = pd.DataFrame()
for i,n in enumerate(allnames):
    #load in files delimited by tabs.
    tempdf = pd.io.parsers.read_csv(n,sep='\t')
    #get column names that are relevant for our search
    columns = tempdf.columns
    #filter out those columns we don't want to check.
    good_cols = [x for x in columns if 'normalized ' in x if 'Heavy' not in x]
    for i, col in enumerate(good_cols):
        #create minimal data frame with only protein and gene name and the column containing data for one
        #dataset.
        tempdf2 = tempdf[['Protein names','Gene names'] + [col]].copy()
        #remove any rows with missing entries. These will occur because not every protein will be identified
        #in every run.
        tempdf2 = tempdf2.dropna()
        #join together the results from each of the individual datasets.
        fulldf = pd.concat([fulldf,tempdf2],axis=0,sort=True)

In [23]:
#we will put a 0 in the place of every not a number entry.
fulldf2 = fulldf.fillna(value=0)

fulldf3 = fulldf2.groupby('Protein names').sum()

In [24]:
#initialize empty ddata frames to contain the summary statistics (mean and variance for each dataset.)
out_df = pd.DataFrame(columns=['mean','var'])
out_background_df = pd.DataFrame(columns=['mean','var'])
out_pval = pd.DataFrame(columns=['pval'])

In [26]:
#now we will go through each set where we think we have identified a transcription factor and check to see
#if it is significantly enriched.
for z,filtered_name in enumerate(all_filtered):
    indf = pd.io.parsers.read_csv(filtered_name)
    #the column with the enrichments will be the last column, so we will save that for use later.
    indf_ratio_col = indf.columns[-1]
    #the files are sorted so that the identified transcription factor will be the first entry. We will
    #save this row separately.
    row = indf.loc[0,:]
    #The name of the identified factor will be in the 'Protein names' column of the first row.
    name = row['Protein names']
    
    #create a dataframe that is a subset of the fulldf3, which contains all massspec data. this subdataframe
    #only contains the information on the particular protein of interest.
    q = fulldf3.loc[name]
    #get out the enrichment ratio of the identified TF
    theratio = float(row[indf_ratio_col])
    #now we get all the rows that contain the ratios for the protein of interest from datasets where there
    #is no enrichment. The variability in these ratios allows us to estimate the variation in the measured
    #ratio for this protein.
    goodrows = (q.index != indf_ratio_col)
    q = q[goodrows]
    #We will not filter out any samples with enrichment equal to exactly 0 as this only occurs when the protein
    #wasn't identified at all in one sample and so these ratios can't be relied upon.
    q = q[q > 1e-8]
    #we will do the estimate in log space, as the uncertainty in ratio estimation will likely be on the log scale.
    #estimate mean and variance for the ratios for the target protein.
    tempmean = np.log(theratio)
    tempvar = np.var(np.log(q))
    
    #save in dataframe
    out_df.loc[indf_ratio_col,:] = [tempmean,tempvar]
    
    #do the same estimation for the background proteins. 
    background_df = indf.loc[1:,:]
    background_tempmean = np.mean(np.log(background_df.loc[:,indf_ratio_col]))
    background_tempvar = np.var(np.log(background_df.loc[:,indf_ratio_col]))
    
    out_background_df.loc[str(indf_ratio_col),:] = [float(background_tempmean),float(background_tempvar)]
    
    #use a t test to estimate the p-value that the target protein is not significantly enriched compared to
    #background.
    statistic,pval = scipy.stats.ttest_ind_from_stats(tempmean,tempvar,len(q),background_tempmean,background_tempvar,len(background_df.index))
    
    #save each p-value to a file.
    with open('../massspec/test_pval','a') as f:
        f.write(indf_ratio_col + ',' + str(pval) + '\n')
    out_pval.loc[indf_ratio_col,:] = [pval]