In [1]:
#import basic modules
import glob
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sp
import peak_utils
import regseq.find_region as find_region

We will now load in all models so we can check which parts are significant. Only an example subset is stored on the github repo. You can download this data from the github repo or as a .tar.gz file from the website under the datasets section.


In [2]:
allnames = glob.glob('/home/bill/next100genes/false_positive/*v20.txt')

In [3]:
allnames

['/home/bill/next100genes/false_positive/rspA_information_footprint_v20.txt',
 '/home/bill/next100genes/false_positive/ftsK_information_footprint_v20.txt',
 '/home/bill/next100genes/false_positive/znuCB_information_footprint_v20.txt',
 '/home/bill/next100genes/false_positive/ompR_information_footprint_v20.txt',
 '/home/bill/next100genes/false_positive/xylF_information_footprint_v20.txt',
 '/home/bill/next100genes/false_positive/marR_information_footprint_v20.txt',
 '/home/bill/next100genes/false_positive/araAB_information_footprint_v20.txt',
 '/home/bill/next100genes/false_positive/uvrD_information_footprint_v20.txt',
 '/home/bill/next100genes/false_positive/dicC_information_footprint_v20.txt']

In [4]:
alldf_RNAP = pd.io.parsers.read_csv('RNAP/comb_RNAP_sigma.csv')

In [5]:
def find_RNAP(s):
    '''Identify if a given putative regulatory region contains an RNAP binding site.
    To do this we cross reference the location of the putative binding site with a list of identified
    RNAP sites.'''
    gene = s['gene']
    try:
        RNAP_sites = get_RNAP_sites(alldf_RNAP[alldf_RNAP['gene'] == gene])
    except:
        RNAP_sites = []
    is_RNAP = False
    start = s['start']
    end = s['end']
    for RNAP in RNAP_sites:
        '''Our RNAP sites are listed based on the location of the RNAP minus 35. We check whether that
        or the minus 10 region (which is why we have to possible regions), exist in the region.'''
        if ((start - 2 < RNAP) and (end > RNAP + 5)) or ((start < RNAP + 29) and (end > RNAP + 21)):
            is_RNAP = True    
    return is_RNAP


In [6]:
def get_RNAP_sites(s):
    '''This function turns the entry in the dataframe of the RNAP sites into a list.'''
    outsites = []
    for i,row in s.iterrows():
        q = row['sites']
        q2 = q.split()
        q3 = [int(x) for x in q2]
        outsites = outsites + q3
    return list(set(outsites))

In [7]:
#all possible growth conditions.
possible_growths = ['fructose','42','acetate','LB','heat','Anaero','arabinose','xylara','0cAMP','SS','arabionse','metal','Fe','xanth2','M9','Tre','deoR','phoP']

In [8]:
threshs = [0.00025]

In [9]:
#we set an averaging size of 15, as this is a typical size for a binding site.
windowsize = 15
def do_sum2(s,length_info=160):
    '''this function does a summation 15 base pairs'''
    outarr = np.zeros(length_info - windowsize)
    for i in range(length_info - windowsize):
        outarr[i] = s[i:(i+windowsize)].sum()
    return outarr


#we define some column names. The only purpose of that these are the columns that are used in 
#the any energy matrices.
val_cols = ['val_A','val_C','val_G','val_T']

for thresh in threshs:
    counter = 0
    outdf = pd.DataFrame(columns=['gene','growth','feat_num','start','end','type'])    
    for name in allnames:
        try:
            noleader = name.split('/')[-1]
            '''parse file name to get gene name'''
            for x in possible_growths:
                if x in noleader and '500cAMP' not in x and noleader.split('dataset')[0] != '':
                    growth = x
                    if growth == 'acetate' or growth == '42' or growth == 'fructose' or growth == '_comb':
                        gene = noleader.split('dataset')[0]
                    else:
                        gene = noleader.split(growth)[0]
            gene = gene.split('_')[-1]
            
            #load in information footprint
            df = pd.io.parsers.read_csv(name,delim_whitespace=True)
            length_info = len(df['info'])
            em = np.abs(np.array(list(df['info'])))
            em_noabs = np.array(list(df['info']))
            
            #invert information values.
            em_noabs = em_noabs*-1

            #split the information footprint into two groups of repressor like and activator like bases.
            pos_mat = np.zeros(length_info)
            neg_mat = np.zeros(length_info)
            for q in range(len(em_noabs)):
                if em_noabs[q] > 0:
                    pos_mat[q] = np.abs(em_noabs[q])
                else:
                    neg_mat[q] = np.abs(em_noabs[q])

            #sum over 15 base pair windows
            summedarr2 = do_sum2(pos_mat)
            summedarr2_neg = do_sum2(neg_mat)

            #initialize array where we will store whether or not the average outcome beats threshold.
            is_thresh = np.zeros(length_info-windowsize)
            is_thresh_neg = np.zeros(length_info-windowsize)
            for i in range(length_info-windowsize):
                is_thresh[i] = summedarr2[i] > thresh*windowsize
                is_thresh_neg[i] = summedarr2_neg[i] > thresh*windowsize
                is_thresh_neg[i] = is_thresh_neg[i]*-1
    
            outdf_temp = find_region.select_region(is_thresh,gene,growth)
            outdf_temp2 = find_region.select_region(is_thresh_neg,gene,growth)
            for i,row in outdf_temp.iterrows():
                start = row['start']
                end = row['end']
                newstart,newend = find_region.find_edges(pos_mat,start,end)
                outdf.loc[counter,['gene','growth','feat_num','start','end','type']] = [row['gene'],growth,row['feat_num'],newstart,newend,row['type']]
                counter = counter + 1
            for i,row in outdf_temp2.iterrows():
                start = row['start']
                end = row['end']
                newstart,newend = find_region.find_edges(neg_mat,start,end)
                outdf.loc[counter,['gene','growth','feat_num','start','end','type']] = [row['gene'],growth,row['feat_num'],newstart,newend,row['type']]
                counter = counter + 1
        except Exception as e:
            print(e)
    outdf['RNAP'] = outdf.apply(find_RNAP,axis=1)
    outdf.to_csv('/home/bill/next100genes/false_positive/all_features_auto20_for_check_split_' + str(thresh) + '.csv')
    z = find_region.merge_growths(outdf,15)
    z['contains RNAP'] = z.apply(find_RNAP,axis=1)
    z.to_csv('/home/bill/next100genes/false_positive/all_features_auto_merged_20_scaled_for_RNAP_check_' + str(thresh) + '.csv')

name 'gene' is not defined
name 'gene' is not defined
name 'gene' is not defined
name 'gene' is not defined
name 'gene' is not defined
name 'gene' is not defined
name 'gene' is not defined
name 'gene' is not defined
name 'gene' is not defined


ValueError: Wrong number of items passed 6, placement implies 1