# Table of Contents

This notebook generates data used to contruct `binding peak` tables and `venn diagrams` for ChIPPro database. But before you run the notebook, make sure of the following: 

1. Do not concatanate TF files with differernt conditions. The goal is to have one TF with one condidtion on each dashboard.
2. Make sure to rename TF in the gff file to match the TF convention and not gene convension.

In [1]:
#basics:
import numpy as np     
import pandas as pd
import urllib                      
import gzip
import os
from os import path
from collections import defaultdict

# Binding Peak Tables

## Generate a data objects

### Prerequisit
Ensure the following paths reflect your local **input** and **output** directories

In [2]:
data_dir = './'+'input_data/'

In [3]:
in_data = data_dir + 'gff_data/'

List of organisms with available curated gff files

In [4]:
org_list = [file for file in os.listdir(in_data)]
org_list

['putida', 'staph', 'salmonella', 'ecoli']

In [5]:
in_dir = in_data + org_list[3]+'/'

Change the index of organisms accordingly:

In [6]:
out_data_venn = '../venn/data/' 
out_data_table = '../binding_site_table/data/' 

List of organism with curated gff files

In [7]:
filelist = [file for file in os.listdir(in_dir)]
filelist

['ygbi_M9_curated.gff',
 'ydcr_M9_curated.gff',
 'ydcI_ph8_curated.gff',
 'ygav_M9_curated.gff',
 'yeht_M9_curated.gff',
 'ybaq_M9_curated.gff',
 'yfie_M9_curated.gff',
 'rpob_fe_curated.gff',
 'yheo_M9_curated.gff',
 'ybhd_M9_curated.gff',
 'yidz_M9_curated.gff',
 'gadx_M9_curated.gff',
 'ybcm_oxidative_curated.gff',
 'ycan_M9_curated.gff',
 'ydcI_ph7_curated.gff',
 'ydcn_M9_curated.gff',
 'yqhc_M9_curated.gff',
 'rpob_dpd_curated.gff',
 'fur_fe_curated.gff',
 'ydip_M9_curated.gff',
 'fur_dpd_curated.gff',
 'yfed_M9_curated.gff',
 'ycit_M9_curated.gff',
 'yiag_M9_curated.gff',
 'gade_M9_curated.gff',
 'yeam_M9_curated.gff',
 'yebk_M9_curated.gff',
 'ydcI_ph5_curated.gff',
 'ybdo_M9_curated.gff',
 'yjdc_M9_curated.gff',
 'cra_acetate_curated.gff',
 'gadw_M9_curated.gff',
 'yiep_M9_curated.gff',
 'yhjb_M9_curated.gff',
 'cra_M9_curated.gff',
 'ybao_M9_curated.gff',
 'ygfi_M9_curated.gff',
 'yiau_M9_curated.gff',
 'ybef_M9_curated.gff',
 'yidl_M9_curated.gff',
 'yhjc_M9_curated.gff',
 'y

In [8]:
database = defaultdict(list)

In [9]:
filelist = [file for file in os.listdir(in_dir)]

for i in range(0,len(filelist)):
    df = pd.read_csv(path.join(in_dir,filelist[i]),index_col=0, 
                                 delimiter='\t', header=None, 
                                 names = ['ref','condition', 'condition_name', 
                                          "binding_peak_start",'binding_peak_end', 
                                          'binding_peak_strength', 'direction', '.','ID'])

    df = df.set_index(pd.Series(range(1,len(df)+1)))
    database[i] = df

In [10]:
database[0].head()

Unnamed: 0,condition,condition_name,binding_peak_start,binding_peak_end,binding_peak_strength,direction,.,ID
1,,_filtered_0.95,2861321,2861365,12.52768,+,.,


For the next block of code to work, make sure the files are named in the correct format: `<TF>_<CONDITION>_cutared.gff`

In [11]:
TF_list =[]
TF_condition = []
TF_gene_name = []
for i in range (0,len(filelist)): 
    split_file_name = filelist[i].split('_')
    
    first_upper_case = split_file_name[0].replace(split_file_name[0][0], split_file_name[0][0].upper())
    
    if len(first_upper_case) == 4:
        last_upper_case = first_upper_case.replace(first_upper_case[-1], first_upper_case[-1].upper())
    else: last_upper_case = first_upper_case
    
    TF_list.append(last_upper_case)
    
    TF_condition.append(split_file_name[1])
    
    TF_gene_name.append(last_upper_case.lower())

Check the convention for all TF, gene and conditions:

In [12]:
TF_list[:5]

['YgbI', 'YdcR', 'YdcI', 'YgaV', 'YehT']

In [13]:
TF_condition[:5]

['M9', 'M9', 'ph8', 'M9', 'M9']

In [14]:
TF_gene_name[:5]

['ygbi', 'ydcr', 'ydci', 'ygav', 'yeht']

## Load gene info + Biocyc TRN 

In [15]:
# Ecoli data
DF_gene_info = pd.read_csv(path.join(data_dir,'ecoli_gene_info.csv'),index_col=0)
precise2_TRN = pd.read_csv(path.join(data_dir,'ecoli_trn_precise2.csv'),index_col=0)

#Staph data 
staph_TRN = pd.read_csv(path.join(data_dir,'staph_trn.csv'),index_col=0)
staph_gene_info = pd.read_csv(path.join(data_dir,'staph_gene_info.csv'),index_col=0)

In [16]:
# Define start codon position (to account for +/- strands)
staph_gene_info['start_codon_pos'] = [row.start if row.strand == '+' else row.stop for idx,row in staph_gene_info.iterrows()]
staph_gene_info.head()

Unnamed: 0,old_locus_tag,gene_product,start,strand,cog,gene_name,operon,stop,regulator,start_codon_pos
USA300HOU_RS00005,USA300HOU_0001,chromosomal replication initiator protein DnaA,543,+,"Replication, recombination and repair",dnaA,dnaA,1905,,543
USA300HOU_RS00010,USA300HOU_0002,DNA polymerase III subunit beta,2182,+,"Replication, recombination and repair",dnaN,dnaN,3316,,2182
USA300HOU_RS00015,USA300HOU_0003,RNA-binding protein,3696,+,Function unknown,yaaA,"yaaA,recF,gyrB,gyrA",3942,,3696
USA300HOU_RS00020,USA300HOU_0004,DNA replication and repair protein RecF,3938,+,"Replication, recombination and repair",recF,"yaaA,recF,gyrB,gyrA",5051,,3938
USA300HOU_RS00025,USA300HOU_0005,DNA gyrase subunit B,5060,+,"Replication, recombination and repair",gyrB,"yaaA,recF,gyrB,gyrA",6995,,5060


In [17]:
# Define start codon position (to account for +/- strands)
DF_gene_info['start_codon_pos'] = [row.start if row.strand == '+' else row.stop for idx,row in DF_gene_info.iterrows()]
DF_gene_info.head()

Unnamed: 0,start,stop,strand,gene_name,length,operon,cog,start_codon_pos
b0001,189,255,+,thrL,66,thrLABC,No COG Annotation,189
b0002,336,2799,+,thrA,2463,thrLABC,No COG Annotation,336
b0003,2800,3733,+,thrB,933,thrLABC,Amino acid transport and metabolism,2800
b0004,3733,5020,+,thrC,1287,thrLABC,Amino acid transport and metabolism,3733
b0005,5233,5530,+,yaaX,297,yaaX,Function unknown,5233


## Generate Binding peaks 

This function assigns bindings peaks for each TF and fins the corresponsing gene targets using the `gene_info.csv` table. 

In [20]:
def annotate_peaks(TF_name, TF_condition, peak_df,margin,gene_info):
    
    res_df = peak_df.copy()
#     TF = peak_df.condition_name[1][:4]
    for i,row in res_df.iterrows():
        pos = row['binding_peak_start']
        # Identify genes within MARGIN nt of binding peak
        close_genes = gene_info[(gene_info.start_codon_pos > pos-margin) 
                                & (gene_info.start_codon_pos < pos+margin)]
        for strand,group in close_genes.groupby('strand'):
            #Remove genes that are completely transcribed before binding peak
            if strand == '+':
                group = group[group.stop > pos]
            else:
                group = group[group.start < pos]

            operon = group.operon.unique()
            # Ensure that we're only identifying one operon on either side of binding peak
#             if len(operon) > 1:
#                 print (operon)

            # Get all genes in operon
            bnums = gene_info[gene_info.operon.isin(operon)].index
            
            ## Add gene information to dataframe
            if strand == '+':
                res_df.loc[i,'TU_p'] = ','.join(operon)
                res_df.loc[i,'genes_p'] = ','.join(bnums)
            else:
                res_df.loc[i,'TU_m'] = ','.join(operon)
                res_df.loc[i,'genes_m'] = ','.join(bnums)
    res_df['index'] = [TF_name +'-' + str(i) for i in range(1,peak_df.shape[0]+1)]
    res_df['condition'] = [ TF_name.lower() + " + " + TF_condition for i in peak_df.condition_name]
#     [peak_df.condition_name[2][:4]+' + '+peak_df.condition_name[1][5:8]]*peak_df.shape[0]
    cols = ['index','condition','binding_peak_start','binding_peak_end',
            'binding_peak_strength','TU_p','genes_p','TU_m','genes_m']
    return res_df.reindex(columns = cols)

## Validate Binding Peaks 

this function validates the accuracy of every gene target from the already indetified gene list for every binding site, identified from the previous function  

In [21]:
def validate2_peak_info(df,gene_info):
#     print(df.shape)
    locusTag = defaultdict(list)
    geneName = defaultdict(list)
    for i,row in df.iterrows():
        BP = row['binding_peak_start']
        idx_name = i
        genes = [row['genes_p'] , row['genes_m']]
        if ((genes[0] == '') & (genes[1] == '')):
            locusTag[idx_name].append('')
            geneName[idx_name].append('')
        for gene in genes:
            if gene == '':
                continue 
            gene_list = gene.split(',')
            for g in gene_list: 
                name = DF_gene_info.loc[g].gene_name 
                strand = DF_gene_info.loc[g].strand 
                start = DF_gene_info.loc[g].start
                stop = DF_gene_info.loc[g].stop
                if ((start > BP) & (stop > BP) & (strand == '+')) | ((start < BP) & (stop < BP) & (strand == '-')):
                        locusTag[idx_name].append(g)
                        geneName[idx_name].append(name)
                elif ((start < BP) & (stop > BP)) | ((start > BP) & (stop < BP)):
                    locusTag[idx_name].append(g)
                    geneName[idx_name].append(name)
                    
    for k, v in locusTag.items():
        if ((len(v) == 1) & (v[0] == '')): 
            locusTag[k] = ''
            continue
        genes = ','.join(locusTag[k])
        locusTag[k] = genes

    for k, v in geneName.items():
        if ((len(v) == 1) & (v[0] == '')): 
            geneName[k] = ''
            continue
        genes = ','.join(geneName[k])
        geneName[k] = genes

    df_complete = df.copy()
#     print(df_complete.shape)
#     print(len(locusTag.values()))
    df_complete['target_locus'] = locusTag.values()
    df_complete['target_genes'] = geneName.values()
    df_final = df_complete.loc[:,['index','condition','binding_peak_start',
                                  'binding_peak_end','binding_peak_strength',
                                  'target_locus','target_genes']]
    return df_final

In [22]:
def validate_peak_info(df,gene_info):
    locusTag = defaultdict(list)
    geneName = defaultdict(list)
    for i,row in df.iterrows():
        BP = row['binding_peak_start']
        idx_name = i
        genes = [row['genes_p'] , row['genes_m']]
        if ((genes[0] == '') & (genes[1] == '')):
            locusTag[idx_name].append('')
            geneName[idx_name].append('')
        for gene in genes:
            if gene == '':
                continue 
            gene_list = gene.split(',')
            for g in gene_list: 
                name = DF_gene_info.loc[g].gene_name 
                strand = DF_gene_info.loc[g].strand 
                start = DF_gene_info.loc[g].start
                stop = DF_gene_info.loc[g].stop
                if ((start > BP) & (stop > BP) & (strand == '+')) | ((start < BP) & (stop < BP) & (strand == '-')):
                        locusTag[idx_name].append(g)
                        geneName[idx_name].append(name)
                elif ((start < BP) & (stop > BP)) | ((start > BP) & (stop < BP)):
                    locusTag[idx_name].append(g)
                    geneName[idx_name].append(name)
                    
    for k, v in locusTag.items():
        if ((len(v) == 1) & (v[0] == '')): 
            locusTag[k] = ''
            continue
        genes = ','.join(locusTag[k])
        locusTag[k] = genes

    for k, v in geneName.items():
        if ((len(v) == 1) & (v[0] == '')): 
            geneName[k] = ''
            continue
        genes = ','.join(geneName[k])
        geneName[k] = genes

    df_complete = df.loc[:,['index','condition','binding_peak_start',
                            'binding_peak_end','binding_peak_strength']]
#     print(len(locusTag.values()))
    df_complete['target_locus'] = locusTag.values()
    df_complete['target_genes'] = geneName.values()
    return df_complete

# Venn Diagrams

Make sure to update the list of `TF_names` as Ye adds more gff files into the dropbox

In [18]:
TRN_data = defaultdict(list)
TF_name = TF_list

for i in TF_name: 
    TRN_data[i] = [x for x in  list(precise2_TRN.gene_name[precise2_TRN.index == i])  if  str(x) != 'nan']

In [19]:
TRN_data

defaultdict(list,
            {'YgbI': ['ygbI'],
             'YdcR': ['atpI',
              'yicI',
              'dinD',
              'yhhI',
              'yhhT',
              'aaeR',
              'glcA',
              'ygeH',
              'oxc',
              'cbl',
              'yobF',
              'ydjF',
              'ldtE',
              'ydfJ',
              'ydcC',
              'ydcR',
              'ynaE',
              'ylaC',
              'yjhF'],
             'YdcI': ['iraP',
              'gltP',
              'yicG',
              'lldP',
              'yhdV',
              'gutM',
              'yojI',
              'yobA',
              'yobF',
              'dtpA',
              'trkG',
              'wrbA',
              'ybhI',
              'sdhC',
              'gltA',
              'tomB',
              'ddlA',
              'nhaA',
              'isrC',
              'flu',
              'smrB'],
             'YgaV': ['aslB', 'yqcC', 'yqaB', 'ygaV', 'y

In [51]:
staph_TRN_data = defaultdict(list)
TF_name = TF_list

for i in TF_name: 
    staph_TRN_data[i] = [x for x in  list(staph_TRN.gene_name[staph_TRN.index == i])  if  str(x) != 'nan']

In [53]:
def Venn_data_gen(Peak_DF): 
    TF_name = Peak_DF['index'][1][:4]
    gene_list = [i for i in Peak_DF.target_genes if i != '']
    chip_data= ','.join(list(gene_list)).split(',')
    reg_data = staph_TRN_data[TF_name]
    all_genes = [i  for i in chip_data if i not in reg_data] + reg_data
    
    
    TF = TF_name
    reg_genes=reg_data
    reg_only = []
    chip_genes=chip_data
    chip_only = []
    shared_genes=[]
    for i in all_genes: 
        if (i in reg_data) & (i not in chip_data):
            reg_only.append(i)
        elif (i in chip_data) & (i not in reg_data):
            chip_only.append(i)
        elif (i in chip_data) & (i in reg_data):
            shared_genes.append(i)
            
    values = [TF,
          len(reg_genes),
          len(reg_only),
          len(chip_genes),
          len(chip_only),
          len(shared_genes),
          len(all_genes)]
    
    index_name = ['TF',
              'reg_genes',
              'reg_only',
              'chip_genes',
              'chip_only',
              'shared_genes',
              'all_genes']
    
    genes = ['; '.join(precise2_TRN.source[precise2_TRN.index == TF_name].unique()),
         reg_genes,
         reg_only,
         chip_genes,
         chip_only,
         shared_genes,
         all_genes]
    
    same1 = defaultdict(list)
    for i in range(0,len(index_name)):
        same1[index_name[i]].append(values[i])

    finall = pd.DataFrame.from_dict(same1, orient='index', columns = ['value'])
    finall['list'] = genes
    for i, row in finall.iterrows(): 
        if row.value == 0:
            finall.list[i] = ''
    return finall

# Generate E. coli Data 

In [27]:
#Test 
i = 8
peak_annot_DF = annotate_peaks(TF_list[i], TF_condition[i],database[i],500,DF_gene_info).fillna('')
final_annot_DF = validate2_peak_info(peak_annot_DF,DF_gene_info)
venn_files = Venn_data_gen(final_annot_DF)
final_annot_DF.to_json(out_data_table+TF_list[i]+TF_condition[i]+'_binding_table.json',orient='records')
venn_files.to_json(path.join(out_data_venn, TF_list[i]+TF_condition[i]+'_venn.json'),orient='records')

In [28]:
x = list(range (0, 5)) + list(range (6, 18))+ list(range (19, 42)) + list(range (43, 51))

In [29]:
for i in x: 
    peak_annot_DF = annotate_peaks(TF_list[i], TF_condition[i],database[i],500,DF_gene_info).fillna('')
    final_annot_DF = validate2_peak_info(peak_annot_DF,DF_gene_info)
    venn_files = Venn_data_gen(final_annot_DF)
    final_annot_DF.to_json(out_data_table+TF_list[i]+'_'+TF_condition[i]+'_binding_table.json',orient='records')
    venn_files.to_json(path.join(out_data_venn, TF_list[i]+'_'+TF_condition[i]+'_venn.json'),orient='records')

# Generate S. aureus Data (Forthcoming)

In [58]:
peak_annot_DF = annotate_peaks(TF_list[0], TF_condition[0],database[0],500,staph_gene_info).fillna('')
final_annot_DF = validate2_peak_info(peak_annot_DF,staph_gene_info)
venn_files = Venn_data_gen(final_annot_DF)
final_annot_DF.to_json(out_data_table+TF_list[0]+TF_condition[0]+'_binding_table.json',orient='records')
venn_files.to_json(path.join(out_data_venn, TF_list[0]+'_'+TF_condition[0]+'_venn.json'),orient='records')

TypeError: sequence item 1: expected str instance, float found