# Data Generation for Plots Notebook

This notebook generates data used to contruct `binding peak` tables and `venn diagrams` for ChIP-pro, as well as other  visuals. But before you run the notebook, make sure of the following: 

> Make sure to rename TF in the gff file to match the TF convention and not gene convension.

In [1]:
import numpy as np 
import pandas as pd
import glob
import urllib                      
import gzip
from collections import defaultdict

pd.set_option('display.max_columns', 500)

# Functions

## Generate Binding peaks 

This function assigns bindings peaks for each TF and fins the corresponsing gene targets using the `gene_info.csv` table. 

In [2]:
def annotate_peaks(TF_name, TF_condition, peak_df,margin,gene_info):
    
    res_df = peak_df.copy()
#     TF = peak_df.condition_name[1][:4]
    for i,row in res_df.iterrows():
        pos = row['binding_peak_start']
        # Identify genes within MARGIN nt of binding peak
        close_genes = gene_info[(gene_info.start_codon_pos > pos-margin) 
                                & (gene_info.start_codon_pos < pos+margin)]
        for strand,group in close_genes.groupby('strand'):
            #Remove genes that are completely transcribed before binding peak
            if strand == '+':
                group = group[group.stop > pos]
            else:
                group = group[group.start < pos]

            operon = group.operon.unique()
            # Ensure that we're only identifying one operon on either side of binding peak
#             if len(operon) > 1:
#                 print (operon)

            # Get all genes in operon
            bnums = gene_info[gene_info.operon.isin(operon)].index
            
            ## Add gene information to dataframe
            if strand == '+':
                res_df.loc[i,'TU_p'] = ','.join(operon)
                res_df.loc[i,'genes_p'] = ','.join(bnums)
            else:
                res_df.loc[i,'TU_m'] = ','.join(operon)
                res_df.loc[i,'genes_m'] = ','.join(bnums)
    res_df['index'] = [TF_name +'-' + str(i) for i in range(1,peak_df.shape[0]+1)]
    res_df['condition'] = [ TF_name.lower() + " + " + TF_condition for i in peak_df.condition_name]
#     [peak_df.condition_name[2][:4]+' + '+peak_df.condition_name[1][5:8]]*peak_df.shape[0]
    cols = ['index','condition','binding_peak_start','binding_peak_end',
            'binding_peak_strength','TU_p','genes_p','TU_m','genes_m']
    return res_df.reindex(columns = cols)

## Validate Binding Peaks 

this function validates the accuracy of every gene target from the already indetified gene list for every binding site, identified from the previous function  

In [3]:
def validate_peak_info(df,gene_info):
    locusTag = defaultdict(list)
    geneName = defaultdict(list)
    for i,row in df.iterrows():
        BP = row['binding_peak_start']
        idx_name = i
        genes = [row['genes_p'] , row['genes_m']]
        if ((genes[0] == '' or str(genes[0]) == 'nan') & (genes[1] == '' or str(genes[1]) == 'nan')):
            locusTag[idx_name].append('')
            geneName[idx_name].append('')
        for gene in genes:
            if gene == '' or str(gene) == 'nan':
                continue 
            gene_list = gene.split(',')
            for g in gene_list: 
                name = DF_gene_info.loc[g].gene_name 
                strand = DF_gene_info.loc[g].strand 
                start = DF_gene_info.loc[g].start
                stop = DF_gene_info.loc[g].stop
                if ((start > BP) & (stop > BP) & (strand == '+')) | ((start < BP) & (stop < BP) & (strand == '-')):
                        locusTag[idx_name].append(g)
                        geneName[idx_name].append(name)
                elif ((start < BP) & (stop > BP)) | ((start > BP) & (stop < BP)):
                    locusTag[idx_name].append(g)
                    geneName[idx_name].append(name)
                    
    for k, v in locusTag.items():
        if ((len(v) == 1) & (v[0] == '')): 
            locusTag[k] = ''
            continue
        genes = ','.join(locusTag[k])
        locusTag[k] = genes

    for k, v in geneName.items():
        if ((len(v) == 1) & (v[0] == '')): 
            geneName[k] = ''
            continue
        genes = ','.join(geneName[k])
        geneName[k] = genes

    df_complete = df.loc[:,['index','condition','binding_peak_start',
                            'binding_peak_end','binding_peak_strength']]
    df_complete['target_locus'] = locusTag.values()
    df_complete['target_genes'] = geneName.values()
    return df_complete

# Venn Diagrams

Make sure to update the list of `TF_names` as Ye adds more gff files into the dropbox

In [4]:
def Venn_data_gen(Peak_DF): 
    TF_name = Peak_DF['index'][1][:4]
    gene_list = [i for i in Peak_DF.target_genes if i != '']
    chip_data= ','.join(list(gene_list)).split(',')
    reg_data = TRN_data[TF_name]
    all_genes = [i  for i in chip_data if i not in reg_data] + reg_data
    
    
    TF = TF_name
    reg_genes=reg_data
    reg_only = []
    chip_genes=chip_data
    chip_only = []
    shared_genes=[]
    for i in all_genes: 
        if (i in reg_data) & (i not in chip_data):
            reg_only.append(i)
        elif (i in chip_data) & (i not in reg_data):
            chip_only.append(i)
        elif (i in chip_data) & (i in reg_data):
            shared_genes.append(i)
            
    values = [TF,
          len(reg_genes),
          len(reg_only),
          len(chip_genes),
          len(chip_only),
          len(shared_genes),
          len(all_genes)]
    
    index_name = ['TF',
              'reg_genes',
              'reg_only',
              'chip_genes',
              'chip_only',
              'shared_genes',
              'all_genes']
    
    genes = ['; '.join(precise2_TRN.source[precise2_TRN.index == TF_name].unique()),
         reg_genes,
         reg_only,
         chip_genes,
         chip_only,
         shared_genes,
         all_genes]
    
    same1 = defaultdict(list)
    for i in range(0,len(index_name)):
        same1[index_name[i]].append(values[i])

    finall = pd.DataFrame.from_dict(same1, orient='index', columns = ['value'])
    finall['list'] = genes
    for i, row in finall.iterrows(): 
        if row.value == 0:
            finall.list[i] = ''
    return finall

# Binding Width Histograms

In [5]:
def binding_width_gen(TF, peak_df,out_dir): 
    widths = pd.DataFrame(peak_df.binding_peak_end - peak_df.binding_peak_start + 1, columns=["binding_width"])
    widths.to_csv(out_dir+TF+'_widths.csv')

# Peak Position Scatter

In [6]:
def peak_position_gen(TF, final_annot_DF,out_dir): 
    peak_scatter_df = pd.DataFrame(columns=final_annot_DF.columns.to_list()+['gene'] +['normalized_dist'])
    counter=0

    for i in range(len(final_annot_DF)):
        peak = final_annot_DF.loc[i+1,:]
        peak_center = np.mean([peak.binding_peak_start, peak.binding_peak_end])

        for gene in peak.target_genes.split(","):
            gene_info = DF_gene_info[DF_gene_info.gene_name == gene]
            try:
                if gene_info.strand[0] == "-":
                    dist = float((gene_info.stop - peak_center)/gene_info.length)
                else:
                    dist = float((peak_center - gene_info.start)/gene_info.length)
                peak_scatter_df.loc[counter] = peak.to_list()+[gene, dist]
                counter+=1
            except:
                continue

    peak_scatter_df['binding_peak_center'] = peak_scatter_df.binding_peak_start/2 + peak_scatter_df.binding_peak_end/2
    peak_scatter_df.to_csv(out_dir+TF+'_positions.csv')

# E. coli

In [65]:
org_folder = "../data/e_coli/"

In [66]:
TF_list = pd.read_csv(org_folder+'TF_list.csv', index_col=0)

TF_list

Unnamed: 0_level_0,TF,Organism,Strain,Media,Supplement,genome_id,organism_id,num_binding_sites,num_samples,num_cond,project,binding tables,BWcond1_1_name,BWcond1_1,BWcond1_2_name,BWcond1_2,BWcond2_1_name,BWcond2_1,BWcond2_2_name,BWcond2_2,BWcond3_1_name,BWcond3_1,BWcond3_2_name,BWcond3_2,BWcond4_1_name,BWcond4_1,BWcond4_2_name,BWcond4_2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
0,BaeR,Escherichia coli,K-12 MG1655,LB,EtOH,NC_000913_3,e_coli,,4,1,TCS,EtOH|baer_EtOH_binding_table.json,bio-rep1 – R1,BaeR_R1_S31_R1.bw,bio-rep1 – R2,BaeR_R1_S31_R2.bw,bio-rep2 – R1,BaerR_R2_S32_R1.bw,bio-rep2 – R2,BaerR_R2_S32_R2.bw,,,,,,,,
1,CpxR,Escherichia coli,K-12 MG1655,LB,EtOH,NC_000913_3,e_coli,,2,1,TCS,EtOH|cpxr_EtOH_binding_table.json,R1,CpxRR1_S1_R1.bw,R2,CpxRR2_S2_R2.bw,,,,,,,,,,,,
2,Cra,Escherichia coli,K-12 MG1655,M9,"Fructose, Galactose, or Acetate",NC_000913_3,e_coli,,8,4,known_TF,M9|cra_M9_binding_table.json;acetate|cra_aceta...,Glu – R1,cra_glu_1.bw,Glu – R2,cra_glu_2.bw,Fru – R1,cra_fru_1.bw,Fru – R2,cra_fru_2.bw,Gal – R1,cra_gal_1.bw,Gal – R2,cra_gal_2.bw,Ace – R1,cra_ace_1.bw,Ace – R2,cra_ace_2.bw
3,Fur,Escherichia coli,K-12 MG1655,M9,Fe or DPD,NC_000913_3,e_coli,,4,2,known_TF,dpd|fur_dpd_binding_table.json;fe|fur_fe_bindi...,Fe – R1,fur_fe_1.bw,Fe – R2,fur_fe_2.bw,DPD – R1,fur_dpd_1.bw,DPD – R2,fur_dpd_2.bw,,,,,,,,
4,GadE,Escherichia coli,K-12 MG1655,M9,,NC_000913_3,e_coli,,2,1,known_TF,M9|gade_M9_binding_table.json,R1,gade_1.bw,R2,gade_2.bw,,,,,,,,,,,,
5,GadW,Escherichia coli,K-12 MG1655,M9,,NC_000913_3,e_coli,,2,1,known_TF,M9|gadw_M9_binding_table.json,R1,gadw_1.bw,R2,gadw_2.bw,,,,,,,,,,,,
6,GadX,Escherichia coli,K-12 MG1655,M9,,NC_000913_3,e_coli,,2,1,known_TF,M9|gadx_M9_binding_table.json,R1,gadx_1.bw,R2,gadx_2.bw,,,,,,,,,,,,
7,KdpE,Escherichia coli,K-12 MG1655,TMA,KCl,NC_000913_3,e_coli,,2,1,TCS,KCl|kdpe_KCl_binding_table.json,R1,KdpeE_R2_S34_R1.bw,R2,KdpeE_R2_S34_R2.bw,,,,,,,,,,,,
8,PhoB,Escherichia coli,K-12 MG1655,M9P,,NC_000913_3,e_coli,,4,1,TCS,M9P|phob_M9P_binding_table.json,bio-rep1 – R1,PhoB_1_S16_R1.bw,bio-rep1 – R2,PhoB_1_S16_R2.bw,bio-rep2 – R1,PhoB_2_S6_R1.bw,bio-rep2 – R2,PhoB_2_S6_R2.bw,,,,,,,,
9,RpoB,Escherichia coli,K-12 MG1655,M9,Fe or DPD,NC_000913_3,e_coli,,4,2,known_TF,dpd|rpob_dpd_binding_table.json;fe|rpob_fe_bin...,Fe – R1,rpob_fe1.bw,Fe – R2,rpob_fe2.bw,DPD – R1,rpob_dpd1.bw,DPD – R2,rpob_dpd2.bw,,,,,,,,


In [67]:
strain = 'NC_000913_3'
DF_gene_info = pd.read_csv(org_folder+strain+'/annotation/gene_info.csv',index_col=0)
DF_gene_info['start_codon_pos'] = [row.start if row.strand == '+' else row.stop for idx,row in DF_gene_info.iterrows()]
TRN = pd.read_csv(org_folder+strain+'/annotation/trn.csv',index_col=0)

DF_gene_info

Unnamed: 0,start,stop,strand,gene_name,length,operon,cog,start_codon_pos
b0001,189,255,+,thrL,66,thrLABC,No COG Annotation,189
b0002,336,2799,+,thrA,2463,thrLABC,No COG Annotation,336
b0003,2800,3733,+,thrB,933,thrLABC,Amino acid transport and metabolism,2800
b0004,3733,5020,+,thrC,1287,thrLABC,Amino acid transport and metabolism,3733
b0005,5233,5530,+,yaaX,297,yaaX,Function unknown,5233
...,...,...,...,...,...,...,...,...
b4399,4636695,4638120,+,creC,1425,creABCD,Signal transduction mechanisms,4636695
b4400,4638177,4639530,+,creD,1353,creABCD,Defense mechanisms,4638177
b4401,4639589,4640306,-,arcA,717,arcA,Signal transduction mechanisms,4640306
b4402,4640401,4640542,+,yjjY,141,yjjY,Function unknown,4640401


In [75]:
strain = 'NC_000913_3'
genes = pd.read_csv(org_folder+strain+'/annotation/genes.gff',index_col=0,
                                 delimiter='\t', header=None, 
                                 names = ['condition', 'ref', 'condition_name', 
                                          "start",'stop', 
                                          '.', 'strand', '-','details'])

genes

Unnamed: 0_level_0,ref,condition_name,start,stop,.,strand,-,details
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NC_000913.3,RefSeq,CDS,190,255,.,+,0,gene thrL;locus_tag b0001;product thr_operon_l...
NC_000913.3,RefSeq,CDS,337,2799,.,+,0,gene thrA;locus_tag b0002;product fused_aspart...
NC_000913.3,RefSeq,CDS,2801,3733,.,+,0,gene thrB;locus_tag b0003;product homoserine_k...
NC_000913.3,RefSeq,CDS,3734,5020,.,+,0,gene thrC;locus_tag b0004;product threonine_sy...
NC_000913.3,RefSeq,CDS,5234,5530,.,+,0,gene yaaX;locus_tag b0005;product DUF2502_doma...
...,...,...,...,...,...,...,...,...
NC_000913.3,RefSeq,CDS,4636696,4638120,.,+,0,gene creC;locus_tag b4399;product sensory_hist...
NC_000913.3,RefSeq,CDS,4638178,4639530,.,+,0,gene creD;locus_tag b4400;product putative_inn...
NC_000913.3,RefSeq,CDS,4639590,4640306,.,-,0,gene arcA;locus_tag b4401;product DNA-binding_...
NC_000913.3,RefSeq,CDS,4640402,4640542,.,+,0,gene yjjY;locus_tag b4402;product uncharacteri...


In [26]:
for TF_index in range(len(TF_list)):

    TF_name= TF_list.TF[TF_index].lower()
    strain = TF_list.genome_id[TF_index]

    #find files and format
    curated_loc = glob.glob(org_folder+strain+"/curated_input/"+TF_name+"_*.gff")
    curated_loc.sort()
    curated_files = [i.split("/")[-1] for i in curated_loc]
    curated_cond = [i.split("_")[1] for i in curated_files]
    table_string =";".join([curated_cond[i]+"|"+TF_name+"_"+curated_cond[i]+'_binding_table.json' for i in range(len(curated_loc))])
    if table_string!='':
        TF_list.loc[TF_index, 'binding tables'] = table_string
    else:
        TF_list.loc[TF_index, 'binding tables'] = "NA"
    TF_list.loc[TF_index, 'num_cond'] = max(1,len(curated_loc))
    
    #write plot data files to folders
#     for file_index in range(len(curated_files)):
#         try:
#             file = curated_files[file_index]
#             df = pd.read_csv(org_folder+strain+"/curated_input/"+file,index_col=0, 
#                                      delimiter='\t', header=None, 
#                                      names = ['ref','condition', 'condition_name', 
#                                               "binding_peak_start",'binding_peak_end', 
#                                               'binding_peak_strength', 'direction', '.','ID'])
#             df = df.set_index(pd.Series(range(1,len(df)+1)))
#             peak_annot_DF = annotate_peaks(TF_list.TF[TF_index], curated_cond[file_index], df, 500, DF_gene_info)
#             final_annot_DF = validate_peak_info(peak_annot_DF,DF_gene_info)
#             final_annot_DF.to_json(org_folder+strain+"/table/"+TF_name+"_"+curated_cond[file_index]+'_binding_table.json',orient='records')
#             binding_width_gen(TF_list.TF[TF_index], df, org_folder+strain+"/binding_widths/")
#             peak_position_gen(TF_list.TF[TF_index], final_annot_DF, org_folder+strain+"/positions/")
#         except:
#             print(TF_name, file)
#             continue

In [27]:
TF_list.to_csv(org_folder+'TF_list.csv')

In [None]:
for i in range (0, len(database)): 
    peak_annot_DF = annotate_peaks(TF_list[i], TF_condition[i],database[i],500,DF_gene_info).fillna('')
    final_annot_DF = validate_peak_info(peak_annot_DF,DF_gene_info)
    venn_files = Venn_data_gen(final_annot_DF)
    final_annot_DF.to_json(out_data_table+TF_list[i]+'_binding_table.json',orient='records')
    venn_files.to_json(path.join(out_data_venn, TF_list[i]+'_venn.json'),orient='records')
    binding_width_gen(TF_list[i], database[i], '../../data/e_coli/NC_000913_3/binding_widths/')
    peak_position_gen(TF_list[i], final_annot_DF, '../../data/e_coli/NC_000913_3/positions/')

In [None]:
file = 'phob_M9P_curated.gff'
df = pd.read_csv(org_folder+strain+"/curated_input/"+file,index_col=0, 
                                 delimiter='\t', header=None, 
                                 names = ['ref','condition', 'condition_name', 
                                          "binding_peak_start",'binding_peak_end', 
                                          'binding_peak_strength', 'direction', '.','ID'])
df = df.set_index(pd.Series(range(1,len(df)+1)))
df.head()

# Other Organisms

In [49]:
org_folder = "../data/all_other/"

In [50]:
TF_list = pd.read_csv(org_folder+'TF_list.csv', index_col=0)

TF_list

Unnamed: 0_level_0,TF,Organism,Strain,Media,Supplement,genome_id,organism_id,num_binding_sites,num_samples,num_cond,project,binding tables,BWcond1_1_name,BWcond1_1,BWcond1_2_name,BWcond1_2,BWcond2_1_name,BWcond2_1,BWcond2_2_name,BWcond2_2,BWcond3_1_name,BWcond3_1,BWcond3_2_name,BWcond3_2,BWcond4_1_name,BWcond4_1,BWcond4_2_name,BWcond4_2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
0,Fur,Klebsiella pneumoniae,MGH 78578,,,CP_000647_1,k_pneumoniae,,2,1,MGH78578,,R1,klebfurfinal1.bw,R2,klebfurfinal2.bw,,,,,,,,,,,,
0,Fur,Pseudomonas putida,KT2440,,,NC_002947_4,p_putida,,1,1,2457T,,R1,putidafurfinal1_202003.bw,,,,,,,,,,,,,,
1,Sigma D,Pseudomonas putida,KT2440,,,NC_002947_4,p_putida,,2,1,2457T,,R1,putidaSigmaD_202003.bw,R2,putidaSigmaD_202005.bw,,,,,,,,,,,,
0,Fur,Salmonella enterica,Typhimurium LT2,,Fe,NC_003197_2,s_enterica,,1,1,LT2,,,,R2,Salmonella_Fur_Fe_2.bw,,,,,,,,,,,,
1,YdcI,Salmonella enterica,Typhimurium LT2,,,NC_003197_2,s_enterica,,6,3,LT2,,MES5 – R1,salmone_ydcI_MES5_1.bw,MES5 – R2,salmone_ydcI_MES5_2.bw,MES5 – R3,salmone_ydcI_MES5_3.bw,MES5 – R4,salmone_ydcI_MES5_4.bw,ph5 – R1,salmone_ydcI_ph55_1.bw,ph5 – R2,salmone_ydcI_ph55_2.bw,,,,
0,Fur,Shigella flexneri,"2a, 2457T",,,AE_014073_1,s_flexneri,,2,1,2457T,,R1,shigella_fur_1.bw,R2,shigella_fur_2.bw,,,,,,,,,,,,
0,CodY,Staphylococcus aureus,LAC chromosome,,,CP_035369_1,s_aureus,,2,1,LAC,,R1,StaphLAC_new_cody1.bw,R2,StaphLAC_new_cody2.bw,,,,,,,,,,,,
1,SigS,Staphylococcus aureus,USA300_TCH1516,,,NC_010079_1,s_aureus,,8,4,USA300_TCH1516,,M1 – R1,StaphTCH_sigs_M1.bw,M1 – R2,Staph_SigS_M1_R2.bw,M2 – R1,StaphTCH_sigs_M2.bw,M2 – R2,Staph_SigS_M2_R2.bw,M3 – R1,StaphTCH_sigs_M3.bw,M3 – R2,Staph_SigS_M3_R2.bw,M4 – R1,StaphTCH_sigs_M4.bw,M4 – R2,Staph_SigS_M4_R2.bw
2,VraR,Staphylococcus aureus,USA300_TCH1516,,,NC_010079_1,s_aureus,,2,1,USA300_TCH1516,,R1,StaphTCH_vraR_7H7C11purified.bw,R2,StaphTCH_vraR_13F12F6purified.bw,,,,,,,,,,,,
0,Fur,Yersinia pseudotuberculosis,IP 32953,,,NZ_CP009712_1,y_pseudotuberculosis,,2,1,2457T,,R1,yersifurfinal1.bw,R2,yersifurfinal2.bw,,,,,,,,,,,,


In [None]:
for TF_index in range(len(TF_list)):

    TF_name= TF_list.TF.iloc[TF_index].lower()
    org_id = TF_list.organism_id.iloc[TF_index]
    genome_id = TF_list.genome_id.iloc[TF_index]

    #find files and format
    curated_loc = glob.glob("../data/"+org_id+"/"+genome_id+"/"+"/curated_input/"+TF_name+"_*.gff")
    curated_loc.sort()
    curated_files = [i.split("/")[-1] for i in curated_loc]
    curated_cond = [i.split("_")[1] for i in curated_files]
    table_string =";".join([curated_cond[i]+"|"+TF_name+"_"+curated_cond[i]+'_binding_table.json' for i in range(len(curated_loc))])
    if table_string!='':
        TF_list['binding tables'].iloc[TF_index] = table_string
    else:
        TF_list['binding tables'].iloc[TF_index] = "NA"
    TF_list['num_cond'].iloc[TF_index] = max(1,len(curated_loc))
    
#     write plot data files to folders
    for file_index in range(len(curated_files)):
        try:
            file = curated_files[file_index]
            df = pd.read_csv("../data/"+org_id+"/"+genome_id+"/"+"/curated_input/"+file,index_col=0, 
                                     delimiter='\t', header=None, 
                                     names = ['ref','condition', 'condition_name', 
                                              "binding_peak_start",'binding_peak_end', 
                                              'binding_peak_strength', 'direction', '.','ID'])
            df = df.set_index(pd.Series(range(1,len(df)+1)))
            peak_annot_DF = annotate_peaks(TF_list.TF[TF_index], curated_cond[file_index], df, 500, DF_gene_info)
            final_annot_DF = validate_peak_info(peak_annot_DF,DF_gene_info)
            final_annot_DF.to_json(org_folder+strain+"/table/"+TF_name+"_"+curated_cond[file_index]+'_binding_table.json',orient='records')
            binding_width_gen(TF_list.TF[TF_index], df, org_folder+strain+"/binding_widths/")
            peak_position_gen(TF_list.TF[TF_index], final_annot_DF, org_folder+strain+"/positions/")
        except:
            print(TF_name, file)
            continue

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, val

rpob rpob_dpd_curated.gff
rpob rpob_fe_curated.gff


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, val

yiag yiag_M9_curated.gff


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [62]:
TF_list

Unnamed: 0_level_0,TF,Organism,Strain,Media,Supplement,genome_id,organism_id,num_binding_sites,num_samples,num_cond,project,binding tables,BWcond1_1_name,BWcond1_1,BWcond1_2_name,BWcond1_2,BWcond2_1_name,BWcond2_1,BWcond2_2_name,BWcond2_2,BWcond3_1_name,BWcond3_1,BWcond3_2_name,BWcond3_2,BWcond4_1_name,BWcond4_1,BWcond4_2_name,BWcond4_2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
0,Fur,Klebsiella pneumoniae,MGH 78578,,,CP_000647_1,k_pneumoniae,,2,1,MGH78578,,R1,klebfurfinal1.bw,R2,klebfurfinal2.bw,,,,,,,,,,,,
0,Fur,Pseudomonas putida,KT2440,,,NC_002947_4,p_putida,,1,1,2457T,,R1,putidafurfinal1_202003.bw,,,,,,,,,,,,,,
1,Sigma D,Pseudomonas putida,KT2440,,,NC_002947_4,p_putida,,2,1,2457T,M9|sigma d_M9_binding_table.json,R1,putidaSigmaD_202003.bw,R2,putidaSigmaD_202005.bw,,,,,,,,,,,,
0,Fur,Salmonella enterica,Typhimurium LT2,,Fe,NC_003197_2,s_enterica,,1,1,LT2,,,,R2,Salmonella_Fur_Fe_2.bw,,,,,,,,,,,,
1,YdcI,Salmonella enterica,Typhimurium LT2,,,NC_003197_2,s_enterica,,6,3,LT2,MES2|ydci_MES2_binding_table.json;MES5|ydci_ME...,MES5 – R1,salmone_ydcI_MES5_1.bw,MES5 – R2,salmone_ydcI_MES5_2.bw,MES5 – R3,salmone_ydcI_MES5_3.bw,MES5 – R4,salmone_ydcI_MES5_4.bw,ph5 – R1,salmone_ydcI_ph55_1.bw,ph5 – R2,salmone_ydcI_ph55_2.bw,,,,
0,Fur,Shigella flexneri,"2a, 2457T",,,AE_014073_1,s_flexneri,,2,1,2457T,,R1,shigella_fur_1.bw,R2,shigella_fur_2.bw,,,,,,,,,,,,
0,CodY,Staphylococcus aureus,LAC chromosome,,,CP_035369_1,s_aureus,,2,1,LAC,M9|cody_M9_binding_table.json,R1,StaphLAC_new_cody1.bw,R2,StaphLAC_new_cody2.bw,,,,,,,,,,,,
1,SigS,Staphylococcus aureus,USA300_TCH1516,,,NC_010079_1,s_aureus,,8,1,USA300_TCH1516,,M1 – R1,StaphTCH_sigs_M1.bw,M1 – R2,Staph_SigS_M1_R2.bw,M2 – R1,StaphTCH_sigs_M2.bw,M2 – R2,Staph_SigS_M2_R2.bw,M3 – R1,StaphTCH_sigs_M3.bw,M3 – R2,Staph_SigS_M3_R2.bw,M4 – R1,StaphTCH_sigs_M4.bw,M4 – R2,Staph_SigS_M4_R2.bw
2,VraR,Staphylococcus aureus,USA300_TCH1516,,,NC_010079_1,s_aureus,,2,1,USA300_TCH1516,,R1,StaphTCH_vraR_7H7C11purified.bw,R2,StaphTCH_vraR_13F12F6purified.bw,,,,,,,,,,,,
0,Fur,Yersinia pseudotuberculosis,IP 32953,,,NZ_CP009712_1,y_pseudotuberculosis,,2,1,2457T,,R1,yersifurfinal1.bw,R2,yersifurfinal2.bw,,,,,,,,,,,,


In [63]:
TF_list.to_csv(org_folder+'TF_list.csv')