# Data Generation for Plots Notebook

This notebook generates data used to contruct `binding peak` tables and `venn diagrams` for ChIP-pro, as well as other  visuals. But before you run the notebook, make sure of the following: 

> Make sure to rename TF in the gff file to match the TF convention and not gene convension.

In [1]:
import numpy as np 
import pandas as pd
import glob
import urllib                      
import gzip
from collections import defaultdict
from zipfile import ZipFile

pd.set_option('display.max_columns', 500)

# Functions

## Generate Binding peaks 

This function assigns bindings peaks for each TF and fins the corresponsing gene targets using the `gene_info.csv` table. 

In [2]:
def annotate_peaks(TF_name, TF_condition, peak_df,margin,gene_info):
    
    res_df = peak_df.copy()
#     TF = peak_df.condition_name[1][:4]
    for i,row in res_df.iterrows():
        pos = row['binding_peak_start']
        # Identify genes within MARGIN nt of binding peak
        close_genes = gene_info[(gene_info.start_codon_pos > pos-margin) 
                                & (gene_info.start_codon_pos < pos+margin)]
        for strand,group in close_genes.groupby('strand'):
            #Remove genes that are completely transcribed before binding peak
            if strand == '+':
                group = group[group.stop > pos]
            else:
                group = group[group.start < pos]

            operon = group.operon.unique()
            # Ensure that we're only identifying one operon on either side of binding peak
#             if len(operon) > 1:
#                 print (operon)

            # Get all genes in operon
            bnums = gene_info[gene_info.operon.isin(operon)].index
            
            ## Add gene information to dataframe
            if strand == '+':
                res_df.loc[i,'TU_p'] = ','.join(operon)
                res_df.loc[i,'genes_p'] = ','.join(bnums)
            else:
                res_df.loc[i,'TU_m'] = ','.join(operon)
                res_df.loc[i,'genes_m'] = ','.join(bnums)
    res_df['index'] = [TF_name +'-' + TF_condition + "-" + str(i) for i in range(1,peak_df.shape[0]+1)]
    res_df['condition'] = [ TF_name.lower() + " + " + TF_condition for i in peak_df.condition_name]
#     [peak_df.condition_name[2][:4]+' + '+peak_df.condition_name[1][5:8]]*peak_df.shape[0]
    cols = ['index','condition','binding_peak_start','binding_peak_end',
            'binding_peak_strength','TU_p','genes_p','TU_m','genes_m']
    return res_df.reindex(columns = cols)

## Validate Binding Peaks 

this function validates the accuracy of every gene target from the already indetified gene list for every binding site, identified from the previous function  

In [3]:
# helper for closest gene

def closest_locus_nc000913(peak_start, peak_stop, gene_info):
    df_pos = gene_info[gene_info.strand=="+"]
    df_pos = df_pos[df_pos.stop >= peak_start]
    try:
        gene_pos = df_pos.iloc[0,:]['gene_name']
        dist_pos = abs(peak_stop - df_pos.iloc[0,:]['start'])
    except:
        dist_pos = np.inf
    
    df_neg = gene_info[gene_info.strand=="-"]
    df_neg = df_neg[df_neg.start <= peak_stop]
    try:
        gene_neg = df_neg.iloc[-1,:]['gene_name']
        dist_neg = abs(peak_start - df_neg.iloc[-1,:]['stop'])
    except:
        dist_neg = np.inf
    
    if dist_pos <= dist_neg:
        return(gene_pos)
    else:
        return(gene_neg)

In [4]:
def validate_peak_info(df,gene_info):
    locusTag = defaultdict(list)
    geneName = defaultdict(list)
    for i,row in df.iterrows():
        BP = row['binding_peak_start']
        idx_name = i
        genes = [row['genes_p'] , row['genes_m']]
        if ((genes[0] == '' or str(genes[0]) == 'nan') & (genes[1] == '' or str(genes[1]) == 'nan')):
            locusTag[idx_name].append('')
            geneName[idx_name].append('')
        for gene in genes:
            if gene == '' or str(gene) == 'nan':
                continue 
            gene_list = gene.split(',')
            for g in gene_list: 
                name = DF_gene_info.loc[g].gene_name 
                strand = DF_gene_info.loc[g].strand 
                start = DF_gene_info.loc[g].start
                stop = DF_gene_info.loc[g].stop
                if ((start >= BP) & (stop >= BP) & (strand == '+')) | ((start <= BP) & (stop <= BP) & (strand == '-')):
                        locusTag[idx_name].append(g)
                        geneName[idx_name].append(name)
                elif ((start <= BP) & (stop >= BP)) | ((start >= BP) & (stop <= BP)):
                    locusTag[idx_name].append(g)
                    geneName[idx_name].append(name)
    
    for k, v in locusTag.items():
        if ((len(v) == 1) & (v[0] == '')): 
            locusTag[k] = ''
            continue
        genes = ','.join(locusTag[k])
        locusTag[k] = genes

    for k, v in geneName.items():
        if ((len(v) == 1) & (v[0] == '')): 
            geneName[k] = ''
            continue
        genes = ','.join(geneName[k])
        geneName[k] = genes

    df_complete = df.loc[:,['index','condition','binding_peak_start',
                            'binding_peak_end','binding_peak_strength']]
    df_complete['target_locus'] = locusTag.values()
    df_complete['target_genes'] = geneName.values()
    df_complete['closest_gene'] = [closest_locus_nc000913(df_complete.binding_peak_start.iloc[i], 
                                           df_complete.binding_peak_end.iloc[i],
                                           gene_info) for i in range(len(df_complete))]
    return df_complete

# Venn Diagrams

Make sure to update the list of `TF_names` as Ye adds more gff files into the dropbox

In [5]:
def Venn_data_gen2(Peak_DF, TRN_data, gene_info): 
    TF_name = Peak_DF['index'][1].split("-")[0].split(" ")[0]
    

    # TRN locus
    reg_genes = set(TRN_data.loc[TF_name].gene_id.to_list())

    # chip locus
    gene_list = [i for i in Peak_DF.target_locus if i != '']
    chip_genes = set(','.join(list(gene_list)).split(','))
    
    #both
    both_genes = set(reg_genes & chip_genes)
    
    # Get gene and operon counts
    reg_gene_count = len(reg_genes)
    chip_gene_count = len(chip_genes)
    both_gene_count = len(both_genes)
    
    # Add adjustments for venn plotting (add '2' for alternates)
    reg_gene_count2 = 0; chip_gene_count2 = 0; both_gene_count2 = 0
    if reg_genes == chip_genes:
        reg_gene_count = 0; chip_gene_count = 0; both_gene_count = 0
        reg_gene_count2 = 0; chip_gene_count2 = 0; both_gene_count2 = len(reg_genes)
    elif all(item in chip_genes for item in reg_genes):
        reg_gene_count = 0; both_gene_count = 0
        reg_gene_count2 = len(reg_genes); chip_gene_count2 = 0; both_gene_count2 = 0
    elif all(item in reg_genes for item in chip_genes):
        chip_gene_count = 0; both_gene_count = 0
        reg_gene_count2 = 0; chip_gene_count2 = len(chip_genes); both_gene_count2 = 0
        
    res = pd.DataFrame([TF_name, reg_gene_count, chip_gene_count, both_gene_count,
                        reg_gene_count2, chip_gene_count2, both_gene_count2], 
                        columns=['Value'],
                        index=['TF', 'reg_genes', 'chip_genes', 'both_genes',
                                'reg_genes2', 'chip_genes2', 'both_genes2'])
    
    # Add gene lists
    just_reg = reg_genes - both_genes
    just_chip = chip_genes - both_genes
    for i, l in zip(['reg_genes', 'chip_genes', 'both_genes'],[just_reg, just_chip, both_genes]):
        gene_list = np.array([gene_info.loc[g,'gene_name'] if g in gene_info.index else g for g in l])
        gene_list = np.array2string(gene_list, separator = ' ')
        res.loc[i, 'list'] = gene_list
    
    # Add data sources
    res.loc['TF', 'list'] = '; '.join(TRN_data.source[TRN_data.index == TF_name].unique())
    
    return res

# Binding Width Histograms

In [6]:
def binding_width_gen(TF, peak_df,out_dir): 
    widths = pd.DataFrame(peak_df.binding_peak_end - peak_df.binding_peak_start + 1, columns=["binding_width"])
    widths.to_csv(out_dir+TF+'_widths.csv')

# Peak Position Scatter

In [7]:
def peak_position_gen(TF, final_annot_DF,out_dir): 
    peak_scatter_df = pd.DataFrame(columns=final_annot_DF.columns.to_list()+['gene'] +['normalized_dist'])
    counter=0

    for i in range(len(final_annot_DF)):
        peak = final_annot_DF.loc[i+1,:]
        peak_center = np.mean([peak.binding_peak_start, peak.binding_peak_end])

        gene = peak.closest_gene
        gene_info = DF_gene_info[DF_gene_info.gene_name == gene]
        try:
            if gene_info.strand[0] == "-":
                dist = float((gene_info.stop - peak_center)/gene_info.length)
            else:
                dist = float((peak_center - gene_info.start)/gene_info.length)
            peak_scatter_df.loc[counter] = peak.to_list()+[gene, dist]
            counter+=1
        except:
            continue

    peak_scatter_df['binding_peak_center'] = peak_scatter_df.binding_peak_start/2 + peak_scatter_df.binding_peak_end/2
    peak_scatter_df.to_csv(out_dir+TF+'_positions.csv')

# E. coli - NC_000913.3

In [48]:
org_folder = "../data/e_coli/"

In [49]:
TF_list = pd.read_csv(org_folder+'TF_list.csv', index_col=0)
TF_list.drop(["curated information"], axis=1, inplace=True)

TF_list

Unnamed: 0_level_0,TF,Organism,Strain,Media,Supplement,genome_id,organism_id,num_binding_sites,num_samples,num_cond,project,binding tables,accession,doi,peak_intensity_measure,BWcond1_1_name,BWcond1_1,BWcond1_2_name,BWcond1_2,BWcond2_1_name,BWcond2_1,BWcond2_2_name,BWcond2_2,BWcond3_1_name,BWcond3_1,BWcond3_2_name,BWcond3_2,BWcond4_1_name,BWcond4_1,BWcond4_2_name,BWcond4_2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
0,BaeR,Escherichia coli,K-12 MG1655,LB,5% EtOH,NC_000913.3,e_coli,,4,1.0,TCS,EtOH|baer_EtOH_binding_table.json,GSE143856,https://doi.org/10.1128/mSystems.00980-20,MACE S/N,bio-rep1 – R1,BaeR_R1_S31_R1.bw,bio-rep1 – R2,BaeR_R1_S31_R2.bw,bio-rep2 – R1,BaerR_R2_S32_R1.bw,bio-rep2 – R2,BaerR_R2_S32_R2.bw,,,,,,,,
1,CpxR,Escherichia coli,K-12 MG1655,LB,5% EtOH,NC_000913.3,e_coli,,2,1.0,TCS,EtOH|cpxr_EtOH_binding_table.json,GSE143856,https://doi.org/10.1128/mSystems.00980-20,MACE S/N,R1,CpxRR1_S1_R1.bw,R2,CpxRR2_S2_R2.bw,,,,,,,,,,,,
2,Cra,Escherichia coli,K-12 MG1655,M9,"0.2% Fructose, Galactose, Acetate, or Glucose",NC_000913.3,e_coli,,8,4.0,known_TF,M9|cra_M9_binding_table.json;acetate|cra_aceta...,GSE65643,https://doi.org/10.1093/nar/gky069,MACE S/N,Glu – R1,cra_glu_1.bw,Glu – R2,cra_glu_2.bw,Fru – R1,cra_fru_1.bw,Fru – R2,cra_fru_2.bw,Gal – R1,cra_gal_1.bw,Gal – R2,cra_gal_2.bw,Ace – R1,cra_ace_1.bw,Ace – R2,cra_ace_2.bw
3,Fur,Escherichia coli,K-12 MG1655,M9,"0.1 mM FeCl2 or 0.2 mM 2,2-dipyridyl (DPD)",NC_000913.3,e_coli,,4,2.0,known_TF,dpd|fur_dpd_binding_table.json;fe|fur_fe_bindi...,GSE54901,https://doi.org/10.1038/ncomms5910,MACE S/N,Fe – R1,fur_fe_1.bw,Fe – R2,fur_fe_2.bw,DPD – R1,fur_dpd_1.bw,DPD – R2,fur_dpd_2.bw,,,,,,,,
4,GadE,Escherichia coli,K-12 MG1655,M9,pH 5.5 (adjusted with HCl),NC_000913.3,e_coli,,2,1.0,known_TF,M9|gade_M9_binding_table.json,GSE66482,https://doi.org/10.1038/ncomms8970,MACE S/N,R1,gade_1.bw,R2,gade_2.bw,,,,,,,,,,,,
5,GadW,Escherichia coli,K-12 MG1655,M9,pH 5.5 (adjusted with HCl),NC_000913.3,e_coli,,2,1.0,known_TF,M9|gadw_M9_binding_table.json,GSE66482,https://doi.org/10.1038/ncomms8970,MACE S/N,R1,gadw_1.bw,R2,gadw_2.bw,,,,,,,,,,,,
6,GadX,Escherichia coli,K-12 MG1655,M9,pH 5.5 (adjusted with HCl),NC_000913.3,e_coli,,2,1.0,known_TF,M9|gadx_M9_binding_table.json,GSE66482,https://doi.org/10.1038/ncomms8970,MACE S/N,R1,gadx_1.bw,R2,gadx_2.bw,,,,,,,,,,,,
7,KdpE,Escherichia coli,K-12 MG1655,Tris-maleic acid (TMA),0.1 mM KCl,NC_000913.3,e_coli,,2,1.0,TCS,KCl|kdpe_KCl_binding_table.json,GSE143856,https://doi.org/10.1128/mSystems.00980-20,MACE S/N,R1,KdpeE_R2_S34_R1.bw,R2,KdpeE_R2_S34_R2.bw,,,,,,,,,,,,
8,PhoB,Escherichia coli,K-12 MG1655,M9P,Low phosphate concentration (10% of the M9 med...,NC_000913.3,e_coli,,4,1.0,TCS,M9P|phob_M9P_binding_table.json,GSE143856,https://doi.org/10.1128/mSystems.00980-20,MACE S/N,bio-rep1 – R1,PhoB_1_S16_R1.bw,bio-rep1 – R2,PhoB_1_S16_R2.bw,bio-rep2 – R1,PhoB_2_S6_R1.bw,bio-rep2 – R2,PhoB_2_S6_R2.bw,,,,,,,,
9,RpoB,Escherichia coli,K-12 MG1655,M9,"0.1 mM FeCl2 or 0.2 mM 2,2-dipyridyl (DPD)",NC_000913.3,e_coli,,4,2.0,known_TF,dpd|rpob_dpd_binding_table.json;fe|rpob_fe_bin...,GSE54901,https://doi.org/10.1038/ncomms5910,MACE S/N,Fe – R1,rpob_fe1.bw,Fe – R2,rpob_fe2.bw,DPD – R1,rpob_dpd1.bw,DPD – R2,rpob_dpd2.bw,,,,,,,,


In [53]:
strain = 'NC_000913.3'
DF_gene_info = pd.read_csv(org_folder+strain+'/annotation/gene_info.csv',index_col=0)
DF_gene_info['start_codon_pos'] = [row.start if row.strand == '+' else row.stop for idx,row in DF_gene_info.iterrows()]
TRN = pd.read_csv(org_folder+strain+'/annotation/trn.csv',index_col=0)

DF_gene_info.head()

Unnamed: 0,start,stop,strand,gene_name,length,operon,cog,start_codon_pos
b0001,189,255,+,thrL,66,thrLABC,No COG Annotation,189
b0002,336,2799,+,thrA,2463,thrLABC,No COG Annotation,336
b0003,2800,3733,+,thrB,933,thrLABC,Amino acid transport and metabolism,2800
b0004,3733,5020,+,thrC,1287,thrLABC,Amino acid transport and metabolism,3733
b0005,5233,5530,+,yaaX,297,yaaX,Function unknown,5233


In [54]:
for TF_index in range(len(TF_list)):

    TF_name= TF_list.TF[TF_index].lower()
    print(TF_name)
    strain = TF_list.genome_id[TF_index]
    
    if strain != 'NC_000913.3':
        continue

    #find files and format
    curated_loc = glob.glob(org_folder+strain+"/curated_input/"+TF_name+"_*.gff")
    curated_loc.sort()
    curated_files = [i.split("/")[-1] for i in curated_loc] # change split to "/" on linux
    curated_cond = [i.split("_")[1] for i in curated_files]
    table_string =";".join([curated_cond[i]+"|"+TF_name+"_"+curated_cond[i]+'_binding_table.json' for i in range(len(curated_loc))])
    if table_string!='':
        TF_list.loc[TF_index, 'binding tables'] = table_string
    else:
        TF_list.loc[TF_index, 'binding tables'] = "NA"
    TF_list.loc[TF_index, 'num_cond'] = max(1,len(curated_loc))
    TF_list.Supplement.fillna("--", inplace=True)
    
    # write plot data files to folders
    try:
        df_list = []
        final_annot_DF_list =[]
        for file_index in range(len(curated_files)):
            file = curated_files[file_index]
            df = pd.read_csv(org_folder+strain+"/curated_input/"+file,index_col=0, 
                                     delimiter='\t', header=None, 
                                     names = ['ref','condition', 'condition_name', 
                                              "binding_peak_start",'binding_peak_end', 
                                              'binding_peak_strength', 'direction', '.','ID'])
            df = df.set_index(pd.Series(range(1,len(df)+1)))
            df_list.append(df)
            peak_annot_DF = annotate_peaks(TF_list.TF[TF_index], curated_cond[file_index], df, 500, DF_gene_info)
            final_annot_DF = validate_peak_info(peak_annot_DF,DF_gene_info)
            final_annot_DF.to_json(org_folder+strain+"/table/"+TF_name+"_"+curated_cond[file_index]+'_binding_table.json',orient='records')
            final_annot_DF_list.append(final_annot_DF)

        # merge conditions then do plots
        df = pd.concat(df_list, axis=0); df.index = np.arange(1, len(df) + 1)
        final_annot_DF = pd.concat(final_annot_DF_list, axis =0); final_annot_DF.index = np.arange(1, len(final_annot_DF.index) + 1)
        binding_width_gen(TF_list.TF[TF_index], df, org_folder+strain+"/binding_widths/")
        if any(final_annot_DF.binding_peak_strength > 0):
            peak_position_gen(TF_list.TF[TF_index], final_annot_DF, org_folder+strain+"/positions/")
            
    #venn diagrams
        try:
            if strain == 'NC_000913.3' and TF_list.TF[TF_index].split(" ")[0] in TRN.index:
                Venn_data_gen2(final_annot_DF, TRN, DF_gene_info).to_csv(org_folder+strain+"/venn/"+TF_list.TF[TF_index]+'_venn.csv')
        except:
            continue
    except:
        continue
        
#     #compress bw files
#     bw_files = TF_list.iloc[TF_index,-15:-1:2].dropna().to_list()
#     zipObj = ZipFile(org_folder+strain+"/bw/compressed/"+TF_list.TF[TF_index]+"_bw.zip", 'w')
#     for bw in bw_files:
#         zipObj.write(org_folder+strain+"/bw/"+bw, bw)
#     zipObj.close()

baer
cpxr
cra
fur
gade
gadw
gadx
kdpe
phob
rpob
ybao
ybaq
ydci
yddm
yheo
yiag
yiep
yihl
yihw
yjdc
yjhi
ynej
ynfl
ypdc
yqhc
zrar
fur
fur
fur
dps
fis
gyra
gyrb
h-ns
hupa
hupb
infa
mukb
rdgc
rob
stpa
topa
tus
narl
puur
rcsa
rpob (cra and crp ko exps)
flhc
flhd
flia
fnr
rpod
argr
ompr
rpoh


In [55]:
new_TF_list = TF_list.copy()
new_TF_list['binding tables'].fillna('NA',inplace=True)
new_TF_list.insert(12, "curated information", new_TF_list['binding tables'] != 'NA')
new_TF_list.to_csv(org_folder+'TF_list.csv')

# Other Organisms

In [9]:
def other_gff_to_geneinfo(org, strain):
    genes = pd.read_csv('../data'+'/'+org+'/'+strain+'/annotation/genes.gff',index_col=0, delimiter='\t', header=None, 
                        names = ['genome', 'ref', 'type', 'start','stop','.', 'strand', '-','details'])
    genes["locus_tag"] = [gene_detail[0].split(" ")[1] for gene_detail in genes.details.str.split("; ")]
    genes.set_index('locus_tag', inplace=True)
    genes = genes.drop(['.', '-','ref','type'], axis=1)

    return(genes)

In [10]:
# helper for closest gene

def closest_locus(peak_start, peak_stop, gene_info):
    gene_info['gene_name'] = [i.split(";")[0].split(" ")[-1] for i in gene_info.details]
    df_pos = gene_info[gene_info.strand=="+"]
    df_pos = df_pos[df_pos.stop >= peak_start]
    try:
        gene_pos = df_pos.iloc[0,:]['gene_name']
        dist_pos = abs(peak_stop - df_pos.iloc[0,:]['start'])
    except:
        dist_pos = np.inf
    
    df_neg = gene_info[gene_info.strand=="-"]
    df_neg = df_neg[df_neg.start <= peak_stop]
    try:
        gene_neg = df_neg.iloc[-1,:]['gene_name']
        dist_neg = abs(peak_start - df_neg.iloc[-1,:]['stop'])
    except:
        dist_neg = np.inf
    
    if dist_pos <= dist_neg:
        return(gene_pos)
    else:
        return(gene_neg)

In [11]:
def other_table_gen(TF_name, TF_condition, peak_df, gene_info):
    
    table = peak_df.copy()
    table["index"] = [TF_name+"-"+TF_condition+"-"+str(i+1) for i in range(len(table))]
    table["condition"] = [TF_name.lower() + " + " + TF_condition for i in range(len(table))]
    table.drop(["condition_name",".", "ID"], axis=1, inplace=True)
    table["target_locus"] = [closest_locus(table.binding_peak_start.iloc[i], 
                                           table.binding_peak_end.iloc[i],
                                           gene_info) for i in range(len(table))]
    table["target_genes"] = table["target_locus"]
    table["closest_gene"] = table["target_locus"]

    return(table)

In [12]:
def other_peak_position_gen(TF, final_annot_DF,out_dir, DF_gene_info): 
    peak_scatter_df = pd.DataFrame(columns=final_annot_DF.columns.to_list()+['gene'] +['normalized_dist'])
    counter=0

    for i in range(len(final_annot_DF)):
        peak = final_annot_DF.iloc[i,:]
        peak_center = np.mean([peak.binding_peak_start, peak.binding_peak_end])

        for gene in peak.target_genes.split(","):
            gene_info = DF_gene_info[DF_gene_info.index == gene]
            try:
                if gene_info.strand.iloc[0] == "-":
                    dist = float((gene_info.stop - peak_center)/(gene_info.stop-gene_info.start+1))
                else:
                    dist = float((peak_center - gene_info.start)/(gene_info.stop-gene_info.start+1))
                peak_scatter_df.loc[counter] = peak.to_list()+[gene, dist]
                counter+=1
            except:
                continue
    peak_scatter_df['binding_peak_center'] = peak_scatter_df.binding_peak_start/2 + peak_scatter_df.binding_peak_end/2
    peak_scatter_df.drop(['direction'], axis=1, inplace=True)
    cols = ['index', 'condition', 'binding_peak_start', 'binding_peak_end', 'binding_peak_strength', 'target_locus', 
            'target_genes', 'closest_gene', 'gene', 'normalized_dist', 'binding_peak_center']
    peak_scatter_df = peak_scatter_df[cols]
    peak_scatter_df.to_csv(out_dir+TF+'_positions.csv')

In [34]:
org_folder = "../data/all_other/"

In [45]:
TF_list = pd.read_csv(org_folder+'TF_list.csv', index_col=0)
TF_list.drop(["curated information"], axis=1, inplace=True)

TF_list

Unnamed: 0_level_0,TF,Organism,Strain,Media,Supplement,genome_id,organism_id,num_binding_sites,num_samples,num_cond,project,binding tables,accession,doi,peak_intensity_measure,BWcond1_1_name,BWcond1_1,BWcond1_2_name,BWcond1_2,BWcond2_1_name,BWcond2_1,BWcond2_2_name,BWcond2_2,BWcond3_1_name,BWcond3_1,BWcond3_2_name,BWcond3_2,BWcond4_1_name,BWcond4_1,BWcond4_2_name,BWcond4_2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
0,Fur,Klebsiella pneumoniae,MGH 78578,M9,Fe,CP000647.1,k_pneumoniae,,2,1,MGH78578,Fe|fur_Fe_binding_table.json,GSE167285,,MACE S/N,R1,klebfurfinal1.bw,R2,klebfurfinal2.bw,,,,,,,,,,,,
0,Fur,Pseudomonas putida,KT2440,M9,Fe,NC_002947.4,p_putida,,1,1,2457T,fe|fur_fe_binding_table.json,GSE167285,,MACE S/N,R1,putidafurfinal1_202003.bw,,,,,,,,,,,,,,
1,Sigma D,Pseudomonas putida,KT2440,M9,--,NC_002947.4,p_putida,,2,1,2457T,M9|sigma d_M9_binding_table.json,GSE167285,,MACE S/N,R1,putidaSigmaD_202003.bw,R2,putidaSigmaD_202005.bw,,,,,,,,,,,,
0,Fur,Salmonella enterica,Typhimurium LT2,M9,Fe,NC_003197.2,s_enterica,,1,1,LT2,Fe|fur_Fe_binding_table.json,GSE167285,,MACE S/N,,,R2,Salmonella_Fur_Fe_2.bw,,,,,,,,,,,,
1,YdcI,Salmonella enterica,Typhimurium LT2,M9,"pH5, MES",NC_003197.2,s_enterica,,6,3,LT2,MES2|ydci_MES2_binding_table.json;MES5|ydci_ME...,GSE167285,,MACE S/N,MES5 – R1,salmone_ydcI_MES5_1.bw,MES5 – R2,salmone_ydcI_MES5_2.bw,MES5 – R3,salmone_ydcI_MES5_3.bw,MES5 – R4,salmone_ydcI_MES5_4.bw,ph5 – R1,salmone_ydcI_ph55_1.bw,ph5 – R2,salmone_ydcI_ph55_2.bw,,,,
0,Fur,Shigella flexneri,"2a, 2457T",M9,Fe,AE014073.1,s_flexneri,,2,1,2457T,Fe|fur_Fe_binding_table.json,GSE167285,,MACE S/N,R1,shigella_fur_1.bw,R2,shigella_fur_2.bw,,,,,,,,,,,,
0,CodY,Staphylococcus aureus,LAC chromosome,RPMI,10% LB,CP035369.1,s_aureus,,2,1,LAC,M9|cody_M9_binding_table.json,GSE159856,https://doi.org/10.1101/2021.01.08.426013,MACE S/N,R1,StaphLAC_new_cody1.bw,R2,StaphLAC_new_cody2.bw,,,,,,,,,,,,
1,SigS,Staphylococcus aureus,USA300_TCH1516,RPMI,10% LB,NC_010079,s_aureus,,8,1,USA300_TCH1516,RPMI|sigs_RPMI_binding_table.json,GSE159856,,MACE S/N,M1 – R1,StaphTCH_sigs_M1.bw,M1 – R2,Staph_SigS_M1_R2.bw,M2 – R1,StaphTCH_sigs_M2.bw,M2 – R2,Staph_SigS_M2_R2.bw,M3 – R1,StaphTCH_sigs_M3.bw,M3 – R2,Staph_SigS_M3_R2.bw,M4 – R1,StaphTCH_sigs_M4.bw,M4 – R2,Staph_SigS_M4_R2.bw
2,VraR,Staphylococcus aureus,USA300_TCH1516,RPMI,10% LB,NC_010079,s_aureus,,2,1,USA300_TCH1516,RPMI|vrar_RPMI_binding_table.json,GSE159856,,MACE S/N,R1,StaphTCH_vraR_7H7C11purified.bw,R2,StaphTCH_vraR_13F12F6purified.bw,,,,,,,,,,,,
0,Fur,Yersinia pseudotuberculosis,IP 32953,BHI,Fe,NZ_CP009712.1,y_pseudotuberculosis,,2,1,2457T,Fe|fur_Fe_binding_table.json,GSE167285,,MACE S/N,R1,yersifurfinal1.bw,R2,yersifurfinal2.bw,,,,,,,,,,,,


In [46]:
for TF_index in range(len(TF_list)):

    TF_name= TF_list.TF.iloc[TF_index].lower()
    org_id = TF_list.organism_id.iloc[TF_index]
    genome_id = TF_list.genome_id.iloc[TF_index]

    #find files and format
    curated_loc = glob.glob("../data/"+org_id+"/"+genome_id+"/"+"/curated_input/"+TF_name+"_*.gff")
    curated_loc.sort()
    curated_files = [i.split("/")[-1] for i in curated_loc] 
    curated_cond = [i.split("_")[1] for i in curated_files]
    table_string =";".join([curated_cond[i]+"|"+TF_name+"_"+curated_cond[i]+'_binding_table.json' for i in range(len(curated_loc))])
    if table_string!='':
        TF_list['binding tables'].iloc[TF_index] = table_string
    else:
        TF_list['binding tables'].iloc[TF_index] = "NA"
    TF_list['num_cond'].iloc[TF_index] = max(1,len(curated_loc))
    TF_list.Supplement.fillna("--", inplace=True)
    gene_info = other_gff_to_geneinfo(org_id, genome_id)
    
 # write plot data files to folders
    try:
        df_list = []
        final_annot_DF_list =[]
        for file_index in range(len(curated_files)):
            file = curated_files[file_index]
            df = pd.read_csv("../data/"+org_id+"/"+genome_id+"/"+"/curated_input/"+file,index_col=0, 
                                     delimiter='\t', header=None, 
                                     names = ['ref','condition', 'condition_name', 
                                              "binding_peak_start",'binding_peak_end', 
                                              'binding_peak_strength', 'direction', '.','ID'])
            df = df.set_index(pd.Series(range(1,len(df)+1)))
            df_list.append(df)
            final_annot_DF = other_table_gen(TF_list.TF.iloc[TF_index], curated_cond[file_index], df, gene_info)
            final_annot_DF.to_json("../data/"+org_id+"/"+genome_id+"/table/"+TF_name+"_"+curated_cond[file_index]+'_binding_table.json',orient='records')
            final_annot_DF_list.append(final_annot_DF)
        # merge conditions then do plots
        df = pd.concat(df_list, axis=0); df.index = np.arange(1, len(df) + 1)
        final_annot_DF = pd.concat(final_annot_DF_list, axis =0); final_annot_DF.index = np.arange(1, len(final_annot_DF.index) + 1)
        binding_width_gen(TF_list.TF.iloc[TF_index], df, "../data/"+org_id+"/"+genome_id+"/binding_widths/")
        if any(final_annot_DF.binding_peak_strength > 0):
            other_peak_position_gen(TF_list.TF.iloc[TF_index], final_annot_DF, "../data/"+org_id+"/"+genome_id+"/positions/", gene_info)
        try:
            TRN = pd.read_csv("../data/"+org_id+"/"+genome_id+"/annotation/trn.csv", index_col=0)
            Venn_data_gen2(final_annot_DF, TRN, DF_gene_info).to_csv("../data/"+org_id+"/"+genome_id+"/venn/"+TF_list.TF.iloc[TF_index]+'_venn.csv')

        except:
            continue
    except:
        print(TF_name)
        continue
        
    #compress bw files
    bw_files = TF_list.iloc[TF_index,-15:-1:2].dropna().to_list()
    zipObj = ZipFile("../data/"+org_id+"/"+genome_id+"/bw/compressed/"+TF_list.TF.iloc[TF_index]+"_bw.zip", 'w')
    for bw in bw_files:
        zipObj.write("../data/"+org_id+"/"+genome_id+"/bw/"+bw, bw)
    zipObj.close()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


amrz
mksb
parb
smc


In [47]:
new_TF_list = TF_list.copy()
new_TF_list.insert(12, "curated information", TF_list['binding tables'] != 'NA')
new_TF_list.to_csv(org_folder+'TF_list.csv')

#individual file updates
for org in new_TF_list.organism_id.unique():
    new_TF_list[new_TF_list.organism_id == org].to_csv("../data/"+org+"/TF_list.csv")

# E. coli - other strains

In [21]:
org_folder = "../data/e_coli/"

TF_list = pd.read_csv(org_folder+'TF_list.csv', index_col=0)
TF_list.drop(["curated information"], axis=1, inplace=True)

TF_list.head()

Unnamed: 0_level_0,TF,Organism,Strain,Media,Supplement,genome_id,organism_id,num_binding_sites,num_samples,num_cond,project,binding tables,accession,doi,BWcond1_1_name,BWcond1_1,BWcond1_2_name,BWcond1_2,BWcond2_1_name,BWcond2_1,BWcond2_2_name,BWcond2_2,BWcond3_1_name,BWcond3_1,BWcond3_2_name,BWcond3_2,BWcond4_1_name,BWcond4_1,BWcond4_2_name,BWcond4_2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
0,BaeR,Escherichia coli,K-12 MG1655,LB,5% EtOH,NC_000913.3,e_coli,,4,1,TCS,EtOH|baer_EtOH_binding_table.json,GSE143856,https://doi.org/10.1128/mSystems.00980-20,bio-rep1 – R1,BaeR_R1_S31_R1.bw,bio-rep1 – R2,BaeR_R1_S31_R2.bw,bio-rep2 – R1,BaerR_R2_S32_R1.bw,bio-rep2 – R2,BaerR_R2_S32_R2.bw,,,,,,,,
1,CpxR,Escherichia coli,K-12 MG1655,LB,5% EtOH,NC_000913.3,e_coli,,2,1,TCS,EtOH|cpxr_EtOH_binding_table.json,GSE143856,https://doi.org/10.1128/mSystems.00980-20,R1,CpxRR1_S1_R1.bw,R2,CpxRR2_S2_R2.bw,,,,,,,,,,,,
2,Cra,Escherichia coli,K-12 MG1655,M9,"0.2% Fructose, Galactose, Acetate, or Glucose",NC_000913.3,e_coli,,8,4,known_TF,M9|cra_M9_binding_table.json;acetate|cra_aceta...,GSE65643,https://doi.org/10.1093/nar/gky069,Glu – R1,cra_glu_1.bw,Glu – R2,cra_glu_2.bw,Fru – R1,cra_fru_1.bw,Fru – R2,cra_fru_2.bw,Gal – R1,cra_gal_1.bw,Gal – R2,cra_gal_2.bw,Ace – R1,cra_ace_1.bw,Ace – R2,cra_ace_2.bw
3,Fur,Escherichia coli,K-12 MG1655,M9,"0.1 mM FeCl2 or 0.2 mM 2,2-dipyridyl (DPD)",NC_000913.3,e_coli,,4,2,known_TF,dpd|fur_dpd_binding_table.json;fe|fur_fe_bindi...,GSE54901,https://doi.org/10.1038/ncomms5910,Fe – R1,fur_fe_1.bw,Fe – R2,fur_fe_2.bw,DPD – R1,fur_dpd_1.bw,DPD – R2,fur_dpd_2.bw,,,,,,,,
4,GadE,Escherichia coli,K-12 MG1655,M9,pH 5.5 (adjusted with HCl),NC_000913.3,e_coli,,2,1,known_TF,M9|gade_M9_binding_table.json,GSE66482,https://doi: 10.1038/ncomms8970,R1,gade_1.bw,R2,gade_2.bw,,,,,,,,,,,,


In [22]:
for TF_index in range(len(TF_list)):
    
    TF_name= TF_list.TF.iloc[TF_index].lower()
    org_id = TF_list.organism_id.iloc[TF_index]
    genome_id = TF_list.genome_id.iloc[TF_index]
    
    if genome_id == 'NC_000913.3':
            continue
            
    #find files and format
    curated_loc = glob.glob("../data/"+org_id+"/"+genome_id+"/"+"/curated_input/"+TF_name+"_*.gff")
    curated_loc.sort()
    curated_files = [i.split("/")[-1] for i in curated_loc]
    curated_cond = [i.split("_")[1] for i in curated_files]
    table_string =";".join([curated_cond[i]+"|"+TF_name+"_"+curated_cond[i]+'_binding_table.json' for i in range(len(curated_loc))])
    if table_string!='':
        TF_list['binding tables'].iloc[TF_index] = table_string
    else:
        TF_list['binding tables'].iloc[TF_index] = "NA"
    TF_list['num_cond'].iloc[TF_index] = max(1,len(curated_loc))
    gene_info = other_gff_to_geneinfo(org_id, genome_id)
    TF_list.Supplement.fillna("--", inplace=True)
    
 # write plot data files to folders
    try:
        df_list = []
        final_annot_DF_list =[]
        for file_index in range(len(curated_files)):
            file = curated_files[file_index]
            df = pd.read_csv("../data/"+org_id+"/"+genome_id+"/"+"/curated_input/"+file,index_col=0, 
                                     delimiter='\t', header=None, 
                                     names = ['ref','condition', 'condition_name', 
                                              "binding_peak_start",'binding_peak_end', 
                                              'binding_peak_strength', 'direction', '.','ID'])
            df = df.set_index(pd.Series(range(1,len(df)+1)))
            df_list.append(df)
            final_annot_DF = other_table_gen(TF_list.TF.iloc[TF_index], curated_cond[file_index], df, gene_info)
            final_annot_DF.to_json("../data/"+org_id+"/"+genome_id+"/table/"+TF_name+"_"+curated_cond[file_index]+'_binding_table.json',orient='records')
            final_annot_DF_list.append(final_annot_DF)
        # merge conditions then do plots
        df = pd.concat(df_list, axis=0); df.index = np.arange(1, len(df) + 1)
        final_annot_DF = pd.concat(final_annot_DF_list, axis =0); final_annot_DF.index = np.arange(1, len(final_annot_DF.index) + 1)
        binding_width_gen(TF_list.TF.iloc[TF_index], df, "../data/"+org_id+"/"+genome_id+"/binding_widths/")
        if any(final_annot_DF.binding_peak_strength > 0):
            other_peak_position_gen(TF_list.TF.iloc[TF_index], final_annot_DF, "../data/"+org_id+"/"+genome_id+"/positions/", gene_info)
        try:
            TRN = pd.read_csv("../data/"+org_id+"/"+genome_id+"/annotation/trn.csv", index_col=0)
            Venn_data_gen2(final_annot_DF, TRN, DF_gene_info).to_csv("../data/"+org_id+"/"+genome_id+"/venn/"+TF_list.TF.iloc[TF_index]+'_venn.csv')
            
        except:
            continue
    except:
        continue

#     #compress bw files
#     bw_files = TF_list.iloc[TF_index,-15:-1:2].dropna().to_list()
#     zipObj = ZipFile("../data/"+org_id+"/"+genome_id+"/bw/compressed/"+TF_list.TF.iloc[TF_index]+"_bw.zip", 'w')
#     for bw in bw_files:
#         zipObj.write("../data/"+org_id+"/"+genome_id+"/bw/"+bw, bw)
#     zipObj.close()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [23]:
new_TF_list = TF_list.copy()
new_TF_list['binding tables'].fillna('NA',inplace=True)
new_TF_list.insert(12, "curated information", new_TF_list['binding tables'] != 'NA')
new_TF_list.to_csv(org_folder+'TF_list.csv')