# Search Output

This notebook generates json files for searching through ChIP-pro's genes and TFs.

In [1]:
import os
import pandas as pd

## TFs

In [3]:
prefix = '../ChiPdb/data/'
suffix = '/TF_list.csv'

organisms = [f.name for f in os.scandir(prefix)]
cols_of_interest = ['TF', 'Organism', 'Strain', 'Media', 'Supplement']

all_tfs = pd.DataFrame(columns = cols_of_interest + ['link'])

for o in organisms:
    try:
        tfs = pd.read_csv(prefix + o + suffix, index_col = 0)
    except NotADirectoryError:
        continue
        
    # build link
    for i in tfs.index:
        l = 'tf_dashboard.html?organism=' + o
        l += '&tf=' + tfs.TF[i]
        l += '&genome=' + tfs.genome_id[i]
        l += '&i=' + str(i)
        tfs.loc[i, 'link'] = l
    
    # take info of interest
    tfs = tfs[cols_of_interest + ['link']]
    
    # save
    all_tfs = all_tfs.append(tfs, ignore_index = True)
    
all_tfs

Unnamed: 0,TF,Organism,Strain,Media,Supplement,link
0,AtoC,Escherichia coli,K-12 MG1655,M9,acetoacetate,tf_dashboard.html?organism=e_coli&tf=AtoC&geno...
1,BaeR,Escherichia coli,K-12 MG1655,LB,EtOH,tf_dashboard.html?organism=e_coli&tf=BaeR&geno...
2,BtsR,Escherichia coli,K-12 MG1655,01xLB,Pyruvate,tf_dashboard.html?organism=e_coli&tf=BtsR&geno...
3,CpxR,Escherichia coli,K-12 MG1655,LB,EtOH,tf_dashboard.html?organism=e_coli&tf=CpxR&geno...
4,Cra,Escherichia coli,K-12 MG1655,M9,"Fructose, Galactose, or Acetate",tf_dashboard.html?organism=e_coli&tf=Cra&genom...
5,CusR,Escherichia coli,K-12 MG1655,LB,CuSO4,tf_dashboard.html?organism=e_coli&tf=CusR&geno...
6,Fur,Escherichia coli,K-12 MG1655,M9,Fe or DPD,tf_dashboard.html?organism=e_coli&tf=Fur&genom...
7,GadE,Escherichia coli,K-12 MG1655,M9,,tf_dashboard.html?organism=e_coli&tf=GadE&geno...
8,GadW,Escherichia coli,K-12 MG1655,M9,,tf_dashboard.html?organism=e_coli&tf=GadW&geno...
9,GadX,Escherichia coli,K-12 MG1655,M9,,tf_dashboard.html?organism=e_coli&tf=GadX&geno...


In [4]:
all_tfs.to_json('../ChiPdb/data/tfs_search.json', orient='records')

## Genes

In [8]:
prefix = '../ChiPdb/data/'
suffix = '_binding_table.json'

all_genes = pd.DataFrame(columns = ['Gene', 'Locus', 
        'Organism', 'Strain', 'Binding_site_id',
        'Condition', 'Peak_start', 'Peak_end',
        'Peak_strength', 'link'])

for j, tf_row in all_tfs.iterrows():
    
    # get folder info from the link & read in file
    file = tf_row.link.replace('&', '=').split('=')
    f = prefix + file[1] + '/' + file[5] + '/table/' + file[3] + suffix
    
    # read in
    try:
        bt = pd.read_json(f)
    except (FileNotFoundError, ValueError) as e:
        continue
    
    for i, row in bt.iterrows():
        
        # get genes & loci
        genes = row.target_genes.split(',')
        loci = row.target_locus.split(',')
        
        # rearrange data for new table
        for g, l in zip(genes, loci):
            if g == '':
                continue
            all_genes = all_genes.append({
                'Gene':g,
                'Locus':l,
                'Organism':tf_row.Organism,
                'Strain':tf_row.Strain,
                'Binding_site_id':row['index'],
                'Condition':row.condition,
                'Peak_start':row.binding_peak_start,
                'Peak_end':row.binding_peak_end,
                'Peak_strength':row.binding_peak_strength,
                'link':tf_row.link}, ignore_index = True)
            
all_genes = all_genes.sort_values('Peak_start')

In [9]:
all_genes

Unnamed: 0,Gene,Locus,Organism,Strain,Binding_site_id,Condition,Peak_start,Peak_end,Peak_strength,link
0,thrL,b0001,Escherichia coli,K-12 MG1655,AtoC-1,atoc + acetoacetate,151,186,8.93,tf_dashboard.html?organism=e_coli&tf=AtoC&geno...
1,thrA,b0002,Escherichia coli,K-12 MG1655,AtoC-1,atoc + acetoacetate,151,186,8.93,tf_dashboard.html?organism=e_coli&tf=AtoC&geno...
2,thrB,b0003,Escherichia coli,K-12 MG1655,AtoC-1,atoc + acetoacetate,151,186,8.93,tf_dashboard.html?organism=e_coli&tf=AtoC&geno...
3,thrC,b0004,Escherichia coli,K-12 MG1655,AtoC-1,atoc + acetoacetate,151,186,8.93,tf_dashboard.html?organism=e_coli&tf=AtoC&geno...
1938,thrC,b0004,Escherichia coli,K-12 MG1655,YjdC-1,yjdc + M9,201,249,2.91,tf_dashboard.html?organism=e_coli&tf=YjdC&geno...
...,...,...,...,...,...,...,...,...,...,...
821,yjtD,b4403,Escherichia coli,K-12 MG1655,BtsR-52,btsr + pyruvate,4640469,4640512,3.20,tf_dashboard.html?organism=e_coli&tf=BtsR&geno...
822,arcA,b4401,Escherichia coli,K-12 MG1655,BtsR-52,btsr + pyruvate,4640469,4640512,3.20,tf_dashboard.html?organism=e_coli&tf=BtsR&geno...
683,arcA,b4401,Escherichia coli,K-12 MG1655,BaeR-178,baer + EtOH,4640576,4640626,1.92,tf_dashboard.html?organism=e_coli&tf=BaeR&geno...
682,yjtD,b4403,Escherichia coli,K-12 MG1655,BaeR-178,baer + EtOH,4640576,4640626,1.92,tf_dashboard.html?organism=e_coli&tf=BaeR&geno...


In [10]:
all_genes.to_json('../ChiPdb/data/genes_search.json', orient='records')