# Search Output

This notebook generates json files for searching through proChIPdb's genes and TFs.

In [1]:
import os
import pandas as pd
import glob

## TFs

In [2]:
prefix = '../data/'
suffix = '/TF_list.csv'

organisms = [f.name for f in os.scandir(prefix)]
organisms.remove('all_other')

cols_of_interest = ['TF', 'Organism', 'Strain', 'Media', 'Supplement']

all_tfs = pd.DataFrame(columns = cols_of_interest + ['link'])

for o in organisms:
    try:
        tfs = pd.read_csv(prefix + o + suffix, index_col = 0)
    except NotADirectoryError:
        continue
        
    # build link
    for i in tfs.index:
        l = 'tf_dashboard.html?organism=' + o
        l += '&tf=' + tfs.TF[i]
        l += '&genome=' + tfs.genome_id[i]
        l += '&i=' + str(i)
        tfs.loc[i, 'link'] = l
    
    # take info of interest
    tfs = tfs[cols_of_interest + ['link']]
    
    # save
    all_tfs = all_tfs.append(tfs, ignore_index = True)
    
all_tfs

Unnamed: 0,TF,Organism,Strain,Media,Supplement,link
0,CodY,Staphylococcus aureus,LAC chromosome,RPMI,10% LB,tf_dashboard.html?organism=s_aureus&tf=CodY&ge...
1,SigS,Staphylococcus aureus,USA300_TCH1516,RPMI,10% LB,tf_dashboard.html?organism=s_aureus&tf=SigS&ge...
2,VraR,Staphylococcus aureus,USA300_TCH1516,RPMI,10% LB,tf_dashboard.html?organism=s_aureus&tf=VraR&ge...
3,AmrZ,Pseudomonas fluorescens,F113,SA (low Fe),--,tf_dashboard.html?organism=p_fluorescens&tf=Am...
4,Fur,Shigella flexneri,"2a, 2457T",M9,Fe,tf_dashboard.html?organism=s_flexneri&tf=Fur&g...
...,...,...,...,...,...,...
95,Smc,Corynebacterium glutamicum,ATCC 13032,BHI,--,tf_dashboard.html?organism=c_glutamicum&tf=Smc...
96,EspR,Mycobacterium tuberculosis,H37Rv,7H9 Broth (Difco),"0.2% gly, ADC, 0.05% Tween 80",tf_dashboard.html?organism=m_tuberculosis&tf=E...
97,Fur,Yersinia pseudotuberculosis,IP 32953,BHI,Fe,tf_dashboard.html?organism=y_pseudotuberculosi...
98,Fur,Salmonella enterica,Typhimurium LT2,M9,Fe,tf_dashboard.html?organism=s_enterica&tf=Fur&g...


In [3]:
all_tfs.to_json('../data/tfs_search.json', orient='records')

## Genes

In [4]:
prefix = '../data/'
suffix = '_binding_table.json'

all_genes = pd.DataFrame(columns = ['Gene', 'Locus', 
        'Organism', 'Strain', 'Binding_site_id',
        'Condition', 'Peak_start', 'Peak_end',
        'Peak_strength', 'link'])

for j, tf_row in all_tfs.iterrows():
    
    # get folder info from the link & read in file
    file = tf_row.link.replace('&', '=').split('=')
    path = prefix + file[1] + '/' + file[5] + '/table/' + file[3].lower() + "_*binding_table.json"
    all_matches = glob.glob(path)
    
    for f in all_matches:
        # read in
        try:
            bt = pd.read_json(f)
        except (FileNotFoundError, ValueError) as e:
            continue

        for i, row in bt.iterrows():

            # get genes & loci
            genes = row.target_genes.split(',')
            loci = row.target_locus.split(',')

            # rearrange data for new table
            for g, l in zip(genes, loci):
                if g == '':
                    continue
                all_genes = all_genes.append({
                    'Gene':g,
                    'Locus':l,
                    'Organism':tf_row.Organism,
                    'Strain':tf_row.Strain,
                    'Binding_site_id':row['index'],
                    'Condition':row.condition,
                    'Peak_start':row.binding_peak_start,
                    'Peak_end':row.binding_peak_end,
                    'Peak_strength':row.binding_peak_strength,
                    'link':tf_row.link}, ignore_index = True)
            
all_genes = all_genes.sort_values('Peak_start')

In [5]:
all_genes

Unnamed: 0,Gene,Locus,Organism,Strain,Binding_site_id,Condition,Peak_start,Peak_end,Peak_strength,link
87764,PP_0010,PP_0010,Pseudomonas putida,KT2440,Sigma D-M9-1,sigma d + M9,43,82,8.09000,tf_dashboard.html?organism=p_putida&tf=Sigma D...
88877,dnaA,dnaA,Pseudomonas aeruginosa,PAO1,SphR-LB-1,sphr + LB,66,149,1.98077,tf_dashboard.html?organism=p_aeruginosa&tf=Sph...
7159,thrC,b0004,Escherichia coli,K-12 MG1655,RpoB-dpd-1,rpob + dpd,100,150,3.68000,tf_dashboard.html?organism=e_coli&tf=RpoB&geno...
7156,thrL,b0001,Escherichia coli,K-12 MG1655,RpoB-dpd-1,rpob + dpd,100,150,3.68000,tf_dashboard.html?organism=e_coli&tf=RpoB&geno...
7157,thrA,b0002,Escherichia coli,K-12 MG1655,RpoB-dpd-1,rpob + dpd,100,150,3.68000,tf_dashboard.html?organism=e_coli&tf=RpoB&geno...
...,...,...,...,...,...,...,...,...,...,...
87697,SCO7825,SCO7825,Streptomyces coelicolor,A3(2) M145,ScbR2-SMM-444,scbr2 + SMM,8641432,8645750,2.34000,tf_dashboard.html?organism=s_coelicolor&tf=Scb...
87090,SCO7828,SCO7828,Streptomyces coelicolor,A3(2) M145,HrdB-MS-1541,hrdb + MS,8645864,8646219,2.65645,tf_dashboard.html?organism=s_coelicolor&tf=Hrd...
87722,SCO7833,SCO7833,Streptomyces coelicolor,A3(2) M145,ScbR2-SMM-469,scbr2 + SMM,8651620,8652701,4.41000,tf_dashboard.html?organism=s_coelicolor&tf=Scb...
87687,SCO7844,SCO7844,Streptomyces coelicolor,A3(2) M145,ScbR2-SMM-434,scbr2 + SMM,8660800,8662596,2.48000,tf_dashboard.html?organism=s_coelicolor&tf=Scb...


In [6]:
all_genes.to_json('../data/genes_search.json', orient='records')