# Search Output

This notebook generates json files for searching through ChIP-pro's genes and TFs.

In [1]:
import os
import pandas as pd

## TFs

In [3]:
prefix = '../data/'
suffix = '/TF_list.csv'

organisms = [f.name for f in os.scandir(prefix)]
cols_of_interest = ['TF', 'Organism', 'Strain', 'Media', 'Supplement']

all_tfs = pd.DataFrame(columns = cols_of_interest + ['link'])

for o in organisms:
    try:
        tfs = pd.read_csv(prefix + o + suffix, index_col = 0)
    except NotADirectoryError:
        continue
        
    # build link
    for i in tfs.index:
        l = 'tf_dashboard.html?organism=' + o
        l += '&tf=' + tfs.TF[i]
        l += '&genome=' + tfs.genome_id[i]
        l += '&i=' + str(i)
        tfs.loc[i, 'link'] = l
    
    # take info of interest
    tfs = tfs[cols_of_interest + ['link']]
    
    # save
    all_tfs = all_tfs.append(tfs, ignore_index = True)
    
all_tfs

Unnamed: 0,TF,Organism,Strain,Media,Supplement,link
0,CodY,Staphylococcus aureus,LAC chromosome,,,tf_dashboard.html?organism=s_aureus&tf=CodY&ge...
1,SigS,Staphylococcus aureus,USA300_TCH1516,,,tf_dashboard.html?organism=s_aureus&tf=SigS&ge...
2,VraR,Staphylococcus aureus,USA300_TCH1516,,,tf_dashboard.html?organism=s_aureus&tf=VraR&ge...
3,Fur,Shigella flexneri,"2a, 2457T",,,tf_dashboard.html?organism=s_flexneri&tf=Fur&g...
4,Fur,Klebsiella pneumoniae,MGH 78578,,,tf_dashboard.html?organism=k_pneumoniae&tf=Fur...
...,...,...,...,...,...,...
62,Fur,Pseudomonas putida,KT2440,,,tf_dashboard.html?organism=p_putida&tf=Fur&gen...
63,Sigma D,Pseudomonas putida,KT2440,,,tf_dashboard.html?organism=p_putida&tf=Sigma D...
64,Fur,Yersinia pseudotuberculosis,IP 32953,,,tf_dashboard.html?organism=y_pseudotuberculosi...
65,Fur,Salmonella enterica,Typhimurium LT2,,Fe,tf_dashboard.html?organism=s_enterica&tf=Fur&g...


In [5]:
all_tfs.to_json('../data/tfs_search.json', orient='records')

## Genes

In [6]:
prefix = '../data/'
suffix = '_binding_table.json'

all_genes = pd.DataFrame(columns = ['Gene', 'Locus', 
        'Organism', 'Strain', 'Binding_site_id',
        'Condition', 'Peak_start', 'Peak_end',
        'Peak_strength', 'link'])

for j, tf_row in all_tfs.iterrows():
    
    # get folder info from the link & read in file
    file = tf_row.link.replace('&', '=').split('=')
    f = prefix + file[1] + '/' + file[5] + '/table/' + file[3] + suffix
    
    # read in
    try:
        bt = pd.read_json(f)
    except (FileNotFoundError, ValueError) as e:
        continue
    
    for i, row in bt.iterrows():
        
        # get genes & loci
        genes = row.target_genes.split(',')
        loci = row.target_locus.split(',')
        
        # rearrange data for new table
        for g, l in zip(genes, loci):
            if g == '':
                continue
            all_genes = all_genes.append({
                'Gene':g,
                'Locus':l,
                'Organism':tf_row.Organism,
                'Strain':tf_row.Strain,
                'Binding_site_id':row['index'],
                'Condition':row.condition,
                'Peak_start':row.binding_peak_start,
                'Peak_end':row.binding_peak_end,
                'Peak_strength':row.binding_peak_strength,
                'link':tf_row.link}, ignore_index = True)
            
all_genes = all_genes.sort_values('Peak_start')

In [7]:
all_genes

Unnamed: 0,Gene,Locus,Organism,Strain,Binding_site_id,Condition,Peak_start,Peak_end,Peak_strength,link


In [10]:
all_genes.to_json('../ChiPdb/data/genes_search.json', orient='records')