# Operon Links - RegulonDB

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Input-Files" data-toc-modified-id="Input-Files-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Input Files</a></span></li><li><span><a href="#Gene-Mapping" data-toc-modified-id="Gene-Mapping-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Gene Mapping</a></span></li><li><span><a href="#Operon-Matching" data-toc-modified-id="Operon-Matching-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Operon Matching</a></span></li><li><span><a href="#Quick-error-fix" data-toc-modified-id="Quick-error-fix-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Quick error fix</a></span></li></ul></div>

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

## Input Files

In [2]:
operon_df = pd.read_csv('regulondb_TUs.txt', sep = '\t', skiprows = 37, header = None, index_col = 0)
operon_df.columns = ['TU', 'operon', 'genes', 'promoter', 'evidence', 'confidence']
operon_df

Unnamed: 0_level_0,TU,operon,genes,promoter,evidence,confidence
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ECK120030224,C0299,C0299,C0299,,,
ECK120030164,C0362,C0362,C0362,,[IHBCE|W|Inferred by a human based on computat...,Weak
ECK120027910,C0465,C0465,C0465,,[AISGDTU|W|Automated inference that a single-g...,Weak
ECK120027906,C0614,C0614,C0614,,[AISGDTU|W|Automated inference that a single-g...,Weak
ECK120030163,C0664,C0664,C0664,,[IHBCE|W|Inferred by a human based on computat...,Weak
...,...,...,...,...,...,...
ECK120009821,zraP,zraP,zraP,zraPp,,
ECK120009822,zraSR,zraSR,"zraS,zraR",zraSp,"[ITCR|W|Inferred through co-regulation],[IEP|W...",Weak
ECK120021094,zupT,zupT,zupT,zupTp,[AISGDTU|W|Automated inference that a single-g...,Weak
ECK120021095,zur,zur,zur,,[AISGDTU|W|Automated inference that a single-g...,Weak


In [3]:
gene_info = pd.read_csv('RegulonDB_GeneProductSet.txt', index_col = 0, skiprows=41, sep = '	', header = None)

strand_dict = dict(zip(gene_info[1], gene_info[5]))

gene_info = pd.read_csv('gene_info.csv', index_col = 0)
strand_dict1 = dict(zip(gene_info.gene_name, gene_info.strand))
sub_strand = {'+':'forward', '-':'reverse'}
strand_dict2 = {k:sub_strand[v] for k, v in strand_dict1.items()}

strand_dict = {**strand_dict, **strand_dict2}

In [4]:
strand_dict['yaiV'] = 'forward'
strand_dict['ydaC'] = 'reverse' # now rcbA
strand_dict['ydaQ'] = 'reverse' # now xisR
strand_dict['ydcX'] = 'forward' # now ortT
strand_dict['acrS'] = 'reverse' # now envR
strand_dict['yjhX'] = 'reverse' # now topAI
strand_dict['sfmZ'] = 'reverse' # unclear but everything in the neighborhood is reverse.
strand_dict['croE'] = 'forward' # now ymfT
strand_dict['stfP'] = 'forward' # now ycfK
strand_dict['insP'] = 'forward' # now yncK
strand_dict['yneM'] = 'forward' # now mgtS
strand_dict['intK'] = 'reverse' # now ydfW
strand_dict['ynhF'] = 'reverse' # now cydH
strand_dict['yegK'] = 'reverse' # now pphC
strand_dict['cbdX'] = 'forward' # now appX

In [5]:
gene_search = pd.read_json('../ChiPdb/data/genes_search.json')
gene_search

Unnamed: 0,Gene,Locus,Synonyms,Organism,Strain,Binding_site_id,Condition,Peak_start,Peak_end,Peak_strength,link
0,SVEN_RS00030,SVEN_RS00030,,Streptomyces venezuelae,ATCC 10712,Lsr2-MS-1,lsr2 + MS,3811,4024,-1.880621,tf_dashboard.html?organism=s_venezuelae&tf=Lsr...
1,SVEN_RS01020,SVEN_RS01020,,Streptomyces venezuelae,ATCC 10712,Lsr2-MS-2,lsr2 + MS,228117,228485,-1.782210,tf_dashboard.html?organism=s_venezuelae&tf=Lsr...
2,SVEN_RS01220,SVEN_RS01220,,Streptomyces venezuelae,ATCC 10712,Lsr2-MS-3,lsr2 + MS,268339,269459,-1.243085,tf_dashboard.html?organism=s_venezuelae&tf=Lsr...
3,SVEN_RS01230,SVEN_RS01230,,Streptomyces venezuelae,ATCC 10712,Lsr2-MS-4,lsr2 + MS,269662,269930,-1.308862,tf_dashboard.html?organism=s_venezuelae&tf=Lsr...
4,SVEN_RS02195,SVEN_RS02195,,Streptomyces venezuelae,ATCC 10712,Lsr2-MS-5,lsr2 + MS,501613,502566,-1.668116,tf_dashboard.html?organism=s_venezuelae&tf=Lsr...
...,...,...,...,...,...,...,...,...,...,...,...
32450,gntK,b3437,ECK3422,Escherichia coli,K-12 MG1655,FlhC-LB-46,flhc + LB,3578756,3578798,9.620000,tf_dashboard.html?organism=e_coli&tf=BaeR&geno...
32451,gntR,b3438,ECK3423,Escherichia coli,K-12 MG1655,FlhC-LB-46,flhc + LB,3578756,3578798,9.620000,tf_dashboard.html?organism=e_coli&tf=BaeR&geno...
32452,cspA,b3556,ECK3543,Escherichia coli,K-12 MG1655,FlhC-LB-47,flhc + LB,3719983,3720012,1.450000,tf_dashboard.html?organism=e_coli&tf=BaeR&geno...
32453,aslB,b3800,"atsB,ECK3793,gppB",Escherichia coli,K-12 MG1655,FlhC-LB-48,flhc + LB,3982479,3982527,2.020000,tf_dashboard.html?organism=e_coli&tf=BaeR&geno...


In [6]:
syn_dict = dict()
for i, row in gene_search.iterrows():
    if len(row.Synonyms) > 0:
        for s in row.Synonyms.split(','):
            syn_dict[s] = row.Gene

## Gene Mapping

In [7]:
# get operon sets for each known operon
operon_dict = {}
for op_id, row in operon_df.iterrows():
    operon_dict[op_id] = set(row.genes.split(','))

In [8]:
# get all the genes from the operon df
genes_odf = set()
for k, v in operon_dict.items():
    genes_odf = genes_odf.union(v)

In [9]:
gec = gene_search.loc[gene_search.Organism == 'Escherichia coli']
bad_genes = []
for g in gec.Gene:
    try:
        strand_dict[g]
    except:
        try:
            strand_dict[g.partition('_')[0]]
        except:
            bad_genes += [g]
bad_genes

[]

## Operon Matching

In [10]:
from tqdm.notebook import tqdm

In [12]:
# iterate through this folder
folder = '../ChiPdb/data/e_coli/NC_000913.3/table/'

# use these to build the new links
# unfortunately have to use search functionality since actual
#    operon ids are not available :/
link_pre = 'http://regulondb.ccg.unam.mx/search?term='
link_suf = '&organism=ECK12&type=All#OPERON'

html_pre = '<a href="'
html_mid = '" target="_blank">'
html_suf = '</a>'

# there is an odd error in the code. Apologies for this ugly workaround.
bad1 = ',<a href="http://regulondb.ccg.unam.mx/search?term=ECK120009818&organism=ECK12&type=All#OPERON" target="_blank"></a>'
bad2 = '<a href="http://regulondb.ccg.unam.mx/search?term=ECK120009818&organism=ECK12&type=All#OPERON" target="_blank"></a>,'

tables = os.listdir(folder)
for t in tqdm(tables):
    try:
        bt = pd.read_json(folder + t)
    except:
        continue
    
    for i, row in tqdm(bt.iterrows()):
        if len(row.target_genes) > 0:
            target_genes = row.target_genes
            closest_gene = row.closest_gene
            op_target = set(target_genes.split(','))

            op_target_f = set()
            op_target_r = set()
            for g in op_target:
                if not(g in strand_dict) & ('_' in g):
                    g = g.partition('_')[0]
                if strand_dict[g] == 'forward':
                    op_target_f = op_target_f.union([g])
                else:
                    op_target_r = op_target_r.union([g])

            res_full = []

            for op_t in [op_target_f, op_target_r]:
                # first, look for operons that contain this one
                matches = pd.Series(dtype = int)
                for op_id, op_set in operon_dict.items():
                    if op_t.issubset(op_set):
                        matches.loc[op_id] = len(op_set)

                if matches.shape[0] > 0: # there is an operon containing this
                    # get the smallest ones
                    matches = matches.loc[matches == matches.min()]
                    match_df = operon_df.loc[matches.index]

                    # sort
                    match_df = match_df.sort_values(['confidence', 'promoter'])
                    op_id = match_df.index[0]
                    targets = ','.join(op_t)
                    link = link_pre + op_id + link_suf
                    res = html_pre + link + html_mid + targets + html_suf


                # if no operons contain it, find the operon that 
                # overlaps most with it
                else:

                    for op_id, op_set in operon_dict.items():
                        if op_t.issuperset(op_set):
                            matches.loc[op_id] = len(op_set.intersection(op_t))

                    if matches.shape[0] > 0: # there are operons that overlap with this
                        # get the highest overlap
                        matches = matches.loc[matches == matches.max()]
                        match_df = operon_df.loc[matches.index]

                        # ensure that the closest gene is there if possible
                        match_df['contains_closest'] = match_df.genes.str.contains(closest_gene)

                        # sort
                        match_df = match_df.sort_values(['contains_closest', 'confidence', 'promoter'],
                                                       ascending = [0, 1, 1])
                        op_id = match_df.index[0]
                        targets = ','.join(op_t)
                        link = link_pre + op_id + link_suf
                        res = html_pre + link + html_mid + targets + html_suf

                    else:
                        res = ','.join(op_t)
                        

                res_full += [res]
            bt.loc[i, 'target_genes_html'] = ','.join(res_full)
        else:
            bt.loc[i, 'target_genes_html'] = row.target_genes
            
        # ugly workaround...
        bt.loc[i, 'target_genes_html'] = bt.loc[i, 'target_genes_html'].replace(bad1, '').replace(bad2, '')
    # output
    bt.to_json(folder + t, orient = 'records')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=83.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…



