# proChIPdb Links

Let's actually gather links systematically from databases to solve this problem correctly.

Databases needed:
- EcoCyc
- RegulonDB
- Uniprot
- PDB
- iModulonDB

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Read-in-data" data-toc-modified-id="Read-in-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Read in data</a></span></li><li><span><a href="#Uniprot" data-toc-modified-id="Uniprot-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Uniprot</a></span></li><li><span><a href="#iModulonDB" data-toc-modified-id="iModulonDB-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>iModulonDB</a></span></li><li><span><a href="#Output" data-toc-modified-id="Output-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Output</a></span></li></ul></div>

In [1]:
import pandas as pd
import numpy as np
from pymodulon.core import IcaData
from pymodulon.io import *

## Read in data

In [2]:
search_path = '../ChiPdb/data/tfs_search.json'
imdb_path = '../../iModulonDB/final_im_objects/e_coli_precise2_imdb.json'
link_path = 'e_coli_link_df.csv'

tf_info = pd.read_json(search_path)
ica_data = load_json_model(imdb_path)

link_df = pd.read_csv(link_path, index_col = 0)

## Uniprot

In [6]:
ecocyc_df = pd.read_csv('ecocyc_gene_download.tsv', sep = '\t', index_col = 0)

In [23]:
bad_tfs = []

tf_rename = {'ybaO':'decR', 'yihW':'csqR', 'h-NS':'hns', 
            'rpoB (Cra and Crp KO exps)':'rpoB', 'arcA-1':'arcA', 
            'arcA-2':'arcA'}

for tf in link_df.index:
    try:
        tf_lower = tf[0].lower() + tf[1:]
        if tf_lower in tf_rename.keys():
            tf_lower = tf_rename[tf_lower]
        up_id = ecocyc_df.loc[tf_lower, 'UniProt']
        link_df.loc[tf, 'UniProt'] = 'https://www.uniprot.org/uniprot/' + up_id
    except:
        
        bad_tfs += [tf]

In [24]:
link_df.index[link_df.UniProt.isna()]

Index(['mixed-TFs-pool1a', 'mixed-TFs-pool1b', 'mixed-TFs-pool1c',
       'mixed-TFs-pool2', 'mixed-TFs-pool3', 'mixed-TFs-pool4',
       'mixed-TFs-pool5'],
      dtype='object')

## iModulonDB

In [3]:
# get ims for each tf
tf_to_im = pd.DataFrame()
for k, regstr in zip(ica_data.imodulon_table.index, ica_data.imodulon_table.regulator):
    if not(isinstance(regstr, str)):
        continue
    
    for tf in regstr.replace('/','+').split('+'):
        if tf in tf_to_im.index:
            tf_to_im.loc[tf, tf_to_im.loc[tf].count()] = k
        else:
            tf_to_im.loc[tf, 0] = k

# remove unnecessary
rel_tfs = tf_to_im.loc[tf_to_im.index.isin(link_df.index)].dropna(1, 'all')

# make the links
imname2num = dict(zip(ica_data.imodulon_table.index, range(ica_data.imodulon_table.shape[0])))

def im_link(im_name):
    return 'https://imodulondb.org/iModulon.html?organism=e_coli&dataset=precise2&k=' + str(imname2num[im_name])

## Output

In [27]:
# save the link_df
link_df.to_csv('e_coli_link_df.csv')

In [7]:
output_folder = '../ChiPdb/data/e_coli/outlinks/'

for tf, row in link_df.iterrows():
    res = row.fillna('')
    if tf in rel_tfs.index:
        for k in rel_tfs.loc[tf].dropna():
            res[k] = im_link(k)
    res.to_csv(output_folder + tf + '_links.csv', header = None)

In [8]:
res

EcoCyc          https://ecocyc.org/gene?orgid=ECOLI&id=EG11140
RegulonDB    http://regulondb.ccg.unam.mx/regulon?term=ECK1...
UniProt                 https://www.uniprot.org/uniprot/P0AED5
PDB                                                           
Name: UvrY, dtype: object