# Import Packages

In [120]:
from collections import Counter, OrderedDict
from itertools import chain, combinations
from pathlib import Path
import sys

import networkx as nx
import pandas as pd
from tqdm.notebook import tqdm

sys.path.append('../../bitome2')
sys.path.append('../')

from bitome.core import Bitome

# Load E. coli K-12 MG1655 Bitome

In [2]:
K12_DATA_PATH = Path('../data/bitome2/mg1655')

In [3]:
# from GenBank annotation
origin = (3925743, 3925975)
# from 1. Duggin, I. G. & Bell, S. D. J. Mol. Biol. (2009). with the following terA/terC sequences:
# ter_a = 'AATTAGTATGTTGTAACTAAAGT'
# ter_c = 'ATATAGGATGTTGTAACTAATAT'
terminus = (1341745, 1609180)

In [4]:
mg1655 = Bitome(
    Path(K12_DATA_PATH, 'NC_000913.3.gb'),
    origin=origin, terminus=terminus,
    gene_table=Path(K12_DATA_PATH, 'gene_info_supp.csv'),
    tu_table=Path(K12_DATA_PATH, 'tu.csv'),
    operon_table=Path(K12_DATA_PATH, 'operon.csv'),
    tss_table=Path(K12_DATA_PATH, 'tss.csv'),
    # TODO PARSE TTS data
    tts_table=None,
    tfbs_table=Path(K12_DATA_PATH, 'tfbs.csv'),
    terminator_table=Path(K12_DATA_PATH, 'terminator.csv'),
    attenuator_table=Path(K12_DATA_PATH, 'attenuator.csv'),
    rbs_table=Path(K12_DATA_PATH, 'rbs.csv'),
    riboswitch_table=Path(K12_DATA_PATH, 'riboswitch.csv')
)

# List 81 TFs of Interest

Swapped IciA for ArgP (alternative name for same TF)

In [5]:
tfs_of_interest = [
    'Lrp', 'TyrR', 'PdhR', 'Nac', 'NtrC', 'GadE', 'InfA', 'Tus', 'StpA', 'Rob', 'CspE', 'HupA', 'HupB',
    'Cra', 'SdiA', 'LsrR', 'ArgR', 'PurR', 'AppY', 'ArgP', 'GadX', 'OxyR', 'SoxS', 'ArcA', 'Mlc',
    'YdcI', 'YiaJ', 'YjhU', 'YagI', 'YeiE', 'YbiH', 'YafC', 'YieP', 'YddM', 'YiaG', 'YheO', 'YbaQ',
    'YbaO', 'YchA', 'YihY', 'YjdC', 'YhjC', 'YdhB', 'YahB', 'YcjW', 'YidZ', 'YqhC', 'YfeD', 'YggD',
    'YihW', 'YcfQ', 'YfeC', 'YebK', 'YedW', 'YciT', 'YgeR', 'YnfL', 'YiaU', 'YbeF', 'YjhI', 'YneJ',
    'YbcM', 'YcaN', 'YbhD', 'YdiP', 'YdcN', 'YbdO', 'YfiE', 'YidL', 'YihL', 'YehT', 'YpdC', 'YgaV',
    'YhjB', 'YeeY', 'YgfI', 'YidP', 'YdcR', 'YgbI', 'YjjJ', 'YeaM'
]
print(len(tfs_of_interest))

81


# Get Binding Sites from Bitome

In [6]:
bitome_tfbs_df = mg1655.tfbs_table[mg1655.tfbs_table['left'].notna()]
bitome_tfbs_df['source'] = 'RegulonDB'
bitome_tfbs_df.head()

Unnamed: 0_level_0,left,right,mode,final_state,tf,strand,source
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ECK120011181,612650,612668,repressor,Fur-Fe<SUP>2+</SUP>,Fur,,RegulonDB
ECK120011181,612650,612668,repressor,Fur-Fe<SUP>2+</SUP>,Fur,,RegulonDB
ECK120011184,4516744,4516762,repressor,Fur-Fe<SUP>2+</SUP>,Fur,,RegulonDB
ECK120011188,1619005,1619019,activator,Fis,Fis,,RegulonDB
ECK120011197,3305958,3305975,activator,TyrR-tyrosine,TyrR,,RegulonDB


# Get Binding Sites from Ye's yTF ChIP-Exo Data

In [7]:
gao_s4_df = pd.read_excel(Path('..', 'data', 'gao_dataset_s4.xlsx'), sheet_name='pre_formatted')
gao_s4_df = gao_s4_df.rename(columns={'Start': 'left', 'End': 'right', 'protein': 'tf'})
gao_s4_df = gao_s4_df.drop(columns=['S/N', 'b_number'])
gao_s4_df['mode'] = None
gao_s4_df['final_state'] = None
gao_s4_df['strand'] = None
gao_s4_df['source'] = 'Gao'
gao_s4_df['locus_tag'] = [f'gaos4_{i}' for i in range(gao_s4_df.shape[0])]
gao_s4_df = gao_s4_df.set_index('locus_tag')
gao_s4_df.head()

Unnamed: 0_level_0,left,right,tf,mode,final_state,strand,source
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
gaos4_0,96929,96970,YidZ,,,,Gao
gaos4_1,149309,149359,YidZ,,,,Gao
gaos4_2,213955,213985,YidZ,,,,Gao
gaos4_3,296762,296809,YidZ,,,,Gao
gaos4_4,365657,365697,YidZ,,,,Gao


In [8]:
gao_s5_df = pd.read_excel(Path('..', 'data', 'gao_dataset_s5.xlsx'), sheet_name='pre_formatted_bitome')
gao_s5_df = gao_s5_df.rename(columns={'TF': 'tf'})
def locus_to_l(locus):
    l, r = locus.split(':')[-1].split('-')
    return int(l)
def locus_to_r(locus):
    l, r = locus.split(':')[-1].split('-')
    return int(r)
gao_s5_df['left'] = gao_s5_df['Locus'].apply(locus_to_l)
gao_s5_df['right'] = gao_s5_df['Locus'].apply(locus_to_r)
gao_s5_df = gao_s5_df.drop(columns=['Binding Sites', 'b_number', 'Nearest gene', 'Transcription Unit ',
                                   'Description', 'Location', 'Locus'])
gao_s5_df['mode'] = None
gao_s5_df['final_state'] = None
gao_s5_df['strand'] = None
gao_s5_df['source'] = 'Gao'
gao_s5_df['locus_tag'] = [f'gaos5_{i}' for i in range(gao_s5_df.shape[0])]
gao_s5_df = gao_s5_df.set_index('locus_tag')
gao_s5_df.head()

Unnamed: 0_level_0,tf,left,right,mode,final_state,strand,source
locus_tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
gaos5_0,YeiE,71350,72115,,,,Gao
gaos5_1,YeiE,79463,80864,,,,Gao
gaos5_2,YeiE,85629,87354,,,,Gao
gaos5_3,YeiE,361149,362403,,,,Gao
gaos5_4,YeiE,430352,431237,,,,Gao


# Combine TFBS and Filter for TFs of Interest

In [9]:
# Don't actually use S5...it seems to not have precise binding locations
all_tfbs_df = pd.concat([bitome_tfbs_df, gao_s4_df])
all_tfbs_df_interest = all_tfbs_df[all_tfbs_df['tf'].isin(tfs_of_interest)]
all_tfbs_df_interest = all_tfbs_df_interest.drop_duplicates(subset=['left', 'right', 'tf'])
all_tfbs_df_interest = all_tfbs_df_interest.drop(columns='strand')

tfs_included = all_tfbs_df_interest['tf'].unique()
tfs_excluded = list(set(tfs_of_interest) - set(tfs_included))

print(f'{all_tfbs_df_interest.shape[0]} TFBS for all TFs of interest')
print()
print(f'{len(tfs_included)} TFs with binding site data:')
print(tfs_included)
print()
print(f'{len(tfs_excluded)} TFs with no binding site data:')
print(tfs_excluded)

1107 TFBS for all TFs of interest

55 TFs with binding site data:
['TyrR' 'ArgP' 'Mlc' 'Nac' 'NtrC' 'SoxS' 'OxyR' 'ArcA' 'Lrp' 'ArgR' 'GadE'
 'PurR' 'Cra' 'Rob' 'YiaJ' 'PdhR' 'GadX' 'SdiA' 'YqhC' 'StpA' 'AppY'
 'LsrR' 'YidZ' 'YcjW' 'YdcN' 'YfeC' 'YciT' 'YfeD' 'YdhB' 'YdcR' 'YebK'
 'YbcM' 'YedW' 'YiaU' 'YhjC' 'YihW' 'YneJ' 'YcaN' 'YgaV' 'YdiP' 'YjhI'
 'YbdO' 'YeaM' 'YfiE' 'YahB' 'YbeF' 'YcfQ' 'YihL' 'YidL' 'YehT' 'YgfI'
 'YbhD' 'YgbI' 'YhjB' 'YnfL']

26 TFs with no binding site data:
['YagI', 'YpdC', 'YchA', 'InfA', 'YjdC', 'YiaG', 'YafC', 'CspE', 'YieP', 'YjhU', 'YeiE', 'YeeY', 'YgeR', 'YdcI', 'YihY', 'Tus', 'YbaQ', 'YjjJ', 'HupA', 'YggD', 'YddM', 'YidP', 'YbaO', 'HupB', 'YheO', 'YbiH']


# Find Overlaps

In [10]:
overlap_finding_df = all_tfbs_df_interest.copy()

overlap_dfs = []

while overlap_finding_df.shape[0] != 0:
    tfbs_row = overlap_finding_df.iloc[0, :]
    l, r = tfbs_row.left, tfbs_row.right
    
    overlap_df = overlap_finding_df[
        ((overlap_finding_df['left'] < l) & (l < overlap_finding_df['right'])) |
        ((overlap_finding_df['left'] < r) & (r < overlap_finding_df['right'])) | 
        ((overlap_finding_df['left'] > l) & (overlap_finding_df['right'] < r))
    ]
    if not overlap_df.empty:
        overlap_df = overlap_df.append(tfbs_row)
        overlap_dfs.append(overlap_df)
    drop_idx = [tfbs_row.name] + list(overlap_df.index)
    overlap_finding_df = overlap_finding_df.drop(index=drop_idx)
    
overlap_df = pd.concat(overlap_dfs)

# Summarize Overlap Info

In [123]:
print(f'{overlap_df.shape[0]} TFBS are involved in overlaps')
print()

tf_counts = Counter(overlap_df['tf'])
for tf, count in sorted(tf_counts.items(), key=lambda tup: tup[1], reverse=True):
    print(f'{tf}: {count}')

192 TFBS are involved in overlaps

ArcA: 36
SoxS: 16
Lrp: 16
YqhC: 11
GadX: 9
YdcN: 9
Rob: 8
YebK: 8
YidZ: 7
YdhB: 7
YhjC: 6
YedW: 5
OxyR: 4
ArgP: 3
Nac: 3
Cra: 3
NtrC: 3
YiaU: 3
YjhI: 3
GadE: 2
YfeC: 2
SdiA: 2
YciT: 2
YbeF: 2
YcaN: 2
YneJ: 2
YdiP: 2
YihW: 2
YeaM: 2
YgaV: 2
Mlc: 1
YbhD: 1
YcjW: 1
YfiE: 1
YhjB: 1
YidL: 1
YehT: 1
YcfQ: 1
YahB: 1
YihL: 1


# Write Results to File

In [124]:
for i, df in enumerate(overlap_dfs):
    with open(Path('..', 'data', 'tfbs_overlap.csv'),'a') as f:
        df.to_csv(f, header=(i==0))
        f.write("\n")

# Compute Optimal Pools for ChIP-Exo

In [133]:
tf_blacklists = {}

for tf in tfs_included:
    overlap_dfs_with_tf = [odf for odf in overlap_dfs if tf in list(odf['tf'])]
    if overlap_dfs_with_tf:
        tf_blacklists[tf] = list(set(pd.concat(overlap_dfs_with_tf)['tf'].unique()) - set([tf]))
    else:
        tf_blacklists[tf] = []
        
sorted_blacklists = OrderedDict(sorted(tf_blacklists.items(), key=lambda tup: len(tup[1]), reverse=True))

blacklists_copy = sorted_blacklists.copy()
pools = []

while len(blacklists_copy) > 0:
    
    pool = []
    for tf in blacklists_copy.keys():
        if not pool:
            pool.append(tf)
        else:
            full_blacklist = list(chain(*[blacklists_copy[pool_tf] for pool_tf in pool]))
            if tf not in full_blacklist:
                pool.append(tf)
    pools.append(pool)
    for pool_tf in pool:
        del blacklists_copy[pool_tf]
        
for i, pool in enumerate(pools):
    print(f'Pool {i+1}: {pool}')

Pool 1: ['YdhB', 'Lrp', 'Cra', 'YjhI', 'OxyR', 'YiaU', 'YcfQ', 'YidL', 'Mlc', 'Nac', 'YcjW', 'YgaV', 'YihL', 'TyrR', 'NtrC', 'ArgR', 'PurR', 'YiaJ', 'PdhR', 'SdiA', 'StpA', 'AppY', 'LsrR', 'YfeD', 'YdcR', 'YbcM', 'YbdO', 'YgfI', 'YgbI', 'YnfL']
Pool 2: ['YdcN', 'YhjC', 'YneJ', 'SoxS', 'YfeC', 'YdiP', 'ArgP', 'GadE', 'YahB', 'YeaM']
Pool 3: ['YqhC', 'Rob', 'YcaN', 'YciT', 'YfiE', 'GadX', 'YehT']
Pool 4: ['YidZ', 'YedW', 'ArcA', 'YhjB']
Pool 5: ['YebK', 'YbeF']
Pool 6: ['YihW', 'YbhD']
