# Import Packages and Construct Bitome

In [90]:
from pathlib import Path
import re
import sys

import pandas as pd

sys.path.append('../../bitome2')

from bitome.core import Bitome

In [57]:
K12_DATA_PATH = Path('../data/bitome2/mg1655')

In [58]:
# from GenBank annotation
origin = (3925743, 3925975)
# from 1. Duggin, I. G. & Bell, S. D. J. Mol. Biol. (2009). with the following terA/terC sequences:
# ter_a = 'AATTAGTATGTTGTAACTAAAGT'
# ter_c = 'ATATAGGATGTTGTAACTAATAT'
terminus = (1341745, 1609180)

In [120]:
mg1655 = Bitome(
    Path(K12_DATA_PATH, 'NC_000913.3.gb'),
    name='MG1655',
    origin=origin, terminus=terminus,
    gene_table=Path(K12_DATA_PATH, 'gene_info_supp.csv'),
    tu_table=Path(K12_DATA_PATH, 'tu.csv'),
    operon_table=Path(K12_DATA_PATH, 'operon.csv'),
    tss_table=Path(K12_DATA_PATH, 'tss.csv'),
    # TODO PARSE TTS data
    tts_table=None,
    tfbs_table=Path(K12_DATA_PATH, 'tfbs.csv'),
    terminator_table=Path(K12_DATA_PATH, 'terminator.csv'),
    attenuator_table=Path(K12_DATA_PATH, 'attenuator.csv'),
    rbs_table=Path(K12_DATA_PATH, 'rbs.csv'),
    riboswitch_table=Path(K12_DATA_PATH, 'riboswitch.csv')
)

# Promoter Table Example

In [121]:
tu_id = 'ECK120009218'

tu_row = mg1655.tu_table.loc[tu_id]

# get the first gene of this TU
def get_tu_first_gene(tu_id, strand):
    
    tu_genes = mg1655._tu_to_genes[tu_id]
    tu_gene_df = mg1655.gene_table.loc[tu_genes]
    if strand == 1:
        sorted_gene_df = tu_gene_df.sort_values(by='left', ascending=True)
    else:
        sorted_gene_df = tu_gene_df.sort_values(by='right', ascending=False)
    return sorted_gene_df.index[0]

first_gene_id = get_tu_first_gene(tu_row.name, tu_strand)
first_gene_row = mg1655.gene_table.loc[first_gene_id]

gene_info_dict = mg1655.genome_organization_for_gene(first_gene_row, primary_tu=True)
gene_info_dict['sigma_factor'] = tu_row.sigma_factor
gene_info_dict

{'genome_loc_sin': 0.8090169943749473,
 'genome_loc_cos': 0.5877852522924732,
 'rep_region': 'lagging',
 'origin_dist': 1506716,
 'tu_len': 4319.0,
 'tu_gc': 0.5448020375086826,
 'tu_order': 1.0,
 'tss_dist': 26.0,
 'tss_base': 'A',
 'utr_len': 26.0,
 'utr_gc': 0.3846153846153846,
 'box_10_seq': 'TATGCT',
 'box_10_tss_dist': 12.0,
 'box_10_ext_gc': 0.33333333333333337,
 'box_35_seq': 'ATGTCA',
 'box_35_tss_dist': 36.0,
 'spacer_len': 18.0,
 'spacer_gc': 0.38888888888888884,
 'tu_tm': 47.122741534424165,
 'tu_gatc': False,
 'sigma_factor': 'Sigma70, Sigma38'}

# Regulatory States

In [122]:
REGULONDB_PATH = Path('../data/bitome2/mg1655/regulondb10.6/')

In [123]:
def read_regulon_db_file(filename) -> pd.DataFrame:
    """
    Given a filename for a raw TXT file from RegulonDB, parses into a DataFrame
    :param Union[str, Path] filename: the filename of the RegulonDB TXT file to read into a pandas DataFrame
    :return pd.DataFrame regulon_df: a pandas DataFrame parsed from a RegulonDB raw TXT file
    """

    full_filename = Path(REGULONDB_PATH, filename)

    with open(full_filename, 'r') as f:
        lines = f.readlines()

    comment = True
    i = 0
    while comment:
        if lines[i].startswith('#'):
            i += 1
        else:
            comment = False

    names = [line[5:-1] for line in lines if re.match(r'# \d', line)]

    df = pd.read_csv(full_filename, index_col=None, skiprows=i, sep='\t', header=None, names=names)

    return df.drop_duplicates()

In [124]:
def to_strand_int(strand):
    if strand in ['forward', '+', 1, 'FWD']:
        return 1
    elif strand in ['reverse', '-', -1, 'REV']:
        return -1
    else:
        return None

In [126]:
reg_phrase_df = read_regulon_db_file('reg_phrase.txt')
reg_phrase_df = reg_phrase_df[['REG_PHRASE_ID', 'PHRASE', 'REG_PHRASE_FUNCTION']]
reg_phrase_link_df = read_regulon_db_file('reg_phrase_ri_link.txt')

reg_phrase_merged_df = reg_phrase_df.merge(reg_phrase_link_df, on='REG_PHRASE_ID', how='left')
tfbs_reg_phrase = mg1655.tfbs_table.merge(reg_phrase_merged_df, on='REGULATORY_INTERACTION_ID', how='left')
tfbs_reg_phrase.head()

Unnamed: 0,left,right,mode,REGULATORY_INTERACTION_ID,PROMOTER_ID,final_state,tf,strand,REG_PHRASE_ID,PHRASE,REG_PHRASE_FUNCTION,TYPE
0,612650,612668,repressor,ECK120033853,ECK120010221,Fur-Fe<SUP>2+</SUP>,Fur,,ECK12T292271,"[Fur,-]",repressor,proximal
1,612650,612668,repressor,ECK120033883,ECK120010299,Fur-Fe<SUP>2+</SUP>,Fur,,ECK12T292271,"[Fur,-]",repressor,proximal
2,4516744,4516762,repressor,ECK120033831,ECK120010548,Fur-Fe<SUP>2+</SUP>,Fur,,ECK12T292271,"[Fur,-]",repressor,proximal
3,1619005,1619019,activator,ECK120034760,ECK120010245,Fis,Fis,,ECK12T292285,"[Fis,+]",activator,proximal
4,3305958,3305975,activator,ECK120031815,ECK120011032,TyrR-tyrosine,TyrR,,ECK12T292433,"[TyrR,+]",activator,remote


In [142]:
# hard-code the promoter ID we care about
prom_id = 'ECK120010122'

phrase_for_gal = tfbs_reg_phrase[tfbs_reg_phrase['PROMOTER_ID'] == prom_id]

for phrase, phrase_df in phrase_for_gal.groupby('REG_PHRASE_ID'):
    print(phrase_df.iloc[0]['PHRASE'])
    
phrase_df

[GalR,-]
[GalS,-]
[CRP,+]
[H-NS,-]
[HU,-]


Unnamed: 0,left,right,mode,REGULATORY_INTERACTION_ID,PROMOTER_ID,final_state,tf,strand,REG_PHRASE_ID,PHRASE,REG_PHRASE_FUNCTION,TYPE
1630,792059,792092,repressor,ECK120030995,ECK120010122,HU,HU,,ECK12T292364,"[HU,-]",repressor,proximal


# Promoter Sweep Variables

## Motifs

Just a single sigma shown, but same for any motif with PSSM

In [101]:
sigma_motif_dict = mg1655.create_sigma_motifs()

In [102]:
mg1655.promoter_motif_search(tu_row.tss, tu_strand, sigma_motif_dict['Sigma70']['-10'].pssm, n_up=200, n_down=0, n_best_matches=200).sort_values(by='right')

Unnamed: 0,match_sequence,log_odds,left,right
147,ATTTCA,-7.88548,792081.0,792086.0
16,TATTTC,1.290253,792082.0,792087.0
56,TTATTT,-2.124794,792083.0,792088.0
79,GTTATT,-4.060738,792084.0,792089.0
61,GGTTAT,-2.798393,792085.0,792090.0
...,...,...,...,...
155,CCTCTC,-8.610154,792272.0,792277.0
95,CCCTCT,-4.975958,792273.0,792278.0
191,ACCCTC,-10.79639,792274.0,792279.0
24,CACCCT,0.199219,792275.0,792280.0


## Shape

In [103]:
fivemer_shape_df = pd.read_csv('../data/5mer_shape_lookup.csv', index_col=0)

In [104]:
def get_dna_shape(seq):
    """
    Given a sequence, compute all 14 shape params
    """
    
    seq_fivemers = [seq[i:i+5] for i in range(len(seq) - 5 + 1)]
    seq_fivemer_shape_df = fivemer_shape_df.loc[seq_fivemers]
    
    return seq_fivemer_shape_df

In [105]:
get_dna_shape(str(mg1655.get_sequence(tu_row.tss - 200, tu_row.tss, -1)))

Unnamed: 0,Opening,Rise,Stretch,EP,Tilt,Shear,ProT,Buckle,Shift,HelT,Stagger,Slide,MGW,Roll
ATACC,-0.57,3.340,-0.03,-7.03,-0.320,0.11,-8.36,3.27,0.130,34.625,0.00,-1.495,5.46,1.400
TACCA,0.27,3.345,-0.04,-5.40,-0.610,0.27,-5.96,2.66,0.045,34.340,0.07,-1.600,5.13,-1.565
ACCAT,0.26,3.465,-0.05,-5.41,0.500,0.27,-4.33,0.80,-0.210,34.715,0.06,-1.660,4.97,1.410
CCATA,-0.60,3.245,-0.02,-6.63,0.270,0.09,-6.69,-2.86,-0.185,33.115,-0.07,-1.420,5.88,0.200
CATAA,-1.00,3.310,-0.02,-6.88,0.390,-0.10,-8.06,4.86,-0.040,33.010,-0.01,-1.390,6.02,2.320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AAGGC,0.28,3.450,-0.03,-5.64,-0.505,-0.28,-1.50,3.50,0.200,33.455,0.14,-1.845,4.51,-2.685
AGGCG,0.40,3.420,-0.04,-5.21,0.580,-0.30,-2.82,0.23,0.015,35.325,0.02,-1.725,4.77,-2.130
GGCGA,0.42,3.370,-0.03,-5.06,0.165,0.25,-2.99,0.17,-0.070,34.400,0.05,-1.580,5.40,0.810
GCGAT,0.44,3.370,-0.03,-5.24,0.485,-0.26,-6.96,-2.84,-0.130,34.155,-0.08,-1.555,5.28,1.745
