# Import Packages and Construct Bitome

In [65]:
from itertools import product
import os
from pathlib import Path
import sys

import numpy as np
import pandas as pd

sys.path.append('../../bitome2')

from bitome.core import Bitome

In [2]:
K12_DATA_PATH = Path('../data/bitome2/mg1655')

In [3]:
# from GenBank annotation
origin = (3925743, 3925975)
# from 1. Duggin, I. G. & Bell, S. D. J. Mol. Biol. (2009). with the following terA/terC sequences:
# ter_a = 'AATTAGTATGTTGTAACTAAAGT'
# ter_c = 'ATATAGGATGTTGTAACTAATAT'
terminus = (1341745, 1609180)

In [4]:
mg1655 = Bitome(
    Path(K12_DATA_PATH, 'NC_000913.3.gb'),
    name='MG1655',
    origin=origin, terminus=terminus,
    gene_table=Path(K12_DATA_PATH, 'gene_info_supp.csv'),
    tu_table=Path(K12_DATA_PATH, 'tu.csv'),
    operon_table=Path(K12_DATA_PATH, 'operon.csv'),
    tss_table=Path(K12_DATA_PATH, 'tss.csv'),
    # TODO PARSE TTS data
    tts_table=None,
    tfbs_table=Path(K12_DATA_PATH, 'tfbs.csv'),
    terminator_table=Path(K12_DATA_PATH, 'terminator.csv'),
    attenuator_table=Path(K12_DATA_PATH, 'attenuator.csv'),
    rbs_table=Path(K12_DATA_PATH, 'rbs.csv'),
    riboswitch_table=Path(K12_DATA_PATH, 'riboswitch.csv')
)

# Shape Processing Example

In [5]:
# feature_table = m_florum.gene_table.iloc[:3, :]
feature_table = pd.DataFrame(
    data={
        'left': [100, 90, 80, 70],
        'right': [110, 120, 130, 140],
        'strand': [1, 1, 1, 1]
    },
    index=['a', 'b', 'c', 'd']
)

shape_res = mg1655.get_dna_shape(feature_table)

In [6]:
shape_res['a'][['Buckle', 'Opening', 'ProT', 'Shear', 'Stagger', 'Stretch']]

Unnamed: 0,Buckle,Opening,ProT,Shear,Stagger,Stretch
100,-0.87,-1.44,-12.2,-0.16,-0.08,-0.03
101,3.61,-1.05,-11.32,-0.1,-0.23,-0.02
102,0.32,-1.68,-10.7,0.12,-0.16,0.0
103,7.24,-2.26,-13.29,0.15,-0.11,-0.02
104,7.97,-2.05,-14.89,0.17,-0.1,-0.01
105,3.92,-0.67,-14.95,0.18,-0.1,-0.01
106,-3.92,-0.67,-14.95,-0.18,-0.1,-0.01
107,-7.97,-2.05,-14.89,-0.17,-0.1,-0.01
108,-7.24,-2.26,-13.29,-0.15,-0.11,-0.02
109,-0.84,-1.11,-12.37,-0.08,-0.25,-0.02


In [7]:
shape_res['b'].loc[100:110][['Buckle', 'Opening', 'ProT', 'Shear', 'Stagger', 'Stretch']]

Unnamed: 0,Buckle,Opening,ProT,Shear,Stagger,Stretch
100,-0.87,-1.44,-12.2,-0.16,-0.08,-0.03
101,3.61,-1.05,-11.32,-0.1,-0.23,-0.02
102,0.32,-1.68,-10.7,0.12,-0.16,0.0
103,7.24,-2.26,-13.29,0.15,-0.11,-0.02
104,7.97,-2.05,-14.89,0.17,-0.1,-0.01
105,3.92,-0.67,-14.95,0.18,-0.1,-0.01
106,-3.92,-0.67,-14.95,-0.18,-0.1,-0.01
107,-7.97,-2.05,-14.89,-0.17,-0.1,-0.01
108,-7.24,-2.26,-13.29,-0.15,-0.11,-0.02
109,-0.84,-1.11,-12.37,-0.08,-0.25,-0.02


# Create All 5-mers

In [5]:
fivemers = [''.join(tup) for tup in product('ACGT', repeat=5)]
fivemer_df = pd.DataFrame(
    data={'seq': fivemers, 'left': 1, 'right': 5, 'strand': 1},
    index=fivemers
)
fivemer_df.head()

Unnamed: 0,seq,left,right,strand
AAAAA,AAAAA,1,5,1
AAAAC,AAAAC,1,5,1
AAAAG,AAAAG,1,5,1
AAAAT,AAAAT,1,5,1
AAACA,AAACA,1,5,1


# Call Special get_dna_shape

In [6]:
mg1655.get_dna_shape(fivemer_df, seq_column=True, parse_result=False)

# Parse Output Files

In [66]:
shape_output_dir = Path('../../bitome2/bitome/shape_work_dir/shape_tmp')

shape_to_fivemer_lookup = {}

for shape_filename in os.listdir(shape_output_dir):
    # skip if it's just the initial input
    if shape_filename[-2:] == 'fa':
        continue

    shape_name = shape_filename.split('.')[-1]
    with open(Path(shape_output_dir, shape_filename), 'r') as shape_result_file:
    
        lines = shape_result_file.readlines()

        # each line corresponds to a sequence; although the lines get written with newlines
        # if they're too long, so we have to figure that out
        fivemer_shape_dict = {}
        for line in lines:
            line_strip = line.strip()
            if line_strip[0] == '>':
                current_index = line_strip[1:]
            else:
                nums_only = [
                    float(str_num) for str_num in line_strip.replace('NA,', '').replace(',NA', '').split(',')
                ]
                # taking the mean here converts from gap to central bp
                fivemer_shape_dict[current_index] = np.mean(nums_only)
                
    shape_to_fivemer_lookup[shape_name] = fivemer_shape_dict
    
# create a final DataFrame to store our lookup
fivemer_shape_df = pd.DataFrame(columns=shape_to_fivemer_lookup.keys(), index=fivemers)

for shape, specific_shape_dict in shape_to_fivemer_lookup.items():
    
    for fivemer, shape_val in specific_shape_dict.items():
        fivemer_shape_df.loc[fivemer, shape] = shape_val

In [67]:
fivemer_shape_df

Unnamed: 0,Opening,Rise,Stretch,EP,Tilt,Shear,ProT,Buckle,Shift,HelT,Stagger,Slide,MGW,Roll
AAAAA,-3.71,3.25,0.03,-10.1,-0.97,0.18,-16.51,7.8,-0.14,37.875,-0.22,-0.98,3.38,-5.07
AAAAC,-1.9,3.2,-0.01,-8.97,-1.405,0.15,-14.47,9.07,0.01,37.04,-0.06,-1.06,4.05,-4.21
AAAAG,-2.3,3.235,0.01,-10.39,-1.475,0.17,-14.68,7.39,0.045,37.1,-0.13,-1.14,3.68,-5.35
AAAAT,-2.05,3.275,-0.01,-10.8,-1.28,0.17,-14.89,7.97,0.03,37.305,-0.1,-1.15,3.63,-4.34
AAACA,-1.3,3.28,-0.01,-8.05,-0.96,0.15,-13.05,11.53,0.25,35.905,0.13,-1.3,4.65,-2.935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGT,-1.7,3.3,-0.03,-7.86,-0.225,-0.09,-12.03,-2.71,0.315,35.59,-0.25,-1.095,5.21,1.765
TTTTA,-2.26,3.175,-0.02,-8.39,0.855,-0.15,-13.29,-7.24,0.18,36.16,-0.11,-1.05,4.89,-2.605
TTTTC,-2.28,3.2,-0.01,-9.65,1.08,-0.16,-13.16,-6,0.21,36.66,-0.16,-1.035,4.35,-3.995
TTTTG,-2.39,3.19,-0.03,-8.76,1.27,-0.15,-13.79,-7.51,0.12,36.215,-0.11,-1.04,4.76,-2.475


In [88]:
fivemer_shape_df.to_csv('../data/5mer_shape_lookup.csv')

# Shape Function

In [70]:
def get_dna_shape(seq):
    """
    Given a sequence, compute all 14 shape params
    """
    
    seq_fivemers = [seq[i:i+5] for i in range(len(seq) - 5 + 1)]
    seq_fivemer_shape_df = fivemer_shape_df.loc[seq_fivemers]
    
    return seq_fivemer_shape_df

# Testing

In [79]:
test_seqs = ['ATCGCGTAAGCTCG', 'TATATGCTCGATA', 'TATCTTTCTCTAGCTAG', 'TCGTAGGGCTTAGC']
test_df = pd.DataFrame(
    data={'seq': test_seqs, 'left': 1, 'right': 5, 'strand': 1},
    index=test_seqs
)

mg1655.get_dna_shape(test_df, seq_column=True, parse_result=False)

shape_output_dir = Path('../../bitome2/bitome/shape_work_dir/shape_tmp')

shape_to_test_lookup = {}

for shape_filename in os.listdir(shape_output_dir):
    # skip if it's just the initial input
    if shape_filename[-2:] == 'fa':
        continue

    shape_name = shape_filename.split('.')[-1]
    with open(Path(shape_output_dir, shape_filename), 'r') as shape_result_file:
    
        lines = shape_result_file.readlines()

        # each line corresponds to a sequence; although the lines get written with newlines
        # if they're too long, so we have to figure that out
        test_shape_dict = {}
        for line in lines:
            line_strip = line.strip()
            if line_strip[0] == '>':
                current_index = line_strip[1:]
            else:
                nums_only = [
                    float(str_num) for str_num in line_strip.replace('NA,', '').replace(',NA', '').split(',')
                ]
                # taking the mean here converts from gap to central bp
                test_shape_dict[current_index] = nums_only
                
    shape_to_test_lookup[shape_name] = test_shape_dict
    
# create a final DataFrame to store our lookup
test_shape_df = pd.DataFrame(columns=shape_to_test_lookup.keys(), index=test_seqs)

for shape, specific_shape_dict in shape_to_test_lookup.items():
    
    for test_seq, shape_val in specific_shape_dict.items():
        test_shape_df.loc[test_seq, shape] = shape_val

In [87]:
test_shape_df.iloc[0]

Opening    [0.44, 0.4, 0.46, 0.56, -0.99, -1.14, -1.23, 0...
Rise       [3.32, 3.4, 3.4, 3.45, 3.29, 3.3, 3.1, 3.41, 3...
Stretch    [-0.03, -0.03, -0.03, -0.03, -0.03, -0.02, -0....
EP         [-5.24, -4.75, -4.86, -5.03, -6.77, -7.23, -7....
Tilt       [-0.45, -0.36, 0.08, 0.24, 0.1, 1.35, -1.73, -...
Shear      [0.26, -0.25, 0.27, -0.27, -0.11, 0.1, 0.14, -...
ProT       [-6.96, -4.27, -4.73, -7.22, -7.71, -10.31, -8...
Buckle     [2.84, -1.22, 1.59, -4.59, -0.6, -0.91, 6.15, ...
Shift      [0.21, 0.07, 0.0, -0.04, -0.23, -0.11, -0.06, ...
HelT       [35.82, 32.57, 35.78, 33.17, 33.75, 34.3, 34.8...
Stagger    [-0.08, 0.03, 0.0, 0.0, 0.1, -0.17, 0.05, 0.12...
Slide      [-1.49, -1.58, -1.56, -1.64, -1.48, -1.29, -1....
MGW        [5.28, 5.42, 5.2, 5.32, 6.11, 5.62, 5.01, 4.14...
Roll       [-0.66, 4.33, -1.36, 4.41, -2.01, 6.12, -3.06,...
Name: ATCGCGTAAGCTCG, dtype: object

In [86]:
test_shape_dfs = [get_dna_shape(test_seq) for test_seq in test_seqs]
test_shape_dfs[0]

Unnamed: 0,Opening,Rise,Stretch,EP,Tilt,Shear,ProT,Buckle,Shift,HelT,Stagger,Slide,MGW,Roll
ATCGC,0.44,3.37,-0.03,-5.24,-0.485,0.26,-6.96,2.84,0.13,34.155,-0.08,-1.555,5.28,1.745
TCGCG,0.4,3.385,-0.03,-4.75,-0.225,-0.25,-4.27,-1.22,0.035,34.155,0.03,-1.555,5.42,1.585
CGCGT,0.46,3.435,-0.03,-4.86,0.37,0.27,-4.73,1.59,0.0,34.55,0.0,-1.605,5.2,1.56
GCGTA,0.56,3.365,-0.03,-5.03,0.175,-0.27,-7.22,-4.59,-0.135,33.535,0.0,-1.555,5.32,1.12
CGTAA,-0.99,3.29,-0.03,-6.77,0.705,-0.11,-7.71,-0.6,-0.17,33.92,0.1,-1.365,6.11,2.2
GTAAG,-1.14,3.21,-0.02,-7.23,-0.175,0.1,-10.31,-0.91,-0.07,34.605,-0.17,-1.295,5.62,1.33
TAAGC,-1.23,3.245,-0.03,-7.49,-1.895,0.14,-8.92,6.15,0.135,33.51,0.05,-1.515,5.01,-2.715
AAGCT,0.37,3.5,-0.02,-6.03,-0.92,-0.25,-1.56,3.03,0.175,35.095,0.12,-1.815,4.14,-3.795
AGCTC,0.35,3.43,-0.02,-5.79,1.0,0.25,-0.83,-2.49,-0.115,34.495,0.12,-1.78,4.63,-3.055
GCTCG,-0.64,3.265,-0.03,-7.17,0.985,-0.1,-6.78,-1.11,-0.04,33.36,-0.04,-1.6,5.2,-1.72
