# Processing DNAShape Features for Bitome

## Setup

In [4]:
from pathlib import Path

import pandas as pd

In [5]:
seq_beginning = 'AGCTTTTCATTCTGACTGCA'
seq_end = 'CGCCTTAGTAAGTATTTTTC'

## Minor Groove Width

In [6]:
mgw = pd.read_csv(Path('ecolik12-MGW.txt'), sep='\t', skiprows=1, usecols=[1, 2, 3], names=['start', 'end', 'MGW'])
mgw

Unnamed: 0,start,end,MGW
0,2,3,4.14
1,3,4,4.03
2,4,5,3.68
3,5,6,4.35
4,6,7,5.40
...,...,...,...
4641643,4641645,4641646,5.30
4641644,4641646,4641647,3.79
4641645,4641647,4641648,3.63
4641646,4641648,4641649,3.38


Based on web server test, this feature is calculated for each actual base, and the first and last two are ommitted; so the second base position is correct

In [7]:
mgw_series = mgw.drop(columns='start').rename(columns={'end': 'base'}).set_index('base').squeeze()
mgw_series

base
3          4.14
4          4.03
5          3.68
6          4.35
7          5.40
           ... 
4641646    5.30
4641647    3.79
4641648    3.63
4641649    3.38
4641650    4.35
Name: MGW, Length: 4641648, dtype: float64

## ORChlD2

In [8]:
orchid2 = pd.read_csv(Path('ecolik12-ORChlD2.txt'), sep='\t', skiprows=1, usecols=[1, 2, 3], names=['start', 'end', 'ORChLD'])
orchid2

Unnamed: 0,start,end,ORChLD
0,0,1,-0.308000
1,1,2,0.737000
2,2,3,0.596667
3,3,4,-0.108625
4,4,5,0.374125
...,...,...,...
4641647,4641647,4641648,-0.325125
4641648,4641648,4641649,-0.115750
4641649,4641649,4641650,-0.612833
4641650,4641650,4641651,-0.588500


So this is one number per base, and has all bases; but the rest don't have the first and last two; so drop here too

In [45]:
orchid2_series = orchid2.drop(columns='start').rename(columns={'end': 'base'}).set_index('base').squeeze()
orchid2_series = orchid2_series.iloc[2:-2]
orchid2_series

base
3          0.596667
4         -0.108625
5          0.374125
6          0.260833
7         -0.210375
             ...   
4641646   -0.214500
4641647    0.485500
4641648   -0.325125
4641649   -0.115750
4641650   -0.612833
Name: ORChLD, Length: 4641648, dtype: float64

## Helical Twist

In [10]:
helt = pd.read_csv(Path('ecolik12-HelT.txt'), sep='\t', skiprows=1, usecols=[1, 2, 3], names=['start', 'end', 'HelT'])
helt

Unnamed: 0,start,end,HelT
0,1,2,38.05
1,2,3,32.48
2,3,4,36.87
3,4,5,37.01
4,5,6,36.11
...,...,...,...
4641644,4641645,4641646,33.31
4641645,4641646,4641647,37.45
4641646,4641647,4641648,37.47
4641647,4641648,4641649,37.37


This is actually applicable to the gap between bases, not the specific base pairs; also, using the web server, I determined that the base "pairs" at each end of the sequence are excluded, so the "end" column is actually the start

In [39]:
helt_series = helt.drop(columns='start').rename(columns={'end': 'base'}).set_index('base').squeeze()
helt_series

base
2          38.05
3          32.48
4          36.87
5          37.01
6          36.11
           ...  
4641646    33.31
4641647    37.45
4641648    37.47
4641649    37.37
4641650    36.32
Name: HelT, Length: 4641649, dtype: float64

We also want this to just be associated with each base; average between the two flanking values to get this:
NOTE: the label "base" for the index is wrong, it's actually START of pair; just doing this for later

In [49]:
helt_series_interp = pd.Series(
    (helt_series.iloc[:-1].values + helt_series.iloc[1:].values) / 2,
    index=helt_series.index[1:]
).rename('HelT')
helt_series_interp

base
3          35.265
4          34.675
5          36.940
6          36.560
7          35.625
            ...  
4641646    33.960
4641647    35.380
4641648    37.460
4641649    37.420
4641650    36.845
Name: HelT, Length: 4641648, dtype: float64

## Propeller Twist

In [32]:
prot = pd.read_csv(Path('ecolik12-ProT.txt'), sep='\t', skiprows=1, usecols=[1, 2, 3], names=['start', 'end', 'ProT'])
prot

Unnamed: 0,start,end,ProT
0,2,3,-1.56
1,3,4,-10.58
2,4,5,-14.68
3,5,6,-13.16
4,6,7,-10.81
...,...,...,...
4641643,4641645,4641646,-10.45
4641644,4641646,4641647,-12.63
4641645,4641647,4641648,-14.89
4641646,4641648,4641649,-16.51


Same as MGW, one value per base, but skips the first and last 2; so end is actually base

In [33]:
prot_series = prot.drop(columns='start').rename(columns={'end': 'base'}).set_index('base').squeeze()
prot_series

base
3          -1.56
4         -10.58
5         -14.68
6         -13.16
7         -10.81
           ...  
4641646   -10.45
4641647   -12.63
4641648   -14.89
4641649   -16.51
4641650   -13.16
Name: ProT, Length: 4641648, dtype: float64

## Roll

In [34]:
roll = pd.read_csv(Path('ecolik12-Roll.txt'), sep='\t', skiprows=1, usecols=[1, 2, 3], names=['start', 'end', 'Roll'])
roll

Unnamed: 0,start,end,Roll
0,1,2,-4.35
1,2,3,-3.60
2,3,4,-5.77
3,4,5,-4.28
4,5,6,-2.65
...,...,...,...
4641644,4641645,4641646,-5.34
4641645,4641646,4641647,-4.52
4641646,4641647,4641648,-4.33
4641647,4641648,4641649,-4.69


Same as HelT, is based on 2 adjacent bases, and the first and last are omitted; same caveat re: name of index as HelT

In [41]:
roll_series = roll.drop(columns='start').rename(columns={'end': 'base'}).set_index('base').squeeze()
roll_series

base
2         -4.35
3         -3.60
4         -5.77
5         -4.28
6         -2.65
           ... 
4641646   -5.34
4641647   -4.52
4641648   -4.33
4641649   -4.69
4641650   -3.67
Name: Roll, Length: 4641649, dtype: float64

We also want this to just be associated with each base; average between the two flanking values to get this:

In [48]:
roll_series_interp = pd.Series(
    (roll_series.iloc[:-1].values + roll_series.iloc[1:].values) / 2,
    index=roll_series.index[1:]
).rename('Roll')
roll_series_interp

base
3         -3.975
4         -4.685
5         -5.025
6         -3.465
7         -1.190
           ...  
4641646    0.445
4641647   -4.930
4641648   -4.425
4641649   -4.510
4641650   -4.180
Name: Roll, Length: 4641648, dtype: float64

## Write to H5

In [50]:
keys = ['MGW', 'ORChLD2', 'HelT', 'ProT', 'Roll']
series = [mgw_series, orchid2_series, helt_series_interp, prot_series, roll_series_interp]

for key, series in zip(keys, series):
    series.to_hdf(f'~/Projects/bitome-kb/data/shape/{key}.h5', key, format='table', complevel=9)

In [51]:
helt_test = pd.read_hdf('shape_HelT.h5')
helt_test

FileNotFoundError: File shape_HelT.h5 does not exist