In [1]:
tf_name = 'BHLHE40'
file_name = 'Mus_musculus|M00251_1.94d|Badis09|Bhlhb2_1274.3=v2.txt'
in_dir = 'data'
gpu_id = 2

In [2]:
import os
import numpy as np
import pandas as pd
from dgutils.pandas import add_column, add_columns

In [3]:
assert file_name.endswith('.txt')

_file_name = os.path.join(in_dir, tf_name, file_name)
print("Loading input data file: %s" % _file_name)
df = pd.read_csv(_file_name,
                 names=['intensity', 'sequence'], delim_whitespace=True)

Loading input data file: data/BHLHE40/Mus_musculus|M00251_1.94d|Badis09|Bhlhb2_1274.3=v2.txt


In [4]:
def _process_seq(s):
    if 'GTCTGTGTTCCGTTGTCCGTGCTG' in s:
        return s.replace('GTCTGTGTTCCGTTGTCCGTGCTG', '')
    else:
        return None


# if sequence length is 60, trim primer suffix
if len(df.iloc[0]['sequence']) == 60:
    df = add_column(df, 'sequence', ['sequence'], _process_seq)
    # drop those entries where primer is not present
    print("dropping %d rows" % len(df[df['sequence'].isna()]))
    print("before %d" % len(df))
    df = df.dropna(subset=['sequence'])
    print("after %d" % len(df))

# log intensity
df = add_column(df, 'log_intensity', ['intensity'], lambda x: np.log(x))
# drop nan's
# replace inf with nan
df = df.replace([np.inf, -np.inf], np.nan)
print("Drop NaN's in log intensity, before: %d" % len(df))
df = df.dropna(subset=['log_intensity'])
print("after %d" % len(df))

# some sequence might be of different length, drop them
print("lengths: %s" % df.sequence.str.len().unique())
print("median length: %d" % df.sequence.str.len().median())
print("Drop non-median length rows, before: %d" % len(df))
df = df[df.sequence.str.len() == df.sequence.str.len().median()]
print("After: %d" % len(df))

Drop NaN's in log intensity, before: 41834
after 41721
lengths: [36]
median length: 36
Drop non-median length rows, before: 41721
After: 41721




In [5]:
# split training/validation + testing
train_mask = np.random.rand(len(df)) < 0.8
df_train = df[train_mask]
df_test = df[~train_mask]

In [6]:
# encode data
IN_MAP = np.asarray([[0, 0, 0, 0],
                     [1, 0, 0, 0],
                     [0, 1, 0, 0],
                     [0, 0, 1, 0],
                     [0, 0, 0, 1]])
_data_input = []
_data_output = []
for i, row in df_train.iterrows():
    seq = row['sequence']
    seq = seq.upper().replace('A', '1').replace('C', '2').replace('G', '3').replace('T', '4').replace('N', '0')
    x = np.asarray(map(int, list(seq)))
    x = IN_MAP[x.astype('int8')]
    _data_input.append(x)

    _val = [row['log_intensity']]
    _data_output.append(_val)

X_train = np.swapaxes(np.swapaxes(np.stack(_data_input, axis=2), 0, 2), 1, 2)
Y_train = np.swapaxes(np.stack(_data_output, axis=1), 0, 1)

In [7]:
Y_train[np.where(np.isinf(Y_train))]

array([], dtype=float64)