# Gene Expression Prediction with Keras

## Import and Set Up Data

In [1]:
import sys

from Bio.Align import AlignInfo, MultipleSeqAlignment
import Bio.motifs as motifs
from Bio.SeqFeature import FeatureLocation
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_addons as tfa

sys.path.append('../bitome-kb')
from bitome.core import Bitome

TensorFlow Addons offers no support for the nightly versions of TensorFlow. Some things might work, some other might not. 
If you encounter a bug, do not file an issue on GitHub.


In [2]:
bitome = Bitome.init_from_file('../bitome-kb/bitome.pkl')

log_tpm_df = pd.read_csv('data/log_tpm_qc.csv', index_col=0)
log_tpm_df_m9 = log_tpm_df.iloc[:, :2].mean(axis=1)
median_expression = log_tpm_df.median(axis=1)

In [3]:
locus_tags = []
tu_tups = []
expressions = []
for locus_tag, expression in median_expression.iteritems():

    genes = [g for g in bitome.genes if g.locus_tag == locus_tag]
    if genes:
        gene = genes[0]
        gene_tus = [tu for tu in gene.transcription_units]
        if gene_tus:
            locus_tags.append(locus_tag)
            tu_tups.append(gene_tus)
            expressions.append(median_expression[locus_tag])

In [4]:
N_UP = 200
N_DOWN = 50

consensus_mats = []
expressions_to_use = []

for tus, expression in zip(tu_tups, expressions):
    
    # extract the sequences for each tss
    sequences = []
    for tu in tus:
        
        strand = tu.location.strand
        if strand == 1:
            if tu.tss is not None:
                tss = tu.tss
            else:
                tss = tu.location.start.position
            # define the sequence feature location to extract the sequence around this TSS
            seq_loc = FeatureLocation(
                int(tss - N_UP - 1),
                int(tss + N_DOWN),
                int(strand)
            )
        else:
            if tu.tss is not None:
                tss = tu.tss
            else:
                tss = tu.location.end.position
            # define the sequence feature location to extract the sequence around this TSS
            seq_loc = FeatureLocation(
                int(tss - N_DOWN - 1),
                int(tss + N_UP),
                int(strand)
            )
        
        if tss > 250:
            sequence = seq_loc.extract(bitome.sequence)
            sequences.append(sequence)
        else:
            continue
            
    if not sequences:
        continue
    else:
        expressions_to_use.append(expression)

    # create a motif with Biopython and get a consensus sequence from that; return the degenerate consensus
    motif = motifs.create(sequences)

    pwm = motif.counts
    base_rows = []

    for base in 'ATCG':
        base_row = pd.Series(pwm[base], index=[f'{pos}_{base}' for pos in np.arange(-N_UP, N_DOWN+1)])
        base_rows.append(base_row)

    sequence_mat = np.array(base_rows).T
    consensus_mats.append(sequence_mat)

X = np.array(consensus_mats)
y = expressions_to_use

In [5]:
X_traindev, X_lockbox, y_traindev, y_lockbox = train_test_split(
    X, y, test_size=0.1, random_state=42
)

X_train, X_val, y_train, y_val = [tf.convert_to_tensor(arr) for arr in train_test_split(
    X_traindev, y_traindev, test_size=(1/9), random_state=42
)]

## Define Model Architecture

In [15]:
model = tf.keras.Sequential()

model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True), input_shape=(251, 4)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)))
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='linear'))

model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_5 (Bidirection (None, 251, 64)           9472      
_________________________________________________________________
bidirectional_6 (Bidirection (None, 64)                24832     
_________________________________________________________________
dense_8 (Dense)              (None, 10)                650       
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 11        
Total params: 34,965
Trainable params: 34,965
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(
    loss=tf.losses.MeanAbsoluteError(),
    optimizer=tf.optimizers.Adam(),
    metrics=[tf.metrics.MeanAbsoluteError(), tf.metrics.MeanSquaredError()]
)

In [17]:
hist = model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [87]:
y_pred = model.predict(X_val)