# RNN with OneHot tensor
Combine three previous achievements.
Still hold off on the stratified split.
This time use tensors instead of numpy arrays.

In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
tf.keras.backend.set_floatx('float32')  # was 64, now we're running out of memory. make this 32


Load data from FASTA files.

In [2]:
MIN_SEQ_LEN=200
MAX_SEQ_LEN=25000
DEFLINE='>'
# Assume file was preprocessed to contain one line per seq.
# Returned structure is ndarray of ndarray i.e no python lists inside.
def load_fasta(filename):
    seqs=[]
    with open (filename,'r') as infile:
        for line in infile:
            if line[0]!=DEFLINE and len(line)>=MIN_SEQ_LEN and len(line)<=MAX_SEQ_LEN:
                line=line.rstrip()
                chars=np.array(list(line))
                seqs.append(chars.reshape(-1, 1)) # reshaped changes (any,) to (any,1)
    nparray=np.array(seqs)
    return nparray

ncfile='ncRNA.fasta' 
print("Load "+ncfile)
nc_seqs = load_fasta(ncfile)
pcfile='pcRNA.fasta' 
print("Load "+pcfile)
pc_seqs = load_fasta(pcfile)

Load ncRNA.fasta
Load pcRNA.fasta


Encode DNA letters with 4-bit one-hot encoding.

In [3]:
encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)
print("Fit")
#seq=tf.reshape(nc_seqs[0],shape=(-1, 1)) # tensor flow version
seq=nc_seqs[0].reshape(-1, 1)
encoder.fit(seq)
encoder.categories_

Fit


[array(['A', 'C', 'G', 'T'], dtype='<U1')]

In [4]:
encoded=encoder.transform(seq) # returns a numpy.ndarray
print(type(encoded))
encoded

<class 'numpy.ndarray'>


array([[0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]])

In [5]:
print("non-coding")
nc_list=[]
for seq in nc_seqs:
    encoded=encoder.transform(seq)  
    nc_list.append(encoded)
    
nc_all=np.array(nc_list).reshape(-1,1)
nc_all.shape

non-coding


(17711, 1)

In [6]:
print("protein-coding")
pc_list=[]
for seq in pc_seqs:
    encoded=encoder.transform(seq)
    pc_list.append(encoded)

pc_all=np.array(pc_list).reshape(-1,1)
pc_all.shape

protein-coding


(20152, 1)

Add labels. Create train and validation sets.

In [7]:
nc_labels=np.zeros(shape=(17711,1))
pc_labels=np.ones(shape=(20152,1))

#nc_labeled=np.concatenate((nc_all,nc_labels),axis=1)
#pc_labeled=np.concatenate((pc_all,pc_labels),axis=1)
all_seqs=np.concatenate((nc_all,pc_all),axis=0)
all_labels=np.concatenate((nc_labels,pc_labels),axis=0)

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import ShuffleSplit

splitter = ShuffleSplit(n_splits=1, test_size=0.2, random_state=37863)
for train_index,test_index in splitter.split(all_seqs):
    train_seqs =   all_seqs[train_index]
    train_labels = all_labels[train_index]
    test_seqs =    all_seqs[test_index]
    test_labels =  all_labels[test_index]
 
print("train")
print(train_seqs.shape,train_labels.shape)
print("test")
print(test_seqs.shape,test_labels.shape)

train
(30290, 1) (30290, 1)
test
(7573, 1) (7573, 1)


In [8]:
# Now that values are shuffled, partition gives random sample.
X_train=train_seqs[:25000]
X_valid=train_seqs[25000:]
y_train=train_labels[:25000]
y_valid=train_labels[25000:]
print("Training Xy data")
print(X_train.shape,y_train.shape)
print("Validation Xy data")
print(X_valid.shape,y_valid.shape)
X_train[0]

Training Xy data
(25000, 1) (25000, 1)
Validation Xy data
(5290, 1) (5290, 1)


array([array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.]])], dtype=object)

In [9]:
# The SimpleRNN won't accept our numpy arrays. Here is the error.
# ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).

# For variable length sequences, it seems we must use a TensorFlow Ragged Tensor, which is much slower.
# https://www.tensorflow.org/guide/tensor#ragged_tensors

X_train = tf.ragged.constant(X_train)  # This takes a long time and hogs memory
print(type(X_train))
print(X_train.shape)
y_train = tf.ragged.constant(y_train)
print(type(y_train))
print(y_train.shape)
X_valid = tf.ragged.constant(X_valid)
print(type(X_valid))
print(X_valid.shape)
y_valid = tf.ragged.constant(y_valid)
print(type(y_valid))
print(y_valid.shape)

# This page offers a speedup for a slightly different problem and I cannot understand it.
# https://stackoverflow.com/questions/44353509/tensorflow-tf-constant-initializer-is-very-slow

# https://www.tensorflow.org/api_docs/python/tf/RaggedTensor#documenting_raggedtensor_shapes_2
# For example, the shape of a 3-D RaggedTensor that stores the fixed-size word embedding for each word in a sentence, for each sentence in a batch, could be written as [num_sentences, (num_words), embedding_size].

<class 'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensor'>
(25000, None, None, None)
<class 'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensor'>
(25000, None)
<class 'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensor'>
(5290, None, None, None)
<class 'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensor'>
(5290, None)


In [10]:
# tf.shape(X_train)
# ValueError: TypeError: object of type 'RaggedTensor' has no len()


In [11]:
seq_len=None  # none indicates variable length
input_features=4  # one hot encoding of DNA means 4 categories
rnn2 = keras.models.Sequential([
    keras.layers.SimpleRNN(4, return_sequences=True, input_shape=[seq_len,input_features]),
    keras.layers.SimpleRNN(1),
])
bc=tf.keras.losses.BinaryCrossentropy(from_logits=False)
rnn2.compile(loss=bc, optimizer="Adam",metrics=["accuracy"])
rnn2.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn (SimpleRNN)       (None, None, 4)           36        
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 1)                 6         
Total params: 42
Trainable params: 42
Non-trainable params: 0
_________________________________________________________________


In [12]:
history = rnn2.fit(X_train,y_train,epochs=5,validation_data=(X_valid,y_valid))
# ValueError: Input 0 of layer sequential_1 is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: [None, None, None, None]

Epoch 1/5


ValueError: in user code:

    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:531 train_step  **
        y_pred = self(x, training=True)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:885 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs,
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/input_spec.py:176 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) + ' of layer ' +

    ValueError: Input 0 of layer sequential is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: [None, None, None, None]


In [None]:
pd.DataFrame(history.history).plot(figsize=(8,5))
plt.grid(True)
plt.gca().set_ylim(0,1)
plt.show()