# RNN with OneHot tensor
Combine three previous achievements.
Still hold off on the stratified split.
This time use tensors instead of numpy arrays.

In [23]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
tf.keras.backend.set_floatx('float64')


Load data from FASTA files.

In [24]:
MIN_SEQ_LEN=200
MAX_SEQ_LEN=25000
DEFLINE='>'
# Assume file was preprocessed to contain one line per seq.
# Returned structure is ndarray of ndarray i.e no python lists inside.
def load_fasta(filename):
    seqs=[]
    with open (filename,'r') as infile:
        for line in infile:
            if line[0]!=DEFLINE and len(line)>=MIN_SEQ_LEN and len(line)<=MAX_SEQ_LEN:
                line=line.rstrip()
                chars=np.array(list(line))
                seqs.append(chars.reshape(-1, 1)) # reshaped changes (any,) to (any,1)
    nparray=np.array(seqs)
    return nparray

ncfile='ncRNA.fasta' 
print("Load "+ncfile)
nc_seqs = load_fasta(ncfile)
pcfile='pcRNA.fasta' 
print("Load "+pcfile)
pc_seqs = load_fasta(pcfile)

Load ncRNA.fasta
Load pcRNA.fasta


Encode DNA letters with 4-bit one-hot encoding.

In [30]:
encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)
print("Fit")
#seq=tf.reshape(nc_seqs[0],shape=(-1, 1)) # tensor flow version
seq=nc_seqs[0].reshape(-1, 1)
encoder.fit(seq)
encoder.categories_

Fit


[array(['A', 'C', 'G', 'T'], dtype='<U1')]

In [31]:
encoded=encoder.transform(seq) # returns a numpy.ndarray
print(type(encoded))
encoded

<class 'numpy.ndarray'>


array([[0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]])

In [34]:
print("non-coding")
nc_list=[]
for seq in nc_seqs:
    encoded=encoder.transform(seq)  
    nc_list.append(encoded)
    
nc_all=np.array(nc_list).reshape(-1,1)
nc_all.shape

non-coding


(17711, 1)

In [35]:
print("protein-coding")
pc_list=[]
for seq in pc_seqs:
    encoded=encoder.transform(seq)
    pc_list.append(encoded)

pc_all=np.array(pc_list).reshape(-1,1)
pc_all.shape

protein-coding


(20152, 1)

Add labels. Create train and validation sets.

In [36]:
nc_labels=np.zeros(shape=(17711,1))
pc_labels=np.ones(shape=(20152,1))

#nc_labeled=np.concatenate((nc_all,nc_labels),axis=1)
#pc_labeled=np.concatenate((pc_all,pc_labels),axis=1)
all_seqs=np.concatenate((nc_all,pc_all),axis=0)
all_labels=np.concatenate((nc_labels,pc_labels),axis=0)

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import ShuffleSplit

splitter = ShuffleSplit(n_splits=1, test_size=0.2, random_state=37863)
for train_index,test_index in splitter.split(all_seqs):
    train_seqs =   all_seqs[train_index]
    train_labels = all_labels[train_index]
    test_seqs =    all_seqs[test_index]
    test_labels =  all_labels[test_index]
 
print("train")
print(train_seqs.shape,train_labels.shape)
print("test")
print(test_seqs.shape,test_labels.shape)

train
(30290, 1) (30290, 1)
test
(7573, 1) (7573, 1)


In [37]:
# Now that values are shuffled, partition gives random sample.
X_train=train_seqs[:25000]
X_valid=train_seqs[25000:]
y_train=train_labels[:25000]
y_valid=train_labels[25000:]
print("Training Xy data")
print(X_train.shape,y_train.shape)
print("Validation Xy data")
print(X_valid.shape,y_valid.shape)
X_train[0]

Training Xy data
(25000, 1) (25000, 1)
Validation Xy data
(5290, 1) (5290, 1)


array([array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       ...,
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.]])], dtype=object)

In [38]:
#X_train = X_train.reshape(25000,MAX_SEQ_LEN,4)
#y_train = y_train.reshape(25000,MAX_SEQ_LEN,1)
#X_valid = X_valid.reshape(5290,MAX_SEQ_LEN,4)
#y_valid = y_valid.reshape(5290,MAX_SEQ_LEN,1)
X_train=tf.convert_to_tensor(X_train)
y_train=tf.convert_to_tensor(y_train)
X_valid=tf.convert_to_tensor(X_valid)
y_valid=tf.convert_to_tensor(y_valid)
# Documentation of this method says it supports numpy array.
# But the error indicates otherwise:
# ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).

In [None]:
seq_len=None  # none indicates variable length
input_features=4  # one hot encoding of DNA means 4 categories
rnn2 = keras.models.Sequential([
    keras.layers.SimpleRNN(4, return_sequences=True, input_shape=[seq_len,input_features]),
    keras.layers.SimpleRNN(1),
])
bc=tf.keras.losses.BinaryCrossentropy(from_logits=False)
rnn2.compile(loss=bc, optimizer="Adam",metrics=["accuracy"])
history = rnn2.fit(X_train,y_train,epochs=5,validation_data=(X_valid,y_valid))

In [None]:
pd.DataFrame(history.history).plot(figsize=(8,5))
plt.grid(True)
plt.gca().set_ylim(0,1)
plt.show()