# GRU 301
Start with CNN 316 but use RNN like GRU 220.
Use one-hot (like CNN) rather than K-mer embedding (like RNN).

In [1]:
TRAIN_NC_FILENAME='ncRNA.gc36.long.fasta'
TRAIN_PC_FILENAME='pcRNA.gc36.long.fasta'
VALID_NC_FILENAME='ncRNA.gc36.short.fasta'
VALID_PC_FILENAME='pcRNA.gc36.short.fasta'
#NC_FILENAME='ncRNA.tiny50.fasta'
#PC_FILENAME='pcRNA.tiny50.fasta'
#NC_FILENAME='ncRNA.gc34.processed.fasta'
#PC_FILENAME='pcRNA.gc34.processed.fasta'

MODEL_FILE='JUNK1'  # change this if you want to keep models
DATAPATH=''

try:
    from google.colab import drive
    IN_COLAB = True
    PATH='/content/drive/'
    drive.mount(PATH)
    DATAPATH=PATH+'My Drive/data/'  # must end in "/"
except:
    IN_COLAB = False
    DATAPATH='data/'  # must end in "/"
TRAIN_NC_FILENAME = DATAPATH+TRAIN_NC_FILENAME
TRAIN_PC_FILENAME = DATAPATH+TRAIN_PC_FILENAME
VALID_NC_FILENAME = DATAPATH+VALID_NC_FILENAME
VALID_PC_FILENAME = DATAPATH+VALID_PC_FILENAME
MODEL_FILE=DATAPATH+MODEL_FILE

EPOCHS=200
SPLITS=5
ONEHOT=5  
NEURONS=64
DROP=0.10
ACT="tanh"
MINLEN=201
MAXLEN=2000
LIMIT=5588    # This is the size of our smallest set (protein coding short)

Mounted at /content/drive/


In [2]:
# Load our own tools
# TO DO: don't go to GitHub if the file is already local.
GITHUB = True
if GITHUB:
    #!pip install requests  # Uncomment this if necessary. Seems to be pre-installed.
    import requests
    r = requests.get('https://raw.githubusercontent.com/ShepherdCode/ShepherdML/master/Strings/tools_fasta.py')
    with open('tools_fasta.py', 'w') as f:
        f.write(r.text)
    # TO DO: delete the file after import
import tools_fasta as tools
tools.yahoo()  # If this prints "Yahoo!" the the import was successful.

TOOLS_CHANGED = False   # set to True to re-run with a new version of tools
if TOOLS_CHANGED:
  from importlib import reload 
  tools=reload(tools)
  print(dir(tools))   # run this to see EVERYTHING in the tools module

Yahoo!


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow import keras
import time
dt='float32'
tf.keras.backend.set_floatx(dt)

Build model

In [4]:
def compile_model(model):
    ## learn rate = initial_learning_rate * decay_rate ^ (step / decay_steps)
    #adam_default_learn_rate = 0.001
    #schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    #    initial_learning_rate = adam_default_learn_rate*10,
    #    decay_steps=10000, decay_rate=0.99, staircase=True)
    #alrd = tf.keras.optimizers.Adam(learning_rate=schedule)
    #model.compile(loss=bc, optimizer=alrd, metrics=["accuracy"])

    print("COMPILE...")
    bc=tf.keras.losses.BinaryCrossentropy(from_logits=False)
    model.compile(loss=bc, optimizer="adam", metrics=["accuracy"])
    print("...COMPILED")
    return model

def build_model():
    MASK=[0,0,0,0,1] # to do: get one_hot('N') from the one_hot() function.
    mask_layer = keras.layers.Masking(mask_value=MASK, input_shape=[MAXLEN,ONEHOT])
    rlayer1 = keras.layers.GRU(NEURONS, return_sequences=True, 
        activation=ACT, dropout=DROP )  
    rlayer2 = keras.layers.GRU(NEURONS, return_sequences=True, 
        activation=ACT, dropout=DROP) 
    rlayer3 = keras.layers.GRU(NEURONS, return_sequences=True, 
        activation=ACT, dropout=DROP) 
    rlayer4 = keras.layers.GRU(NEURONS, return_sequences=False, 
        activation=ACT, dropout=DROP) 

    dlayer1 = keras.layers.Dense(NEURONS, activation=ACT,dtype=dt)
    dlayer2 = keras.layers.Dropout(DROP)
    dlayer3 = keras.layers.Dense(NEURONS, activation=ACT,dtype=dt)
    dlayer4 = keras.layers.Dropout(DROP)
    output_layer = keras.layers.Dense(1, activation="sigmoid", dtype=dt)

    rnn = keras.models.Sequential()
    rnn.add(mask_layer)
    rnn.add(rlayer1)
    rnn.add(rlayer2)
    rnn.add(rlayer3)
    rnn.add(rlayer4)
    rnn.add(dlayer1)
    rnn.add(dlayer2)
    rnn.add(dlayer3)
    rnn.add(dlayer4)
    rnn.add(output_layer)
    compiled = compile_model(rnn)
    return compiled

Cross validation

In [5]:
def do_cross_validation(X_train,y_train,given_model,X_valid,y_valid):
    cv_scores = []
    fold=0
    # When not using ShuffleSplit, make sure to randomize train data.
    #splitter = ShuffleSplit(n_splits=SPLITS, test_size=0.1, random_state=37863)
    #for train_index,valid_index in splitter.split(X):
    while fold < SPLITS:
        fold += 1
        # Avoid continually improving the same model.
        model = compile_model(keras.models.clone_model(given_model))
        bestname=MODEL_FILE+".cv."+str(fold)+".best"
        es = keras.callbacks.EarlyStopping(monitor='val_loss',  
            patience=10, verbose=1)
        mc = keras.callbacks.ModelCheckpoint(
            filepath=bestname, save_best_only=True, 
            monitor='val_accuracy', mode='max')  
        mycallbacks = [es,mc]  
        print("FIT")
        start_time=time.time()
        history=model.fit(X_train, y_train, # batch_size=10, default=32 works nicely
                epochs=EPOCHS, verbose=1,  # verbose=1 for ascii art, verbose=0 for none
                callbacks=mycallbacks, shuffle=True,
                validation_data=(X_valid,y_valid) )
        end_time=time.time()
        elapsed_time=(end_time-start_time)                        
        print("Fold %d, %d epochs, %d sec"%(fold,EPOCHS,elapsed_time))
        pd.DataFrame(history.history).plot(figsize=(8,5))
        plt.grid(True)
        plt.gca().set_ylim(0,1)
        plt.show()
        best_model=keras.models.load_model(bestname)
        scores = best_model.evaluate(X_valid, y_valid, verbose=0)
        print("%s: %.2f%%" % (best_model.metrics_names[1], scores[1]*100))
        cv_scores.append(scores[1] * 100)  

    print()
    print("%d-way Cross Validation max %.2f%%, mean %.2f%% (+/- %.2f%%)" % 
          (fold, np.amax(cv_scores), np.mean(cv_scores), np.std(cv_scores)))

## Train on RNA lengths 200-1Kb

In [6]:
print ("Compile the model")
model=build_model()
print ("Summarize the model")
print(model.summary())  # Print this only once
#model.save(MODEL_FILE+'.model')


Compile the model
COMPILE...
...COMPILED
Summarize the model
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (None, 2000, 5)           0         
_________________________________________________________________
gru (GRU)                    (None, 2000, 64)          13632     
_________________________________________________________________
gru_1 (GRU)                  (None, 2000, 64)          24960     
_________________________________________________________________
gru_2 (GRU)                  (None, 2000, 64)          24960     
_________________________________________________________________
gru_3 (GRU)                  (None, 64)                24960     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_____________________________________________________________

In [7]:
def load_data(nc_filename,pc_filename,limit=None):
  nc_seq=tools.load_fasta(nc_filename,0)
  pc_seq=tools.load_fasta(pc_filename,1)
  if limit is not None:
    # Choose a random subset of size = limit.
    # Side effect: randomizes the data order.
    nfrac=1.0*limit/len(nc_seq)
    pfrac=1.0*limit/len(pc_seq)
    if nfrac<1.0:
      nc_seq=nc_seq.sample(frac=nfrac)
    if pfrac<1.0:
      pc_seq=pc_seq.sample(frac=pfrac)
  train_set=pd.concat((nc_seq,pc_seq),axis=0)
  # May not need to slice by length if input files already sliced
  subset=tools.make_slice(train_set,MINLEN,MAXLEN)
  # randset=subset.sample(frac=1) # unneccessary with fit(shuffle)
  (X1,y1)=tools.separate_X_and_y(subset)
  # X1 is pandas df of ("list" of one sequence)
  X2=X1.to_numpy()  # numpy ndarray of ("list" of one sequence)
  X3=[elem[0] for elem in X2]  # numpy dnarray of str
  X4=tools.uniform_length(X3,MAXLEN)
  X5= [tools.one_hot(x) for x in X4]  
  # one-hot leaves ONEHOT=5. TO DO: set 5 automatically
  X6=np.asarray(X5)
  y6=y1.to_numpy()
  return X6,y6

print("Load train data from files.")
X_train,y_train = load_data(TRAIN_NC_FILENAME,TRAIN_PC_FILENAME,LIMIT)
print("X_train",X_train.shape)
print(X_train[1])
print("Load valid data from files.")
X_valid,y_valid = load_data(VALID_NC_FILENAME,VALID_PC_FILENAME,LIMIT)
print("X_valid",X_valid.shape)

Load train data from files.
X_train (11176, 2000, 5)
[[1 0 0 0 0]
 [0 0 0 1 0]
 [0 0 0 1 0]
 ...
 [0 0 0 0 1]
 [0 0 0 0 1]
 [0 0 0 0 1]]
Load valid data from files.
X_valid (11176, 2000, 5)


In [None]:
print ("Cross valiation")
do_cross_validation(X_train,y_train,model,X_valid,y_valid)  
print ("Done")

Cross valiation
COMPILE...
...COMPILED
FIT
Epoch 1/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 14/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 15/200
Epoch 16/200
Epoch 17/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 22/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 23/200
Epoch 24/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 25/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 26/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 27/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 28/200
Epoch 29/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 30/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 31/200
Epoch 32/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 33/200
Epoch 34/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 35/200
Epoch 36/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 37/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 38/200
Epoch 39/200
Epoch 40/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 41/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 42/200
Epoch 43/200
Epoch 44/200




INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/data/JUNK1.cv.1.best/assets


Epoch 45/200