# MLP 312
Runs a pre-trained model.
This does not need GPU. 
This requires that you run MLP_305 first -- takes about 5 min. 
This notebook is a modification of MLP_309.

In this notebook, we move some Python code 
out of the notebook and into a module called tools. 

In [1]:
NC_FILENAME='ncRNA.gc34.processed.fasta'
PC_FILENAME='pcRNA.gc34.processed.fasta'
MODEL_FILE='MLP305'   # load not save!
DATAPATH=''

try:
    from google.colab import drive
    IN_COLAB = True
    PATH='/content/drive/'
    drive.mount(PATH)
    DATAPATH=PATH+'My Drive/data/'  # must end in "/"
except:
    IN_COLAB = False
    DATAPATH='data/'  # must end in "/"
NC_FILENAME = DATAPATH+NC_FILENAME
PC_FILENAME = DATAPATH+PC_FILENAME
MODEL_FILE=DATAPATH+MODEL_FILE

EPOCHS=200
SPLITS=5
K=3
VOCABULARY_SIZE=4**K+1   # e.g. K=3 => 64 DNA K-mers + 'NNN'
EMBED_DIMEN=16
NEURONS=16

Load our own tools module.

In [2]:
GITHUB = True
if GITHUB:
    #!pip install requests  # Uncomment this if necessary. Seems to be pre-installed.
    import requests
    r = requests.get('https://raw.githubusercontent.com/ShepherdCode/ShepherdML/master/Strings/tools_fasta.py')
    with open('tools_fasta.py', 'w') as f:
        f.write(r.text)
    # TO DO: delete the file after import
import tools_fasta as tools
tools.yahoo()  # If this prints "Yahoo!" the the import was successful.

TOOLS_CHANGED = False   # set to True to re-run with a new version of tools
if TOOLS_CHANGED:
  from importlib import reload 
  tools=reload(tools)
  print(dir(tools))   # run this to see EVERYTHING in the tools module

Yahoo!
['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'load_fasta', 'make_frequencies', 'make_kmer_table', 'make_kmers', 'np', 'pd', 'separate_X_and_y', 'strings_to_vectors', 'yahoo']


Load keras and tensorflow etc.

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow import keras
#from tensorflow.keras.models import load_model
#from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Bidirectional
from keras.layers import GRU
from keras.layers import Dense
from keras.layers import LayerNormalization
import time
dt='float32'  # Use 32 bit data types to save on RAM
tf.keras.backend.set_floatx(dt)

Define the model

In [4]:
def compile_model(model):
    adam_default_learn_rate = 0.001
    schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate = adam_default_learn_rate*10,
        decay_steps=10000, decay_rate=0.99, staircase=True)
    opt = tf.keras.optimizers.Adam(learning_rate=schedule)
    bc=tf.keras.losses.BinaryCrossentropy(from_logits=False)
    print("COMPILE...")
    model.compile(loss=bc, optimizer=opt, metrics=["accuracy"])
    print("...COMPILED")
    return model

def build_model(maxlen):
    act="elu"
    dense1_layer = keras.layers.Dense(NEURONS, activation=act,dtype=dt,
                                      input_dim=VOCABULARY_SIZE)
    dense2_layer = keras.layers.Dense(NEURONS, activation=act,dtype=dt)
    output_layer = keras.layers.Dense(1,  activation="sigmoid",dtype=dt)
    mlp = keras.models.Sequential()
    mlp.add(dense1_layer)
    mlp.add(dense2_layer)
    mlp.add(output_layer)
    mlpc = compile_model(mlp)
    return mlpc

Partition sequences

In [5]:
def make_slice(data_set,min_len,max_len):
    slice = data_set.query('seqlen <= '+str(max_len)+' & seqlen>= '+str(min_len))
    return slice

## Cross validation

In [6]:
from sklearn.metrics import confusion_matrix
def do_evaluation(X,y):
    cv_scores = []
    fold=1
    while fold<=5:
        X_valid=X
        y_valid=y 
        bestname=MODEL_FILE+".cv."+str(fold)+".best"
        #bestname='/Users/jasonmiller/Source/ShepherdML/Nasa2021/'+bestname
        #print(bestname)
        best_model=keras.models.load_model(bestname)
        scores = best_model.evaluate(X_valid, y_valid, verbose=0)
        #print("%s: %.2f%%" % (best_model.metrics_names[1], scores[1]*100))
        print("%.2f%% "%(scores[1]*100), end=' ')
        cv_scores.append(scores[1] * 100) 

        y_pred = best_model.predict(X_valid)
        y_pred = [1 if n >= 0.5 else 0 for n in y_pred]
        cm=confusion_matrix(y_valid,y_pred)
        print("confusion")
        print(cm)

        fold += 1
    print()
    print("mean %.2f%% (+/- %.2f%%)" % (np.mean(cv_scores), np.std(cv_scores)))

## Train on RNA lengths 200-1Kb

In [7]:
MINLEN=200
MAXLEN=1000
print("Load data from files.")
nc_seq=tools.load_fasta(NC_FILENAME,0)
pc_seq=tools.load_fasta(PC_FILENAME,1)
train_set=pd.concat((nc_seq,pc_seq),axis=0)
nc_seq=None
pc_seq=None
print("Ready: train_set")
print ("Data prep")
subset=make_slice(train_set,MINLEN,MAXLEN)# One array to two: X and y
print ("Data reshape")
(X_train,y_train)=tools.make_kmers(K,MAXLEN,subset)
X_train=tools.make_frequencies(K,X_train)
print("Ready")

Load data from files.
Ready: train_set
Data prep
Data reshape
Ready


In [8]:
print ("Evaluation")
do_evaluation(X_train,y_train)  
print ("Done")

Evaluation
85.45%  confusion
[[9720  603]
 [1740 4041]]
86.36%  confusion
[[9578  745]
 [1452 4329]]
85.90%  confusion
[[9614  709]
 [1561 4220]]
85.97%  confusion
[[9239 1084]
 [1176 4605]]
86.62%  confusion
[[9558  765]
 [1389 4392]]

mean 86.06% (+/- 0.40%)
Done
