# CNN 317
Balanced GenCode 36.
Cross validation (instead of separate train/test sets used before).

In [1]:
NC_FILENAME='ncRNA.gc36.balance.fasta'
PC_FILENAME='pcRNA.gc36.balance.fasta'
#NC_FILENAME='ncRNA.tiny50.fasta'
#PC_FILENAME='pcRNA.tiny50.fasta'
#NC_FILENAME='ncRNA.gc34.processed.fasta'
#PC_FILENAME='pcRNA.gc34.processed.fasta'

MODEL_FILE='JUNK1'  # change this if you want to keep models
DATAPATH=''

try:
    from google.colab import drive
    IN_COLAB = True
    PATH='/content/drive/'
    drive.mount(PATH)
    DATAPATH=PATH+'My Drive/data/'  # must end in "/"
except:
    IN_COLAB = False
    DATAPATH='data/'  # must end in "/"
NC_FILENAME = DATAPATH+NC_FILENAME
PC_FILENAME = DATAPATH+PC_FILENAME
MODEL_FILE=DATAPATH+MODEL_FILE

EPOCHS=200
SPLITS=5
K=1
VOCABULARY_SIZE=4**K+1   # e.g. K=3 => 64 DNA K-mers + 'NNN'
EMBED_DIMEN=2
FILTERS=32
KERNEL=3
NEURONS=32
DROP=0.25
ACT="tanh"
MINLEN=201
MAXLEN=2000
LIMIT=None # 5588 is the size of our smallest set (protein coding short)

In [2]:
# Load our own tools
# TO DO: don't go to GitHub if the file is already local.
GITHUB = True
if GITHUB:
    #!pip install requests  # Uncomment this if necessary. Seems to be pre-installed.
    import requests
    r = requests.get('https://raw.githubusercontent.com/ShepherdCode/ShepherdML/master/Strings/tools_fasta.py')
    with open('tools_fasta.py', 'w') as f:
        f.write(r.text)
    # TO DO: delete the file after import
import tools_fasta as tools
tools.yahoo()  # If this prints "Yahoo!" the the import was successful.

TOOLS_CHANGED = False   # set to True to re-run with a new version of tools
if TOOLS_CHANGED:
  from importlib import reload 
  tools=reload(tools)
  print(dir(tools))   # run this to see EVERYTHING in the tools module

Yahoo!


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score
import tensorflow as tf
from tensorflow import keras
import time
dt='float32'
tf.keras.backend.set_floatx(dt)

Build model

In [4]:
def compile_model(model):
    ## learn rate = initial_learning_rate * decay_rate ^ (step / decay_steps)
    #adam_default_learn_rate = 0.001
    #schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    #    initial_learning_rate = adam_default_learn_rate*10,
    #    decay_steps=10000, decay_rate=0.99, staircase=True)
    #alrd = tf.keras.optimizers.Adam(learning_rate=schedule)
    #model.compile(loss=bc, optimizer=alrd, metrics=["accuracy"])

    print("COMPILE...")
    bc=tf.keras.losses.BinaryCrossentropy(from_logits=False)
    model.compile(loss=bc, optimizer="adam", metrics=["accuracy"])
    print("...COMPILED")
    return model

def build_model():
    SHAPE=(MAXLEN,5)  # MAXLEN bases = time steps, 5 features = one hots
    clayer1 = keras.layers.Conv1D(FILTERS,KERNEL,activation=ACT,padding="same",
                                 input_shape=SHAPE)
    clayer2 = keras.layers.Conv1D(FILTERS,KERNEL,activation=ACT,padding="same")
    clayer3 = keras.layers.MaxPooling1D(2)
    clayer4 = keras.layers.Conv1D(FILTERS,KERNEL,activation=ACT,padding="same")
    clayer5 = keras.layers.Conv1D(FILTERS,KERNEL,activation=ACT,padding="same")
    clayer6 = keras.layers.MaxPooling1D(2)
    clayer7 = keras.layers.Flatten()

    dlayer1 = keras.layers.Dense(NEURONS, activation=ACT,dtype=dt, input_shape=[1000])
    dlayer2 = keras.layers.Dropout(DROP)
    dlayer3 = keras.layers.Dense(NEURONS, activation=ACT,dtype=dt)
    dlayer4 = keras.layers.Dropout(DROP)
    output_layer = keras.layers.Dense(1, activation="sigmoid", dtype=dt)

    cnn = keras.models.Sequential()
    cnn.add(clayer1)
    cnn.add(clayer2)
    cnn.add(clayer3)
    cnn.add(clayer4)
    cnn.add(clayer5)
    cnn.add(clayer6)
    cnn.add(clayer7)
    cnn.add(dlayer1)
    cnn.add(dlayer2)
    cnn.add(dlayer3)
    cnn.add(dlayer4)
    cnn.add(output_layer)
    mlpc = compile_model(cnn)
    return mlpc

Cross validation

In [5]:
# This version was used for fixed train/test e.g. long/short.
def do_cross_validation(X_train,y_train,given_model,X_valid,y_valid):
    cv_scores = []
    fold=0
    # When not using ShuffleSplit, make sure to randomize train data.
    #splitter = ShuffleSplit(n_splits=SPLITS, test_size=0.1, random_state=37863)
    #for train_index,valid_index in splitter.split(X):
    while fold < SPLITS:
        fold += 1
        # Avoid continually improving the same model.
        model = compile_model(keras.models.clone_model(given_model))
        bestname=MODEL_FILE+".cv."+str(fold)+".best"
        es = keras.callbacks.EarlyStopping(monitor='val_loss',  
            patience=10, verbose=1)
        mc = keras.callbacks.ModelCheckpoint(
            filepath=bestname, save_best_only=True, 
            monitor='val_accuracy', mode='max')  
        mycallbacks = [es,mc]  
        print("FIT")
        start_time=time.time()
        history=model.fit(X_train, y_train, # batch_size=10, default=32 works nicely
                epochs=EPOCHS, verbose=1,  # verbose=1 for ascii art, verbose=0 for none
                callbacks=mycallbacks, shuffle=True,
                validation_data=(X_valid,y_valid) )
        end_time=time.time()
        elapsed_time=(end_time-start_time)                        
        print("Fold %d, %d epochs, %d sec"%(fold,EPOCHS,elapsed_time))
        pd.DataFrame(history.history).plot(figsize=(8,5))
        plt.grid(True)
        plt.gca().set_ylim(0,1)
        plt.show()
        best_model=keras.models.load_model(bestname)
        scores = best_model.evaluate(X_valid, y_valid, verbose=0)
        print("%s: %.2f%%" % (best_model.metrics_names[1], scores[1]*100))
        cv_scores.append(scores[1] * 100)  

    print()
    print("%d-way Cross Validation max %.2f%%, mean %.2f%% (+/- %.2f%%)" % 
          (fold, np.amax(cv_scores), np.mean(cv_scores), np.std(cv_scores)))

In [6]:
# This does actual cross validation of one big X
def do_cross_val(model,splits,X,y,verbose=False):
    skf = StratifiedKFold(n_splits=splits, random_state=456, shuffle=True)
    confusion = np.zeros(shape=[2,2],dtype=np.int8)
    for train_index, test_index in skf.split(X, y):
        X_train,y_train = X[train_index],y[train_index]
        model.fit(X_train, y_train)
        X_test,y_test = X[test_index],y[test_index]
        y_pred = model.predict(X_test)
        # Labels attribute says generate all 4 counts even if none of any category.
        # https://stackoverflow.com/questions/46229965/how-to-make-sklearn-metrics-confusion-matrix-to-always-return-tp-tn-fp-fn
        cf = confusion_matrix(y_test,y_pred,labels=[0,1])
        confusion = np.add(confusion,cf)
        if verbose:
            ba = balanced_accuracy_score(y_test,y_pred)
            acc = accuracy_score(y_test,y_pred)
            # unintuitive order but from documentation
            tn, fp, fn, tp = cf.ravel() 
            print(" Array indices. Train:",train_index, " Test:",test_index)
            print("  y_test=",y_test," y_pred=",y_pred)
            print("  acc=%.2f ba=%.2f tp=%d fp=%d fn=%d tn=%d"%(acc,ba,tp,fp,fn,tn))
    return confusion
def print_confusion(label1,cm1,label2,cm2):
    tn, fp, fn, tp = cm1.ravel()
    acc = (tp+tn)/(tp+tn+fp+fn)
    tpr = tp / (tp+fn)
    tnr = tn / (tn+fp)
    bal = (tpr+tnr)/2
    f1 = (2*tp)/(2*tp+fp+fn)
    TL=' tp fp | %2d %2d | acc=%.2f'%(tp,fp,acc)
    BL=' fn tn | %2d %2d | bal=%.2f'%(fn,tn,bal)
    tn, fp, fn, tp = cm2.ravel()
    acc = (tp+tn)/(tp+tn+fp+fn)
    tpr = tp / (tp+fn)
    tnr = tn / (tn+fp)
    bal = (tpr+tnr)/2
    f1 = (2*tp)/(2*tp+fp+fn)
    TR=' tp fp | %2d %2d | acc=%.2f'%(tp,fp,acc)
    BR=' fn tn | %2d %2d | bal=%.2f'%(fn,tn,bal)
    print("%-30s %-30s"%(label1,label2))
    print("%-30s %-30s"%(TL,TR))
    print("%-30s %-30s"%(BL,BR))

## Train on RNA lengths 200-1Kb

In [7]:
print ("Compile the model")
model=build_model()
print ("Summarize the model")
print(model.summary())  # Print this only once
#model.save(MODEL_FILE+'.model')


Compile the model
COMPILE...
...COMPILED
Summarize the model
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 2000, 32)          512       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2000, 32)          3104      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 1000, 32)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1000, 32)          3104      
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 1000, 32)          3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 500, 32)           0         
_____________________________________________________________

In [8]:
#test
if False:
    nc_filename=NC_FILENAME
    pc_filename=PC_FILENAME
    nc_seq=tools.load_fasta(nc_filename,0)
    pc_seq=tools.load_fasta(pc_filename,1)
    train_set=pd.concat((nc_seq,pc_seq),axis=0)
    print(type(train_set))
    train_set
    subset=train_set
    (X1,y1)=tools.separate_X_and_y(subset)
    print(type(X1))
    print(type(y1))
    y_np = y1.to_numpy()
    y_npa=[elem[0] for elem in y_np]  
    print(type(y_npa))
    print(type(y_npa[0]))
    y_npa[0:10]
    y_npa[-10:]


In [9]:
def load_data(nc_filename,pc_filename,limit=None):
    nc_seq=tools.load_fasta(nc_filename,0)
    pc_seq=tools.load_fasta(pc_filename,1)
    if limit is not None:
        # Choose a random subset of size = limit.
        # Side effect: randomizes the data order.
        nfrac=1.0*limit/len(nc_seq)
        pfrac=1.0*limit/len(pc_seq)
        if nfrac<1.0:
          nc_seq=nc_seq.sample(frac=nfrac)
        if pfrac<1.0:
          pc_seq=pc_seq.sample(frac=pfrac)
    train_set=pd.concat((nc_seq,pc_seq),axis=0)
    # May not need to slice by length if input files already sliced
    ### subset=tools.make_slice(train_set,MINLEN,MAXLEN)
    subset=train_set
    # randset=subset.sample(frac=1) # unneccessary with fit(shuffle)
    (X1,y1)=tools.separate_X_and_y(subset)
    # X1 is pandas df of ("list" of one sequence)
    X2=X1.to_numpy()  # numpy ndarray of ("list" of one sequence)
    X3=[elem[0] for elem in X2]  # numpy dnarray of str
    X4=tools.uniform_length(X3,MAXLEN)
    X5= [tools.one_hot(x) for x in X4]
    X6=np.asarray(X5)
    y2=y1.to_numpy()
    y3=[elem[0] for elem in y2]  
    return X6,y3

print("Load train data from files.")
X,y = load_data(NC_FILENAME,PC_FILENAME,LIMIT)
print("X.shape",X.shape)

Load train data from files.
X.shape (35528, 2000, 5)


In [10]:
print(type(y))
print(y)

<class 'list'>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [11]:
print("Validation run")
confusion = do_cross_val(build_model(),5,X, y)

Validation run
COMPILE...
...COMPILED


TypeError: only integer scalar arrays can be converted to a scalar index