# CNN Demo

## Computing Environment Setup
TO DO: add a CoLab badge

In [1]:
PC_SEQUENCES=1000
NC_SEQUENCES=1000
BASES=55
ALPHABET=4
INPUT_SHAPE_2D = (BASES,ALPHABET,1) #2D inputs
INPUT_SHAPE = (BASES,ALPHABET) #1D inputs
CELLS = 16
FILTERS = 16
WIDTH = 3
STRIDE_2D = (1,1)
STRIDE = 1
EPOCHS=5  # use 5 for software testing, 50 for model testing
SPLITS=5
FOLDS=1  # max = num splits
TEST_PORTION = 0.1

In [2]:
import sys
try:
    from google.colab import drive
    IN_COLAB = True
    print("On Google CoLab, mount cloud-local file, get our code from GitHub.")
    PATH='/content/drive/'
    drive.mount(PATH)
    DATAPATH=PATH+'My Drive/data/'  # must end in "/"
    import requests
    r = requests.get('https://raw.githubusercontent.com/ShepherdCode/Soars2021/blob/main/SimTools/RNA_gen.py')
    with open('RNA_gen.py', 'w') as f:
        f.write(r.text)  # delete the file later?
except:
    print("On my PC, use relative paths.")
    IN_COLAB = False
    DATAPATH='data/'  # must end in "/"
    sys.path.append("..") # append parent dir in order to use sibling dirs

On my PC, use relative paths.


In [3]:
from os import listdir
import time # datetime
import csv
from zipfile import ZipFile

import numpy as np
import pandas as pd
from scipy import stats  # mode

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle

from keras.models import Sequential
from keras.layers import Dense,Embedding
from keras.layers import Conv1D,Conv2D
from keras.layers import Flatten,MaxPooling1D,MaxPooling2D

import matplotlib.pyplot as plt
from matplotlib import colors
mycmap = colors.ListedColormap(['red','blue'])  # list color for label 0 then 1
np.set_printoptions(precision=2)

from SimTools.RNA_gen import *
if not assert_imported_RNA_gen():
    print("ERROR: Cannot use RNA_gen.")

## Data Preparation

In [11]:
# print(datetime.datetime.now())
t = time.time()
time.strftime('%Y-%m-%d %H:%M:%S %Z', time.localtime(t))

'2021-05-06 13:52:31 EDT'

In [12]:
def get_all_sequences():
    cgen = Collection_Generator()
    sgen = Sequence_Oracle()
    lgen = Length_Oracle()
    lgen.set_mean(BASES)
    cgen.set_seq_oracle(sgen)
    cgen.set_len_oracle(lgen)
    pc_seqs=cgen.get_sequences(PC_SEQUENCES)
    nc_seqs=cgen.get_sequences(NC_SEQUENCES)
    return pc_seqs,nc_seqs
pc_seqs,nc_seqs = get_all_sequences()

In [13]:
def prepare_for_learning(pcs,ncs):
    NUM_SAMPLES=PC_SEQUENCES+NC_SEQUENCES
    samples = nc_seqs + pc_seqs
    X_shape = (NUM_SAMPLES,BASES,ALPHABET)
    Y_shape = (NUM_SAMPLES,1)
    y=np.concatenate((np.zeros(NC_SEQUENCES,dtype=np.int8),
                      np.ones(PC_SEQUENCES,dtype=np.int8)))
    X=np.zeros(X_shape,dtype=np.int8)
    base_to_dim = {'A':0, 'C':1, 'G':2, 'T':3}
    for s in range(0,NUM_SAMPLES):  # TO DO: speed this up by avoiding loops
        sample = samples[s]
        for b in range(0,BASES): # use len(sample) if length varies
            base = sample[b]
            d = base_to_dim[base]   # TO DO: error on non-ACGT
            X[s,b,d]=1
    X,y = shuffle(X,y,random_state=4200)
    return X,y
X,y = prepare_for_learning(pc_seqs,nc_seqs)
print("X shape:",X.shape)
print("y shape:",y.shape)


X shape: (2000, 55, 4)
y shape: (2000,)


## Model build, train, test

In [19]:
def make_DNN():
    print("make_DNN")
    print("input shape:",INPUT_SHAPE)
    EMBED_DIM = 3 # for 4-letter one-hot inputs, encode each letter as 3D vector
    dnn = Sequential()
    dnn.add(Embedding(ALPHABET,EMBED_DIM,input_length=BASES)) 
    dnn.add(Conv1D( 
            filters=FILTERS,kernel_size=WIDTH,strides=STRIDE,
            activation=None, padding="same"))
    dnn.add(Conv1D(
            filters=FILTERS,kernel_size=WIDTH,strides=STRIDE,
            activation=None, padding="same"))
    dnn.add(MaxPooling1D(strides=STRIDE,padding="same"))
    dnn.add(Flatten())
    dnn.add(Dense(1))   
    dnn.compile(optimizer='adam')
    dnn.build(input_shape=INPUT_SHAPE)
    #ln_rate = tf.keras.optimizers.Adam(learning_rate = LN_RATE)
    #bc=tf.keras.losses.BinaryCrossentropy(from_logits=False)
    #model.compile(loss=bc, optimizer=ln_rate, metrics=["accuracy"])
    return dnn
model = make_DNN()
print(model.summary())

make_DNN
input shape: (55, 4)
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 55, 3)             12        
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 55, 16)            160       
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 55, 16)            784       
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 55, 16)            0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 880)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 881       
Total params: 1,837
Trainable params: 1,837
Non-trainable params: 0
______________________

In [20]:
def do_cross_validation(X,y):
    cv_scores = []
    fold=0
    splitter = KFold(n_splits=SPLITS, random_state=37863)
    for train_index,valid_index in splitter.split(X):
        if fold < FOLDS:
            fold += 1
            X_train=X[train_index] # use iloc[] for dataframe
            y_train=y[train_index]
            X_valid=X[valid_index]
            y_valid=y[valid_index]        
            # Avoid continually improving the same model.
            print("MODEL")
            model = model = make_DNN()
            #bestname=MODEL_FILE+".cv."+str(fold)+".best"
            #mycallbacks = [keras.callbacks.ModelCheckpoint(
            #    filepath=bestname, save_best_only=True, 
            #    monitor='val_accuracy', mode='max')]   
            print("FIT")
            start_time=time.time()
            history=model.fit(X_train, y_train, 
                    epochs=EPOCHS, verbose=1, # callbacks=mycallbacks,
                    validation_data=(X_valid,y_valid))
            # THE VALIDATION ABOVE IS JUST FOR SHOW
            end_time=time.time()
            elapsed_time=(end_time-start_time)                        
            print("Fold %d, %d epochs, %d sec"%(fold,EPOCHS,elapsed_time))
            pd.DataFrame(history.history).plot(figsize=(8,5))
            plt.grid(True)
            plt.gca().set_ylim(0,1)
            plt.show()
            #best_model=keras.models.load_model(bestname)
            # THE VALIDATION BELOW IS FOR KEEPS
            #scores = best_model.evaluate(X_VALID, Y_VALID, verbose=0)
            #print("%s: %.2f%%" % (best_model.metrics_names[1], scores[1]*100))
            #cv_scores.append(scores[1] * 100)  
    print()
    print("%d-way Cross Validation mean %.2f%% (+/- %.2f%%)" % (fold, np.mean(cv_scores), np.std(cv_scores)))

In [21]:
do_cross_validation(X,y)

MODEL
make_DNN
input shape: (55, 4)
FIT
Epoch 1/5


ValueError: in user code:

    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:788 run_step  **
        outputs = model.train_step(data)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:754 train_step
        y_pred = self(x, training=True)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:1012 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/sequential.py:375 call
        return super(Sequential, self).call(inputs, training=training, mask=mask)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/functional.py:424 call
        return self._run_internal_graph(
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/functional.py:560 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:998 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/input_spec.py:219 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) + ' of layer ' +

    ValueError: Input 0 of layer max_pooling1d_6 is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: (32, 55, 4, 16)
