# CNN Demo

Computing Environment Setup

In [8]:
PC_SEQUENCES=1000
NC_SEQUENCES=1000
BASES=55
ALPHABET=4
EPOCHS=5  # use 5 for software testing, 50 for model testing
INPUT_SHAPE_2D = (BASES,ALPHABET,1) #2D inputs
INPUT_SHAPE = (BASES,ALPHABET) #1D inputs
CELLS = 16
FILTERS = 16
WIDTH = 3
STRIDE_2D = (1,1)
STRIDE = 1

In [9]:
import sys
try:
    from google.colab import drive
    IN_COLAB = True
    print("On Google CoLab, mount cloud-local file, get our code from GitHub.")
    PATH='/content/drive/'
    drive.mount(PATH)
    DATAPATH=PATH+'My Drive/data/'  # must end in "/"
    import requests
    r = requests.get('https://raw.githubusercontent.com/ShepherdCode/Soars2021/blob/main/SimTools/RNA_gen.py')
    with open('RNA_gen.py', 'w') as f:
        f.write(r.text)  # delete the file later?
except:
    print("On my PC, use relative paths.")
    IN_COLAB = False
    DATAPATH='data/'  # must end in "/"
    sys.path.append("..") # append parent dir in order to use sibling dirs

On my PC, use relative paths.


In [10]:
from os import listdir
import datetime
import csv
from zipfile import ZipFile

import numpy as np
import pandas as pd
from scipy import stats  # mode

from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense,Embedding
from keras.layers import Conv1D,Conv2D
from keras.layers import Flatten,MaxPooling1D,MaxPooling2D

import matplotlib.pyplot as plt
from matplotlib import colors
mycmap = colors.ListedColormap(['red','blue'])  # list color for label 0 then 1
np.set_printoptions(precision=2)

from SimTools.RNA_gen import *
if not assert_imported_RNA_gen():
    print("ERROR: Cannot use RNA_gen.")

Data Preparation

In [11]:
print(datetime.datetime.now())

2021-05-06 13:07:22.261924


In [12]:
def get_all_sequences():
    cgen = Collection_Generator()
    sgen = Sequence_Oracle()
    lgen = Length_Oracle()
    lgen.set_mean(BASES)
    cgen.set_seq_oracle(sgen)
    cgen.set_len_oracle(lgen)
    pc_seqs=cgen.get_sequences(PC_SEQUENCES)
    nc_seqs=cgen.get_sequences(NC_SEQUENCES)
    return pc_seqs,nc_seqs
pc_seqs,nc_seqs = get_all_sequences()

In [13]:
def prepare_for_learning(pcs,ncs):
    NUM_SAMPLES=PC_SEQUENCES+NC_SEQUENCES
    samples = nc_seqs + pc_seqs
    X_shape = (NUM_SAMPLES,BASES,ALPHABET)
    Y_shape = (NUM_SAMPLES,1)
    y=np.concatenate((np.zeros(NC_SEQUENCES,dtype=np.int8),
                      np.ones(PC_SEQUENCES,dtype=np.int8)))
    X=np.zeros(X_shape,dtype=np.int8)
    base_to_dim = {'A':0, 'C':1, 'G':2, 'T':3}
    for s in range(0,NUM_SAMPLES):  # TO DO: speed this up by avoiding loops
        sample = samples[s]
        for b in range(0,BASES): # use len(sample) if length varies
            base = sample[b]
            d = base_to_dim[base]   # TO DO: error on non-ACGT
            X[s,b,d]=1
    return X,y
X,y = prepare_for_learning(pc_seqs,nc_seqs)
print("X shape:",X.shape)
print("y shape:",y.shape)


X shape: (2000, 55, 4)
y shape: (2000,)


In [16]:
def make_DNN():
    print("make_DNN")
    print("input shape:",INPUT_SHAPE)
    EMBED_DIM = 3 # for 4-letter one-hot inputs, encode each letter as 3D vector
    dnn = Sequential()
    dnn.add(Embedding(ALPHABET,EMBED_DIM,input_length=BASES)) 
    dnn.add(Conv1D( 
            filters=FILTERS,kernel_size=WIDTH,strides=STRIDE,
            activation=None, padding="same"))
    dnn.add(Conv1D(
            filters=FILTERS,kernel_size=WIDTH,strides=STRIDE,
            activation=None, padding="same"))
    dnn.add(MaxPooling1D())
    dnn.add(Flatten())
    dnn.add(Dense(1))   
    dnn.compile(optimizer='adam')
    dnn.build(input_shape=INPUT_SHAPE)
    #ln_rate = tf.keras.optimizers.Adam(learning_rate = LN_RATE)
    #bc=tf.keras.losses.BinaryCrossentropy(from_logits=False)
    #model.compile(loss=bc, optimizer=ln_rate, metrics=["accuracy"])
    return dnn
model = make_DNN()
print(model.summary())

make_DNN
input shape: (55, 4)
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 55, 3)             12        
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 55, 16)            160       
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 55, 16)            784       
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 27, 16)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 432)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 433       
Total params: 1,389
Trainable params: 1,389
Non-trainable params: 0
______________________

In [None]:
cors = []
overall = 0
cnt = 0
one_site_weather = load_weather_for_site(SITE)
for BLDG in SITE_BUILDINGS:
    print("Building",BLDG)
    one_bldg_meter = load_meter_for_building(BLDG,SMOOTHING_WINDOW)
    count_bad = one_bldg_meter[PREDICTED_VARIABLE].isna().sum()
    MAX_BAD = 500
    if count_bad<=MAX_BAD:
        # Must get rid of Nan labels, else loss hits NaN during training.
        print(" Count bad values before pseudofill:",count_bad)
        pseudovalue = one_bldg_meter[PREDICTED_VARIABLE].mean()
        one_bldg_meter = one_bldg_meter.fillna(pseudovalue)
        count_bad = one_bldg_meter[PREDICTED_VARIABLE].isna().sum()
        print(" Count bad values after pseudofill:",count_bad)
        # Smoothing window applies to inputs
        X,y = prepare_for_learning(one_site_weather,one_bldg_meter)
        split = len(X)//2   # year 1 vs year 2
        X_train = np.asarray(X[0:split])
        y_train = np.asarray(y[0:split])
        X_test = np.asarray(X[split:])
        # Smoothing does not apply to truth
        one_bldg_meter = load_meter_for_building(BLDG,0)
        one_bldg_meter = one_bldg_meter.fillna(pseudovalue)
        X_raw,y_raw = prepare_for_learning(one_site_weather,one_bldg_meter)
        y_test = np.asarray(y_raw[split:])
        # Train and predict
        model = make_DNN()
        print(model.summary())
        example=411
        print("Example y train:\n",y_train[example].astype(int))
        model.fit(X_train,y_train,epochs=EPOCHS)
        y_pred = model.predict(X_test)
        # Reporting
        rmse = mean_squared_error(y_test,y_pred,squared=False)
        mean = one_bldg_meter[PREDICTED_VARIABLE].mean()
        cors.append([mean,rmse,rmse/mean,BLDG])
        cnt += 1
        print("i,mean,rmse,rmse/mean,bldg:",cnt,mean,rmse,rmse/mean,BLDG)
        overall += rmse/mean
        for hr in range(0,24,2):
            print("Example prediction:\n",hr,y_pred[example+hr].astype(int))