# CNN + RNN 
Compare to RNN 229.


In [13]:
from os import listdir
import csv
from zipfile import ZipFile
import numpy as np
import pandas as pd
from scipy import stats  # mode

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

from keras.models import Sequential
from keras.layers import SimpleRNN
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import TimeDistributed
from keras.layers import Dense
from keras.losses import MeanSquaredError
from keras.layers import Conv2D
from keras.layers import Flatten
from keras.layers import MaxPooling2D
from keras.layers import TimeDistributed

import matplotlib.pyplot as plt
from matplotlib import colors
mycmap = colors.ListedColormap(['red','blue'])  # list color for label 0 then 1
np.set_printoptions(precision=2)

In [14]:
# Constants
EPOCHS=50  # use 5 for software testing, 50 for model testing
SITE = 'Eagle'
PREDICTORS = ['hour','month','doy','meter','cloudCoverage', 'airTemperature', 'dewTemperature', 'precipDepth1HR', 'precipDepth6HR', 'seaLvlPressure', 'windDirection', 'windSpeed']
# For our CNN, min predictive features is six.
NUM_PREDICTORS=len(PREDICTORS)
print("PREDICTORS=",NUM_PREDICTORS,PREDICTORS)
PREDICTED_VARIABLE = 'meter'  
STEPS_HISTORY = 24
STEPS_FORWARD = 12 
STEPS_FUTURE =  12 
METER_FILE='steam.csv'
WEATHER_FILE='weather.csv'
EXAMPLE='Eagle_lodging_Edgardo'
SITE_BUILDINGS = None
SMOOTHING_WINDOW=3
SCALING=1
CELLS = 16
FILTERS = 16
WIDTH = 3
STRIDE = (1,1)
INPUT_SHAPE = (STEPS_FORWARD,NUM_PREDICTORS,1) 

PREDICTORS= 12 ['hour', 'month', 'doy', 'meter', 'cloudCoverage', 'airTemperature', 'dewTemperature', 'precipDepth1HR', 'precipDepth6HR', 'seaLvlPressure', 'windDirection', 'windSpeed']


In [15]:
DATAPATH=''
try:
    # On Google Drive, set path to my drive / data directory.
    from google.colab import drive
    IN_COLAB = True
    PATH='/content/drive/'
    drive.mount(PATH)
    DATAPATH=PATH+'My Drive/data/'  # must end in "/"
except:
    # On home computer, set path to local data directory.
    IN_COLAB = False
    DATAPATH='data/'  # must end in "/"

ZIP_FILE='BuildingData.zip'
ZIP_PATH = DATAPATH+ZIP_FILE
MODEL_FILE='Model'  # will be used later to save models

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [16]:
def scale(df):
    scaler=StandardScaler()
    #scaler=MinMaxScaler()
    scaled=scaler.fit_transform(df.values)
    scaled = pd.DataFrame(scaled,index=df.index,columns=df.columns)
    return scaled

In [17]:
def read_zip_to_panda(zip_filename,csv_filename):
    zip_handle = ZipFile(zip_filename)
    csv_handle = zip_handle.open(csv_filename)
    panda = pd.read_csv(csv_handle)
    return panda
def fix_date_type(panda):
    # Convert the given timestamp column to the pandas datetime data type.
    panda['timestamp'] = pd.to_datetime(panda['timestamp'], infer_datetime_format = True)
    indexed = panda.set_index(['timestamp'])
    return indexed


In [18]:
DATE_PARSE=True  # must be true if we use one of these as predictor
def load_weather_for_site(site):
    wet_df = read_zip_to_panda(ZIP_PATH,WEATHER_FILE)
    wet_df = fix_date_type(wet_df)
    site_df = wet_df.loc[wet_df['site_id'] == site]
    # Drop the site, which is constant (we selected for one site).
    site_df = site_df.drop(['site_id'],axis=1)
    if DATE_PARSE:
        site_df.insert(0,'hour',0)
        site_df.insert(1,'month',0)
        site_df.insert(2,'doy',0)
        L=len(site_df)
        for i in range(0,L):
            dt=site_df.index[i]
            hour=dt.hour
            month=dt.month
            doy=dt.dayofyear
            site_df.iat[i,0] = hour
            site_df.iat[i,1] = month
            site_df.iat[i,2] = doy
    #if SCALING==1:
    #    site_df = scale(site_df) # could break if any column is empty
    return site_df

one_site_weather = load_weather_for_site(SITE)
one_site_weather.tail()

Unnamed: 0_level_0,hour,month,doy,airTemperature,cloudCoverage,dewTemperature,precipDepth1HR,precipDepth6HR,seaLvlPressure,windDirection,windSpeed
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017-12-31 18:00:00,18,12,365,-11.1,0.0,-20.6,0.0,,1026.2,330.0,2.6
2017-12-31 20:00:00,20,12,365,-12.2,0.0,-21.1,0.0,,1027.0,320.0,1.5
2017-12-31 21:00:00,21,12,365,-12.8,0.0,-21.1,0.0,,1027.2,310.0,2.6
2017-12-31 22:00:00,22,12,365,-12.8,0.0,-20.6,0.0,,1027.4,330.0,3.1
2017-12-31 23:00:00,23,12,365,-12.8,0.0,-20.6,0.0,,1027.4,320.0,4.6


In [19]:
def load_meter_for_building(bldg,smooth=0):
    all_df = read_zip_to_panda(ZIP_PATH,METER_FILE)
    all_df = fix_date_type(all_df)
    global SITE_BUILDINGS
    SITE_BUILDINGS = [x for x in all_df.columns if x.startswith(SITE)]
    site_series = all_df[bldg]
    site_df = site_series.to_frame()
    #site_df = all_df.loc[all_df['site_id'] == site]
    # Change column name from building name to meter.
    site_df = site_df.rename(columns={bldg : PREDICTED_VARIABLE})
    if smooth>0:
        site_df = site_df.rolling(smooth).mean()
    return site_df

one_bldg_meter = load_meter_for_building(EXAMPLE)
print(type(one_bldg_meter))
one_bldg_meter.tail()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0_level_0,meter
timestamp,Unnamed: 1_level_1
2017-12-31 19:00:00,92.2957
2017-12-31 20:00:00,277.5584
2017-12-31 21:00:00,280.5331
2017-12-31 22:00:00,289.3302
2017-12-31 23:00:00,164.3474


In [20]:
# Make X out of weather + meter features, then select wanted columns.
# Make y out of meter features, then select meter column only.
# Apply scaler to X only.
# Pull out time steps for X from the past: STEPS_HISTORY.
# Pull out proper number of time steps for X: STEPS_FORWARD.
# Pull out proper number of time steps for y: STEPS_FUTURE.
# Make X inefficiently with 3-deep nested for loop because
# a) want to replace NaN with previous value
# b) need to add the RGB dimension for CNN.
def prepare_for_learning(wdf,mdf):
    df = pd.concat([wdf,mdf],axis=1)
    if SCALING==1:
        df = scale(df) # could break if any column is empty
    num_samples = len(df) - STEPS_FUTURE - STEPS_HISTORY
    predictor_series = df[PREDICTORS].values  # selected features
    predicted_series = mdf[PREDICTED_VARIABLE].values  # meter
    #
    X_shape = (num_samples,STEPS_FUTURE,NUM_PREDICTORS,1) # RGB = 1
    Y_shape = (num_samples,STEPS_FUTURE)
    X=np.zeros(X_shape)
    y=np.zeros(Y_shape)
    for sam in range (0,num_samples): 
        prev_val = 0
        one_sample = predictor_series[sam:sam+STEPS_FORWARD]
        for time in range (0,STEPS_FORWARD): 
            one_period = one_sample[time]
            for feat in range (0,NUM_PREDICTORS):
                val = one_period[feat]
                if np.isnan(val):
                    val = prev_val
                else:
                    prev_val = val
                X[sam,time,feat,0] = val  # RGB dim = 0
        for time in range (0,STEPS_FUTURE):  
            y[sam,time]=predicted_series[sam+STEPS_HISTORY+time]
    return X,y 
print(one_bldg_meter.head())
X,y = prepare_for_learning(one_site_weather,one_bldg_meter)
print("X shape:",X.shape)
print("y shape:",y.shape)


                       meter
timestamp                   
2016-01-01 00:00:00  31.7661
2016-01-01 01:00:00  27.4004
2016-01-01 02:00:00  38.4989
2016-01-01 03:00:00  59.1697
2016-01-01 04:00:00  39.9556
X shape: (17508, 12, 12, 1)
y shape: (17508, 12)


In [21]:
print("X columns:",PREDICTORS)
print("X example:\n",X[100].astype(int))
print("y example:\n",y[100].astype(int))

X columns: ['hour', 'month', 'doy', 'meter', 'cloudCoverage', 'airTemperature', 'dewTemperature', 'precipDepth1HR', 'precipDepth6HR', 'seaLvlPressure', 'windDirection', 'windSpeed']
X example:
 [[[-1]
  [-1]
  [-1]
  [ 1]
  [ 0]
  [-2]
  [-2]
  [ 0]
  [ 0]
  [ 2]
  [ 1]
  [ 1]]

 [[ 0]
  [-1]
  [-1]
  [ 2]
  [ 0]
  [-2]
  [-2]
  [ 0]
  [ 0]
  [ 2]
  [ 1]
  [ 1]]

 [[ 0]
  [-1]
  [-1]
  [ 2]
  [ 0]
  [-2]
  [-2]
  [ 0]
  [ 0]
  [ 2]
  [-1]
  [ 1]]

 [[ 0]
  [-1]
  [-1]
  [ 0]
  [ 0]
  [-2]
  [-2]
  [ 0]
  [ 0]
  [ 2]
  [ 1]
  [ 0]]

 [[ 0]
  [-1]
  [-1]
  [ 2]
  [ 0]
  [-2]
  [-2]
  [ 0]
  [ 0]
  [ 2]
  [ 1]
  [ 0]]

 [[ 0]
  [-1]
  [-1]
  [ 3]
  [ 0]
  [-2]
  [-2]
  [ 0]
  [ 0]
  [ 2]
  [ 1]
  [ 0]]

 [[ 0]
  [-1]
  [-1]
  [ 3]
  [ 0]
  [-2]
  [-2]
  [ 0]
  [ 0]
  [ 2]
  [ 1]
  [ 0]]

 [[ 0]
  [-1]
  [-1]
  [ 3]
  [ 0]
  [-1]
  [-2]
  [ 0]
  [ 0]
  [ 2]
  [ 1]
  [ 0]]

 [[ 0]
  [-1]
  [-1]
  [ 3]
  [ 0]
  [-1]
  [-2]
  [ 0]
  [ 0]
  [ 2]
  [ 1]
  [ 0]]

 [[ 0]
  [-1]
  [-1]
  [ 2]
  [ 

In [22]:
def make_DNN():
    print("make_DNN")
    print("input shape:",INPUT_SHAPE)
    dnn = Sequential()
    dnn.add(Conv2D( input_shape=INPUT_SHAPE,
            filters=FILTERS,kernel_size=WIDTH,strides=STRIDE,
            activation=None, padding="valid"))
    dnn.add(Conv2D(
            filters=FILTERS,kernel_size=WIDTH,strides=STRIDE,
            activation=None, padding="valid"))
    dnn.add(MaxPooling2D())
    dnn.add(TimeDistributed(Flatten()))
    #dnn.add(GRU(CELLS,return_sequences=True))
    dnn.add(GRU(CELLS,return_sequences=False))
    dnn.add(Dense(STEPS_FUTURE))   
    dnn.compile(optimizer='adam',loss=MeanSquaredError())
    dnn.build(input_shape=INPUT_SHAPE)
    return dnn    

In [23]:
cors = []
overall = 0
cnt = 0
one_site_weather = load_weather_for_site(SITE)
for BLDG in SITE_BUILDINGS:
    print("Building",BLDG)
    one_bldg_meter = load_meter_for_building(BLDG,SMOOTHING_WINDOW)
    count_bad = one_bldg_meter[PREDICTED_VARIABLE].isna().sum()
    MAX_BAD = 500
    if count_bad<=MAX_BAD:
        # Must get rid of Nan labels, else loss hits NaN during training.
        print(" Count bad values before pseudofill:",count_bad)
        pseudovalue = one_bldg_meter[PREDICTED_VARIABLE].mean()
        one_bldg_meter = one_bldg_meter.fillna(pseudovalue)
        count_bad = one_bldg_meter[PREDICTED_VARIABLE].isna().sum()
        print(" Count bad values after pseudofill:",count_bad)
        # Smoothing window applies to inputs
        X,y = prepare_for_learning(one_site_weather,one_bldg_meter)
        split = len(X)//2   # year 1 vs year 2
        X_train = np.asarray(X[0:split])
        y_train = np.asarray(y[0:split])
        X_test = np.asarray(X[split:])
        # Smoothing does not apply to truth
        one_bldg_meter = load_meter_for_building(BLDG,0)
        one_bldg_meter = one_bldg_meter.fillna(pseudovalue)
        X_raw,y_raw = prepare_for_learning(one_site_weather,one_bldg_meter)
        y_test = np.asarray(y_raw[split:])
        # Train and predict
        model = make_DNN()
        print(model.summary())
        example=411
        print("Example y train:\n",y_train[example].astype(int))
        model.fit(X_train,y_train,epochs=EPOCHS)
        y_pred = model.predict(X_test)
        # Reporting
        rmse = mean_squared_error(y_test,y_pred,squared=False)
        mean = one_bldg_meter[PREDICTED_VARIABLE].mean()
        cors.append([mean,rmse,rmse/mean,BLDG])
        cnt += 1
        print("i,mean,rmse,rmse/mean,bldg:",cnt,mean,rmse,rmse/mean,BLDG)
        overall += rmse/mean
        for hr in range(0,24,2):
            print("Example prediction:\n",hr,y_pred[example+hr].astype(int))


Building Eagle_office_Lamont
 Count bad values before pseudofill: 17
 Count bad values after pseudofill: 0
make_DNN
input shape: (12, 12, 1)
Model: "sequential_39"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_78 (Conv2D)           (None, 10, 10, 16)        160       
_________________________________________________________________
conv2d_79 (Conv2D)           (None, 8, 8, 16)          2320      
_________________________________________________________________
max_pooling2d_39 (MaxPooling (None, 4, 4, 16)          0         
_________________________________________________________________
time_distributed_39 (TimeDis (None, 4, 64)             0         
_________________________________________________________________
gru_78 (GRU)                 (None, 16)                3936      
_________________________________________________________________
dense_39 (Dense)             (None, 12)     



i,mean,rmse,rmse/mean,bldg: 7 0.0 0.0005283642390658173 inf Eagle_office_Henriette
Example prediction:
 0 [0 0 0 0 0 0 0 0 0 0 0 0]
Example prediction:
 2 [0 0 0 0 0 0 0 0 0 0 0 0]
Example prediction:
 4 [0 0 0 0 0 0 0 0 0 0 0 0]
Example prediction:
 6 [0 0 0 0 0 0 0 0 0 0 0 0]
Example prediction:
 8 [0 0 0 0 0 0 0 0 0 0 0 0]
Example prediction:
 10 [0 0 0 0 0 0 0 0 0 0 0 0]
Example prediction:
 12 [0 0 0 0 0 0 0 0 0 0 0 0]
Example prediction:
 14 [0 0 0 0 0 0 0 0 0 0 0 0]
Example prediction:
 16 [0 0 0 0 0 0 0 0 0 0 0 0]
Example prediction:
 18 [0 0 0 0 0 0 0 0 0 0 0 0]
Example prediction:
 20 [0 0 0 0 0 0 0 0 0 0 0 0]
Example prediction:
 22 [0 0 0 0 0 0 0 0 0 0 0 0]
Building Eagle_health_Reba
 Count bad values before pseudofill: 36
 Count bad values after pseudofill: 0
make_DNN
input shape: (12, 12, 1)
Model: "sequential_46"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_92 (Conv2D)          

In [24]:
print("History",STEPS_HISTORY,"Future",STEPS_FUTURE)
print("Column 1: Mean usage.")
print("Column 2: RMSE of LinearRegression(X=Weather, y=Usage).")
print("Column 3: RMSE/mean normalized to help understand RMSE.")
print("Column 4: Building.")
for cor in sorted(cors):
    print("%10.2f %10.2f %5.2f   %s"%(cor[0],cor[1],cor[2],cor[3]))  
overall = overall/cnt
print ("overall = ",overall)  

History 24 Future 12
Column 1: Mean usage.
Column 2: RMSE of LinearRegression(X=Weather, y=Usage).
Column 3: RMSE/mean normalized to help understand RMSE.
Column 4: Building.
      0.00       0.00   inf   Eagle_office_Henriette
      0.11       0.04  0.34   Eagle_education_Wesley
     15.76      43.70  2.77   Eagle_education_Jewell
     35.89       9.90  0.28   Eagle_office_Mandi
     36.93       9.71  0.26   Eagle_office_Lamont
     43.44      55.50  1.28   Eagle_lodging_Blake
     46.46      24.03  0.52   Eagle_education_Eileen
     56.51      76.82  1.36   Eagle_office_Dallas
     57.05      35.62  0.62   Eagle_education_Petra
     62.02      56.46  0.91   Eagle_office_Sheree
     81.97      68.55  0.84   Eagle_lodging_Edgardo
     87.21      23.41  0.27   Eagle_office_Phyllis
     91.28      76.31  0.84   Eagle_lodging_Trina
     92.83      57.10  0.62   Eagle_lodging_Dawn
    101.60      65.47  0.64   Eagle_office_Freida
    103.18      29.08  0.28   Eagle_education_Shana
    121.