# LSTM 
We previously used linear regression
to predict future air temp based on past air temp.
Here, use LSTM for the same task.
Where LinReg viewed each vector as one point,
LSTM will view each vector as a time series.

In [1]:
DATAPATH=''
try:
    # On Google Drive, set path to my drive / data directory.
    from google.colab import drive
    IN_COLAB = True
    PATH='/content/drive/'
    drive.mount(PATH)
    DATAPATH=PATH+'My Drive/data/'  # must end in "/"
except:
    # On home computer, set path to local data directory.
    IN_COLAB = False
    DATAPATH='data/'  # must end in "/"

ZIP_FILE='BuildingData.zip'
ZIP_PATH = DATAPATH+ZIP_FILE
STEAM_FILE='steam.csv'
WEATHER_FILE='weather.csv'
MODEL_FILE='Model'  # will be used later to save models

In [2]:
from os import listdir
import csv
from zipfile import ZipFile
import numpy as np
import pandas as pd
from scipy import stats  # mode

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

from keras.models import Sequential
from keras.layers import SimpleRNN
from keras.layers import TimeDistributed
from keras.layers import Dense

import matplotlib.pyplot as plt
from matplotlib import colors
mycmap = colors.ListedColormap(['red','blue'])  # list color for label 0 then 1
np.set_printoptions(precision=2)

In [3]:
def read_zip_to_panda(zip_filename,csv_filename):
    zip_handle = ZipFile(zip_filename)
    csv_handle = zip_handle.open(csv_filename)
    panda = pd.read_csv(csv_handle)
    return panda
def fix_date_type(panda):
    # Convert the given timestamp column to the pandas datetime data type.
    panda['timestamp'] = pd.to_datetime(panda['timestamp'], infer_datetime_format = True)
    indexed = panda.set_index(['timestamp'])
    return indexed
def get_site_timeseries(panda,site):
    # Assume the panda dataframe has a datetime column.
    # (If not, call fix_date_type() before this.)
    # Extract the timeseries for one site.
    # Convert the datetime column to a DatetimeIndex.
    site_df = panda[panda['site_id']==site]
    temp_col = site_df['date']
    temp_val = temp_col.values
    temp_ndx = pd.DatetimeIndex(temp_val)
    dropped = site_df.drop('date',axis=1)
    panda = dropped.set_index(temp_ndx)
    return panda

In [20]:
SITE = 'Eagle'
METER = 'steam'
BLDG = 'Eagle_education_Peter'   # one example
STEPS_HISTORY = 7  # 14 days
STEPS_FUTURE = 1    # 1 day
PREDICTOR_VARIABLE = 'airTemperature'  # for starters
PREDICTED_VARIABLE = 'airTemperature'  # for starters

In [21]:
wet_df = read_zip_to_panda(ZIP_PATH,WEATHER_FILE)
wet_df = fix_date_type(wet_df)
stm_df = read_zip_to_panda(ZIP_PATH,STEAM_FILE)
stm_df = fix_date_type(stm_df)
site_specific_weather = wet_df.loc[wet_df['site_id'] == SITE]
all_buildings = [x for x in stm_df.columns if x.startswith(SITE)]

In [22]:
DOWNSAMPLE = True
def smooth(df):
    # For smoothing the 24 hour cycle, we do not want exponential smoothing.
    smoothed = None
    if DOWNSAMPLE:
        # This alternate method samples down to 1/24 time steps.
        smoothed = df.resample("24H").mean() 
    else:
        # This method does not reduce the number of time steps.
        # Note the first 23 measurements get set to Nan.
        smoothed=df.rolling(window=24).mean()
        smoothed=smoothed[24:]
    return smoothed

# Correlation is low when buildings have many NaN and 0 meter readings.
# We will ignore buildings that have >max bad meter readings.
def is_usable_column(df,column_name):
    MAX_BAD = 500 
    bad = df[column_name].isin([0]).sum()
    return bad<=MAX_BAD

def prepare_for_learning(df):
    # This is very slow. Is there a faster way? See...
    # https://stackoverflow.com/questions/27852343/split-python-sequence-time-series-array-into-subsequences-with-overlap
    # X = df.drop(METER,axis=1) # this would use all predictors, just drop the predicted
    X=[]
    y=[]
    predictor_series = df[PREDICTOR_VARIABLE]
    predicted_series = df[PREDICTED_VARIABLE]
    for i in range(STEPS_HISTORY,len(df)-STEPS_FUTURE):
        one_predictor = predictor_series[i-STEPS_HISTORY:i]
        one_predicted = predicted_series[i:i+STEPS_FUTURE]
        X.append(one_predictor.to_frame())
        y.append(one_predicted.to_frame())
    return X,y  # both are list of dataframe


In [23]:
def make_RNN():
    rnn = Sequential([
        SimpleRNN(20,return_sequences=True,input_shape=[None,1]),
        SimpleRNN(20,return_sequences=True),
        TimeDistributed(Dense(10))
    ])
    rnn.compile()
    return rnn

In [24]:
cors = []
# Test on only Peter just during code development
for BLDG in all_buildings:
    # Get steam usage for one building.
    bldg_specific_steam = stm_df[[BLDG]]
    # Concatenate steam usage with weather.
    one_bldg_df = pd.concat([bldg_specific_steam,site_specific_weather],axis=1)
    # Drop the site, which is constant (we selected for one site).
    one_bldg_df = one_bldg_df.drop(['site_id'],axis=1)
    # The original steam table used column name = building name.
    # We are processing one building, so rename to the column 'steam'.
    one_bldg_df = one_bldg_df.rename(columns={BLDG : METER})
    # In order to filter bad buildings, count sum of NaN + zero.
    one_bldg_df = one_bldg_df.fillna(0)
    
    if is_usable_column(one_bldg_df,METER):
        one_bldg_df = smooth(one_bldg_df) # moving average: 24hr
        X,y = prepare_for_learning(one_bldg_df)
        if True:
            X = one_bldg_df.drop(METER,axis=1)
            y = one_bldg_df[METER]
            # Ideally, split Year1 = train, Year2 = test.
            # Some data is incomplete, so split 1st half and 2nd half.
            split = len(X)//2 
            X_train = X.iloc[0:split]
            y_train = y.iloc[0:split]
            X_test = X.iloc[split:]
            y_test = y.iloc[split:]
            print(X_train.describe())
            print(X_train.shape())
            model = make_RNN()
            model.fit(X_train,y_train)
            y_pred = model.predict(X_test)
            # Keep a table for reporting later.
            rmse = mean_squared_error(y_test,y_pred,squared=False)
            mean = one_bldg_df[METER].mean()
            cor = one_bldg_df.corr().iloc[0][3] # corr(steam,dew_temp)
            cors.append([cor,mean,rmse,rmse/mean,BLDG])
if True:
    print("History",STEPS_HISTORY,"Future",STEPS_FUTURE)
    print("Column 1: Correlation of steam usage to dew temp.")
    print("          Using dew temp as leading weather correlate.")
    print("Column 2: Mean steam usage.")
    print("          Using mean to help understand the RMSE.")
    print("Column 3: RMSE of LinearRegression(X=Weather, y=SteamUsage).")
    print("Column 4: RMSE/mean normalized to help understand RMSE.")
    print("Column 5: Building.")
    for cor in sorted(cors):
        print("%7.4f %10.2f %10.2f %5.2f   %s"%(cor[0],cor[1],cor[2],cor[3],cor[4]))    

       airTemperature  cloudCoverage  dewTemperature  precipDepth1HR  \
count      365.000000     365.000000      365.000000      365.000000   
mean        13.277329       0.348744        5.803596        0.741667   
std          9.598063       0.511221       10.306974        1.943793   
min        -12.245833       0.000000      -22.837500       -0.958333   
25%          5.987500       0.000000       -2.683333        0.000000   
50%         12.491667       0.250000        5.816667        0.000000   
75%         22.166667       0.500000       15.008333        0.208333   
max         30.425000       7.125000       24.120833       14.416667   

       precipDepth6HR  seaLvlPressure  windDirection   windSpeed  
count      365.000000      365.000000     365.000000  365.000000  
mean         0.897831     1002.914829     181.314155    3.064247  
std          2.408688       38.203056      67.867051    1.288354  
min         -0.166667      544.475000      39.166667    0.708333  
25%          0.0

ValueError: in user code:

    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:788 run_step  **
        outputs = model.train_step(data)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:754 train_step
        y_pred = self(x, training=True)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:998 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/input_spec.py:219 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) + ' of layer ' +

    ValueError: Input 0 of layer sequential_6 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 8)


## Useful Links

Jason Brownlee  
https://machinelearningmastery.com/how-to-develop-lstm-models-for-time-series-forecasting/
https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/
https://machinelearningmastery.com/suitability-long-short-term-memory-networks-time-series-forecasting/
https://machinelearningmastery.com/autoregression-models-time-series-forecasting-python/