# Development 3 - Heat Prediction With Sarimax
## Monthly dataset based on mean values of meters

In [1]:
# Import Python modules
import warnings
import itertools
import numpy as np
from numpy import concatenate, savetxt, unique, array, subtract
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
import pandas as pd
from pandas import merge, DataFrame, Series
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
import matplotlib
from math import sqrt
from statistics import mean


In [2]:
# Load the format the datasets
df = pd.read_csv('CHL_Monthly_Stacked.csv', header=0, sep='[,]', parse_dates=True,
                 squeeze=True, dayfirst=True, engine='python')
df1a = pd.DataFrame(index =['Month'], columns=['prediction', 'actual'])
pred_date = '2019-03-01'
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df.columns =['MeterNo', 'Month', 'kWh']
df_DDH = pd.read_csv('DDH_Monthly.csv', header=0, sep='[,]', parse_dates=True, squeeze=True, dayfirst=True, engine='python')
df_DDH['Date'] = pd.to_datetime(df_DDH['Date'], dayfirst=True)
df_DDH.columns =['Month', 'DDH']
df = merge(df, df_DDH, on='Month')
df_meters = pd.read_csv('CHL-Meters.csv', header=0, sep='[,]', parse_dates=True, squeeze=True, dayfirst=True, engine='python')
df = merge(df, df_meters, on='MeterNo')
y = df.set_index('Month')
y.drop(y.columns[[0]], axis = 1, inplace = True)
y = y.sort_index()

# Create the summary datasets and merge
ya = y.groupby(y.index)['kWh'].mean()
yb = y.groupby(y.index)['DDH'].mean()
yc = y.groupby(y.index)['Area_m2'].mean()
z = merge(ya, yb, left_index=True, right_index=True)
z = merge(z, yc, left_index=True, right_index=True)
print(z)


                   kWh  DDH    Area_m2
Month                                 
2013-01-01  474.596491  318  72.519298
2013-02-01  467.385965  311  72.519298
2013-03-01  455.115789  380  72.519298
2013-04-01  389.592982  254  72.519298
2013-05-01  280.585965  168  72.519298
...                ...  ...        ...
2020-10-01  342.726316  183  72.519298
2020-11-01  425.635088  220  72.519298
2020-12-01  554.003509  326  72.519298
2021-01-01  560.175439  358  72.519298
2021-02-01  515.698246  261  72.519298

[98 rows x 3 columns]


In [3]:
#Sarimax model configuration
endog = z.iloc[:,0]
exog = z.iloc[:,1:]
mod = sm.tsa.statespace.SARIMAX(endog = endog,
                            order=(1, 1, 1),
                            seasonal_order=(0, 1, 1, 12),
                            exog = exog,
                            enforce_stationarity=False,
                            enforce_invertibility=False)
results = mod.fit()
pred = results.get_prediction(start=pd.to_datetime(pred_date), dynamic=False)
y_forecasted = pred.predicted_mean
y_forecasted =y_forecasted.to_frame()
y_truth = z[pred_date:]
df1 = merge(y_forecasted, y_truth, left_index=True, right_index=True)
df1.drop(df1.columns[[2,3]], axis = 1, inplace = True)
df1.columns =['prediction', 'actual']
df1['error'] = round(((df1['prediction'] - df1['actual'])),2)
print(df1)



            prediction      actual  error
2019-03-01  389.618161  432.150877 -42.53
2019-04-01  353.944754  385.712281 -31.77
2019-05-01  314.603398  302.915789  11.69
2019-06-01  240.461316  227.119298  13.34
2019-07-01  202.568557  211.235088  -8.67
2019-08-01  199.133824  194.691228   4.44
2019-09-01  235.215084  259.235088 -24.02
2019-10-01  313.906126  321.540351  -7.63
2019-11-01  438.546205  418.789474  19.76
2019-12-01  525.382704  533.392982  -8.01
2020-01-01  555.918246  553.136842   2.78
2020-02-01  521.435975  537.428070 -15.99
2020-03-01  494.063053  463.421053  30.64
2020-04-01  372.493933  360.564912  11.93
2020-05-01  288.691126  304.842105 -16.15
2020-06-01  242.187705  246.343860  -4.16
2020-07-01  231.526083  228.463158   3.06
2020-08-01  211.730911  209.263158   2.47
2020-09-01  255.263290  239.087719  16.18
2020-10-01  286.687657  342.726316 -56.04
2020-11-01  451.335385  425.635088  25.70
2020-12-01  549.996145  554.003509  -4.01
2021-01-01  589.002150  560.175439

In [4]:
# drop any nan values
df2 = df1.dropna()
print(df2)

            prediction      actual  error
2019-03-01  389.618161  432.150877 -42.53
2019-04-01  353.944754  385.712281 -31.77
2019-05-01  314.603398  302.915789  11.69
2019-06-01  240.461316  227.119298  13.34
2019-07-01  202.568557  211.235088  -8.67
2019-08-01  199.133824  194.691228   4.44
2019-09-01  235.215084  259.235088 -24.02
2019-10-01  313.906126  321.540351  -7.63
2019-11-01  438.546205  418.789474  19.76
2019-12-01  525.382704  533.392982  -8.01
2020-01-01  555.918246  553.136842   2.78
2020-02-01  521.435975  537.428070 -15.99
2020-03-01  494.063053  463.421053  30.64
2020-04-01  372.493933  360.564912  11.93
2020-05-01  288.691126  304.842105 -16.15
2020-06-01  242.187705  246.343860  -4.16
2020-07-01  231.526083  228.463158   3.06
2020-08-01  211.730911  209.263158   2.47
2020-09-01  255.263290  239.087719  16.18
2020-10-01  286.687657  342.726316 -56.04
2020-11-01  451.335385  425.635088  25.70
2020-12-01  549.996145  554.003509  -4.01
2021-01-01  589.002150  560.175439

In [5]:
#mse calculation
mse = mean_squared_error(df2['actual'],df2['prediction'])
print(mse)

458.4794729408605


In [6]:
#rmse calculation
rmse = np.sqrt(mse)
print(rmse)

21.412133778324392


In [7]:
#mean acutal calculation
print(df2['actual'].mean())

367.81549707602335


In [None]:
#PRMSE: RMSE/Mean Actual
rmse/df2['actual'].mean()

In [None]:
# Summary statistics
def Summary(x):
    return round(Series(index=['min','max', 'mean','sum','count'],data=[x.min(),x.max(),x.mean(),x.sum(),x.count()]),2)
df2.apply(Summary)