In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import lightgbm as lgb

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import time

import datetime as datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.set_config_file(offline=True)

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
path_file = r'/kaggle/input/create-pickle-for-dataset/'

# Load dataset

In [None]:
df_data = pd.read_pickle(os.path.join(path_file, 'df_merged.pickle.gz'))
df_data

In [None]:
df_weather = pd.read_pickle(os.path.join(path_file, 'df_weather.pickle.gz'))
df_weather

In [None]:
df_holiday_encode = pd.read_pickle(os.path.join(path_file, 'df_holiday_encode.pickle.gz'))
df_holiday_encode

# Process dataset (power meter data at 15-mins interval)

In [None]:
# Leave columns with keyword of 'kW'
df_powerMeter = df_data.loc[:, df_data.columns.str.contains('kW')].copy()
df_powerMeter = df_powerMeter.sum(axis=1).rename('total_demand')
df_powerMeter = df_powerMeter.resample('H').mean()
df_powerMeter

In [None]:
df_powerMeter.iplot()

# Regression & visualizations

## The first forecasting model with only time-series features

In [None]:
# Prepare data for modeling
df_temp = df_powerMeter.reset_index().copy()
df_temp = df_temp.dropna()

# Add timestamp features
df_temp['weekday'] = df_temp['Date'].dt.weekday
df_temp['hour'] = df_temp['Date'].dt.hour
df_temp['date'] =pd.to_datetime(df_temp['Date'].dt.date)

df_temp = df_temp.set_index('Date').drop(['date'],axis=1)

df_temp = df_temp.rename(columns={'total_demand':'total_demand_meas'})

df_temp

In [None]:
# Weekly profiles of building energy
df_plot = df_temp.copy()
df_plot['date'] = pd.to_datetime(df_plot.index.date)
df_plot.pivot_table(columns=['weekday','hour'], index='date', values='total_demand_meas').T.plot(figsize=(15,5),color='black',alpha=0.1,legend=False)

In [None]:
traindata = df_temp.loc['2018-7':'2019-6'].copy()
testdata = df_temp.loc['2019-7':].copy()

train_labels = traindata['total_demand_meas']
test_labels = testdata['total_demand_meas']

train_features = traindata.drop('total_demand_meas', axis=1)
test_features = testdata.drop('total_demand_meas', axis=1) 

LGB_model = lgb.LGBMRegressor()
LGB_model.fit(train_features, train_labels)

testdata['total_demand_pred'] = LGB_model.predict(test_features)

df_temp.loc[testdata.index, 'total_demand_pred'] = testdata['total_demand_pred']

# Calculate the absolute errors
errors = abs(testdata['total_demand_pred'] - test_labels)

# Calculate mean absolute percentage error (MAPE) and add to list
MAPE = 100 * np.mean((errors / test_labels))
NMBE = 100 * (sum(testdata.dropna()['total_demand_meas'] - testdata.dropna()['total_demand_pred']) / (testdata.dropna()['total_demand_meas'].count() * np.mean(testdata.dropna()['total_demand_meas'])))
CVRSME = 100 * ((sum((testdata.dropna()['total_demand_meas'] - testdata.dropna()['total_demand_pred'])**2) / (testdata.dropna()['total_demand_meas'].count()-1))**(0.5)) / np.mean(testdata.dropna()['total_demand_meas'])
RSQUARED = r2_score(testdata.dropna()['total_demand_meas'], testdata.dropna()['total_demand_pred'])

print("MAPE: "+str(round(MAPE,2)))
print("NMBE: "+str(round(NMBE,2)))
print("CVRSME: "+str(round(CVRSME,2)))
print("R SQUARED: "+str(round(RSQUARED,2)))

testdata[['total_demand_meas', 'total_demand_pred']].iplot()

## The second forecasting model with time-series and weather features

In [None]:
# Prepare data for modeling
df_temp = df_powerMeter.reset_index().copy()
df_temp = df_temp.dropna()

# Add timestamp features
df_temp['weekday'] = df_temp['Date'].dt.weekday
df_temp['hour'] = df_temp['Date'].dt.hour
df_temp['date'] =pd.to_datetime(df_temp['Date'].dt.date)

# Add weather features
df_temp = df_temp.merge(df_weather.reset_index(), left_on='date', right_on='index')

df_temp = df_temp.set_index('Date').drop(['date', 'index'],axis=1)

df_temp = df_temp.rename(columns={'total_demand':'total_demand_meas'})

df_temp

In [None]:
# Scatter plot for energy consumptions and outdoor temperature
plt.figure(figsize=(10,10))
df_plot = df_temp.copy()
df_plot = df_plot.resample('D').mean()
df_plot['weekday/weekend'] = 'weekday'
df_plot.loc[df_plot['weekday']>4, 'weekday/weekend'] ='weekend'

ax = sns.relplot(x="Avg Temp", y="total_demand_meas", col="weekday/weekend",
                 kind="scatter", data=df_plot, alpha=0.8)

In [None]:
traindata = df_temp.loc['2018-7':'2019-6'].copy()
testdata = df_temp.loc['2019-7':].copy()

train_labels = traindata['total_demand_meas']
test_labels = testdata['total_demand_meas']

train_features = traindata.drop('total_demand_meas', axis=1)
test_features = testdata.drop('total_demand_meas', axis=1) 

LGB_model = lgb.LGBMRegressor()
LGB_model.fit(train_features, train_labels)

testdata['total_demand_pred'] = LGB_model.predict(test_features)

df_temp.loc[testdata.index, 'total_demand_pred'] = testdata['total_demand_pred']

# Calculate the absolute errors
errors = abs(testdata['total_demand_pred'] - test_labels)

# Calculate mean absolute percentage error (MAPE) and add to list
MAPE = 100 * np.mean((errors / test_labels))
NMBE = 100 * (sum(testdata.dropna()['total_demand_meas'] - testdata.dropna()['total_demand_pred']) / (testdata.dropna()['total_demand_meas'].count() * np.mean(testdata.dropna()['total_demand_meas'])))
CVRSME = 100 * ((sum((testdata.dropna()['total_demand_meas'] - testdata.dropna()['total_demand_pred'])**2) / (testdata.dropna()['total_demand_meas'].count()-1))**(0.5)) / np.mean(testdata.dropna()['total_demand_meas'])
RSQUARED = r2_score(testdata.dropna()['total_demand_meas'], testdata.dropna()['total_demand_pred'])

print("MAPE: "+str(round(MAPE,2)))
print("NMBE: "+str(round(NMBE,2)))
print("CVRSME: "+str(round(CVRSME,2)))
print("R SQUARED: "+str(round(RSQUARED,2)))

testdata[['total_demand_meas', 'total_demand_pred']].iplot()

## The third forecasting model with time-series, weather, and holiday features¶

In [None]:
df_holiday_2018 = pd.read_html('https://www.timeanddate.com/holidays/thailand/2018')[0]
df_holiday_2018.columns = df_holiday_2018.columns.get_level_values(0)
df_holiday_2018 = df_holiday_2018.dropna(how='all')
df_holiday_2018 = df_holiday_2018[['Date', 'Name', 'Type']]
df_holiday_2018['Date'] = '2018 ' + df_holiday_2018['Date']
df_holiday_2018['Date'] = pd.to_datetime(df_holiday_2018['Date'])

df_holiday_2019 = pd.read_html('https://www.timeanddate.com/holidays/thailand/2019')[0]
df_holiday_2019.columns = df_holiday_2019.columns.get_level_values(0)
df_holiday_2019 = df_holiday_2019.dropna(how='all')
df_holiday_2019 = df_holiday_2019[['Date', 'Name', 'Type']]
df_holiday_2019['Date'] = '2019 ' + df_holiday_2019['Date']
df_holiday_2019['Date'] = pd.to_datetime(df_holiday_2019['Date'])

df_holiday = pd.concat([df_holiday_2018, df_holiday_2019], axis=0, ignore_index=True)
df_holiday = df_holiday.drop_duplicates(subset=['Date'])
df_holiday = df_holiday.set_index('Date').asfreq('D')
df_holiday.loc[df_holiday.index.weekday>=5, 'Name'] = 'weekend'
df_holiday.loc[df_holiday.index.weekday>=5, 'Type'] = 'weekend'
df_holiday.columns = 'holiday_' + df_holiday.columns

df_holiday = df_holiday.reset_index()
df_holiday = df_holiday.rename(columns={'Date':'date'}) 

df_holiday

In [None]:
df_holiday_encode = df_holiday.copy()
df_holiday_encode[['holiday_Name', 'holiday_Type']] = df_holiday_encode[['holiday_Name', 'holiday_Type']].astype('str').apply(LabelEncoder().fit_transform)
df_holiday_encode

In [None]:
# Prepare data for modeling
df_temp = df_powerMeter.reset_index().copy()
df_temp = df_temp.dropna()

# Add timestamp features
df_temp['weekday'] = df_temp['Date'].dt.weekday
df_temp['hour'] = df_temp['Date'].dt.hour
df_temp['date'] =pd.to_datetime(df_temp['Date'].dt.date)

# Add weather features
df_temp = df_temp.merge(df_weather.reset_index(), left_on='date', right_on='index')

# Add holiday features
df_temp = df_temp.merge(df_holiday_encode, on='date')

df_temp = df_temp.set_index('Date').drop(['date', 'index'],axis=1)

df_temp = df_temp.rename(columns={'total_demand':'total_demand_meas'})

df_temp

In [None]:
traindata = df_temp.loc['2018-7':'2019-6'].copy()
testdata = df_temp.loc['2019-7':].copy()

train_labels = traindata['total_demand_meas']
test_labels = testdata['total_demand_meas']

train_features = traindata.drop('total_demand_meas', axis=1)
test_features = testdata.drop('total_demand_meas', axis=1) 

LGB_model = lgb.LGBMRegressor()
LGB_model.fit(train_features, train_labels)

testdata['total_demand_pred'] = LGB_model.predict(test_features)

df_temp.loc[testdata.index, 'total_demand_pred'] = testdata['total_demand_pred']

# Calculate the absolute errors
errors = abs(testdata['total_demand_pred'] - test_labels)

# Calculate mean absolute percentage error (MAPE) and add to list
MAPE = 100 * np.mean((errors / test_labels))
NMBE = 100 * (sum(testdata.dropna()['total_demand_meas'] - testdata.dropna()['total_demand_pred']) / (testdata.dropna()['total_demand_meas'].count() * np.mean(testdata.dropna()['total_demand_meas'])))
CVRSME = 100 * ((sum((testdata.dropna()['total_demand_meas'] - testdata.dropna()['total_demand_pred'])**2) / (testdata.dropna()['total_demand_meas'].count()-1))**(0.5)) / np.mean(testdata.dropna()['total_demand_meas'])
RSQUARED = r2_score(testdata.dropna()['total_demand_meas'], testdata.dropna()['total_demand_pred'])

print("MAPE: "+str(round(MAPE,2)))
print("NMBE: "+str(round(NMBE,2)))
print("CVRSME: "+str(round(CVRSME,2)))
print("R SQUARED: "+str(round(RSQUARED,2)))

testdata[['total_demand_meas', 'total_demand_pred']].iplot()

## Aggregated by different levels

In [None]:
testdata[['total_demand_meas', 'total_demand_pred']].resample('D').mean().iplot()

In [None]:
testdata[['total_demand_meas', 'total_demand_pred']].resample('M').mean().iplot(kind='bar')