Importing libraries and packages

In [3]:
#Importing libraries and packages

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import json
from datetime import timedelta

Loading Files

In [4]:
#Loading Cleaned Consumption Data, Weather Data and Tariff Data

energy_cleaned_d = pd.read_csv(r'/Users/rosaliapujols/Desktop/Dissertation/Code/Final Code/Pre-processing/new_energy_d.csv')
energy_cleaned_n = pd.read_csv(r'/Users/rosaliapujols/Desktop/Dissertation/Code/Final Code/Pre-processing/new_energy_n.csv')
weather = pd.read_csv(r'/Users/rosaliapujols/Desktop/Dissertation/Code/Final Code/Weather/weather_cleaned.csv')
tariff = pd.read_csv(r'/Users/rosaliapujols/Desktop/Dissertation/Code/Dataset-UKDA-7857-csv/csv/data_collection/data_tables/tariff_d.csv')

In [5]:
#Loading Bank Holidays Data

file_paths = [
    '/Users/rosaliapujols/Desktop/Dissertation/Code/bank_holidays/bank_holidays_2011_england-and-wales.json',
    '/Users/rosaliapujols/Desktop/Dissertation/Code/bank_holidays/bank_holidays_2012_england-and-wales.json',
    '/Users/rosaliapujols/Desktop/Dissertation/Code/bank_holidays/bank_holidays_2013_england-and-wales.json',
    '/Users/rosaliapujols/Desktop/Dissertation/Code/bank_holidays/bank_holidays_2014_england-and-wales.json'
]

dataframes_list = []

for file_path in file_paths:
    with open(file_path, 'r') as json_file:
        json_data = json.load(json_file)
        df = pd.DataFrame(json_data)
        df = df.drop(['notes', 'bunting'], axis=1)
        df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')

        dataframes_list.append(df)

bank_holidays = pd.concat(dataframes_list, ignore_index=True)

print(bank_holidays)

                          title       date
0        bank_holidays.new_year 2011-01-03
1     bank_holidays.good_friday 2011-04-22
2   bank_holidays.easter_monday 2011-04-25
3       bank_holidays.early_may 2011-05-02
4          bank_holidays.spring 2011-05-30
5     bank_holidays.late_august 2011-08-29
6      bank_holidays.boxing_day 2011-12-26
7       bank_holidays.christmas 2011-12-27
8        bank_holidays.new_year 2012-01-02
9     bank_holidays.good_friday 2012-04-06
10  bank_holidays.easter_monday 2012-04-09
11      bank_holidays.early_may 2012-05-07
12         bank_holidays.spring 2012-05-28
13    bank_holidays.late_august 2012-08-27
14      bank_holidays.christmas 2012-12-25
15     bank_holidays.boxing_day 2012-12-26
16       bank_holidays.new_year 2013-01-01
17    bank_holidays.good_friday 2013-03-29
18  bank_holidays.easter_monday 2013-04-01
19      bank_holidays.early_may 2013-05-06
20         bank_holidays.spring 2013-05-27
21    bank_holidays.late_august 2013-08-26
22      ban

Converting GMT columns to datetime

In [6]:
#Converting GMT column to datetime

energy_cleaned_d['GMT'] = pd.to_datetime(energy_cleaned_d['GMT'], format='%Y-%m-%d %H:%M:%S')

energy_cleaned_n['GMT'] = pd.to_datetime(energy_cleaned_d['GMT'], format='%Y-%m-%d %H:%M:%S')

tariff['GMT'] = pd.to_datetime(tariff['GMT'], format='%Y-%m-%d %H:%M:%S')

Removing all rows from 2012, first row of 2013 which really represents the last reading of 2012, and keeping only the first row of 2014 which represents the last reading of 2013.

In [7]:
#On households type N

energy_2013_n = energy_cleaned_n[(energy_cleaned_n['GMT'].dt.year == 2013) & (energy_cleaned_n['GMT'] != '2013-01-01 00:00:00')]
first_timestamp_2014_n = energy_cleaned_n[(energy_cleaned_n['GMT'].dt.year == 2014)].iloc[0]
energy_2014_n = energy_cleaned_n[(energy_cleaned_n['GMT'].dt.year == 2014) & (energy_cleaned_n['GMT'] == first_timestamp_2014_n['GMT'])]
result_n = pd.concat([energy_2013_n, energy_2014_n])
result_n = result_n.reset_index(drop=True)
energy_cleaned_n = result_n

In [8]:
#On households type D

energy_2013_d = energy_cleaned_d[(energy_cleaned_d['GMT'].dt.year == 2013) & (energy_cleaned_d['GMT'] != '2013-01-01 00:00:00')]
first_timestamp_2014_d = energy_cleaned_d[(energy_cleaned_d['GMT'].dt.year == 2014)].iloc[0]
energy_2014_d = energy_cleaned_d[(energy_cleaned_d['GMT'].dt.year == 2014) & (energy_cleaned_d['GMT'] == first_timestamp_2014_d['GMT'])]
result_d = pd.concat([energy_2013_d, energy_2014_d])
result_d = result_d.reset_index(drop=True)
energy_cleaned_d = result_d

In [9]:
energy_cleaned_d = energy_cleaned_d.drop("Holiday", axis=1)
energy_cleaned_n = energy_cleaned_n.drop("Holiday", axis=1)
energy_cleaned_n.drop(columns=["Date", "Year", "Time","Month", "Day", "Hour", "DayOfWeek"], inplace=True)
energy_cleaned_d.drop(columns=["Date", "Year", "Time","Month", "Day", "Hour", "DayOfWeek"], inplace=True)

In [10]:
#Checking if the GMT column is in datetime format

print(energy_cleaned_d['GMT'].dtype)
print(energy_cleaned_n['GMT'].dtype)

datetime64[ns]
datetime64[ns]


In [11]:
#Checking that the GMT column is the same in both dataframes

gmt_match = energy_cleaned_n['GMT'].equals(energy_cleaned_d['GMT'])
print(gmt_match)

True


In [12]:
#Getting shapes of dataframes

rows_energy_d, columns_energy_d = energy_cleaned_d.shape
print("Shape of DataFrame 'energy_d':", (rows_energy_d, columns_energy_d))

rows_energy_n, columns_energy_n = energy_cleaned_n.shape
print("Shape of DataFrame 'energy_n':", (rows_energy_n, columns_energy_n))

Shape of DataFrame 'energy_d': (17520, 1026)
Shape of DataFrame 'energy_n': (17520, 4174)


Printing Dataframes

In [13]:
bank_holidays

Unnamed: 0,title,date
0,bank_holidays.new_year,2011-01-03
1,bank_holidays.good_friday,2011-04-22
2,bank_holidays.easter_monday,2011-04-25
3,bank_holidays.early_may,2011-05-02
4,bank_holidays.spring,2011-05-30
5,bank_holidays.late_august,2011-08-29
6,bank_holidays.boxing_day,2011-12-26
7,bank_holidays.christmas,2011-12-27
8,bank_holidays.new_year,2012-01-02
9,bank_holidays.good_friday,2012-04-06


In [14]:
energy_cleaned_d

Unnamed: 0,GMT,D0000,D0001,D0002,D0003,D0004,D0005,D0006,D0007,D0008,...,D1015,D1016,D1017,D1018,D1019,D1020,D1021,D1022,D1023,D1024
0,2013-01-01 00:30:00,0.404,0.248,0.227,0.078,0.202,0.066,0.161,0.137,0.176,...,0.032,0.005,0.236,0.054,0.196,0.838,0.081,0.166,0.110,0.058
1,2013-01-01 01:00:00,0.185,0.206,0.231,0.076,0.188,0.057,0.227,0.164,0.187,...,0.046,0.005,0.289,0.060,0.218,0.446,0.039,0.145,0.100,0.087
2,2013-01-01 01:30:00,0.151,0.171,0.236,0.077,0.039,0.052,0.138,0.136,0.341,...,0.016,0.005,0.239,0.057,0.181,0.410,0.059,0.133,0.234,0.040
3,2013-01-01 02:00:00,0.139,0.068,0.239,0.077,0.024,0.070,0.040,0.103,0.087,...,0.016,0.005,0.120,0.263,0.174,0.282,0.049,0.049,0.056,0.083
4,2013-01-01 02:30:00,0.105,0.083,0.251,0.077,0.102,0.038,0.020,0.094,0.063,...,0.016,0.005,0.097,0.047,0.188,0.239,0.034,0.039,0.106,0.053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17515,2013-12-31 22:00:00,0.234,0.256,0.145,0.094,0.006,0.248,0.387,0.203,0.222,...,0.020,0.024,0.268,0.047,0.381,0.284,0.074,0.168,0.073,0.100
17516,2013-12-31 22:30:00,0.182,0.117,0.177,0.090,0.006,0.232,0.327,0.174,0.182,...,0.044,0.025,0.351,0.064,0.361,0.662,0.085,0.178,0.077,0.098
17517,2013-12-31 23:00:00,0.153,0.078,0.158,0.055,0.025,0.212,0.164,0.187,0.219,...,0.036,0.201,0.249,0.041,0.267,0.640,0.048,0.163,0.113,0.089
17518,2013-12-31 23:30:00,0.166,0.025,0.162,0.049,0.012,0.241,0.132,0.195,0.196,...,0.037,0.024,0.246,0.069,0.291,0.295,0.055,0.191,0.078,0.093


In [15]:
energy_cleaned_n

Unnamed: 0,GMT,N0000,N0001,N0002,N0003,N0004,N0005,N0006,N0007,N0008,...,N4163,N4164,N4165,N4166,N4167,N4168,N4169,N4170,N4171,N4172
0,2013-01-01 00:30:00,0.732,0.115,0.034,0.116,0.091,0.404,0.000,0.086,0.091,...,0.062,0.048,0.043,0.010,0.248,0.102,0.041000,0.102,0.048,0.051
1,2013-01-01 01:00:00,0.250,0.152,0.051,0.092,0.092,0.245,0.000,0.082,0.047,...,0.132,0.014,0.042,0.020,0.263,0.093,0.034000,0.055,0.082,0.092
2,2013-01-01 01:30:00,0.234,0.115,0.050,0.100,0.093,0.114,0.000,0.070,0.043,...,0.172,0.031,0.024,0.095,0.201,0.101,0.036000,0.108,0.094,0.048
3,2013-01-01 02:00:00,0.214,0.114,0.048,0.100,0.108,0.059,0.000,0.085,0.096,...,0.214,0.011,0.042,0.215,0.072,0.114,0.040000,0.053,0.096,0.084
4,2013-01-01 02:30:00,0.235,0.152,0.044,0.100,0.376,0.091,0.000,0.085,0.060,...,0.182,0.041,0.042,0.020,0.118,0.127,0.106000,0.044,0.059,0.058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17515,2013-12-31 22:00:00,0.203,0.495,0.020,0.742,0.146,0.128,0.099,0.309,0.120,...,0.047,0.037,0.098,0.226,0.955,0.083,0.067005,0.488,0.010,0.211
17516,2013-12-31 22:30:00,0.116,0.290,0.045,0.265,0.124,0.158,0.155,0.187,0.097,...,0.022,0.074,0.089,0.018,1.006,0.086,0.064775,0.150,0.010,0.165
17517,2013-12-31 23:00:00,0.122,0.267,0.055,0.213,0.130,0.152,0.106,0.214,0.132,...,0.055,0.012,0.106,0.018,0.398,0.090,0.066131,0.105,0.043,0.133
17518,2013-12-31 23:30:00,0.149,0.335,0.054,0.091,0.156,0.126,0.036,0.261,0.131,...,0.017,0.015,0.112,0.407,0.152,0.093,0.063400,0.106,0.060,0.156


Turning both dataframes into hourly readings to use weather variables and to predict peak load hour in a day

In [16]:
# Setting "GMT" column as the DataFrame index
energy_cleaned_n.set_index("GMT", inplace=True)

# Resampling the DataFrame to hourly readings by adding the half-hourly values
energy_cleaned_n_hourly = energy_cleaned_n.resample('H', closed='right', label='right').sum()

# Resetting the index to get the "GMT" column back and keeping the datetime format
energy_cleaned_n_hourly.reset_index(inplace=True)

energy_cleaned_n_hourly

Unnamed: 0,GMT,N0000,N0001,N0002,N0003,N0004,N0005,N0006,N0007,N0008,...,N4163,N4164,N4165,N4166,N4167,N4168,N4169,N4170,N4171,N4172
0,2013-01-01 01:00:00,0.982,0.267,0.085,0.208,0.183,0.649,0.000,0.168,0.138,...,0.194,0.062,0.085,0.030,0.511,0.195,0.075000,0.157,0.130,0.143
1,2013-01-01 02:00:00,0.448,0.229,0.098,0.200,0.201,0.173,0.000,0.155,0.139,...,0.386,0.042,0.066,0.310,0.273,0.215,0.076000,0.161,0.190,0.132
2,2013-01-01 03:00:00,0.417,0.268,0.061,0.233,0.435,0.168,0.000,0.157,0.099,...,0.363,0.097,0.070,0.030,0.213,0.224,0.288000,0.117,0.106,0.135
3,2013-01-01 04:00:00,0.526,0.267,0.033,0.249,0.165,0.169,0.000,0.166,0.130,...,0.341,0.152,0.080,0.021,0.122,0.306,0.088000,0.108,0.095,0.126
4,2013-01-01 05:00:00,0.799,0.230,0.087,0.184,0.147,0.153,0.000,0.152,0.127,...,0.344,0.066,0.065,0.039,0.219,0.185,0.068000,0.132,0.094,0.208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2013-12-31 20:00:00,0.419,0.591,0.181,1.036,0.259,0.292,0.208,1.072,0.201,...,0.078,0.202,0.204,2.815,2.110,0.178,0.136993,0.307,0.119,0.385
8756,2013-12-31 21:00:00,0.493,0.460,0.205,1.023,0.250,0.287,0.257,0.477,0.189,...,0.077,0.178,0.224,2.670,1.660,0.160,0.135263,0.242,0.065,0.403
8757,2013-12-31 22:00:00,0.410,0.787,0.050,1.220,0.332,0.278,0.195,0.525,0.227,...,0.079,0.151,0.217,0.469,1.766,0.182,0.133179,0.642,0.023,0.369
8758,2013-12-31 23:00:00,0.238,0.557,0.100,0.478,0.254,0.310,0.261,0.401,0.229,...,0.077,0.086,0.195,0.036,1.404,0.176,0.130906,0.255,0.053,0.298


In [17]:
# Setting "GMT" column as the DataFrame index
energy_cleaned_d.set_index("GMT", inplace=True)

# Resampling the DataFrame to hourly readings by adding the half-hourly values
energy_cleaned_d_hourly = energy_cleaned_d.resample('H', closed='right', label='right').sum()

# Resetting the index to get the "GMT" column back and keeping the datetime format
energy_cleaned_d_hourly.reset_index(inplace=True)

energy_cleaned_d_hourly

Unnamed: 0,GMT,D0000,D0001,D0002,D0003,D0004,D0005,D0006,D0007,D0008,...,D1015,D1016,D1017,D1018,D1019,D1020,D1021,D1022,D1023,D1024
0,2013-01-01 01:00:00,0.589,0.454,0.458,0.154,0.390,0.123,0.388,0.301,0.363,...,0.078,0.010,0.525,0.114,0.414,1.284,0.120,0.311,0.210,0.145
1,2013-01-01 02:00:00,0.290,0.239,0.475,0.154,0.063,0.122,0.178,0.239,0.428,...,0.032,0.010,0.359,0.320,0.355,0.692,0.108,0.182,0.290,0.123
2,2013-01-01 03:00:00,0.227,0.166,0.497,0.154,0.135,0.121,0.055,0.180,0.132,...,0.062,0.010,0.214,0.112,0.382,0.514,0.092,0.058,0.179,0.216
3,2013-01-01 04:00:00,0.201,0.132,0.621,0.155,0.057,0.109,0.052,0.154,0.136,...,0.031,0.057,0.193,0.110,0.338,0.301,0.095,0.064,0.086,0.119
4,2013-01-01 05:00:00,0.191,0.121,0.552,0.154,0.055,0.126,0.052,0.143,0.153,...,0.062,0.051,0.612,0.110,0.375,0.227,0.114,0.068,0.013,0.124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2013-12-31 20:00:00,0.669,0.344,0.337,0.187,0.036,0.475,1.221,0.484,0.683,...,0.127,0.063,0.296,0.106,1.419,0.265,0.117,0.277,0.124,1.246
8756,2013-12-31 21:00:00,0.573,0.308,0.317,0.186,0.033,0.482,0.683,0.537,0.641,...,0.142,0.155,0.508,0.104,0.813,0.381,0.165,0.342,0.295,0.353
8757,2013-12-31 22:00:00,0.378,0.483,0.321,0.187,0.032,0.468,0.679,0.730,0.407,...,0.048,0.048,0.519,0.105,0.789,0.520,0.141,0.331,0.138,0.176
8758,2013-12-31 23:00:00,0.335,0.195,0.335,0.145,0.031,0.444,0.491,0.361,0.401,...,0.080,0.226,0.600,0.105,0.628,1.302,0.133,0.341,0.190,0.187


Substracting 1 hour from GMT so it will now portray the hour the reading corresponds to, since before the reading at 2013-01-01 02:00:00 what actually described was the reading for hour 1 of 2013-01-01.

In [18]:
energy_cleaned_n_hourly['GMT'] = energy_cleaned_n_hourly['GMT'] - pd.Timedelta(hours=1)
energy_cleaned_d_hourly['GMT'] = energy_cleaned_d_hourly['GMT'] - pd.Timedelta(hours=1)

In [19]:
energy_cleaned_d_hourly

Unnamed: 0,GMT,D0000,D0001,D0002,D0003,D0004,D0005,D0006,D0007,D0008,...,D1015,D1016,D1017,D1018,D1019,D1020,D1021,D1022,D1023,D1024
0,2013-01-01 00:00:00,0.589,0.454,0.458,0.154,0.390,0.123,0.388,0.301,0.363,...,0.078,0.010,0.525,0.114,0.414,1.284,0.120,0.311,0.210,0.145
1,2013-01-01 01:00:00,0.290,0.239,0.475,0.154,0.063,0.122,0.178,0.239,0.428,...,0.032,0.010,0.359,0.320,0.355,0.692,0.108,0.182,0.290,0.123
2,2013-01-01 02:00:00,0.227,0.166,0.497,0.154,0.135,0.121,0.055,0.180,0.132,...,0.062,0.010,0.214,0.112,0.382,0.514,0.092,0.058,0.179,0.216
3,2013-01-01 03:00:00,0.201,0.132,0.621,0.155,0.057,0.109,0.052,0.154,0.136,...,0.031,0.057,0.193,0.110,0.338,0.301,0.095,0.064,0.086,0.119
4,2013-01-01 04:00:00,0.191,0.121,0.552,0.154,0.055,0.126,0.052,0.143,0.153,...,0.062,0.051,0.612,0.110,0.375,0.227,0.114,0.068,0.013,0.124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2013-12-31 19:00:00,0.669,0.344,0.337,0.187,0.036,0.475,1.221,0.484,0.683,...,0.127,0.063,0.296,0.106,1.419,0.265,0.117,0.277,0.124,1.246
8756,2013-12-31 20:00:00,0.573,0.308,0.317,0.186,0.033,0.482,0.683,0.537,0.641,...,0.142,0.155,0.508,0.104,0.813,0.381,0.165,0.342,0.295,0.353
8757,2013-12-31 21:00:00,0.378,0.483,0.321,0.187,0.032,0.468,0.679,0.730,0.407,...,0.048,0.048,0.519,0.105,0.789,0.520,0.141,0.331,0.138,0.176
8758,2013-12-31 22:00:00,0.335,0.195,0.335,0.145,0.031,0.444,0.491,0.361,0.401,...,0.080,0.226,0.600,0.105,0.628,1.302,0.133,0.341,0.190,0.187


In [20]:
energy_cleaned_n_hourly

Unnamed: 0,GMT,N0000,N0001,N0002,N0003,N0004,N0005,N0006,N0007,N0008,...,N4163,N4164,N4165,N4166,N4167,N4168,N4169,N4170,N4171,N4172
0,2013-01-01 00:00:00,0.982,0.267,0.085,0.208,0.183,0.649,0.000,0.168,0.138,...,0.194,0.062,0.085,0.030,0.511,0.195,0.075000,0.157,0.130,0.143
1,2013-01-01 01:00:00,0.448,0.229,0.098,0.200,0.201,0.173,0.000,0.155,0.139,...,0.386,0.042,0.066,0.310,0.273,0.215,0.076000,0.161,0.190,0.132
2,2013-01-01 02:00:00,0.417,0.268,0.061,0.233,0.435,0.168,0.000,0.157,0.099,...,0.363,0.097,0.070,0.030,0.213,0.224,0.288000,0.117,0.106,0.135
3,2013-01-01 03:00:00,0.526,0.267,0.033,0.249,0.165,0.169,0.000,0.166,0.130,...,0.341,0.152,0.080,0.021,0.122,0.306,0.088000,0.108,0.095,0.126
4,2013-01-01 04:00:00,0.799,0.230,0.087,0.184,0.147,0.153,0.000,0.152,0.127,...,0.344,0.066,0.065,0.039,0.219,0.185,0.068000,0.132,0.094,0.208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2013-12-31 19:00:00,0.419,0.591,0.181,1.036,0.259,0.292,0.208,1.072,0.201,...,0.078,0.202,0.204,2.815,2.110,0.178,0.136993,0.307,0.119,0.385
8756,2013-12-31 20:00:00,0.493,0.460,0.205,1.023,0.250,0.287,0.257,0.477,0.189,...,0.077,0.178,0.224,2.670,1.660,0.160,0.135263,0.242,0.065,0.403
8757,2013-12-31 21:00:00,0.410,0.787,0.050,1.220,0.332,0.278,0.195,0.525,0.227,...,0.079,0.151,0.217,0.469,1.766,0.182,0.133179,0.642,0.023,0.369
8758,2013-12-31 22:00:00,0.238,0.557,0.100,0.478,0.254,0.310,0.261,0.401,0.229,...,0.077,0.086,0.195,0.036,1.404,0.176,0.130906,0.255,0.053,0.298


Summing all households in one column for both dfs

In [21]:
#Summing the consumption of all households in group N

household_columns_n = energy_cleaned_n_hourly.columns[1:] 
energy_cleaned_n_hourly["Total Household Consumption (N)"] = energy_cleaned_n_hourly[household_columns_n].sum(axis=1)
energy_cleaned_n_hourly = energy_cleaned_n_hourly[["GMT", "Total Household Consumption (N)"]]

print(energy_cleaned_n_hourly)

                     GMT  Total Household Consumption (N)
0    2013-01-01 00:00:00                      1657.483601
1    2013-01-01 01:00:00                      1422.045237
2    2013-01-01 02:00:00                      1200.551314
3    2013-01-01 03:00:00                      1061.568405
4    2013-01-01 04:00:00                      1000.474013
...                  ...                              ...
8755 2013-12-31 19:00:00                      2979.858073
8756 2013-12-31 20:00:00                      2700.407931
8757 2013-12-31 21:00:00                      2447.194261
8758 2013-12-31 22:00:00                      2232.695632
8759 2013-12-31 23:00:00                      2001.119964

[8760 rows x 2 columns]


In [22]:
#Summing the consumption of all households in group D

household_columns_d = energy_cleaned_d_hourly.columns[1:]
energy_cleaned_d_hourly["Total Household Consumption (D)"] = energy_cleaned_d_hourly[household_columns_d].sum(axis=1)
energy_cleaned_d_hourly = energy_cleaned_d_hourly[["GMT", "Total Household Consumption (D)"]]

print(energy_cleaned_d_hourly)

                     GMT  Total Household Consumption (D)
0    2013-01-01 00:00:00                       393.709000
1    2013-01-01 01:00:00                       327.556000
2    2013-01-01 02:00:00                       286.207000
3    2013-01-01 03:00:00                       250.964000
4    2013-01-01 04:00:00                       233.412000
...                  ...                              ...
8755 2013-12-31 19:00:00                       743.251693
8756 2013-12-31 20:00:00                       645.912531
8757 2013-12-31 21:00:00                       586.851605
8758 2013-12-31 22:00:00                       508.774572
8759 2013-12-31 23:00:00                       453.855276

[8760 rows x 2 columns]


Converting Tariff dataframe to Hourly

In [23]:
#Removing all rows from 2012, first row of 2013 which really represents the last reading of 2012, 
#and keeping only the first row of 2014 which represents the last reading of 2013.

tariff_2013 = tariff[(tariff['GMT'].dt.year == 2013) & (tariff['GMT'] != '2013-01-01 00:00:00')]
first_timestamp_2014_tariff = tariff[(tariff['GMT'].dt.year == 2014)].iloc[0]
tariff_2014 = tariff[(tariff['GMT'].dt.year == 2014) & (tariff['GMT'] == first_timestamp_2014_tariff['GMT'])]
result_tariff = pd.concat([tariff_2013, tariff_2014])
result_tariff = result_tariff.reset_index(drop=True)
tariff = result_tariff

In [24]:
print(tariff['GMT'].dtype)

datetime64[ns]


In [25]:
rows_tariff, columns_tariff = tariff.shape
print("Shape of DataFrame 'tariff':", (rows_tariff, columns_tariff))

Shape of DataFrame 'tariff': (17520, 3)


In [26]:
# Setting "GMT" column as the DataFrame index
tariff.set_index("GMT", inplace=True)

# Resampling the DataFrame to hourly readings by summing the half-hourly values
tariff_hourly = tariff.resample('H', closed='right', label='right').mean(numeric_only=True)
tariff_hourly.reset_index(inplace=True)
tariff_hourly

Unnamed: 0,GMT,Price
0,2013-01-01 01:00:00,0.1176
1,2013-01-01 02:00:00,0.1176
2,2013-01-01 03:00:00,0.1176
3,2013-01-01 04:00:00,0.1176
4,2013-01-01 05:00:00,0.1176
...,...,...
8755,2013-12-31 20:00:00,0.1176
8756,2013-12-31 21:00:00,0.1176
8757,2013-12-31 22:00:00,0.1176
8758,2013-12-31 23:00:00,0.1176


In [27]:
#Checking the same unique prices remain in dataframe for low, default and high tariff

unique_prices = tariff_hourly["Price"].unique()
print(unique_prices)

[0.1176 0.0399 0.672 ]


In [28]:
tariff_hourly['GMT'] = tariff_hourly['GMT'] - pd.Timedelta(hours=1)

In [29]:
tariff_hourly

Unnamed: 0,GMT,Price
0,2013-01-01 00:00:00,0.1176
1,2013-01-01 01:00:00,0.1176
2,2013-01-01 02:00:00,0.1176
3,2013-01-01 03:00:00,0.1176
4,2013-01-01 04:00:00,0.1176
...,...,...
8755,2013-12-31 19:00:00,0.1176
8756,2013-12-31 20:00:00,0.1176
8757,2013-12-31 21:00:00,0.1176
8758,2013-12-31 22:00:00,0.1176


In [30]:
energy_cleaned_d_hourly

Unnamed: 0,GMT,Total Household Consumption (D)
0,2013-01-01 00:00:00,393.709000
1,2013-01-01 01:00:00,327.556000
2,2013-01-01 02:00:00,286.207000
3,2013-01-01 03:00:00,250.964000
4,2013-01-01 04:00:00,233.412000
...,...,...
8755,2013-12-31 19:00:00,743.251693
8756,2013-12-31 20:00:00,645.912531
8757,2013-12-31 21:00:00,586.851605
8758,2013-12-31 22:00:00,508.774572


Merging Holiday Dataframe

In [31]:
#Creating a copy of the dataframes
energy_cleaned_d_hourly = energy_cleaned_d_hourly.copy()
energy_cleaned_n_hourly = energy_cleaned_n_hourly.copy()

energy_cleaned_d_hourly.loc[:, 'Date'] = energy_cleaned_d_hourly['GMT'].dt.date
energy_cleaned_d_hourly.loc[:, 'Time'] = energy_cleaned_d_hourly['GMT'].dt.time

energy_cleaned_n_hourly.loc[:, 'Date'] = energy_cleaned_n_hourly['GMT'].dt.date
energy_cleaned_n_hourly.loc[:, 'Time'] = energy_cleaned_n_hourly['GMT'].dt.time

In [32]:
#Creating additional columns for year, month, day, hour and day of week

def add_additional_columns(df):
    df['Year'] = df['GMT'].dt.year
    df['Month'] = df['GMT'].dt.month
    df['Day'] = df['GMT'].dt.day
    df['Hour'] = df['GMT'].dt.hour
    df['DayOfWeek'] = df['GMT'].dt.dayofweek

    return df

energy_cleaned_d_hourly = add_additional_columns(energy_cleaned_d_hourly)
energy_cleaned_n_hourly = add_additional_columns(energy_cleaned_n_hourly)

In [33]:
#Converting GMT to datetime

energy_cleaned_n_hourly["Date"] = pd.to_datetime(energy_cleaned_n_hourly["Date"])
energy_cleaned_d_hourly["Date"] = pd.to_datetime(energy_cleaned_d_hourly["Date"])

In [34]:
#Checking GMT is in datetime format across all dataframes

datetypeholidays= bank_holidays["date"].dtype
print("Data type of bank holidays':", datetypeholidays)

datetypeN= energy_cleaned_n_hourly["Date"].dtype
print("Data type of N:", datetypeN)

datetypeD= energy_cleaned_d_hourly["Date"].dtype
print("Data type of D:", datetypeD)

Data type of bank holidays': datetime64[ns]
Data type of N: datetime64[ns]
Data type of D: datetime64[ns]


In [35]:
#Checkin that all dates from bank holidays are available in energy consumption dataframes

bank_holidays_2013 = bank_holidays[bank_holidays["date"].dt.year == 2013]

all_dates_available = bank_holidays_2013["date"].isin(energy_cleaned_d_hourly["Date"]).all()

if all_dates_available:
    print("All dates from bank_holidays for the year 2013 are available in energy_cleaned_d_hourly.")
else:
    print("Some dates from bank_holidays for the year 2013 are not available in energy_cleaned_d_hourly.")

All dates from bank_holidays for the year 2013 are available in energy_cleaned_d_hourly.


In [36]:
#Creating a column to indicate whether a day is a bank holiday or not

energy_cleaned_n_hourly["IsHoliday"] = energy_cleaned_n_hourly["Date"].isin(bank_holidays["date"])
energy_cleaned_d_hourly["IsHoliday"] = energy_cleaned_d_hourly["Date"].isin(bank_holidays["date"])

In [37]:
energy_cleaned_n_hourly

Unnamed: 0,GMT,Total Household Consumption (N),Date,Time,Year,Month,Day,Hour,DayOfWeek,IsHoliday
0,2013-01-01 00:00:00,1657.483601,2013-01-01,00:00:00,2013,1,1,0,1,True
1,2013-01-01 01:00:00,1422.045237,2013-01-01,01:00:00,2013,1,1,1,1,True
2,2013-01-01 02:00:00,1200.551314,2013-01-01,02:00:00,2013,1,1,2,1,True
3,2013-01-01 03:00:00,1061.568405,2013-01-01,03:00:00,2013,1,1,3,1,True
4,2013-01-01 04:00:00,1000.474013,2013-01-01,04:00:00,2013,1,1,4,1,True
...,...,...,...,...,...,...,...,...,...,...
8755,2013-12-31 19:00:00,2979.858073,2013-12-31,19:00:00,2013,12,31,19,1,False
8756,2013-12-31 20:00:00,2700.407931,2013-12-31,20:00:00,2013,12,31,20,1,False
8757,2013-12-31 21:00:00,2447.194261,2013-12-31,21:00:00,2013,12,31,21,1,False
8758,2013-12-31 22:00:00,2232.695632,2013-12-31,22:00:00,2013,12,31,22,1,False


In [38]:
energy_cleaned_d_hourly

Unnamed: 0,GMT,Total Household Consumption (D),Date,Time,Year,Month,Day,Hour,DayOfWeek,IsHoliday
0,2013-01-01 00:00:00,393.709000,2013-01-01,00:00:00,2013,1,1,0,1,True
1,2013-01-01 01:00:00,327.556000,2013-01-01,01:00:00,2013,1,1,1,1,True
2,2013-01-01 02:00:00,286.207000,2013-01-01,02:00:00,2013,1,1,2,1,True
3,2013-01-01 03:00:00,250.964000,2013-01-01,03:00:00,2013,1,1,3,1,True
4,2013-01-01 04:00:00,233.412000,2013-01-01,04:00:00,2013,1,1,4,1,True
...,...,...,...,...,...,...,...,...,...,...
8755,2013-12-31 19:00:00,743.251693,2013-12-31,19:00:00,2013,12,31,19,1,False
8756,2013-12-31 20:00:00,645.912531,2013-12-31,20:00:00,2013,12,31,20,1,False
8757,2013-12-31 21:00:00,586.851605,2013-12-31,21:00:00,2013,12,31,21,1,False
8758,2013-12-31 22:00:00,508.774572,2013-12-31,22:00:00,2013,12,31,22,1,False


Merging Weather with Consumption Datasets

In [39]:
weather

Unnamed: 0,Datetime,Hourly Global Radiation (KJ/m2),Hourly Maximum Gust (kn),Hourly Mean Wind Direction (o),Hourly Mean Windspeed (kn),Hourly Pressure at Mean Sea Level (hPa),Hourly Rainfall Total (mm),Hourly Relative Humidity (%),Hourly Snow Depth (cm),Hourly Temperature (C),Hourly Total Cloud Cover (oktas)\n,Hourly Visibility (dm)
0,2013-01-01 00:00:00,0.0,13.2,232.0,7.8,999.400,0.00,92.085714,0.0,8.914286,8.000000,2133.333333
1,2013-01-01 01:00:00,0.0,13.6,236.0,8.0,999.675,0.00,91.042857,0.0,8.785714,8.000000,1766.666667
2,2013-01-01 02:00:00,0.0,13.2,246.0,7.6,999.975,0.20,92.685714,0.0,8.514286,8.000000,1333.333333
3,2013-01-01 03:00:00,0.0,18.6,286.0,9.0,1000.725,0.96,89.871429,0.0,7.285714,7.666667,4100.000000
4,2013-01-01 04:00:00,0.0,15.4,278.0,7.4,1001.300,0.08,88.585714,0.0,6.700000,4.000000,5500.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2013-12-31 19:00:00,0.0,12.8,192.0,7.8,1002.900,0.00,85.385714,0.0,5.942857,6.333333,3666.666667
8756,2013-12-31 20:00:00,0.0,13.6,196.0,7.4,1003.175,0.00,84.442857,0.0,5.442857,1.333333,4000.000000
8757,2013-12-31 21:00:00,0.0,13.6,186.0,7.6,1003.450,0.00,83.257143,0.0,5.414286,1.000000,3833.333333
8758,2013-12-31 22:00:00,0.0,15.2,182.0,8.4,1002.925,0.00,88.628571,0.0,5.971429,2.333333,2266.666667


In [40]:
print(weather.dtypes)

Datetime                                    object
Hourly Global Radiation (KJ/m2)            float64
Hourly Maximum Gust (kn)                   float64
Hourly Mean Wind Direction (o)             float64
Hourly Mean Windspeed (kn)                 float64
Hourly Pressure at Mean Sea Level (hPa)    float64
Hourly Rainfall Total (mm)                 float64
Hourly Relative Humidity (%)               float64
Hourly Snow Depth (cm)                     float64
Hourly Temperature (C)                     float64
Hourly Total Cloud Cover (oktas)\n         float64
Hourly Visibility (dm)                     float64
dtype: object


In [41]:
#Re-checking there are no missing values in the weather dataframe

is_missing_weather = weather.isnull().any().any()
print("Missing values: ", is_missing_weather)

Missing values:  False


In [42]:
#Rename 'Datetime' column to 'GMT' and set it as the index for merging purposes

weather.rename(columns={'Datetime': 'GMT'}, inplace=True)
weather.set_index('GMT', inplace=True)

In [43]:
weather.index = pd.to_datetime(weather.index)

In [44]:
weather

Unnamed: 0_level_0,Hourly Global Radiation (KJ/m2),Hourly Maximum Gust (kn),Hourly Mean Wind Direction (o),Hourly Mean Windspeed (kn),Hourly Pressure at Mean Sea Level (hPa),Hourly Rainfall Total (mm),Hourly Relative Humidity (%),Hourly Snow Depth (cm),Hourly Temperature (C),Hourly Total Cloud Cover (oktas)\n,Hourly Visibility (dm)
GMT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2013-01-01 00:00:00,0.0,13.2,232.0,7.8,999.400,0.00,92.085714,0.0,8.914286,8.000000,2133.333333
2013-01-01 01:00:00,0.0,13.6,236.0,8.0,999.675,0.00,91.042857,0.0,8.785714,8.000000,1766.666667
2013-01-01 02:00:00,0.0,13.2,246.0,7.6,999.975,0.20,92.685714,0.0,8.514286,8.000000,1333.333333
2013-01-01 03:00:00,0.0,18.6,286.0,9.0,1000.725,0.96,89.871429,0.0,7.285714,7.666667,4100.000000
2013-01-01 04:00:00,0.0,15.4,278.0,7.4,1001.300,0.08,88.585714,0.0,6.700000,4.000000,5500.000000
...,...,...,...,...,...,...,...,...,...,...,...
2013-12-31 19:00:00,0.0,12.8,192.0,7.8,1002.900,0.00,85.385714,0.0,5.942857,6.333333,3666.666667
2013-12-31 20:00:00,0.0,13.6,196.0,7.4,1003.175,0.00,84.442857,0.0,5.442857,1.333333,4000.000000
2013-12-31 21:00:00,0.0,13.6,186.0,7.6,1003.450,0.00,83.257143,0.0,5.414286,1.000000,3833.333333
2013-12-31 22:00:00,0.0,15.2,182.0,8.4,1002.925,0.00,88.628571,0.0,5.971429,2.333333,2266.666667


Final Merged Dataframes for Both Households Type

In [45]:
#Households type N

merged_n = pd.merge(energy_cleaned_n_hourly, weather,left_on='GMT', right_index=True, how='inner')

In [46]:
#Households type D

merged_d = pd.merge(energy_cleaned_d_hourly, weather, left_on='GMT', right_index=True, how='inner')

In [47]:
merged_n['GMT'] = pd.to_datetime(merged_d['GMT'], format='%Y-%m-%d %H:%M:%S')
tariff_hourly['GMT'] = pd.to_datetime(tariff_hourly['GMT'], format='%Y-%m-%d %H:%M:%S')

In [48]:
#Merging tariff dataframe to households type D

merged_d = pd.merge(merged_d, tariff_hourly, on='GMT', how='inner')

In [49]:
merged_d

Unnamed: 0,GMT,Total Household Consumption (D),Date,Time,Year,Month,Day,Hour,DayOfWeek,IsHoliday,...,Hourly Mean Wind Direction (o),Hourly Mean Windspeed (kn),Hourly Pressure at Mean Sea Level (hPa),Hourly Rainfall Total (mm),Hourly Relative Humidity (%),Hourly Snow Depth (cm),Hourly Temperature (C),Hourly Total Cloud Cover (oktas)\n,Hourly Visibility (dm),Price
0,2013-01-01 00:00:00,393.709000,2013-01-01,00:00:00,2013,1,1,0,1,True,...,232.0,7.8,999.400,0.00,92.085714,0.0,8.914286,8.000000,2133.333333,0.1176
1,2013-01-01 01:00:00,327.556000,2013-01-01,01:00:00,2013,1,1,1,1,True,...,236.0,8.0,999.675,0.00,91.042857,0.0,8.785714,8.000000,1766.666667,0.1176
2,2013-01-01 02:00:00,286.207000,2013-01-01,02:00:00,2013,1,1,2,1,True,...,246.0,7.6,999.975,0.20,92.685714,0.0,8.514286,8.000000,1333.333333,0.1176
3,2013-01-01 03:00:00,250.964000,2013-01-01,03:00:00,2013,1,1,3,1,True,...,286.0,9.0,1000.725,0.96,89.871429,0.0,7.285714,7.666667,4100.000000,0.1176
4,2013-01-01 04:00:00,233.412000,2013-01-01,04:00:00,2013,1,1,4,1,True,...,278.0,7.4,1001.300,0.08,88.585714,0.0,6.700000,4.000000,5500.000000,0.1176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2013-12-31 19:00:00,743.251693,2013-12-31,19:00:00,2013,12,31,19,1,False,...,192.0,7.8,1002.900,0.00,85.385714,0.0,5.942857,6.333333,3666.666667,0.1176
8756,2013-12-31 20:00:00,645.912531,2013-12-31,20:00:00,2013,12,31,20,1,False,...,196.0,7.4,1003.175,0.00,84.442857,0.0,5.442857,1.333333,4000.000000,0.1176
8757,2013-12-31 21:00:00,586.851605,2013-12-31,21:00:00,2013,12,31,21,1,False,...,186.0,7.6,1003.450,0.00,83.257143,0.0,5.414286,1.000000,3833.333333,0.1176
8758,2013-12-31 22:00:00,508.774572,2013-12-31,22:00:00,2013,12,31,22,1,False,...,182.0,8.4,1002.925,0.00,88.628571,0.0,5.971429,2.333333,2266.666667,0.1176


In [50]:
merged_n

Unnamed: 0,GMT,Total Household Consumption (N),Date,Time,Year,Month,Day,Hour,DayOfWeek,IsHoliday,...,Hourly Maximum Gust (kn),Hourly Mean Wind Direction (o),Hourly Mean Windspeed (kn),Hourly Pressure at Mean Sea Level (hPa),Hourly Rainfall Total (mm),Hourly Relative Humidity (%),Hourly Snow Depth (cm),Hourly Temperature (C),Hourly Total Cloud Cover (oktas)\n,Hourly Visibility (dm)
0,2013-01-01 00:00:00,1657.483601,2013-01-01,00:00:00,2013,1,1,0,1,True,...,13.2,232.0,7.8,999.400,0.00,92.085714,0.0,8.914286,8.000000,2133.333333
1,2013-01-01 01:00:00,1422.045237,2013-01-01,01:00:00,2013,1,1,1,1,True,...,13.6,236.0,8.0,999.675,0.00,91.042857,0.0,8.785714,8.000000,1766.666667
2,2013-01-01 02:00:00,1200.551314,2013-01-01,02:00:00,2013,1,1,2,1,True,...,13.2,246.0,7.6,999.975,0.20,92.685714,0.0,8.514286,8.000000,1333.333333
3,2013-01-01 03:00:00,1061.568405,2013-01-01,03:00:00,2013,1,1,3,1,True,...,18.6,286.0,9.0,1000.725,0.96,89.871429,0.0,7.285714,7.666667,4100.000000
4,2013-01-01 04:00:00,1000.474013,2013-01-01,04:00:00,2013,1,1,4,1,True,...,15.4,278.0,7.4,1001.300,0.08,88.585714,0.0,6.700000,4.000000,5500.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2013-12-31 19:00:00,2979.858073,2013-12-31,19:00:00,2013,12,31,19,1,False,...,12.8,192.0,7.8,1002.900,0.00,85.385714,0.0,5.942857,6.333333,3666.666667
8756,2013-12-31 20:00:00,2700.407931,2013-12-31,20:00:00,2013,12,31,20,1,False,...,13.6,196.0,7.4,1003.175,0.00,84.442857,0.0,5.442857,1.333333,4000.000000
8757,2013-12-31 21:00:00,2447.194261,2013-12-31,21:00:00,2013,12,31,21,1,False,...,13.6,186.0,7.6,1003.450,0.00,83.257143,0.0,5.414286,1.000000,3833.333333
8758,2013-12-31 22:00:00,2232.695632,2013-12-31,22:00:00,2013,12,31,22,1,False,...,15.2,182.0,8.4,1002.925,0.00,88.628571,0.0,5.971429,2.333333,2266.666667


One-hot Encoding for Tariff Types

In [51]:
#One-hot encoding the tariff prices

merged_d['Low'] = 0
merged_d['Default'] = 0
merged_d['High'] = 0

merged_d.loc[0, 'Default'] = 1
merged_d.loc[0, 'Low'] = 0
merged_d.loc[0, 'High'] = 0

for i in range(1, len(merged_d)):
    previous_price = merged_d.loc[i - 1, 'Price']
    
    if previous_price == 0.1176:
        merged_d.loc[i, 'Default'] = 1
    elif previous_price == 0.0399:
        merged_d.loc[i, 'Low'] = 1
    elif previous_price == 0.672:
        merged_d.loc[i, 'High'] = 1

merged_d.drop(columns=['Price'], inplace=True)

In [52]:
merged_d

Unnamed: 0,GMT,Total Household Consumption (D),Date,Time,Year,Month,Day,Hour,DayOfWeek,IsHoliday,...,Hourly Pressure at Mean Sea Level (hPa),Hourly Rainfall Total (mm),Hourly Relative Humidity (%),Hourly Snow Depth (cm),Hourly Temperature (C),Hourly Total Cloud Cover (oktas)\n,Hourly Visibility (dm),Low,Default,High
0,2013-01-01 00:00:00,393.709000,2013-01-01,00:00:00,2013,1,1,0,1,True,...,999.400,0.00,92.085714,0.0,8.914286,8.000000,2133.333333,0,1,0
1,2013-01-01 01:00:00,327.556000,2013-01-01,01:00:00,2013,1,1,1,1,True,...,999.675,0.00,91.042857,0.0,8.785714,8.000000,1766.666667,0,1,0
2,2013-01-01 02:00:00,286.207000,2013-01-01,02:00:00,2013,1,1,2,1,True,...,999.975,0.20,92.685714,0.0,8.514286,8.000000,1333.333333,0,1,0
3,2013-01-01 03:00:00,250.964000,2013-01-01,03:00:00,2013,1,1,3,1,True,...,1000.725,0.96,89.871429,0.0,7.285714,7.666667,4100.000000,0,1,0
4,2013-01-01 04:00:00,233.412000,2013-01-01,04:00:00,2013,1,1,4,1,True,...,1001.300,0.08,88.585714,0.0,6.700000,4.000000,5500.000000,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8755,2013-12-31 19:00:00,743.251693,2013-12-31,19:00:00,2013,12,31,19,1,False,...,1002.900,0.00,85.385714,0.0,5.942857,6.333333,3666.666667,0,1,0
8756,2013-12-31 20:00:00,645.912531,2013-12-31,20:00:00,2013,12,31,20,1,False,...,1003.175,0.00,84.442857,0.0,5.442857,1.333333,4000.000000,0,1,0
8757,2013-12-31 21:00:00,586.851605,2013-12-31,21:00:00,2013,12,31,21,1,False,...,1003.450,0.00,83.257143,0.0,5.414286,1.000000,3833.333333,0,1,0
8758,2013-12-31 22:00:00,508.774572,2013-12-31,22:00:00,2013,12,31,22,1,False,...,1002.925,0.00,88.628571,0.0,5.971429,2.333333,2266.666667,0,1,0


Saving merged files to CSV

In [53]:
#Saving the merged D dataframe to csv

merged_d.to_csv('merged_d.csv', index=False)

In [54]:
#Saving the merged N dataframe to csv

merged_n.to_csv('merged_n.csv', index=False)