Importing Packages and Libraries

In [1]:
#Importing libraries and packages

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import json
from datetime import timedelta

Loading UK Power Network Datasets

In [2]:
#Loading UK Power Network Datasets

energy_d = pd.read_csv(r'/Users/rosaliapujols/Desktop/Dissertation/Code/Dataset-UKDA-7857-csv/csv/data_collection/data_tables/consumption_d.csv')
energyn = pd.read_csv(r'/Users/rosaliapujols/Desktop/Dissertation/Code/Dataset-UKDA-7857-csv/csv/data_collection/data_tables/consumption_n.csv')

Loading Bank Holidays Dataset

In [3]:
#Loading Bank Holidays Dataset

file_paths = [
    '/Users/rosaliapujols/Desktop/Dissertation/Code/bank_holidays/bank_holidays_2011_england-and-wales.json',
    '/Users/rosaliapujols/Desktop/Dissertation/Code/bank_holidays/bank_holidays_2012_england-and-wales.json',
    '/Users/rosaliapujols/Desktop/Dissertation/Code/bank_holidays/bank_holidays_2013_england-and-wales.json',
    '/Users/rosaliapujols/Desktop/Dissertation/Code/bank_holidays/bank_holidays_2014_england-and-wales.json'
]

dataframes_list = []

for file_path in file_paths:
    with open(file_path, 'r') as json_file:
        json_data = json.load(json_file)
        df = pd.DataFrame(json_data)
        df = df.drop(['notes', 'bunting'], axis=1)
        df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')

        dataframes_list.append(df)

bank_holidays = pd.concat(dataframes_list, ignore_index=True)

print(bank_holidays)

                          title       date
0        bank_holidays.new_year 2011-01-03
1     bank_holidays.good_friday 2011-04-22
2   bank_holidays.easter_monday 2011-04-25
3       bank_holidays.early_may 2011-05-02
4          bank_holidays.spring 2011-05-30
5     bank_holidays.late_august 2011-08-29
6      bank_holidays.boxing_day 2011-12-26
7       bank_holidays.christmas 2011-12-27
8        bank_holidays.new_year 2012-01-02
9     bank_holidays.good_friday 2012-04-06
10  bank_holidays.easter_monday 2012-04-09
11      bank_holidays.early_may 2012-05-07
12         bank_holidays.spring 2012-05-28
13    bank_holidays.late_august 2012-08-27
14      bank_holidays.christmas 2012-12-25
15     bank_holidays.boxing_day 2012-12-26
16       bank_holidays.new_year 2013-01-01
17    bank_holidays.good_friday 2013-03-29
18  bank_holidays.easter_monday 2013-04-01
19      bank_holidays.early_may 2013-05-06
20         bank_holidays.spring 2013-05-27
21    bank_holidays.late_august 2013-08-26
22      ban

Convert GMT to Datetime

In [4]:
##Checking GMT time format

print(energy_d['GMT'].dtype)

object


In [5]:
#Convert GMT to Datetime

energy_d['GMT'] = pd.to_datetime(energy_d['GMT'], format='%Y-%m-%d %H:%M:%S')
print(energy_d.dtypes)

GMT      datetime64[ns]
D0000           float64
D0001           float64
D0002           float64
D0003           float64
              ...      
D1020           float64
D1021           float64
D1022           float64
D1023           float64
D1024           float64
Length: 1026, dtype: object


In [6]:
energy_d

Unnamed: 0,GMT,D0000,D0001,D0002,D0003,D0004,D0005,D0006,D0007,D0008,...,D1015,D1016,D1017,D1018,D1019,D1020,D1021,D1022,D1023,D1024
0,2011-11-23 09:00:00,,,,,,,,,,...,,,,,,,,,,
1,2011-11-23 09:30:00,,,,,,,,,,...,,,,,,,,,,
2,2011-11-23 10:00:00,,,,,,,,,,...,,,,,,,,,,
3,2011-11-23 10:30:00,,,,,,,,,,...,,,,,,,,,,
4,2011-11-23 11:00:00,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39722,2014-02-27 22:00:00,0.401,0.046,0.457,0.095,0.203,0.232,0.284,0.126,0.528,...,0.089,0.031,0.139,0.085,0.393,0.324,0.127,0.151,0.071,0.123
39723,2014-02-27 22:30:00,0.190,0.184,0.430,0.096,0.176,0.240,0.310,0.126,0.650,...,0.033,0.020,0.132,0.085,0.388,0.268,0.114,0.179,0.059,0.109
39724,2014-02-27 23:00:00,0.119,0.108,0.309,0.087,0.098,0.195,0.236,0.161,0.182,...,0.048,0.021,0.104,0.037,0.312,0.244,0.088,0.222,0.063,0.096
39725,2014-02-27 23:30:00,0.042,0.097,0.353,0.052,0.020,0.225,0.344,0.164,0.077,...,0.037,0.020,0.093,0.067,0.208,0.296,0.083,0.212,0.051,0.252


In [7]:
energy_d['Date'] = energy_d['GMT'].dt.date
energy_d['Time'] = energy_d['GMT'].dt.time

In [8]:
print("Date column type:", energy_d['Date'].dtype)
print("Time column type:", energy_d['Time'].dtype)
print("GMT column type:", energy_d['GMT'].dtype)

Date column type: object
Time column type: object
GMT column type: datetime64[ns]


Adding additional columns

In [9]:
#Adding additional columns to the dataframe pertaining to date and time

def add_additional_columns(df):
    df['Year'] = df['GMT'].dt.year
    df['Month'] = df['GMT'].dt.month
    df['Day'] = df['GMT'].dt.day
    df['Hour'] = df['GMT'].dt.hour
    df['DayOfWeek'] = df['GMT'].dt.dayofweek

    return df

energy_d = add_additional_columns(energy_d)

Keeping last week of 2012 and first day of 2014 for missing values purposes and to aggregate hourly figures later

In [10]:
#Keeping only the data from 2012-12-24 to 2014-01-02 for missing values analysis

energy_d.drop(energy_d[(energy_d['GMT'] < '2012-12-24') | (energy_d['GMT'] > '2014-01-02')].index, inplace=True)

In [11]:
energy_d

Unnamed: 0,GMT,D0000,D0001,D0002,D0003,D0004,D0005,D0006,D0007,D0008,...,D1022,D1023,D1024,Date,Time,Year,Month,Day,Hour,DayOfWeek
19038,2012-12-24 00:00:00,0.081,0.179,0.182,0.079,0.099,0.211,0.163,0.179,0.173,...,0.181,0.086,0.053,2012-12-24,00:00:00,2012,12,24,0,0
19039,2012-12-24 00:30:00,0.096,0.135,0.182,0.079,0.022,0.196,0.029,0.187,0.137,...,0.186,0.069,0.051,2012-12-24,00:30:00,2012,12,24,0,0
19040,2012-12-24 01:00:00,0.033,0.118,0.176,0.079,0.034,0.108,0.035,0.174,0.172,...,0.203,0.074,0.067,2012-12-24,01:00:00,2012,12,24,1,0
19041,2012-12-24 01:30:00,0.064,0.066,0.140,0.079,0.031,0.073,0.016,0.156,0.209,...,0.134,0.071,0.055,2012-12-24,01:30:00,2012,12,24,1,0
19042,2012-12-24 02:00:00,0.038,0.028,,0.079,0.024,0.123,0.034,0.106,0.259,...,0.136,0.066,0.050,2012-12-24,02:00:00,2012,12,24,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36986,2014-01-01 22:00:00,0.239,0.104,0.147,0.093,0.081,0.182,0.614,0.168,0.280,...,0.166,0.108,0.108,2014-01-01,22:00:00,2014,1,1,22,2
36987,2014-01-01 22:30:00,0.244,0.068,0.168,0.094,0.214,0.198,0.302,0.161,0.211,...,0.159,0.117,0.132,2014-01-01,22:30:00,2014,1,1,22,2
36988,2014-01-01 23:00:00,0.188,0.115,0.167,0.099,0.137,0.147,0.318,0.176,0.234,...,0.185,0.107,0.111,2014-01-01,23:00:00,2014,1,1,23,2
36989,2014-01-01 23:30:00,0.156,0.151,0.156,0.056,0.292,0.191,0.523,0.120,0.194,...,0.180,0.116,0.086,2014-01-01,23:30:00,2014,1,1,23,2


In [12]:
#Printing new shape of dataframe

rows_energy, columns_energy = energy_d.shape
print("Shape of DataFrame 'energy':", (rows_energy, columns_energy))

Shape of DataFrame 'energy': (17953, 1033)


Merging UK Power Network Dataset with Bank Holiday

In [13]:
#Merging bank holidays dataset with energy dataset

energy_d['Holiday'] = energy_d['Date'].isin(bank_holidays['date'])

  energy_d['Holiday'] = energy_d['Date'].isin(bank_holidays['date'])


In [14]:
energy_d

Unnamed: 0,GMT,D0000,D0001,D0002,D0003,D0004,D0005,D0006,D0007,D0008,...,D1023,D1024,Date,Time,Year,Month,Day,Hour,DayOfWeek,Holiday
19038,2012-12-24 00:00:00,0.081,0.179,0.182,0.079,0.099,0.211,0.163,0.179,0.173,...,0.086,0.053,2012-12-24,00:00:00,2012,12,24,0,0,False
19039,2012-12-24 00:30:00,0.096,0.135,0.182,0.079,0.022,0.196,0.029,0.187,0.137,...,0.069,0.051,2012-12-24,00:30:00,2012,12,24,0,0,False
19040,2012-12-24 01:00:00,0.033,0.118,0.176,0.079,0.034,0.108,0.035,0.174,0.172,...,0.074,0.067,2012-12-24,01:00:00,2012,12,24,1,0,False
19041,2012-12-24 01:30:00,0.064,0.066,0.140,0.079,0.031,0.073,0.016,0.156,0.209,...,0.071,0.055,2012-12-24,01:30:00,2012,12,24,1,0,False
19042,2012-12-24 02:00:00,0.038,0.028,,0.079,0.024,0.123,0.034,0.106,0.259,...,0.066,0.050,2012-12-24,02:00:00,2012,12,24,2,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36986,2014-01-01 22:00:00,0.239,0.104,0.147,0.093,0.081,0.182,0.614,0.168,0.280,...,0.108,0.108,2014-01-01,22:00:00,2014,1,1,22,2,False
36987,2014-01-01 22:30:00,0.244,0.068,0.168,0.094,0.214,0.198,0.302,0.161,0.211,...,0.117,0.132,2014-01-01,22:30:00,2014,1,1,22,2,False
36988,2014-01-01 23:00:00,0.188,0.115,0.167,0.099,0.137,0.147,0.318,0.176,0.234,...,0.107,0.111,2014-01-01,23:00:00,2014,1,1,23,2,False
36989,2014-01-01 23:30:00,0.156,0.151,0.156,0.056,0.292,0.191,0.523,0.120,0.194,...,0.116,0.086,2014-01-01,23:30:00,2014,1,1,23,2,False


Dealing with missing values

In [15]:
#Checking for missing values

has_missing_values_2013 = energy_d[energy_d['Year'] == 2013].isnull().any().any()
has_missing_values_2014 = energy_d[energy_d['Year'] == 2014].isnull().any().any()

print("Has missing values in 2013:", has_missing_values_2013)
print("Has missing values in 2014:", has_missing_values_2014)


Has missing values in 2013: True
Has missing values in 2014: True


In [16]:
#Printing missing values count per household

energy_d_2013 = energy_d[energy_d['Year'] == 2013]
missing_values_per_household = energy_d_2013.isnull().sum()
missing_values_df = pd.DataFrame({'Household': energy_d_2013.columns, 'Missing Values Count': missing_values_per_household})
print(missing_values_df)

           Household  Missing Values Count
GMT              GMT                     0
D0000          D0000                     1
D0001          D0001                    50
D0002          D0002                    13
D0003          D0003                     7
...              ...                   ...
Month          Month                     0
Day              Day                     0
Hour            Hour                     0
DayOfWeek  DayOfWeek                     0
Holiday      Holiday                     0

[1034 rows x 2 columns]


In [17]:
#Checking number of households with missing values

households_with_missing_values = energy_d_2013.isnull().any().sum()
print("Number of households with missing values in 2013:", households_with_missing_values)

Number of households with missing values in 2013: 783


Calculating total missing data points, total data points and % of missing values

In [18]:
#Calculating total number of missing values in 2013 and its percentage

total_missing_values_2013 = energy_d_2013.isnull().sum().sum()

total_data_points_2013 = energy_d_2013.size

percentage_missing = (total_missing_values_2013 / total_data_points_2013) * 100

print(f"Total number of missing values in 2013: {total_missing_values_2013}")
print(f"Total data points in 2013: {total_data_points_2013}")
print(f"Percentage of missing values in 2013: {percentage_missing:.2f}%")

Total number of missing values in 2013: 106572
Total data points in 2013: 18115680
Percentage of missing values in 2013: 0.59%


Function creation to fill in missing values according to the previous hour, same hour of previous day and same hour of the week prior

In [19]:
import numpy as np

# Function to fill missing values for each household
def fill_missing_values(column):
    for idx, value in column.items():
        if pd.notna(value):
            continue

        # Timestamp of the current row
        timestamp = energy_d.at[idx, 'GMT']

        # Timestamp for the previous hour
        previous_hour = timestamp - pd.Timedelta(hours=1)
        previous_hour_value = energy_d.loc[energy_d['GMT'] == previous_hour, column.name]
        previous_hour_valid_count = previous_hour_value.count()
        previous_hour_value = previous_hour_value.sum()

        # Timestamp for the same time on the previous day
        previous_day = timestamp - pd.Timedelta(days=1)
        previous_day_value = energy_d.loc[energy_d['GMT'] == previous_day, column.name]
        previous_day_valid_count = previous_day_value.count()
        previous_day_value = previous_day_value.sum()

        # Timestamp for the same time on the same day of the previous week
        previous_week = timestamp - pd.Timedelta(weeks=1)
        previous_week_value = energy_d.loc[energy_d['GMT'] == previous_week, column.name]
        previous_week_valid_count = previous_week_value.count()
        previous_week_value = previous_week_value.sum()

        # Mean considering the number of valid values
        total_valid_count = previous_hour_valid_count + previous_day_valid_count + previous_week_valid_count
        if total_valid_count > 0:
            filled_value = (previous_hour_value + previous_day_value + previous_week_value) / total_valid_count
        else:
            filled_value = 0

        energy_d.at[idx, column.name] = filled_value

for column in energy_d.columns[1:]:  
    fill_missing_values(energy_d[column])

Confirming there are no missing values remaining

In [20]:
#Confirming missing values have been filled

has_missing_values_2013 = energy_d[energy_d['Year'] == 2013].isnull().any().any()
has_missing_values_2014 = energy_d[energy_d['Year'] == 2014].isnull().any().any()

print("Has missing values in 2013:", has_missing_values_2013)
print("Has missing values in 2014:", has_missing_values_2014)

Has missing values in 2013: False
Has missing values in 2014: False


Converting cleaned dataframe to csv

In [21]:
#Converting cleaned "N" dataframe to csv

energy_d.to_csv('new_energy_d.csv', index=False)