In [112]:
import pandas as pd
import numpy as np
import glob
import os
from datetime import datetime, date
import holidays

In [113]:
# load zonal demand data 
demand_files = glob.glob('../HourlyZonalDemand/PUB_DemandZonal_*.csv')
dfs = []

for file in demand_files:
    df = pd.read_csv(file, skiprows=3)  # Skip header rows
    dfs.append(df)

hourly_demand = pd.concat(dfs, ignore_index=True)

# Convert date and hour to datetime
hourly_demand['datetime'] = pd.to_datetime(hourly_demand['Date']) + pd.to_timedelta(hourly_demand['Hour'] - 1, unit='h')
hourly_demand = hourly_demand.sort_values('datetime')

# Filter hourly_demand to only include data until 2025-04-04
hourly_demand = hourly_demand[hourly_demand['datetime'] < '2025-04-05']

# drop unrequired columns
hourly_demand = hourly_demand.drop(columns=['Diff', 'Zone Total', "Date", "Hour"]) # we add hour back in again later for consistency  

display(hourly_demand)




Unnamed: 0,Ontario Demand,Northwest,Northeast,Ottawa,East,Toronto,Essa,Bruce,Southwest,Niagara,West,datetime
0,16627,591,1466,1066,1268,5340,1579,89,3157,477,1489,2018-01-01 00:00:00
1,16084,577,1420,985,1300,5211,1516,91,3061,462,1465,2018-01-01 01:00:00
2,15866,613,1417,943,1316,5096,1471,86,3032,446,1441,2018-01-01 02:00:00
3,15725,656,1430,943,1303,4987,1451,81,2911,439,1413,2018-01-01 03:00:00
4,15470,657,1425,932,1343,4926,1422,69,2907,449,1391,2018-01-01 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
63619,16611,641,1360,1007,1122,6004,1150,119,3139,565,1754,2025-04-04 19:00:00
63620,16209,633,1381,991,1058,5859,1120,113,2995,553,1698,2025-04-04 20:00:00
63621,15299,624,1363,954,1009,5537,1052,111,2842,521,1556,2025-04-04 21:00:00
63622,14235,613,1316,876,945,5198,972,108,2665,511,1447,2025-04-04 22:00:00


In [114]:
# hourly ontario demand 

hourly_ontario_demand = hourly_demand[["datetime", "Ontario Demand"]]
hourly_ontario_demand.set_index(['datetime'], inplace=True)
display(hourly_ontario_demand)

Unnamed: 0_level_0,Ontario Demand
datetime,Unnamed: 1_level_1
2018-01-01 00:00:00,16627
2018-01-01 01:00:00,16084
2018-01-01 02:00:00,15866
2018-01-01 03:00:00,15725
2018-01-01 04:00:00,15470
...,...
2025-04-04 19:00:00,16611
2025-04-04 20:00:00,16209
2025-04-04 21:00:00,15299
2025-04-04 22:00:00,14235


In [115]:
# load hourly climate data from different regions into a multi-level dataframe
climate_files = glob.glob('../ClimateData/*Climate.csv')
dfs = []

for file in climate_files:
    # Extract region name from filename (e.g., "NortheastClimate.csv" -> "Northeast")
    region = os.path.basename(file).split('Climate')[0]
    df = pd.read_csv(file, skiprows=3, nrows=63600) 
    # there are a bunch of missing values after 63604, i.e. April 5th data is incomplete so we shouldn't use it
    # data goes until line 63676 after that we have mean data for each day
    df['datetime'] = pd.to_datetime(df['time'])
    df['region'] = region
    df = df.drop(columns=["time"])
    
    # Set multi-index with datetime and region
    df.set_index(['datetime', 'region'], inplace=True)
    dfs.append(df)

# Combine all climate dataframes
hourly_climate = pd.concat(dfs)

# Sort the multi-index
hourly_climate.sort_index(inplace=True)

display(hourly_climate)

Unnamed: 0_level_0,Unnamed: 1_level_0,temperature_2m (°C),cloud_cover (%),cloud_cover_low (%),cloud_cover_mid (%),cloud_cover_high (%),wind_speed_10m (km/h),wind_speed_100m (km/h),weather_code (wmo code),precipitation (mm),rain (mm),snowfall (cm),apparent_temperature (°C),wind_gusts_10m (km/h),relative_humidity_2m (%),dew_point_2m (°C),surface_pressure (hPa)
datetime,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2018-01-02 00:00:00,Bruce,-6.8,98,96,98,0,32.2,47.4,71,0.2,0.0,0.14,-14.9,58.7,68,-11.6,1004.6
2018-01-02 00:00:00,East,-17.6,25,25,0,0,15.9,26.5,1,0.0,0.0,0.00,-23.7,27.7,78,-20.5,1014.9
2018-01-02 00:00:00,Essa,-22.6,4,4,0,0,5.9,12.5,0,0.0,0.0,0.00,-27.4,18.7,70,-26.6,991.0
2018-01-02 00:00:00,Niagara,-12.4,100,100,100,0,11.7,22.7,73,0.3,0.0,0.21,-17.7,22.0,81,-15.0,1006.5
2018-01-02 00:00:00,Northeast,-24.2,100,0,100,100,11.5,25.5,3,0.0,0.0,0.00,-29.9,22.0,77,-27.0,982.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-04 23:00:00,Northwest,0.1,100,18,99,47,12.4,24.6,3,0.0,0.0,0.00,-4.1,22.3,88,-1.7,987.5
2025-04-04 23:00:00,Ottawa,1.9,100,0,0,100,7.4,15.0,3,0.0,0.0,0.00,-1.5,13.3,79,-1.4,1018.3
2025-04-04 23:00:00,Southwest,5.2,100,0,88,100,14.8,27.5,3,0.0,0.0,0.00,1.0,27.7,71,0.4,983.9
2025-04-04 23:00:00,Toronto,5.0,100,0,100,100,7.2,17.6,3,0.0,0.0,0.00,2.0,14.0,77,1.3,1003.4


In [116]:
# combine hourly demand and climate data

# melt demand data, i.e. make it such that each row is a unique datetime and region combination 
merge_hourly_demand = hourly_demand.melt(
    id_vars=['datetime'],
    value_vars=['Northwest', 'Northeast', 'Ottawa', 'East', 'Toronto', 'Essa', 'Bruce', 'Southwest', 'Niagara', 'West'],
    var_name='region',
    value_name='zonal_demand'
)

# merge 
hourly_data = pd.merge(
    hourly_climate,
    merge_hourly_demand[['datetime', 'region', 'zonal_demand']],
    on=['datetime', 'region'],
    how='left'  # Use 'inner' if you only want matching rows
)
hourly_data.set_index(['datetime', 'region'], inplace=True)

display(hourly_data)



Unnamed: 0_level_0,Unnamed: 1_level_0,temperature_2m (°C),cloud_cover (%),cloud_cover_low (%),cloud_cover_mid (%),cloud_cover_high (%),wind_speed_10m (km/h),wind_speed_100m (km/h),weather_code (wmo code),precipitation (mm),rain (mm),snowfall (cm),apparent_temperature (°C),wind_gusts_10m (km/h),relative_humidity_2m (%),dew_point_2m (°C),surface_pressure (hPa),zonal_demand
datetime,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2018-01-02 00:00:00,Bruce,-6.8,98,96,98,0,32.2,47.4,71,0.2,0.0,0.14,-14.9,58.7,68,-11.6,1004.6,69
2018-01-02 00:00:00,East,-17.6,25,25,0,0,15.9,26.5,1,0.0,0.0,0.00,-23.7,27.7,78,-20.5,1014.9,1398
2018-01-02 00:00:00,Essa,-22.6,4,4,0,0,5.9,12.5,0,0.0,0.0,0.00,-27.4,18.7,70,-26.6,991.0,1373
2018-01-02 00:00:00,Niagara,-12.4,100,100,100,0,11.7,22.7,73,0.3,0.0,0.21,-17.7,22.0,81,-15.0,1006.5,416
2018-01-02 00:00:00,Northeast,-24.2,100,0,100,100,11.5,25.5,3,0.0,0.0,0.00,-29.9,22.0,77,-27.0,982.4,1404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-04 23:00:00,Northwest,0.1,100,18,99,47,12.4,24.6,3,0.0,0.0,0.00,-4.1,22.3,88,-1.7,987.5,599
2025-04-04 23:00:00,Ottawa,1.9,100,0,0,100,7.4,15.0,3,0.0,0.0,0.00,-1.5,13.3,79,-1.4,1018.3,829
2025-04-04 23:00:00,Southwest,5.2,100,0,88,100,14.8,27.5,3,0.0,0.0,0.00,1.0,27.7,71,0.4,983.9,2500
2025-04-04 23:00:00,Toronto,5.0,100,0,100,100,7.2,17.6,3,0.0,0.0,0.00,2.0,14.0,77,1.3,1003.4,4902


In [117]:
# get daily climate data 

climate_files = glob.glob('../ClimateData/*Climate.csv')
dfs = []

for file in climate_files:
    # Extract region name from filename (e.g., "NortheastClimate.csv" -> "Northeast")
    region = os.path.basename(file).split('Climate')[0]
    df = pd.read_csv(file, skiprows=63677) 
    df = df.head(-3)
    # need to read rows 63678-66328 
    df['date'] = pd.to_datetime(df['time'])
    df['region'] = region
    # latter two are rows with NaN values
    df = df.drop(columns=["time", "precipitation_probability_mean (undefined)", "precipitation_probability_min (undefined)"])
    df['day_length'] = pd.to_datetime(df['sunset (iso8601)']) - pd.to_datetime(df['sunrise (iso8601)'])
    # Set multi-index with datetime and region
    df.set_index(['date', 'region'], inplace=True)
    dfs.append(df)

# Combine all climate dataframes
daily_climate = pd.concat(dfs)

# Sort the multi-index
daily_climate.sort_index(inplace=True)

display(daily_climate)

Unnamed: 0_level_0,Unnamed: 1_level_0,temperature_2m_mean (°C),cloud_cover_mean (%),cloud_cover_max (%),cloud_cover_min (%),wind_speed_10m_mean (km/h),temperature_2m_min (°C),apparent_temperature_mean (°C),weather_code (wmo code),temperature_2m_max (°C),apparent_temperature_max (°C),...,wind_gusts_10m_mean (km/h),wind_speed_10m_min (km/h),wind_gusts_10m_min (km/h),precipitation_sum (mm),rain_sum (mm),snowfall_sum (cm),precipitation_hours (h),sunrise (iso8601),sunset (iso8601),day_length
date,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-01-02,Bruce,-6.7,98.0,100.0,86.0,33.8,-8.3,-14.9,71.0,-5.3,-13.8,...,63.4,28.0,52.6,0.9,0.0,0.63,7.0,2018-01-02T09:01,2018-01-02T17:59,0 days 08:58:00
2018-01-02,East,-13.4,76.0,100.0,0.0,17.0,-22.9,-19.5,3.0,-5.4,-11.7,...,34.1,4.1,9.4,0.0,0.0,0.00,0.0,2018-01-02T08:41,2018-01-02T17:38,0 days 08:57:00
2018-01-02,Essa,-12.6,94.0,100.0,4.0,14.9,-22.6,-18.3,73.0,-5.9,-11.3,...,33.7,3.6,9.4,3.3,0.0,2.31,16.0,2018-01-02T08:56,2018-01-02T17:45,0 days 08:49:00
2018-01-02,Niagara,-10.5,76.0,100.0,1.0,21.3,-14.5,-17.1,73.0,-6.9,-14.3,...,43.5,11.7,22.0,0.6,0.0,0.42,3.0,2018-01-02T08:48,2018-01-02T17:53,0 days 09:05:00
2018-01-02,Northeast,-16.3,94.0,100.0,37.0,12.5,-24.2,-22.0,71.0,-11.7,-18.0,...,27.5,5.0,7.9,0.8,0.0,0.56,8.0,2018-01-02T09:17,2018-01-02T17:42,0 days 08:25:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-04,Northwest,0.2,82.0,100.0,0.0,11.1,-4.1,-4.2,51.0,5.1,-0.1,...,23.3,7.5,11.2,0.1,0.1,0.00,1.0,2025-04-04T07:18,2025-04-04T20:30,0 days 13:12:00
2025-04-04,Ottawa,4.9,49.0,100.0,0.0,11.7,-0.5,1.1,3.0,10.6,7.7,...,27.7,4.6,8.3,0.0,0.0,0.00,0.0,2025-04-04T06:36,2025-04-04T19:35,0 days 12:59:00
2025-04-04,Southwest,5.1,71.0,100.0,0.0,8.3,-0.8,2.1,3.0,10.7,9.3,...,19.9,2.8,14.0,0.0,0.0,0.00,0.0,2025-04-04T06:57,2025-04-04T19:52,0 days 12:55:00
2025-04-04,Toronto,5.3,67.0,100.0,0.0,7.1,-0.3,2.5,3.0,11.4,9.5,...,19.5,0.0,2.2,0.0,0.0,0.00,0.0,2025-04-04T06:52,2025-04-04T19:48,0 days 12:56:00


In [118]:
# daily ontario demand 

# Assuming df contains your data
daily_demand = hourly_demand
daily_demand['date'] = pd.to_datetime(daily_demand['datetime'].dt.date)
daily_demand = daily_demand.drop(columns=["datetime"])

# Select only numeric columns to sum 
numeric_columns = hourly_demand.select_dtypes(include=['number']).columns

daily_demand = daily_demand.groupby('date')[numeric_columns].sum().reset_index()
daily_ontario_demand = daily_demand[["date", "Ontario Demand"]]
daily_ontario_demand.set_index(['date'], inplace=True)
daily_demand = daily_demand.drop(columns=["Ontario Demand"])
display(daily_ontario_demand)
display(daily_demand)

Unnamed: 0_level_0,Ontario Demand
date,Unnamed: 1_level_1
2018-01-01,406340
2018-01-02,430873
2018-01-03,434960
2018-01-04,433198
2018-01-05,455055
...,...
2025-03-31,373406
2025-04-01,375761
2025-04-02,406097
2025-04-03,370669


Unnamed: 0,date,Northwest,Northeast,Ottawa,East,Toronto,Essa,Bruce,Southwest,Niagara,West
0,2018-01-01,15249,34493,26274,34951,134028,36774,1988,77389,11439,36150
1,2018-01-02,14910,33191,26816,35831,147904,36823,1984,84959,12378,38792
2,2018-01-03,15040,34490,25393,32411,149297,36256,3111,86718,12628,41264
3,2018-01-04,15221,35047,25343,31886,150517,37329,3214,85344,12795,39718
4,2018-01-05,15164,36078,31048,32305,157676,40589,2556,87574,13392,40063
...,...,...,...,...,...,...,...,...,...,...,...
2646,2025-03-31,13447,34069,25221,21073,134218,21408,2398,70930,12661,40556
2647,2025-04-01,13220,32852,24542,17732,137583,23485,2725,72793,12767,40585
2648,2025-04-02,13929,33213,25172,23618,147589,27506,2505,78807,13582,42160
2649,2025-04-03,13792,32721,23833,21347,134509,24095,2310,70459,12819,37162


In [119]:
# combine daily demand and climate data

# melt demand data, i.e. make it such that each row is a unique datetime and region combination 
merge_daily_demand = daily_demand.melt(
    id_vars=['date'],
    value_vars=['Northwest', 'Northeast', 'Ottawa', 'East', 'Toronto', 'Essa', 'Bruce', 'Southwest', 'Niagara', 'West'],
    var_name='region',
    value_name='zonal_demand'
)

# merge 
daily_data = pd.merge(
    daily_climate,
    merge_daily_demand[['date', 'region', 'zonal_demand']],
    on=['date', 'region'],
    how='left'  # Use 'inner' if you only want matching rows
)
daily_data.set_index(['date', 'region'], inplace=True)

display(daily_data)

Unnamed: 0_level_0,Unnamed: 1_level_0,temperature_2m_mean (°C),cloud_cover_mean (%),cloud_cover_max (%),cloud_cover_min (%),wind_speed_10m_mean (km/h),temperature_2m_min (°C),apparent_temperature_mean (°C),weather_code (wmo code),temperature_2m_max (°C),apparent_temperature_max (°C),...,wind_speed_10m_min (km/h),wind_gusts_10m_min (km/h),precipitation_sum (mm),rain_sum (mm),snowfall_sum (cm),precipitation_hours (h),sunrise (iso8601),sunset (iso8601),day_length,zonal_demand
date,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-01-02,Bruce,-6.7,98.0,100.0,86.0,33.8,-8.3,-14.9,71.0,-5.3,-13.8,...,28.0,52.6,0.9,0.0,0.63,7.0,2018-01-02T09:01,2018-01-02T17:59,0 days 08:58:00,1984
2018-01-02,East,-13.4,76.0,100.0,0.0,17.0,-22.9,-19.5,3.0,-5.4,-11.7,...,4.1,9.4,0.0,0.0,0.00,0.0,2018-01-02T08:41,2018-01-02T17:38,0 days 08:57:00,35831
2018-01-02,Essa,-12.6,94.0,100.0,4.0,14.9,-22.6,-18.3,73.0,-5.9,-11.3,...,3.6,9.4,3.3,0.0,2.31,16.0,2018-01-02T08:56,2018-01-02T17:45,0 days 08:49:00,36823
2018-01-02,Niagara,-10.5,76.0,100.0,1.0,21.3,-14.5,-17.1,73.0,-6.9,-14.3,...,11.7,22.0,0.6,0.0,0.42,3.0,2018-01-02T08:48,2018-01-02T17:53,0 days 09:05:00,12378
2018-01-02,Northeast,-16.3,94.0,100.0,37.0,12.5,-24.2,-22.0,71.0,-11.7,-18.0,...,5.0,7.9,0.8,0.0,0.56,8.0,2018-01-02T09:17,2018-01-02T17:42,0 days 08:25:00,33191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-04,Northwest,0.2,82.0,100.0,0.0,11.1,-4.1,-4.2,51.0,5.1,-0.1,...,7.5,11.2,0.1,0.1,0.00,1.0,2025-04-04T07:18,2025-04-04T20:30,0 days 13:12:00,13973
2025-04-04,Ottawa,4.9,49.0,100.0,0.0,11.7,-0.5,1.1,3.0,10.6,7.7,...,4.6,8.3,0.0,0.0,0.00,0.0,2025-04-04T06:36,2025-04-04T19:35,0 days 12:59:00,22189
2025-04-04,Southwest,5.1,71.0,100.0,0.0,8.3,-0.8,2.1,3.0,10.7,9.3,...,2.8,14.0,0.0,0.0,0.00,0.0,2025-04-04T06:57,2025-04-04T19:52,0 days 12:55:00,68848
2025-04-04,Toronto,5.3,67.0,100.0,0.0,7.1,-0.3,2.5,3.0,11.4,9.5,...,0.0,2.2,0.0,0.0,0.00,0.0,2025-04-04T06:52,2025-04-04T19:48,0 days 12:56:00,131437


In [120]:
datasets = [hourly_data, daily_data, hourly_ontario_demand, daily_ontario_demand]

for df in datasets:
    print(df.index.get_level_values(0))
# hourly_data["datetime"]
# hourly_ontario_demand["datetime"]

DatetimeIndex(['2018-01-02 00:00:00', '2018-01-02 00:00:00',
               '2018-01-02 00:00:00', '2018-01-02 00:00:00',
               '2018-01-02 00:00:00', '2018-01-02 00:00:00',
               '2018-01-02 00:00:00', '2018-01-02 00:00:00',
               '2018-01-02 00:00:00', '2018-01-02 00:00:00',
               ...
               '2025-04-04 23:00:00', '2025-04-04 23:00:00',
               '2025-04-04 23:00:00', '2025-04-04 23:00:00',
               '2025-04-04 23:00:00', '2025-04-04 23:00:00',
               '2025-04-04 23:00:00', '2025-04-04 23:00:00',
               '2025-04-04 23:00:00', '2025-04-04 23:00:00'],
              dtype='datetime64[ns]', name='datetime', length=636000, freq=None)
DatetimeIndex(['2018-01-02', '2018-01-02', '2018-01-02', '2018-01-02',
               '2018-01-02', '2018-01-02', '2018-01-02', '2018-01-02',
               '2018-01-02', '2018-01-02',
               ...
               '2025-04-04', '2025-04-04', '2025-04-04', '2025-04-04',
              

In [121]:
# we need to modify this so that it works for the multi-index merged dataframe, 
# but also perhaps preserve this so that we can use it for the other dataframe that has the daily demand data 

def add_time_features(df):
    datetime_index = df.index.get_level_values(0)
    
    # df['day_of_week'] = df['datetime'].dt.dayofweek
    # df['month'] = df['datetime'].dt.month
    # df['day_of_year'] = df['datetime'].dt.dayofyear
    
    df['day_of_week'] = datetime_index.dayofweek
    df['month'] = datetime_index.month
    df['day_of_year'] = datetime_index.dayofyear
    
    # Add holiday information
    holiday_years = list(range(2018, 2026))
    ca_holidays = holidays.CA(prov='ON', years=holiday_years)
    
    # df['is_holiday'] = df['datetime'].dt.date.isin(ca_holidays)
    # df['is_weekend'] = df['day_of_week'].isin([5, 6])\
    dates = pd.Series(datetime_index.date)
    day_of_week = pd.Series(datetime_index.dayofweek)
    df['is_holiday'] = dates.isin(ca_holidays)
    df['is_weekend'] = day_of_week.isin([5, 6])
    
    return df

def create_lag_features(df, lags=[24, 48, 168]):
    """Create lagged features for the target variable."""
    for col in df.columns:
        for lag in lags:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag)
    
    df = df.dropna() # remove null values 
    
    return df

In [125]:
for dataset in datasets:
    print(dataset.shape)

(635832, 73)
(26332, 145)
(63456, 9)
(2483, 9)


In [123]:
for i in range(len(datasets)):
    datasets[i] = create_lag_features(datasets[i])
    datasets[i] = add_time_features(datasets[i])
    display(datasets[i])

# hourly_data = create_lag_features(hourly_data)
# daily_data = create_lag_features(daily_data)
# hourly_ontario_demand = create_lag_features(hourly_ontario_demand)
# daily_ontario_demand = create_lag_features(daily_ontario_demand)

# hourly_data = add_time_features(hourly_data)
# daily_data = add_time_features(daily_data)
# hourly_ontario_demand = add_time_features(hourly_ontario_demand)
# daily_ontario_demand = add_time_features(daily_ontario_demand)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_week'] = datetime_index.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['month'] = datetime_index.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_year'] = datetime_index.dayofyear
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .lo

Unnamed: 0_level_0,Unnamed: 1_level_0,temperature_2m (°C),cloud_cover (%),cloud_cover_low (%),cloud_cover_mid (%),cloud_cover_high (%),wind_speed_10m (km/h),wind_speed_100m (km/h),weather_code (wmo code),precipitation (mm),rain (mm),...,surface_pressure (hPa)_lag_48,surface_pressure (hPa)_lag_168,zonal_demand_lag_24,zonal_demand_lag_48,zonal_demand_lag_168,day_of_week,month,day_of_year,is_holiday,is_weekend
datetime,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-01-02 16:00:00,Toronto,-7.0,97,97,0,1,22.1,35.8,3,0.0,0.0,...,1000.7,1004.6,1435.0,83.0,69.0,1,1,2,,
2018-01-02 16:00:00,West,-15.6,20,20,0,0,30.0,52.4,1,0.0,0.0,...,1010.2,1014.9,639.0,1574.0,1398.0,1,1,2,,
2018-01-02 17:00:00,Bruce,-6.0,86,74,0,46,34.5,49.8,3,0.0,0.0,...,981.8,991.0,1174.0,1575.0,1373.0,1,1,2,,
2018-01-02 17:00:00,East,-6.0,96,10,15,95,30.1,47.9,3,0.0,0.0,...,1003.4,1006.5,3667.0,557.0,416.0,1,1,2,,
2018-01-02 17:00:00,Essa,-5.9,95,95,83,0,22.8,37.6,71,0.2,0.0,...,975.1,982.4,6663.0,1410.0,1404.0,1,1,2,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-04 23:00:00,Northwest,0.1,100,18,99,47,12.4,24.6,3,0.0,0.0,...,985.7,983.7,1009.0,3080.0,3162.0,4,4,94,,
2025-04-04 23:00:00,Ottawa,1.9,100,0,0,100,7.4,15.0,3,0.0,0.0,...,1004.7,1002.2,1052.0,5914.0,5622.0,4,4,94,,
2025-04-04 23:00:00,Southwest,5.2,100,0,88,100,14.8,27.5,3,0.0,0.0,...,1003.8,1002.9,521.0,1751.0,1913.0,4,4,94,,
2025-04-04 23:00:00,Toronto,5.0,100,0,100,100,7.2,17.6,3,0.0,0.0,...,1003.1,1003.1,1363.0,119.0,123.0,4,4,94,,


  df[f'{col}_lag_{lag}'] = df[col].shift(lag)
  df[f'{col}_lag_{lag}'] = df[col].shift(lag)
  df[f'{col}_lag_{lag}'] = df[col].shift(lag)
  df[f'{col}_lag_{lag}'] = df[col].shift(lag)
  df[f'{col}_lag_{lag}'] = df[col].shift(lag)
  df[f'{col}_lag_{lag}'] = df[col].shift(lag)
  df[f'{col}_lag_{lag}'] = df[col].shift(lag)
  df[f'{col}_lag_{lag}'] = df[col].shift(lag)
  df[f'{col}_lag_{lag}'] = df[col].shift(lag)
  df['day_of_week'] = datetime_index.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_week'] = datetime_index.dayofweek
  df['month'] = datetime_index.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pand

Unnamed: 0_level_0,Unnamed: 1_level_0,temperature_2m_mean (°C),cloud_cover_mean (%),cloud_cover_max (%),cloud_cover_min (%),wind_speed_10m_mean (km/h),temperature_2m_min (°C),apparent_temperature_mean (°C),weather_code (wmo code),temperature_2m_max (°C),apparent_temperature_max (°C),...,day_length_lag_48,day_length_lag_168,zonal_demand_lag_24,zonal_demand_lag_48,zonal_demand_lag_168,day_of_week,month,day_of_year,is_holiday,is_weekend
date,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-01-18,Toronto,-6.8,55.0,100.0,2.0,14.7,-10.6,-12.2,3.0,-2.0,-6.5,...,0 days 09:14:00,0 days 08:58:00,33381.0,2187.0,1984.0,3,1,18,,
2018-01-18,West,-6.1,16.0,92.0,0.0,24.5,-9.3,-13.0,3.0,-2.3,-9.1,...,0 days 09:13:00,0 days 08:57:00,15498.0,29327.0,35831.0,3,1,18,,
2018-01-19,Bruce,-0.7,76.0,100.0,2.0,26.5,-2.3,-7.3,3.0,1.2,-5.6,...,0 days 09:06:00,0 days 08:49:00,30785.0,36236.0,36823.0,4,1,19,,
2018-01-19,East,-1.7,86.0,100.0,12.0,15.9,-5.0,-6.7,71.0,0.9,-5.2,...,0 days 09:20:00,0 days 09:05:00,86025.0,12131.0,12378.0,4,1,19,,
2018-01-19,Essa,-2.6,93.0,100.0,5.0,13.1,-6.2,-7.3,71.0,-0.3,-4.8,...,0 days 08:44:00,0 days 08:25:00,148345.0,34705.0,33191.0,4,1,19,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-04,Northwest,0.2,82.0,100.0,0.0,11.1,-4.1,-4.2,51.0,5.1,-0.1,...,0 days 12:40:00,0 days 12:05:00,23618.0,69275.0,72112.0,4,4,94,,
2025-04-04,Ottawa,4.9,49.0,100.0,0.0,11.7,-0.5,1.1,3.0,10.6,7.7,...,0 days 12:41:00,0 days 12:04:00,27506.0,130838.0,134523.0,4,4,94,,
2025-04-04,Southwest,5.1,71.0,100.0,0.0,8.3,-0.8,2.1,3.0,10.7,9.3,...,0 days 12:39:00,0 days 12:04:00,13582.0,38558.0,39445.0,4,4,94,,
2025-04-04,Toronto,5.3,67.0,100.0,0.0,7.1,-0.3,2.5,3.0,11.4,9.5,...,0 days 12:44:00,0 days 12:07:00,33213.0,2398.0,2214.0,4,4,94,,


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col}_lag_{lag}'] = df[col].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col}_lag_{lag}'] = df[col].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col}_lag_{lag}'] = df[col].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

Unnamed: 0_level_0,Ontario Demand,Ontario Demand_lag_24,Ontario Demand_lag_48,Ontario Demand_lag_168,day_of_week,month,day_of_year,is_holiday,is_weekend
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-08 00:00:00,15521,17015.0,17293.0,16627.0,0,1,8,,
2018-01-08 01:00:00,15129,16531.0,16937.0,16084.0,0,1,8,,
2018-01-08 02:00:00,14916,16240.0,16505.0,15866.0,0,1,8,,
2018-01-08 03:00:00,14880,16103.0,16599.0,15725.0,0,1,8,,
2018-01-08 04:00:00,14919,16031.0,16647.0,15470.0,0,1,8,,
...,...,...,...,...,...,...,...,...,...
2025-04-04 19:00:00,16611,16951.0,18558.0,17732.0,4,4,94,,
2025-04-04 20:00:00,16209,16682.0,17765.0,17260.0,4,4,94,,
2025-04-04 21:00:00,15299,15739.0,16733.0,16358.0,4,4,94,,
2025-04-04 22:00:00,14235,14929.0,15739.0,15380.0,4,4,94,,


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_week'] = datetime_index.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['month'] = datetime_index.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_year'] = datetime_index.dayofyear
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .lo

Unnamed: 0_level_0,Ontario Demand,Ontario Demand_lag_24,Ontario Demand_lag_48,Ontario Demand_lag_168,day_of_week,month,day_of_year,is_holiday,is_weekend
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-06-18,449568,368424.0,322065.0,406340.0,0,6,169,,
2018-06-19,386619,351924.0,333736.0,430873.0,1,6,170,,
2018-06-20,377181,348860.0,344165.0,434960.0,2,6,171,,
2018-06-21,365664,408330.0,318967.0,433198.0,3,6,172,,
2018-06-22,349240,405836.0,292545.0,455055.0,4,6,173,,
...,...,...,...,...,...,...,...,...,...
2025-03-31,373406,406231.0,443595.0,333870.0,0,3,90,,
2025-04-01,375761,387399.0,454031.0,362343.0,1,4,91,,
2025-04-02,406097,370923.0,447541.0,366050.0,2,4,92,,
2025-04-03,370669,377050.0,445713.0,361809.0,3,4,93,,


In [124]:
# we want to make sure that not too many rows have been dropped from each dataset and the number of columns increase by approx 3

for dataset in datasets:
    print(dataset.shape)

635832
26332
63456
2483
