In [1]:
import datetime
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

In [3]:
raw_data = pd.read_csv('./raw/household_power_consumption.txt', sep=';', low_memory=False)

In [4]:
data = raw_data[['Date', 'Time', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3', 'Global_active_power']].copy()
data['Datetime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'], dayfirst=True)
data.set_index('Datetime', inplace=True)

data.replace("?", np.nan, inplace=True)
data = data.apply(pd.to_numeric, errors='coerce')
data.drop(columns=['Date', 'Time'], inplace=True)

data["normalization"] = data["Global_active_power"] / (
        data["Sub_metering_1"] + data["Sub_metering_2"] + data["Sub_metering_3"] + 10e-12)

data["flexible_demand_response"] = data["Sub_metering_2"] * data["normalization"]
data["household_energy_demand"] = (data["Sub_metering_1"] + data["Sub_metering_3"]) * data["normalization"]

In [5]:
hed_minutely = data.loc[:, ["household_energy_demand"]]
hed_minutely.rename(columns={"household_energy_demand": "energy"}, inplace=True)

fdr_minutely = data.loc[:, ["flexible_demand_response"]]
fdr_minutely.rename(columns={"flexible_demand_response": "energy"}, inplace=True)

hed_hourly = hed_minutely.resample('H').mean()
fdr_hourly = fdr_minutely.resample('H').mean()

In [6]:
for data, resolution, name in zip([hed_minutely, fdr_minutely, hed_hourly, fdr_hourly],
                                ["minutely", "minutely", "hourly", "hourly"],
                            ["household_energy_demand", "flexible_demand_response", "household_energy_demand",
                             "flexible_demand_response"]):
    print(f"{name} {resolution} Max: ", np.max(data['energy']))
    print(f"{name} {resolution} Min: ", np.min(data['energy']))

    episode = data.groupby(pd.Grouper(freq='W'))

    required_length = 10000 if resolution == "minutely" else 165

    episode = pd.concat([group.fillna(0) for name, group in episode if
                             len(group.dropna()) >= required_length and group.index[0] >= datetime(2007, 1, 1) and
                             group.index[-1] <= datetime(2008, 12, 31)]).groupby(pd.Grouper(freq='W'))
    
    print(f"{name} {resolution} episodes: ", len(episode))
    with pd.HDFStore(f'./{resolution}/{name}.h5') as store:
        # Save each DataFrame with a key
        i = 0
        for index, group in episode:
            if not group.empty:
                store[f'eps_{i}'] = group
                i += 1
    print(f"{name} {resolution} episodes: ", i)

household_energy_demand minutely Max:  10.161999999998569
household_energy_demand minutely Min:  0.0
household_energy_demand minutely episodes:  104
                     energy
Datetime                   
2008-03-03 00:00:00     0.0
2008-03-03 00:01:00     0.0
2008-03-03 00:02:00     0.0
2008-03-03 00:03:00     0.0
2008-03-03 00:04:00     0.0
...                     ...
2008-03-09 23:55:00     0.0
2008-03-09 23:56:00     0.0
2008-03-09 23:57:00     0.0
2008-03-09 23:58:00     0.0
2008-03-09 23:59:00     0.0

[10080 rows x 1 columns]
household_energy_demand minutely episodes:  101
flexible_demand_response minutely Max:  8.583999999998824
flexible_demand_response minutely Min:  0.0
flexible_demand_response minutely episodes:  104
                     energy
Datetime                   
2008-03-03 00:00:00   0.000
2008-03-03 00:01:00   0.000
2008-03-03 00:02:00   0.000
2008-03-03 00:03:00   0.000
2008-03-03 00:04:00   0.000
...                     ...
2008-03-09 23:55:00   0.362
2008-03-09