In [1]:
import numpy as np
import pandas as pd
import holidays
import datetime

from sklearn.linear_model import LinearRegression
from sklearn import metrics

from load_data import load_data, split_data

data = load_data("1h")
all_columns = data.keys()
all_columns

Index(['MWh', 'temperature_fore_ch', 'temperature_fore_fr',
       'temperature_fore_de', 'temperature_fore_it', 'solar_fore_de [MW]',
       'solar_fore_it [MW]', 'wind_fore_de [MW]', 'wind_fore_it [MW]', 'CH_AT',
       'CH_DE', 'CH_FR', 'CH_IT', 'AT_CH', 'DE_CH', 'FR_CH', 'IT_CH'],
      dtype='object')

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [3]:
columns_to_drop = [
    "CH_AT",
    "CH_DE",
    "CH_FR",
    "CH_IT",
    "AT_CH",
    "DE_CH",
    "FR_CH",
    "IT_CH",
]

In [4]:
data_filtered = data.drop(columns=columns_to_drop)

In [5]:
target_col = ["MWh"]
data_columns = data_filtered.columns

In [6]:
new_df = data_filtered.groupby(data_filtered.index.date)[data_columns].agg(list)

In [7]:
feature_cols = []
# Create lagged features
for col in target_col:
    for i in range(1, 8):  # for past 7 days (in hours)
        new_df[f'lag_{col}_{i}'] = new_df[col].shift(i)
        feature_cols += [f'lag_{col}_{i}']

In [14]:
new_df[feature_cols]

Unnamed: 0,lag_MWh_1,lag_MWh_2,lag_MWh_3,lag_MWh_4,lag_MWh_5,lag_MWh_6,lag_MWh_7
2019-01-01,,,,,,,
2019-01-02,"[129.716036003, 133.398074458, 135.13385173100...",,,,,,
2019-01-03,"[160.254575486, 157.627130612, 162.555837972, ...","[129.716036003, 133.398074458, 135.13385173100...",,,,,
2019-01-04,"[161.491635231, 164.567867901, 144.99496507, 1...","[160.254575486, 157.627130612, 162.555837972, ...","[129.716036003, 133.398074458, 135.13385173100...",,,,
2019-01-05,"[197.708791105, 214.018616727, 226.156810638, ...","[161.491635231, 164.567867901, 144.99496507, 1...","[160.254575486, 157.627130612, 162.555837972, ...","[129.716036003, 133.398074458, 135.13385173100...",,,
...,...,...,...,...,...,...,...
2021-12-27,"[126.97557193899999, 122.506341252, 117.759608...","[103.814344879, 115.79573829899999, 119.124220...","[96.965601373, 104.161457059, 125.646292993999...","[100.11748447299999, 101.074044227, 112.005609...","[85.25125266, 94.96943925800001, 98.78998129, ...","[104.879053009, 111.447329443, 122.003848153, ...","[118.93402083800001, 122.766563235, 125.508276..."
2021-12-28,"[88.593463178, 95.671567192, 94.462406665, 84....","[126.97557193899999, 122.506341252, 117.759608...","[103.814344879, 115.79573829899999, 119.124220...","[96.965601373, 104.161457059, 125.646292993999...","[100.11748447299999, 101.074044227, 112.005609...","[85.25125266, 94.96943925800001, 98.78998129, ...","[104.879053009, 111.447329443, 122.003848153, ..."
2021-12-29,"[121.391864488, 120.860130201, 125.649269606, ...","[88.593463178, 95.671567192, 94.462406665, 84....","[126.97557193899999, 122.506341252, 117.759608...","[103.814344879, 115.79573829899999, 119.124220...","[96.965601373, 104.161457059, 125.646292993999...","[100.11748447299999, 101.074044227, 112.005609...","[85.25125266, 94.96943925800001, 98.78998129, ..."
2021-12-30,"[115.549291126, 108.695661341, 104.90628235300...","[121.391864488, 120.860130201, 125.649269606, ...","[88.593463178, 95.671567192, 94.462406665, 84....","[126.97557193899999, 122.506341252, 117.759608...","[103.814344879, 115.79573829899999, 119.124220...","[96.965601373, 104.161457059, 125.646292993999...","[100.11748447299999, 101.074044227, 112.005609..."


In [15]:
new_df.index[0]

datetime.date(2019, 1, 1)

In [27]:
train = new_df[(new_df.index<datetime.date(2021, 12, 31))&(new_df.index>datetime.date(2021, 1, 1))]
test = new_df[(new_df.index<datetime.date(2021, 1, 1))&(new_df.index>datetime.date(2019, 1, 2))]

In [28]:
#feature_cols = new_df.columns.drop(target_col)
X_train = np.vstack(train[feature_cols].apply(np.hstack, axis=1).values)
#X_train.reshape(-1, 1)
y_train = np.vstack(train[target_col].apply(np.hstack, axis=1).values)

In [29]:
X_train.shape, y_train.shape

((363, 168), (363, 24))

In [30]:
lin_reg = LinearRegression()  # create the model
lin_reg.fit(X_train, y_train)  # train it

In [31]:
X_test = np.vstack(test[feature_cols].apply(np.hstack, axis=1).values)
y_test = np.vstack(test[target_col].apply(np.hstack, axis=1).values)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 53 and the array at index 1 has size 76

In [66]:
X_test.shape, y_test.shape

((358, 168), (358, 24))

In [67]:
y_pred = lin_reg.predict(X_test)

In [68]:
y_pred.size, y_test.size

(8592, 8592)

In [70]:
metrics.mean_absolute_error(y_test, y_pred)
#Note: here testing set is part of the training !!! BAD PRACTICE

14.387767709478458

# Testing with 2022 data

In [76]:
import yaml
import os

In [84]:
import sys

In [87]:
os.getcwd()

'/srv/train_group1'

In [88]:
os.chdir("../results_group1")

In [89]:
os.getcwd()

'/srv/results_group1'

In [90]:
with open("../results_group1/params.yaml", "r") as yaml_file:
    file_paths = yaml.safe_load(yaml_file)
        
data = pd.read_csv(
    file_paths["active_loss_path"],
    skiprows=1,
    names=["unnamed0", "unnamed1", "timestamps", "MWh"],
    parse_dates=["timestamps"],
    index_col="timestamps",
)
data = data.drop(columns=["unnamed0", "unnamed1"])
data["MWh"] = data["MWh"]


data

Unnamed: 0_level_0,MWh
timestamps,Unnamed: 1_level_1
2022-01-01 00:00:00,146.054792
2022-01-01 01:00:00,139.133354
2022-01-01 02:00:00,147.562500
2022-01-01 03:00:00,157.636204
2022-01-01 04:00:00,163.326766
...,...
2022-12-31 19:00:00,67.876028
2022-12-31 20:00:00,72.765318
2022-12-31 21:00:00,81.277633
2022-12-31 22:00:00,95.496046


In [98]:
new_df = data.groupby(data.index.date)["MWh"].agg(list)
new_df = pd.DataFrame(new_df)
new_df

Unnamed: 0,MWh
2022-01-01,"[146.054792, 139.133354, 147.5625, 157.636204,..."
2022-01-02,"[150.981065, 156.355783, 159.302609, 155.96211..."
2022-01-03,"[215.603282, 184.89184100000003, 146.135674, 1..."
2022-01-04,"[161.05259, 179.75817800000002, 157.233299, 13..."
2022-01-05,"[182.146147, 179.16613199999998, 167.550049, 1..."
...,...
2022-12-27,"[111.53740668, 111.046009, 125.318241, 116.811..."
2022-12-28,"[139.585493, 135.26405614285716, 130.942619285..."
2022-12-29,"[36.27518709523808, 37.1555956190476, 38.03600..."
2022-12-30,"[142.91911671428574, 156.773506, 143.824683, 1..."


In [99]:
for col in target_col:
    for i in range(1, 8):  # for past 7 days (in hours)
        new_df[f'lag_{col}_{i}'] = new_df[col].shift(i)
        feature_cols += [f'lag_{col}_{i}']

In [100]:
new_df

Unnamed: 0,MWh,lag_MWh_1,lag_MWh_2,lag_MWh_3,lag_MWh_4,lag_MWh_5,lag_MWh_6,lag_MWh_7
2022-01-01,"[146.054792, 139.133354, 147.5625, 157.636204,...",,,,,,,
2022-01-02,"[150.981065, 156.355783, 159.302609, 155.96211...","[146.054792, 139.133354, 147.5625, 157.636204,...",,,,,,
2022-01-03,"[215.603282, 184.89184100000003, 146.135674, 1...","[150.981065, 156.355783, 159.302609, 155.96211...","[146.054792, 139.133354, 147.5625, 157.636204,...",,,,,
2022-01-04,"[161.05259, 179.75817800000002, 157.233299, 13...","[215.603282, 184.89184100000003, 146.135674, 1...","[150.981065, 156.355783, 159.302609, 155.96211...","[146.054792, 139.133354, 147.5625, 157.636204,...",,,,
2022-01-05,"[182.146147, 179.16613199999998, 167.550049, 1...","[161.05259, 179.75817800000002, 157.233299, 13...","[215.603282, 184.89184100000003, 146.135674, 1...","[150.981065, 156.355783, 159.302609, 155.96211...","[146.054792, 139.133354, 147.5625, 157.636204,...",,,
...,...,...,...,...,...,...,...,...
2022-12-27,"[111.53740668, 111.046009, 125.318241, 116.811...","[123.330951, 122.83955332, 122.34815564, 121.8...","[130.271508, 104.086217, 77.92486699999999, 76...","[145.03873, 142.614236, 131.0587, 126.593042, ...","[133.77160644000003, 134.252926, 132.223826, 1...","[122.219937, 122.70125656, 123.18257612, 123.6...","[153.126612, 143.11705600000002, 145.626379, 1...","[153.95896599999998, 151.833437, 140.380454, 1..."
2022-12-28,"[139.585493, 135.26405614285716, 130.942619285...","[111.53740668, 111.046009, 125.318241, 116.811...","[123.330951, 122.83955332, 122.34815564, 121.8...","[130.271508, 104.086217, 77.92486699999999, 76...","[145.03873, 142.614236, 131.0587, 126.593042, ...","[133.77160644000003, 134.252926, 132.223826, 1...","[122.219937, 122.70125656, 123.18257612, 123.6...","[153.126612, 143.11705600000002, 145.626379, 1..."
2022-12-29,"[36.27518709523808, 37.1555956190476, 38.03600...","[139.585493, 135.26405614285716, 130.942619285...","[111.53740668, 111.046009, 125.318241, 116.811...","[123.330951, 122.83955332, 122.34815564, 121.8...","[130.271508, 104.086217, 77.92486699999999, 76...","[145.03873, 142.614236, 131.0587, 126.593042, ...","[133.77160644000003, 134.252926, 132.223826, 1...","[122.219937, 122.70125656, 123.18257612, 123.6..."
2022-12-30,"[142.91911671428574, 156.773506, 143.824683, 1...","[36.27518709523808, 37.1555956190476, 38.03600...","[139.585493, 135.26405614285716, 130.942619285...","[111.53740668, 111.046009, 125.318241, 116.811...","[123.330951, 122.83955332, 122.34815564, 121.8...","[130.271508, 104.086217, 77.92486699999999, 76...","[145.03873, 142.614236, 131.0587, 126.593042, ...","[133.77160644000003, 134.252926, 132.223826, 1..."


In [107]:
X2022 = np.vstack(new_df[7:].drop(columns=["MWh"]).apply(np.hstack, axis=1).values)
y2022 = np.vstack(new_df[7:][target_col].apply(np.hstack, axis=1).values)


In [108]:
y2022_pred = lin_reg.predict(X2022)

In [109]:
metrics.mean_absolute_error(y2022, y2022_pred)

22.776128452038787