In [2]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path

from predict_energy_behavior.models.joined_model import JoinedModel
from predict_energy_behavior.cv import MonthlyKFold

from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import StandardScaler

%load_ext autoreload
%autoreload 2

In [2]:
df = pd.read_parquet(
    "/beegfs/ws/0/s4610340-energy_behavior/yahor/kaggle-predict_energy_behavior_of_prosumers/data/processed/train/make_features/df_features.parquet",
    engine="fastparquet"
).dropna()
df = df.loc[df["is_consumption"]==0] # select production
df

Unnamed: 0_level_0,county,is_business,product_type,is_consumption,datetime,date,dayofyear,hour,day,weekday,...,diff_168h_10_metre_u_wind_component_historical,diff_168h_10_metre_u_wind_component_forecast,diff_168h_10_metre_v_wind_component_historical,diff_168h_10_metre_v_wind_component_forecast,diff_168h_humidity_historical,diff_168h_humidity_forecast,diff_168h_fog_historical,diff_168h_fog_forecast,diff_168h_surface_solar_radiation_downwards_forecast,target
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
412848,0,0,1,0,2022-01-16 00:00:00,2022-01-16,16,0,16,7,...,2.903655,7.641448,1.134356,0.862746,-2.098530,90.614426,0.028430,0.406587,0.000000,0.001
412978,0,0,1,0,2022-01-16 01:00:00,2022-01-16,16,1,16,7,...,-0.603822,4.140790,3.241673,4.276222,-2.124906,90.515038,0.014313,0.359051,0.000000,0.011
413108,0,0,1,0,2022-01-16 02:00:00,2022-01-16,16,2,16,7,...,-1.399107,1.606657,-2.564545,0.754472,-3.038066,-1.687471,-0.036185,0.012937,0.000000,0.000
413238,0,0,1,0,2022-01-16 03:00:00,2022-01-16,16,3,16,7,...,-1.129649,2.984911,2.979040,2.609334,-5.545794,-3.043178,-0.081314,-0.008434,0.000000,0.000
413368,0,0,1,0,2022-01-16 04:00:00,2022-01-16,16,4,16,7,...,-1.652590,2.628980,0.027282,2.423260,-6.818796,-3.402164,-0.097776,-0.035603,0.000000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017818,14,1,2,0,2023-05-31 19:00:00,2023-05-31,151,19,31,3,...,-1.105848,5.061428,-0.284668,-0.991004,-1.342231,5.543314,-0.309736,-0.365412,21.849556,35.026
2017948,14,1,2,0,2023-05-31 20:00:00,2023-05-31,151,20,31,3,...,-0.750958,3.834254,-0.188849,-0.764466,-4.415627,2.071079,-0.244341,-0.401858,22.746534,14.867
2018078,14,1,2,0,2023-05-31 21:00:00,2023-05-31,151,21,31,3,...,-1.467191,3.102036,0.381953,-0.375884,-5.425629,1.909982,-0.149826,-0.255360,10.977181,1.865
2018208,14,1,2,0,2023-05-31 22:00:00,2023-05-31,151,22,31,3,...,0.907293,3.925916,-1.620587,-1.667414,-9.103232,-1.070073,-0.285407,-0.361982,0.000000,0.017


In [3]:
cv = MonthlyKFold(["2023-02", "2023-03", "2023-04", "2023-05"], max_offset_h=0)
cv

<predict_energy_behavior.cv.MonthlyKFold at 0x7f76e1c3da80>

In [4]:
features = [
      "temperature_historical",
      "dewpoint_historical",
      "rain_historical",
      "snowfall_historical",
      "cloudcover_total_historical",
      "cloudcover_low_historical",
      "cloudcover_mid_historical",
      "cloudcover_high_historical",
      "surface_solar_radiation_downwards_forecast",
      "windspeed_10m_historical",
      "10_metre_u_wind_component_historical",
      "10_metre_v_wind_component_historical",
      "fog_historical",
      # Historical diff 0h
      "diff_48h_temperature_historical",
      "diff_48h_dewpoint_historical",
      "diff_48h_rain_historical",
      "diff_48h_snowfall_historical",
      "diff_48h_cloudcover_total_historical",
      "diff_48h_cloudcover_low_historical",
      "diff_48h_cloudcover_mid_historical",
      "diff_48h_cloudcover_high_historical",
      "diff_48h_windspeed_10m_historical",
      "diff_48h_surface_solar_radiation_downwards_forecast",
      "diff_48h_10_metre_u_wind_component_historical",
      "diff_48h_10_metre_v_wind_component_historical",
      "diff_48h_fog_historical",
      # General
      "sin(hour)",
      "cos(hour)",
      "sin(dayofyear)",
      "cos(dayofyear)",
      # Target
      "target_per_capacity_48h"
]


In [5]:
df["target_per_capacity"] = np.where(
    df["installed_capacity"] > 0,
    df["target"]/df["installed_capacity"],
    df["target"]
)
df["target_per_capacity"].isna().any()

False

In [6]:
def replace_historical_with_forecast(df: pd.DataFrame) -> pd.DataFrame:
    features = df.columns
    historical_weather_features = [f for f in features if f.endswith("historical")]
    corresponding_weather_features = [
        f.replace("_historical", "_forecast") for f in historical_weather_features
    ]
    df = df.drop(columns=historical_weather_features)
    df = df.rename(
        {
            f_c: f_h
            for f_h, f_c in zip(
                historical_weather_features, corresponding_weather_features
            )
        },
        axis=1,
    )
    return df

In [8]:
for fold_name, df_train, df_val in cv.split(df):
    print(fold_name)
    
    #df_val = replace_historical_with_forecast(df_val[features].copy())

    #scaler = StandardScaler()
    #scaler.fit(df_train[features])
    
    #print("Transform")
    #X_train = scaler.transform(df_train[features])
    #X_val = scaler.transform(df_val[features])
    
    print("Fit")
    model = MLPRegressor(
        hidden_layer_sizes=[len(features)*2, len(features)*2]
    )
    model.fit(df_train[features], df_train["target_per_capacity"])
    
    predictions = model.predict(df_val[features]) * df_val["installed_capacity"]
    print(fold_name, mean_absolute_error(predictions, df_val["target"]))

f-2023-2
Fit
f-2023-2 17.410415314885796
f-2023-3
Fit
f-2023-3 61.97440633432489
f-2023-4
Fit
f-2023-4 99.83621630783871
f-2023-5
Fit
f-2023-5 93.16401111257055


In [8]:
counties = pd.read_parquet("/beegfs/ws/0/s4610340-energy_behavior/yahor/kaggle-predict_energy_behavior_of_prosumers/data/processed/prepare_stations/stations_with_weights.parquet")["county"].unique()
np.sort(counties)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])