In [32]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import VotingRegressor

In [110]:
df = pd.read_parquet(
    "/beegfs/ws/0/s4610340-energy_behavior/yahor/kaggle-predict_energy_behavior_of_prosumers/data/processed/train/make_features/df_features.parquet",
    engine="fastparquet"
).dropna()
df_c = df.loc[df["is_consumption"]==1].copy()

In [111]:
df_c["target_per_eic"] = np.where(
    df_c["eic_count"] > 0,
    df_c["target"] / df_c["eic_count"],
    df_c["target"],
)

df_c["target_per_eic_diff_48h"] = df_c["target_per_eic"] - df_c["target_per_eic_48h"]
df_c["target_per_eic_diff_168h"] = df_c["target_per_eic"] - df_c["target_per_eic_168h"]

lags = [d*24 for d in (2,3,4,5,6,7,9,10)]
for lag in lags:
    df_c[f"target_per_eic_diff_48h_lag{lag}h"] = df_c[f"target_per_eic_{lag}h"] - df_c[f"target_per_eic_{lag+48}h"]

df_c["target_per_eic_diff_48h_median"] = df_c[[f"target_per_eic_diff_48h_lag{lag}h" for lag in lags]].median(axis=1)
df_c["target_per_eic_diff_48h_q25"] = df_c[[f"target_per_eic_diff_48h_lag{lag}h" for lag in lags]].quantile(0.25, axis=1)
df_c["target_per_eic_diff_48h_q75"] = df_c[[f"target_per_eic_diff_48h_lag{lag}h" for lag in lags]].quantile(0.75, axis=1)

lags_168 = [d*24 for d in (2,3,4,5,6,7)]
for lag in lags_168:
    df_c[f"target_per_eic_diff_168h_lag{lag}h"] = df_c[f"target_per_eic_{lag}h"] - df_c[f"target_per_eic_{lag+168}h"]

df_c["target_per_eic_diff_168h_median"] = df_c[[f"target_per_eic_diff_168h_lag{lag}h" for lag in lags_168]].median(axis=1)
df_c["target_per_eic_diff_168h_q25"] = df_c[[f"target_per_eic_diff_168h_lag{lag}h" for lag in lags_168]].quantile(0.25, axis=1)
df_c["target_per_eic_diff_168h_q75"] = df_c[[f"target_per_eic_diff_168h_lag{lag}h" for lag in lags_168]].quantile(0.75, axis=1)

weather_features = [
    "temperature_historical",
    "dewpoint_historical",
    "rain_historical",
    "snowfall_historical",
    "cloudcover_total_historical",
    "cloudcover_low_historical",
    "cloudcover_mid_historical",
    "cloudcover_high_historical",
    "windspeed_10m_historical",
    "surface_solar_radiation_downwards_forecast",
    "10_metre_u_wind_component_historical",
    "10_metre_v_wind_component_historical",
    "humidity_historical",
]

for weather_feature in weather_features:
    df_c[f"diff_48h_{weather_feature}"] = df_c[weather_feature] - df_c[f"{weather_feature}_h48"]
    df_c[f"diff_168h_{weather_feature}"] = df_c[weather_feature] - df_c[f"{weather_feature}_h168"]


In [112]:
df_c_train = df_c.loc[df_c["datetime"] < pd.Timestamp("2023-02-01")]
df_c_val = df_c.loc[df_c["datetime"] > pd.Timestamp("2023-02-01")]

In [115]:
y_c_train = df_c_train["target_per_eic_diff_48h"]
y_c_val = df_c_val["target_per_eic_diff_48h"]

features_48 = [
    # "predictions_production",
    "product_type",
    "is_business",
    "weekday",
    "month",
    "dayofyear",
    "hour",
    "eic_count",
    "is_country_holiday",
    # Historical diff 0h
    "diff_48h_temperature_historical",
    "diff_48h_dewpoint_historical",
    "diff_48h_rain_historical",
    "diff_48h_snowfall_historical",
    "diff_48h_cloudcover_total_historical",
    "diff_48h_windspeed_10m_historical",
    "diff_48h_surface_solar_radiation_downwards_forecast",
    "diff_48h_10_metre_u_wind_component_historical",
    "diff_48h_10_metre_v_wind_component_historical",
    "diff_48h_humidity_historical",
    # # Historical diff 48h
    # "diff_48h_lag_48h_temperature_historical",
    # "diff_48h_lag_48h_dewpoint_historical",
    # "diff_48h_lag_48h_rain_historical",
    # "diff_48h_lag_48h_snowfall_historical",
    # "diff_48h_lag_48h_cloudcover_total_historical",
    # "diff_48h_lag_48h_windspeed_10m_historical",
    # "diff_48h_lag_48h_surface_solar_radiation_downwards_forecast",
    # "diff_48h_lag_48h_10_metre_u_wind_component_historical",
    # "diff_48h_lag_48h_10_metre_v_wind_component_historical",
    # "diff_48h_lag_48h_humidity_historical",
    # Target features
    "target_per_eic_diff_48h_lag48h",
    "target_per_eic_diff_48h_lag72h",
    "target_per_eic_diff_48h_lag96h",
    "target_per_eic_diff_48h_lag120h",
    "target_per_eic_diff_48h_lag144h",
    "target_per_eic_diff_48h_lag168h",
    "target_per_eic_diff_48h_median",
    "target_per_eic_diff_48h_q25",
    "target_per_eic_diff_48h_q75",
]

model_48 = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.06,
    colsample_bytree=0.9,
    colsample_bynode=0.6,
    reg_alpha=3.5,
    reg_lambda=1.5,
    max_depth=16,
    num_leaves=500,
    min_child_samples=50,
    objective="regression_l1",
    device="cpu",
    n_jobs=12,
    verbosity=0,
    gpu_device_id=2,
    random_state=333,
)

model_48.fit(df_c_train[features_48], y_c_train)

predicted_diff_48h = model_48.predict(df_c_val[features_48])
predicted_target_48h = (predicted_diff_48h + df_c_val["target_per_eic_48h"]) * df_c_val["eic_count"]

true_diff_48h = y_c_val
true_target_48h = np.where(
    df_c_val["eic_count"] > 0,
    (true_diff_48h + df_c_val["target_per_eic_48h"]) * df_c_val["eic_count"],
    true_diff_48h + df_c_val["target_per_eic_48h"]
)

mean_absolute_error(true_target_48h, predicted_target_48h)

60.828268316861255

### Model diff with 7d ago

In [116]:
y_c_train = df_c_train["target_per_eic_diff_168h"]
y_c_val = df_c_val["target_per_eic_diff_168h"]

features_168 = [
    # "predictions_production",
    "product_type",
    "is_business",
    "weekday",
    "month",
    "dayofyear",
    "hour",
    "eic_count",
    "is_country_holiday",
    # Historical diff 0h
    "diff_168h_temperature_historical",
    "diff_168h_dewpoint_historical",
    "diff_168h_rain_historical",
    "diff_168h_snowfall_historical",
    "diff_168h_cloudcover_total_historical",
    "diff_168h_windspeed_10m_historical",
    "diff_168h_surface_solar_radiation_downwards_forecast",
    "diff_168h_10_metre_u_wind_component_historical",
    "diff_168h_10_metre_v_wind_component_historical",
    "diff_168h_humidity_historical",
    # # Historical diff 48h
    # "diff_48h_lag_48h_temperature_historical",
    # "diff_48h_lag_48h_dewpoint_historical",
    # "diff_48h_lag_48h_rain_historical",
    # "diff_48h_lag_48h_snowfall_historical",
    # "diff_48h_lag_48h_cloudcover_total_historical",
    # "diff_48h_lag_48h_windspeed_10m_historical",
    # "diff_48h_lag_48h_surface_solar_radiation_downwards_forecast",
    # "diff_48h_lag_48h_10_metre_u_wind_component_historical",
    # "diff_48h_lag_48h_10_metre_v_wind_component_historical",
    # "diff_48h_lag_48h_humidity_historical",
    # Target features
    "target_per_eic_diff_168h_lag48h",
    "target_per_eic_diff_168h_lag72h",
    "target_per_eic_diff_168h_lag96h",
    "target_per_eic_diff_168h_lag120h",
    "target_per_eic_diff_168h_lag144h",
    "target_per_eic_diff_168h_lag168h",
    "target_per_eic_diff_168h_median",
    "target_per_eic_diff_168h_q25",
    "target_per_eic_diff_168h_q75",
    # "target_per_eic_diff_48h_lag48h",
    # "target_per_eic_diff_48h_lag72h",
    # "target_per_eic_diff_48h_lag96h",
    # "target_per_eic_diff_48h_lag120h",
    # "target_per_eic_diff_48h_lag144h",
    # "target_per_eic_diff_48h_lag168h",
    # "target_per_eic_diff_48h_median",
    # "target_per_eic_diff_48h_q25",
    # "target_per_eic_diff_48h_q75",
]

model_168 = lgb.LGBMRegressor(
    n_estimators=500,
    learning_rate=0.06,
    colsample_bytree=0.9,
    colsample_bynode=0.6,
    reg_alpha=3.5,
    reg_lambda=1.5,
    max_depth=16,
    num_leaves=500,
    min_child_samples=50,
    objective="regression_l1",
    device="cpu",
    n_jobs=12,
    verbosity=0,
    gpu_device_id=2,
    random_state=333,
)

model_168.fit(df_c_train[features_168], y_c_train)

predicted_diff_168h = model_168.predict(df_c_val[features_168])
predicted_target_168h = (predicted_diff_168h + df_c_val["target_per_eic_168h"]) * df_c_val["eic_count"]

true_diff_168h = y_c_val
true_target_168h = np.where(
    df_c_val["eic_count"] > 0,
    (true_diff_168h + df_c_val["target_per_eic_168h"]) * df_c_val["eic_count"],
    true_diff_168h + df_c_val["target_per_eic_168h"]
)

mean_absolute_error(true_target_168h, predicted_target_168h)


62.756836927224334

In [117]:
assert np.allclose(true_target_168h,true_target_48h, rtol=0.01)

In [118]:
predictions_avg = (predicted_target_168h + predicted_target_48h) / 2
mean_absolute_error(predictions_avg, true_target_48h)

55.77619076266462