In [42]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path

from predict_energy_behavior.models.joined_model import JoinedModel

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [43]:
df = pd.read_parquet(
    "/beegfs/ws/0/s4610340-energy_behavior/yahor/kaggle-predict_energy_behavior_of_prosumers/data/processed/train/make_features/df_features.parquet",
    engine="fastparquet"
).dropna()
df = df.loc[df["is_consumption"]==0] # select production
df

Unnamed: 0_level_0,county,is_business,product_type,is_consumption,datetime,date,dayofyear,hour,day,weekday,...,diff_168h_10_metre_u_wind_component_historical,diff_168h_10_metre_u_wind_component_forecast,diff_168h_10_metre_v_wind_component_historical,diff_168h_10_metre_v_wind_component_forecast,diff_168h_humidity_historical,diff_168h_humidity_forecast,diff_168h_fog_historical,diff_168h_fog_forecast,diff_168h_surface_solar_radiation_downwards_forecast,target
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
47092,0,0,1,0,2021-09-17 02:00:00,2021-09-17,260,2,17,5,...,-0.253379,-8.680022,0.718309,0.305286,-9.630847,62.298077,-0.339087,-0.161518,0.000000,0.000
47214,0,0,1,0,2021-09-17 03:00:00,2021-09-17,260,3,17,5,...,0.453719,-4.382467,-0.185719,-0.006285,-11.749646,-19.800087,-0.437202,-0.448680,0.000000,0.000
47336,0,0,1,0,2021-09-17 04:00:00,2021-09-17,260,4,17,5,...,0.397895,-4.545776,0.987074,0.005153,-11.411714,-19.661541,-0.531233,-0.545194,-0.003299,0.000
47458,0,0,1,0,2021-09-17 05:00:00,2021-09-17,260,5,17,5,...,0.554464,-4.207518,-2.007018,-0.507044,-12.682578,-21.136198,-0.656662,-0.680925,-0.028096,0.011
47580,0,0,1,0,2021-09-17 06:00:00,2021-09-17,260,6,17,5,...,1.645593,-4.437342,-0.147766,-0.804386,-13.034556,-21.479017,-0.671132,-0.689885,-0.972631,0.035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017818,14,1,2,0,2023-05-31 19:00:00,2023-05-31,151,19,31,3,...,-0.298044,4.869689,0.113850,-1.154504,-2.619718,4.432365,-0.286587,-0.343460,11.855307,35.026
2017948,14,1,2,0,2023-05-31 20:00:00,2023-05-31,151,20,31,3,...,-0.665162,4.048730,0.498659,-1.076261,-3.987781,2.489141,-0.233383,-0.345996,16.528545,14.867
2018078,14,1,2,0,2023-05-31 21:00:00,2023-05-31,151,21,31,3,...,0.315522,3.727382,0.606830,-0.177199,-4.616146,3.756591,-0.101987,-0.230209,10.491406,1.865
2018208,14,1,2,0,2023-05-31 22:00:00,2023-05-31,151,22,31,3,...,0.510262,3.873753,-0.467229,-1.254119,-5.682425,1.990297,-0.142692,-0.284932,0.077438,0.017


In [44]:
df["target_per_capacity"] = df["target"] / df["installed_capacity"]

In [54]:
df["snowfall_integrated_24h_historical"].describe()

count    9.856300e+05
mean     4.030466e-01
std      3.141230e-01
min     -1.332268e-15
25%      1.700000e-01
50%      3.300000e-01
75%      5.500000e-01
max      4.010000e+00
Name: snowfall_integrated_24h_historical, dtype: float64

In [45]:
def viz_weather_and_predictions_for_month(df, year: int, month: int):
    t_start = pd.Timestamp(year=year, month=month, day=1)
    t_end = t_start + pd.DateOffset(months=1)
    df_this_year = df.loc[
        (df["datetime"] > t_start) &
        (df["datetime"] < t_end)
    ]
    df_last_year = df.loc[
        (df["datetime"] > (t_start - pd.DateOffset(years=1))) &
        (df["datetime"] < (t_end - pd.DateOffset(years=1)))
    ]
    df_last_year = df_last_year.assign(**{"datetime": df_last_year["datetime"] + pd.DateOffset(years=1)})
    
    fig = make_subplots(cols=1, rows=7, subplot_titles=["Temperature", "Rain", "Snowfall", "I", "target", "snowfall int 48h", "rain int 48h"], shared_xaxes=True)
    
    for df, opacity in zip([df_this_year, df_last_year], [1, 0.5]):
        # Temperature
        fig.add_trace(
            go.Scatter(x=df["datetime"], y=df["temperature_forecast"], marker_color="blue", name="forecast", legendgroup=1, opacity=opacity),
            row=1, col=1
        )
        fig.add_trace(
            go.Scatter(x=df["datetime"], y=df["temperature_historical"], marker_color="red", name="historical", legendgroup=2, opacity=opacity),
            row=1, col=1
        )

        # Rain
        fig.add_trace(
            go.Scatter(x=df["datetime"], y=df["rain_forecast"], marker_color="blue", name="forecast", legendgroup=1, showlegend=False, opacity=opacity),
            row=2, col=1
        )
        fig.add_trace(
            go.Scatter(x=df["datetime"], y=df["rain_historical"], marker_color="red", name="historical", legendgroup=2, showlegend=False, opacity=opacity),
            row=2, col=1
        )

        # Snow
        fig.add_trace(
            go.Scatter(x=df["datetime"], y=df["snowfall_forecast"], marker_color="blue", name="forecast", legendgroup=1, showlegend=False, opacity=opacity),
            row=3, col=1
        )
        fig.add_trace(
            go.Scatter(x=df["datetime"], y=df["snowfall_historical"], marker_color="red", name="historical", legendgroup=2, showlegend=False, opacity=opacity),
            row=3, col=1
        )

        # I
        fig.add_trace(
            go.Scatter(x=df["datetime"], y=df["surface_solar_radiation_downwards_forecast"], marker_color="blue", name="forecast", legendgroup=1, showlegend=False, opacity=opacity),
            row=4, col=1
        )


        # target
        fig.add_trace(
            go.Scatter(x=df["datetime"], y=df["target"] / df["installed_capacity"], marker_color="green", name="target", showlegend=True, opacity=opacity),
            row=5, col=1
        )
        fig.add_trace(
            go.Scatter(x=df["datetime"], y=df["predictions"] / df["installed_capacity"], marker_color="orange", name="predictions", showlegend=True, opacity=opacity),
            row=5, col=1
        )

        # Snowfall integrated
        fig.add_trace(
            go.Scatter(x=df["datetime"], y=df["snowfall_integrated_24h_forecast"], marker_color="blue", name="forecast", legendgroup=1, showlegend=False, opacity=opacity),
            row=6, col=1
        )
        fig.add_trace(
            go.Scatter(x=df["datetime"], y=df["snowfall_integrated_24h_historical"], marker_color="red", name="historical", legendgroup=2, showlegend=False, opacity=opacity),
            row=6, col=1
        )

         # Rain integrated
        fig.add_trace(
            go.Scatter(x=df["datetime"], y=df["rain_integrated_24h_forecast"], marker_color="blue", name="forecast", legendgroup=1, showlegend=False, opacity=opacity),
            row=7, col=1
        )
        fig.add_trace(
            go.Scatter(x=df["datetime"], y=df["rain_integrated_24h_historical"], marker_color="red", name="historical", legendgroup=2, showlegend=False, opacity=opacity),
            row=7, col=1
        )

    fig.update_layout(height=800)

    return fig

In [46]:
model = JoinedModel.load(Path("/beegfs/ws/0/s4610340-energy_behavior/yahor/kaggle-predict_energy_behavior_of_prosumers/outputs/train/lgbm-wlf+diff-lgbms-submit/model_joined-f-2022-12"))
model

<predict_energy_behavior.models.joined_model.JoinedModel at 0x7f0fa9e70a00>

In [47]:
model._model_p, model._model_c

(<predict_energy_behavior.models.production.two_orders_regression.AvgTwoOrdersRegression at 0x7f0faa13ee90>,
 <predict_energy_behavior.models.consumption.lgbm_regression.LGBMOnMultiDiff at 0x7f0faa13d840>)

In [48]:
df["predictions"] = model.predict(df)

In [49]:
df["predictions_per_capacity"] = df["predictions"] / df["installed_capacity"]

In [50]:
def calculate_dep(df, x: str, y: str, win, step, val_thr):
    series = df.set_index(df[x] / df[x].max()).sort_index()[y]
    series = series.loc[series.index>val_thr]
    series_rolling_median = series.rolling(win).median()[::step]
    series_rolling_q75 = series.rolling(win).quantile(0.75)[::step]
    series_rolling_q25 = series.rolling(win).quantile(0.25)[::step]
    return series_rolling_median, series_rolling_q25, series_rolling_q75

def explore_feature_vs_target_and_preds(df: pd.DataFrame, x: str, win=10, step=10, val_thr=0.0):
    s_true, s_true_q25, s_true_q75 = calculate_dep(df, x, "target_per_capacity", win, step, val_thr)
    #s_p1, s_p1_q25, s_p1_q75 = calculate_dep(df, x, "predictions_first_order_per_capacity", win, step, val_thr)
    s_p2, s_p2_q25, s_p2_q75 = calculate_dep(df, x, "predictions_per_capacity", win, step, val_thr)

    fig = go.Figure(
        [
            go.Scattergl(
                x=s_true.index, 
                y=s_true,
                marker_color="blue",
                mode="markers", 
                hoverinfo="skip",
                name="targets_per_capacity"
            ),
            go.Scatter(
                x=s_true_q25.index,
                y=s_true_q25,
                mode='lines',
                marker=dict(color="#444"),
                line=dict(width=0),
                showlegend=False
            ),
            go.Scatter(
                x=s_true_q75.index,
                y=s_true_q75,
                marker=dict(color="#444"),
                line=dict(width=0),
                mode='lines',
                fillcolor='rgba(68, 68, 68, 0.3)',
                fill='tonexty',
                showlegend=False
            ),
            go.Scattergl(
                x=s_p2.index, 
                y=s_p2,
                marker_color="purple",
                mode="markers", 
                hoverinfo="skip",
                name="predictions_per_capacity"
            ),
        ]
    )

    return fig


explore_feature_vs_target_and_preds(df, "surface_solar_radiation_downwards_forecast", 150, 100)

In [51]:
df_insp_p = df.loc[
    (df["county"]==0) &
    (df["product_type"]==1) &
    (df["is_business"]==1) &
    (df["is_consumption"]==0)
]
df_insp_p

Unnamed: 0_level_0,county,is_business,product_type,is_consumption,datetime,date,dayofyear,hour,day,weekday,...,diff_168h_10_metre_v_wind_component_forecast,diff_168h_humidity_historical,diff_168h_humidity_forecast,diff_168h_fog_historical,diff_168h_fog_forecast,diff_168h_surface_solar_radiation_downwards_forecast,target,target_per_capacity,predictions,predictions_per_capacity
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
47100,0,1,1,0,2021-09-17 02:00:00,2021-09-17,260,2,17,5,...,0.305286,-9.630847,62.298077,-0.339087,-0.161518,0.000000,0.000,0.000000,2.123442e-14,1.504920e-17
47222,0,1,1,0,2021-09-17 03:00:00,2021-09-17,260,3,17,5,...,-0.006285,-11.749646,-19.800087,-0.437202,-0.448680,0.000000,0.000,0.000000,0.000000e+00,0.000000e+00
47344,0,1,1,0,2021-09-17 04:00:00,2021-09-17,260,4,17,5,...,0.005153,-11.411714,-19.661541,-0.531233,-0.545194,-0.003299,0.000,0.000000,-1.125397e-04,-7.975886e-08
47466,0,1,1,0,2021-09-17 05:00:00,2021-09-17,260,5,17,5,...,-0.507044,-12.682578,-21.136198,-0.656662,-0.680925,-0.028096,0.000,0.000000,8.151366e-05,5.777013e-08
47588,0,1,1,0,2021-09-17 06:00:00,2021-09-17,260,6,17,5,...,-0.804386,-13.034556,-21.479017,-0.671132,-0.689885,-0.972631,0.000,0.000000,4.476320e-04,3.172445e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017710,0,1,1,0,2023-05-31 19:00:00,2023-05-31,151,19,31,3,...,-1.406289,-1.086469,2.961163,-0.168319,-0.202967,32.079052,126.947,0.044054,1.443167e+02,5.008213e-02
2017840,0,1,1,0,2023-05-31 20:00:00,2023-05-31,151,20,31,3,...,-1.185897,-5.386113,0.446676,-0.199136,-0.242432,23.237328,27.563,0.009565,3.013972e+01,1.045937e-02
2017970,0,1,1,0,2023-05-31 21:00:00,2023-05-31,151,21,31,3,...,-0.819711,-5.548689,1.733130,-0.076652,-0.182087,12.428007,5.091,0.001767,6.247684e+00,2.168130e-03
2018100,0,1,1,0,2023-05-31 22:00:00,2023-05-31,151,22,31,3,...,-2.143498,-7.578453,0.846309,-0.046496,-0.255288,0.151417,0.028,0.000010,7.779489e-01,2.699711e-04


In [52]:
viz_weather_and_predictions_for_month(df_insp_p, 2023, 4)

In [53]:
viz_weather_and_predictions_for_month(df_insp_p, 2023, 5)