# 1. Setup

In [None]:
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.ndimage import binary_dilation

In [None]:
sys.path.append(str(Path().resolve().parent))

In [None]:
pd.set_option(
    "display.float_format",
    lambda x: f"{x:.2e}" if abs(x) < 0.01 and x != 0 else f"{x:.2f}",
)
pd.set_option('display.max_columns', 100)
pd.set_option("display.max_rows", 100)

In [None]:
SEGMENT_C = ["county", "product_type", "is_business"]
CATEGORICAL_C = ["county", "product_type", "is_business", "is_consumption"]
TARGET_C = [
    "county",
    "product_type",
    "is_business",
    "is_consumption",
    "datetime",
]
RAND = 10

In [None]:
RAW_DATA_PATH = "../raw_data/"
ADDITIONAL_DATA_PATH = "../additional_data/"

In [None]:
from utils.loading import load_all_raw_data
from utils.preprocessing import process_all_dfs
from utils.merging import merge_all_dfs

In [None]:
from utils.feature_engineering import (
    get_lag,
    get_moving_average,
    add_cyclic_datetime_features,
)

# 2. Processing

In [None]:
processed_dfs = process_all_dfs(
    load_all_raw_data(RAW_DATA_PATH, ADDITIONAL_DATA_PATH)
)

In [None]:
processed_dfs.keys()

dict_keys(['train', 'gas_prices', 'client', 'electricity_prices', 'forecast_weather', 'historical_weather', 'station_county_mapping', 'county_id_to_name_map', 'holidays'])

In [None]:
df = merge_all_dfs(processed_dfs)

In [None]:
for lag in [2, 3, 7]:
    df = df.merge(
        get_lag(processed_dfs["train"][TARGET_C + ["target"]], lag=lag),
        on=TARGET_C,
    )

In [None]:
df = add_cyclic_datetime_features(df)

In [None]:
for window in [24, 24 * 3, 24 * 7]:
    df = df.merge(
        get_moving_average(
            processed_dfs["train"]
            .set_index("datetime")
            .sort_index()
            .groupby(CATEGORICAL_C, observed=True, as_index=False),
            window=window,
        ).dropna(),
        on=TARGET_C,
    )

In [None]:
df.head()

Unnamed: 0,county,product_type,is_business,is_consumption,datetime,target,data_block_id,date,dst,eic_count,installed_capacity,lowest_price_per_mwh,highest_price_per_mwh,euros_per_mwh,f1_temperature,f1_dewpoint,f1_snowfall_mm,f1_total_precipitation_mm,f1_cloudcover_low,f1_cloudcover_mid,f1_cloudcover_high,f1_cloudcover_total,f1_u_component,f1_v_component,f1_windspeed,f1_direct_solar_radiation,f1_surface_solar_radiation_downwards,h2_temperature,h2_dewpoint,h2_snowfall_mm,h2_rain_mm,h2_surface_pressure,h2_cloudcover_low,h2_cloudcover_mid,h2_cloudcover_high,h2_cloudcover_total,h2_windspeed,h2_u_component,h2_v_component,h2_shortwave_radiation,h2_direct_solar_radiation,h2_diffuse_radiation,holiday_type,2d_lag_target,3d_lag_target,7d_lag_target,hour_sin,hour_cos,weekday_sin,weekday_cos,day_of_month_sin,day_of_month_cos,month_sin,month_cos,day_of_year_sin,day_of_year_cos,week_of_year_sin,week_of_year_cos,quarter_sin,quarter_cos,24h_ma_2d_lag_target,72h_ma_2d_lag_target,168h_ma_2d_lag_target
0,0,1,False,False,2021-09-10,0.04,9,2021-09-10,True,108,952.89,44.96,47.72,96.48,14.29,13.54,0.0,0.0,52,6,15,56,2.03,0.53,2.13,0.0,0.0,14.3,12.03,0.0,0.0,1009.88,53,82,14,88,5.47,4.89,2.41,0,0,0,ordinary_day,0.04,0.001,0.79,0.0,1.0,-0.43,-0.9,0.88,-0.48,-1.0,-1.84e-16,-0.94,-0.35,-0.94,-0.35,-1.0,-1.84e-16,57.95,76.7,96.64
1,0,1,False,True,2021-09-10,90.14,9,2021-09-10,True,108,952.89,44.96,47.72,96.48,14.29,13.54,0.0,0.0,52,6,15,56,2.03,0.53,2.13,0.0,0.0,14.3,12.03,0.0,0.0,1009.88,53,82,14,88,5.47,4.89,2.41,0,0,0,ordinary_day,106.73,106.91,107.13,0.0,1.0,-0.43,-0.9,0.88,-0.48,-1.0,-1.84e-16,-0.94,-0.35,-0.94,-0.35,-1.0,-1.84e-16,87.98,91.72,89.49
2,0,2,False,False,2021-09-10,0.0,9,2021-09-10,True,17,166.4,44.96,47.72,96.48,14.29,13.54,0.0,0.0,52,6,15,56,2.03,0.53,2.13,0.0,0.0,14.3,12.03,0.0,0.0,1009.88,53,82,14,88,5.47,4.89,2.41,0,0,0,ordinary_day,0.0,0.0,0.0,0.0,1.0,-0.43,-0.9,0.88,-0.48,-1.0,-1.84e-16,-0.94,-0.35,-0.94,-0.35,-1.0,-1.84e-16,10.79,16.21,20.14
3,0,2,False,True,2021-09-10,17.02,9,2021-09-10,True,17,166.4,44.96,47.72,96.48,14.29,13.54,0.0,0.0,52,6,15,56,2.03,0.53,2.13,0.0,0.0,14.3,12.03,0.0,0.0,1009.88,53,82,14,88,5.47,4.89,2.41,0,0,0,ordinary_day,20.43,20.73,19.63,0.0,1.0,-0.43,-0.9,0.88,-0.48,-1.0,-1.84e-16,-0.94,-0.35,-0.94,-0.35,-1.0,-1.84e-16,16.76,17.53,16.75
4,0,3,False,False,2021-09-10,0.62,9,2021-09-10,True,687,7199.88,44.96,47.72,96.48,14.29,13.54,0.0,0.0,52,6,15,56,2.03,0.53,2.13,0.0,0.0,14.3,12.03,0.0,0.0,1009.88,53,82,14,88,5.47,4.89,2.41,0,0,0,ordinary_day,0.68,1.5,0.98,0.0,1.0,-0.43,-0.9,0.88,-0.48,-1.0,-1.84e-16,-0.94,-0.35,-0.94,-0.35,-1.0,-1.84e-16,449.11,617.1,769.3


In [None]:
# df.shape

In [None]:
# plt.figure(figsize=(18, 11))
# sns.heatmap(
#     df.drop(
#         columns=[
#             "county",
#             "product_type",
#             "datetime",
#             "data_block_id",
#             "date",
#             "holiday_type"
#         ]
#     ).corr(),
#     annot=True,
#     fmt=".1f",
#     annot_kws={"size": 7}
# )
# plt.show()