# 1. Setup

In [None]:
import json
from datetime import date, time

import colorcet as cc
import holidays
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import matplotlib.transforms as mt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from IPython.display import Image
from matplotlib.dates import DateFormatter, MonthLocator
from matplotlib.lines import Line2D
from matplotlib.ticker import MultipleLocator
from pandas.core.groupby.generic import DataFrameGroupBy
from prophet import Prophet
from scipy.ndimage import binary_dilation
from shapely.geometry import shape
from statsmodels.tsa.seasonal import MSTL, seasonal_decompose

In [None]:
RAND = 10
RAW_DATA = "../raw_data/"

train_df = pd.read_csv(f"{RAW_DATA}train.csv")
gas_prices_df = pd.read_csv(f"{RAW_DATA}gas_prices.csv")
client_df = pd.read_csv(f"{RAW_DATA}client.csv")
electricity_prices_df = pd.read_csv(f"{RAW_DATA}electricity_prices.csv")
forecast_weather_df = pd.read_csv(f"{RAW_DATA}forecast_weather.csv")
historical_weather_df = pd.read_csv(f"{RAW_DATA}historical_weather.csv")
station_county_mapping = pd.read_csv(
    f"{RAW_DATA}weather_station_to_county_mapping.csv"
)
county_id_to_name_map = pd.read_json(
    f"{RAW_DATA}county_id_to_name_map.json",
    typ="series",
).str.lower()

# External file with Estonian counties boundaries for visualisation
with open("../additional_data/estonia.geojson", "r", encoding="utf-8") as f:
    estonia_geojson = json.load(f)

In [None]:
CATEGORICAL_DICT = {
    "county": county_id_to_name_map,
    "is_business": {0: "not_business", 1: "business"},
    "is_consumption": {0: "production", 1: "consumption"},
    "product_type": {
        0: "combined",
        1: "fixed",
        2: "general_service",
        3: "spot",
    },
}

PALETTE = sns.color_palette(
    cc.glasbey[:4]
    + [cc.glasbey[8]]
    + cc.glasbey[5:8]
    + [cc.glasbey[4]]
    + [cc.glasbey[12]]
    + cc.glasbey[10:12]
    + [cc.glasbey[9]]
    + cc.glasbey[13:16]
).as_hex()

SEGMENT_C = ["county", "product_type", "is_business"]
CATEGORICAL_C = ["county", "product_type", "is_business", "is_consumption"]
TARGET_C = [
    "county",
    "product_type",
    "is_business",
    "is_consumption",
    "datetime",
]

In [None]:
pd.set_option(
    "display.float_format",
    lambda x: f"{x:.2e}" if abs(x) < 0.01 and x != 0 else f"{x:.2f}",
)
pd.set_option("display.max_columns", None)

In [None]:
sns.set_style("whitegrid")

# 2. Data Preparation

In [None]:
# train
# train_df = categorical_mapper(train_df, CATEGORICAL_DICT)
train_df = train_df[
    [
        "county",
        "product_type",
        "is_business",
        "is_consumption",
        "datetime",
        "target",
        "data_block_id",
    ]
].astype(
    {
        "target": "float32",
        "data_block_id": "uint16",
        "datetime": "datetime64[ns]",
    }
)

In [None]:
# gas_prices
gas_prices_df = gas_prices_df.drop(
    columns=["origin_date", "forecast_date"]
).astype(
    {
        "data_block_id": "uint16",
        "lowest_price_per_mwh": "float32",
        "highest_price_per_mwh": "float32",
    }
)[
    [
        "data_block_id",
        "lowest_price_per_mwh",
        "highest_price_per_mwh",
    ]
]

In [None]:
# client
client_df = client_df[
    [
        "county",
        "product_type",
        "is_business",
        "date",
        "eic_count",
        "installed_capacity",
        "data_block_id",
    ]
].astype(
    {
        "date": "datetime64[ns]",
        "eic_count": "uint32",
        "installed_capacity": "float32",
        "data_block_id": "uint16",
    }
)
# client_df = categorical_mapper(client_df, CATEGORICAL_DICT)

In [None]:
# electricity_prices
electricity_prices_df = electricity_prices_df.astype(
    {
        "origin_date": "datetime64[ns]",
        "forecast_date": "datetime64[ns]",
        "euros_per_mwh": "float32",
        "data_block_id": "uint16",
    }
)
electricity_prices_df = electricity_prices_df.assign(
    electricity_datetime=lambda x: x["electricity_datetime"]
    + pd.Timedelta(2, "d")
)[["electricity_datetime", "euros_per_mwh", "data_block_id"]]

In [None]:
# forecast_weather
forecast_weather_df = forecast_weather_df[
    [
        "latitude",
        "longitude",
        "origin_datetime",
        "hours_ahead",
        "forecast_datetime",
        "data_block_id",
        "temperature",
        "dewpoint",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_high",
        "cloudcover_total",
        "10_metre_u_wind_component",
        "10_metre_v_wind_component",
        "direct_solar_radiation",
        "surface_solar_radiation_downwards",
        "snowfall",
        "total_precipitation",
    ]
]

forecast_weather_df[["latitude", "longitude"]] = (
    forecast_weather_df[["latitude", "longitude"]].round(1).mul(10)
)

forecast_weather_df[
    ["cloudcover_low", "cloudcover_mid", "cloudcover_high", "cloudcover_total"]
] = (
    forecast_weather_df[
        [
            "cloudcover_low",
            "cloudcover_mid",
            "cloudcover_high",
            "cloudcover_total",
        ]
    ]
    .round(2)
    .mul(100)
)

forecast_weather_df = forecast_weather_df.astype(
    {
        "latitude": "uint16",
        "longitude": "uint16",
        "origin_datetime": "datetime64[ns]",
        "forecast_datetime": "datetime64[ns]",
        "data_block_id": "uint16",
        "temperature": "float32",
        "dewpoint": "float32",
        "cloudcover_low": "uint8",
        "cloudcover_mid": "uint8",
        "cloudcover_high": "uint8",
        "cloudcover_total": "uint8",
        "10_metre_u_wind_component": "float32",
        "10_metre_v_wind_component": "float32",
        "direct_solar_radiation": "float32",
        "surface_solar_radiation_downwards": "float32",
        "snowfall": "float32",
        "total_precipitation": "float32",
    }
)
forecast_weather_df["hours_ahead"] = pd.to_timedelta(
    forecast_weather_df["hours_ahead"], "h"
)

In [None]:
# historical_weather
historical_weather_df = historical_weather_df[
    [
        "latitude",
        "longitude",
        "datetime",
        "data_block_id",
        "temperature",
        "dewpoint",
        "rain",
        "snowfall",
        "surface_pressure",
        "cloudcover_total",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_high",
        "windspeed_10m",
        "winddirection_10m",
        "shortwave_radiation",
        "direct_solar_radiation",
        "diffuse_radiation",
    ]
]

# Change data types to reduce memory usage
historical_weather_df[["latitude", "longitude"]] = (
    historical_weather_df[["latitude", "longitude"]].round(1).mul(10)
)
historical_weather_df = historical_weather_df.astype(
    {
        "latitude": "uint16",
        "longitude": "uint16",
        "datetime": "datetime64[ns]",
        "data_block_id": "uint16",
        "temperature": "float32",
        "dewpoint": "float32",
        "rain": "float32",
        "snowfall": "float32",
        "surface_pressure": "float32",
        "cloudcover_total": "uint8",
        "cloudcover_low": "uint8",
        "cloudcover_mid": "uint8",
        "cloudcover_high": "uint8",
        "windspeed_10m": "float32",
        "winddirection_10m": "uint16",
        "shortwave_radiation": "uint16",
        "direct_solar_radiation": "uint16",
        "diffuse_radiation": "uint16",
    }
)
hw_to_drop = [1176339]
hw_to_drop.append(1176343)
historical_weather_df = historical_weather_df.drop(
    index=hw_to_drop
).reset_index(drop=True)

In [None]:
# county_id_to_name
county_id_to_name_map = county_id_to_name_map.str.lower()

In [None]:
# station_county
station_county_mapping = station_county_mapping[
    [
        "latitude",
        "longitude",
        "county_name",
        "county",
    ]
]

station_county_mapping[["latitude", "longitude"]] = (
    station_county_mapping[["latitude", "longitude"]].round(1).mul(10)
)

station_county_mapping[["county_name", "county"]] = station_county_mapping[
    ["county_name", "county"]
].fillna({"county_name": "unknown", "county": 12})

station_county_mapping["county_name"] = station_county_mapping[
    "county_name"
].str.lower()

station_county_mapping = (
    station_county_mapping.astype(
        {
            "latitude": "uint16",
            "longitude": "uint16",
            "county_name": "category",
            "county": "uint8",
        }
    )
    .astype({"county": "category"})
    .sort_values(["latitude", "longitude"], ignore_index=True)
    .rename(
        columns={
            "county_name": "county",
            "county": "county_index",
        }
    )
)

In [None]:
# Merging
# Drop spring NaNs and impute autumn NaNs with interpolated values
df = train_df.loc[~train_df["datetime"].isin(na_datetimes[1::2])].assign(
    target=lambda x: x["target"].interpolate()
)

# Add a flag indicating Daylight Saving Time
df["dst"] = ~(
    ((df.datetime >= na_datetimes[0]) & (df.datetime < na_datetimes[1]))
    | ((df.datetime >= na_datetimes[2]) & (df.datetime < na_datetimes[3]))
)

# estonia_holidays = holidays.EE(years=range(2021, 2024), language='en_US')
# for date, name in estonia_holidays.items():
#     print(date, name)

df = pd.merge(
    left=df,
    right=client_df.drop(columns=["date"]),
    how='left',
    on=["county", "product_type", "is_business", "data_block_id"],
)

df = df.merge(
    right=gas_prices_df,
    on=["data_block_id"],
)

df = df.merge(
    right=electricity_prices_df,
    left_on=["datetime", "data_block_id"],
    right_on=["electricity_datetime", "data_block_id"],
).drop(columns=["electricity_datetime"])