# Create a model to predict the weather next year using trigonometric features and lags

## Init, Load

In [None]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from src.config import DATA_RAW_DIR, OPENMETEO_WEATHER_FILENAME
from src.transformation import prepare_aggregate_openmeteo_data

MAX_LAG_DAYS = 3

In [None]:
df_raw = pd.read_csv(
    os.path.join(DATA_RAW_DIR, OPENMETEO_WEATHER_FILENAME),
    sep=";",
    index_col=["timestamp"],
)

display(df_raw.head(5))

In [None]:
df_agg = prepare_aggregate_openmeteo_data(
    df_raw.reset_index(),
    weather_column="weather_description",
    mandatory_weather_columns=[
        "clear_sky",
        "cloudy",
        "drizzle",
        "rain",
        "solid_precipitation",
    ],
)
display(df_agg.head(5))

## Train, Test Split

In [None]:
# last year as holdout
split_point = len(df_agg) - 365 - MAX_LAG_DAYS

Xy_train, Xy_test = df_agg.loc[:split_point], df_agg.loc[split_point + MAX_LAG_DAYS + 1 :]

display(Xy_train.tail(5))
display(Xy_test.head(5))

In [None]:
target_columns = [
    "sunshine_duration",
    "direct_radiation",
    "cloud_cover",
    "snow_depth",
    "is_day",
    "clear_sky",
    "cloudy",
    "drizzle",
    "rain",
    "solid_precipitation",
]

## Feature Engineering

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from src.transformers import DayOfYearTransformer

### Add Features

#### Lag Features

In [None]:
from src.transformers import LagFeatureTransformer

lft = LagFeatureTransformer(lags=list(range(1, MAX_LAG_DAYS + 1)))
lft.set_output(transform="pandas")

X_train = pd.concat(
    [
        Xy_train[["date"]],
        lft.fit_transform(Xy_train[target_columns], y=None),
    ],
    axis=1,
)
y_train = pd.concat([Xy_train[["date"]], Xy_train[target_columns]], axis=1)

X_test = pd.concat(
    [
        Xy_test[["date"]],
        lft.transform(Xy_test[target_columns]),
    ],
    axis=1,
)
y_test = pd.concat([Xy_test[["date"]], Xy_test[target_columns]], axis=1)

display(X_train.head(5))
display(y_train.head(5))

#### Pipeline

In [None]:
# ohe.columns can be interpreted as numeric, because they represent the share of a certain weather type on the whole day

num_cols = [
    c
    for c in X_test.columns
    if c not in ["sol_prod", "date"]
]

num_pipe = Pipeline(
    steps=[
        ("scale", MinMaxScaler()),
    ]
)

doy_sin_pipe = Pipeline(
    steps=[
        ("trig", DayOfYearTransformer("sin")),
    ]
)
doy_cos_pipe = Pipeline(
    steps=[
        ("trig", DayOfYearTransformer("cos")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("sin", doy_sin_pipe, ["date"]),
        ("cos", doy_cos_pipe, ["date"]),
    ],
    remainder="drop",
)
preprocessor.set_output(transform="pandas")

## Model Training

In [None]:

X_train = Xy_train.drop(target_columns, axis=1)
y_train = Xy_train[target_columns]
X_test = Xy_test.drop(target_columns, axis=1)
y_test = Xy_test[target_columns]

In [None]:
pipe.fit(X_train, y_train)

## Prediction

In [None]:
y_pred = pipe.predict(X_test)

## Evaluation

In [None]:
from src.model_evaluation.regressor_evaluation import evaluate_regressor
from datetime import datetime

results = evaluate_regressor(
    regressor=regressor,
    y_true=y_test,
    y_pred=y_pred,
    timestamp=datetime.now(),
    model_purpose="feat-eng",
    special_features="trig-doy",
)

print("Evaluation Results:")
for key in [
    k
    for k in ["MAE", "MSE", "RMSE", "MAPE", "MedAE", "R2", "ExplainedVar"]
    if k in results
]:
    print(f"  {key}: {results.get(key):.4f}")

## Save Model And Results

In [None]:
# import pickle
import json
import os

from src.config import MODELS_DIR

model_name = results["model_name"]

folder = os.path.join(MODELS_DIR, model_name)
filename = os.path.join(folder, model_name)
os.makedirs(folder, exist_ok=True)

# because of issues with pickling custom transformers,
# saving only config and results

# with open(f"{filename}.model.pkl", "wb") as f:
#     pickle.dump(reg, f)

# with open(f"{filename}.pipeline.pkl", "wb") as f:
#     pickle.dump(preprocessor, f)

with open(f"{filename}.model.txt", "w") as file:
    file.write(str(regressor))

with open(f"{filename}.model_params.json", "w") as f:
    json.dump(regressor.get_params(), f, indent=2)

with open(f"{filename}.pipeline_params.txt", "w") as f:
    f.write(preprocessor.get_params().__str__())

with open(f"{filename}.results.json", "w") as f:
    json.dump(results, f, indent=2)
