# Create a model for total solar production using trigonometric features

## Init, Load

In [None]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from src.config import DATA_RAW_DIR, DATA_RAW_FILENAME

In [None]:
df_raw = pd.read_csv(
    os.path.join(DATA_RAW_DIR, DATA_RAW_FILENAME),
    sep=";",
    index_col=["timestamp"],
    date_format="%Y-%m-%d %H:%M",
)
# Recover data in relevant columns
df_raw.loc[df_raw["sol_prod"].isna(), "sol_prod"] = (
    df_raw[df_raw["sol_prod"].isna()]["sol_prod_1"]
    + df_raw[df_raw["sol_prod"].isna()]["sol_prod_2"]
)

df_raw = df_raw.reset_index()

display(df_raw.head(5))

In [None]:
df_sum = df_raw.copy().reset_index()
df_sum["date"] = pd.to_datetime(df_sum["timestamp"].dt.date)

df_sum = df_sum[["date", "sol_prod"]].groupby(["date"]).sum()
df_sum = df_sum.reset_index().sort_values(by="date", ascending=True)
display(df_sum.head(5))

## Train, Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_sum[["date"]], df_sum["sol_prod"], test_size=0.2, random_state=42
)

## Feature Engineering

### Transformer
Idea from [scikit-learn](https://scikit-learn.org/stable/auto_examples/applications/plot_cyclical_feature_engineering.html#data-exploration-on-the-bike-sharing-demand-dataset)

In [None]:
from sklearn.preprocessing import FunctionTransformer


def sin_transformer(period):
    """Create a sine transformer.

    Args:
        period (int): The period of the sine wave.
    """

    def get_feature_names(self, x):
        return [f"{y}_sin" for y in x]

    def _sin_transformer(x):
        # Use a defined function to make the transformer pickable
        return np.sin(x / period * 2 * np.pi)

    return FunctionTransformer(_sin_transformer, feature_names_out=get_feature_names)


def cos_transformer(period):
    """Create a cosine transformer.

    Args:
        period (int): The period of the cosine wave.
    """

    def get_feature_names(self, x):
        return [f"{y}_cos" for y in x]

    def _cos_transformer(x):
        # Use a defined function to make the transformer pickable
        return np.cos(x / period * 2 * np.pi)

    return FunctionTransformer(_cos_transformer, feature_names_out=get_feature_names)

In [None]:
def dayofyear_extractor():
    """Create a day of year extractor transformer."""

    def get_feature_names(self, x):
        return [f"{y}_dayofyear" for y in x]

    def extract_dayofyear(x):
        if isinstance(x, pd.Series) and np.issubdtype(x.dtype, np.datetime64):
            return x.dt.dayofyear
        elif isinstance(x, pd.DataFrame):
            x = x.copy()
            for col in x.columns:
                x[col] = x[col].dt.dayofyear
            return x
        else:
            raise ValueError(
                "Input must be a pandas Series with datetime64 dtype"
                " or a DataFrame with datetime64 columns."
            )

    return FunctionTransformer(extract_dayofyear, feature_names_out=get_feature_names)

### Add Features

In [None]:
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

dayofyear_sin_pipeline = Pipeline(
    steps=[
        ("extract_dayofyear", dayofyear_extractor()),
        ("transform_sin", sin_transformer(366)),
    ]
)
dayofyear_cos_pipeline = Pipeline(
    steps=[
        ("extract_dayofyear", dayofyear_extractor()),
        ("transform_cos", cos_transformer(366)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("dayofyear", dayofyear_extractor(), ["date"]),
        ("sin", dayofyear_sin_pipeline, ["date"]),
        ("cos", dayofyear_cos_pipeline, ["date"]),
    ],
    remainder="drop",
)
preprocessor.set_output(transform="pandas")

regressor = RidgeCV()
pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", regressor)])

## Model Training

In [None]:
pipe.fit(X_train, y_train)

## Prediction

In [None]:
y_pred = pipe.predict(X_test)

## Evaluation

In [None]:
from src.model_evaluation.regressor_evaluation import evaluate_regressor
from datetime import datetime

results = evaluate_regressor(
    regressor=regressor,
    y_true=y_test,
    y_pred=y_pred,
    timestamp=datetime.now(),
    model_purpose="feat-eng",
    special_features="trig-doy",
)

print("Evaluation Results:")
for key in [
    k
    for k in ["MAE", "MSE", "RMSE", "MAPE", "MedAE", "R2", "ExplainedVar"]
    if k in results
]:
    print(f"  {key}: {results.get(key):.4f}")