# Create different baseline models for total solar production

## Init, Load

In [None]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from src.config import DATA_RAW_DIR, DATA_RAW_FILENAME

In [None]:
df_raw = pd.read_csv(
    os.path.join(DATA_RAW_DIR, DATA_RAW_FILENAME),
    sep=";",
    index_col=["timestamp"],
    date_format="%Y-%m-%d %H:%M",
)
# Recover data in relevant columns
df_raw.loc[df_raw["sol_prod"].isna(), "sol_prod"] = (
    df_raw[df_raw["sol_prod"].isna()]["sol_prod_1"]
    + df_raw[df_raw["sol_prod"].isna()]["sol_prod_2"]
)


display(df_raw.head(5))

In [None]:
import pandas as pd

def cyclic_rolling_mean(s: pd.Series, window: int, min_periods: int = 1) -> pd.Series:
    """
    Compute rolling mean on a cyclic Series s.
    
    Parameters
    ----------
    s : pd.Series
        Input data.
    window : int
        Window size (in rows).
    min_periods : int
        Minimum periods to include in calculation.
    """
    if not isinstance(s, pd.Series):
        raise TypeError("Input must be a pandas Series")
    if window < 1:
        raise ValueError("Window must be >= 1")

    # Extend series
    s_ext = pd.concat([s.iloc[-(window-1):], s, s.iloc[:window-1]])
    
    # Rolling mean
    roll_ext = s_ext.rolling(window=window, min_periods=min_periods).mean()
    
    # Extract original range
    return roll_ext.iloc[window-1:window-1+len(s)]

In [None]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin


class DateFeatureRegressor(BaseEstimator, RegressorMixin):

    def _make_date_features(self, X):
        ts = pd.to_datetime(X.iloc[:, 0])

        df_feat = pd.DataFrame(
            {
                "year": ts.dt.year,
                "day_of_year": ts.dt.dayofyear,
            }
        )

        return df_feat

    def fit(self, X, y):
        X = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X.copy()
        y = pd.Series(y) if not isinstance(y, pd.Series) else y.copy()
        y.name = "target"

        X_feat = self._make_date_features(X)
        X = pd.concat([X_feat.reset_index(drop=True), y.reset_index(drop=True)], axis=1)

        # sum up each days production
        X_sum = X.groupby(["day_of_year", "year"]).sum().reset_index()

        # calculate mean production for each day of the year

        self.means_ = (
            X_sum.sort_values(by=["day_of_year", "year"])
            .drop("year", axis=1)
            .groupby("day_of_year")
            .mean()
        )

        # self.means_["target"] = cyclic_rolling_mean(
        #     self.means_["target"], window=2, min_periods=1
        # )

        return self

    def predict(self, X):
        X = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X.copy()
        X_feat = self._make_date_features(X)

        X = pd.merge(X_feat, self.means_, on="day_of_year", how="left")
        return X["target"].values


reg = DateFeatureRegressor()

reg.fit(df_raw.index, df_raw["sol_prod"])

In [None]:
df_sum = df_raw.copy().reset_index()
df_sum["date"] = df_sum["timestamp"].dt.date

df_sum = df_sum[["date", "sol_prod"]].groupby(["date"]).sum()
df_sum["pred"] = reg.predict(df_sum.index)
display(df_sum)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


fig, axes = plt.subplots(figsize=(12, 6), nrows=2, sharex=True)
axes[0].set_title("Actual Solar Production vs Predicted")
sns.lineplot(
    data=df_sum,
    x="date",
    y="sol_prod",
    label="Actual",
    linewidth=0.5,
    alpha=0.8,
    ax=axes[0],
)
sns.lineplot(
    data=df_sum,
    x="date",
    y="pred",
    label="Predicted",
    linewidth=0.5,
    alpha=0.8,
    ax=axes[0],
)


axes[0].set_ylabel("Solar Production (Wh)")
df_sum["diff"] = df_sum["pred"] - df_sum["sol_prod"]

axes[1].set_title("Difference between Predicted and Actual")
sns.lineplot(
    data=df_sum,
    x="date",
    y="diff",
    label="Difference",
    linewidth=0.5,
    alpha=0.8,
    ax=axes[1],
)

plt.tight_layout()
plt.show()