# EDA of power generation vs sunshine duration and radiation global

## Load

In [None]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from src.config import DATA_RAW_DIR, POWER_OPENMETEO_WEATHER_FILENAME

In [None]:
df_raw = pd.read_csv(os.path.join(DATA_RAW_DIR, POWER_OPENMETEO_WEATHER_FILENAME), sep=";")

In [None]:
# sanitize solar production columns and calculate total solar production
sol_prod_cols = [
    col
    for col in ["sol_prod_1", "sol_prod_2", "sol_prod_3"]
    if col in df_raw.columns
]

for c in sol_prod_cols:
    df_raw[c] = pd.to_numeric(df_raw[c], errors="raise")
    df_raw[c] = df_raw[c].fillna(0)

df_raw["sol_prod"] = df_raw[sol_prod_cols].sum(axis=1)

In [None]:
psr_df = df_raw[
    [
        "installation",
        "timestamp",
        "sol_prod",
        "is_day",
        "weather_description",
        "sunshine_duration",
        "direct_radiation",
        "cloud_cover",
        "snow_depth",
    ]
].copy()
psr_df["timestamp"] = pd.to_datetime(psr_df["timestamp"], errors="raise", utc=True)
num_cols = [
    "sol_prod",
    "cloud_cover",
    "snow_depth",
    "sunshine_duration",
    "is_day",
    "direct_radiation",
]
for c in num_cols:
    psr_df[c] = pd.to_numeric(psr_df[c], errors="raise")

## Basics

In [None]:
display(psr_df.describe().T)

## Grouped Series

In [None]:
psr_df["year"] = psr_df["timestamp"].dt.year
psr_df["month"] = psr_df["timestamp"].dt.month
psr_df["day"] = psr_df["timestamp"].dt.day
psr_df["hour"] = psr_df["timestamp"].dt.hour
psr_df["minute"] = psr_df["timestamp"].dt.minute
psr_df["day_of_week"] = psr_df["timestamp"].dt.dayofweek
psr_df["day_of_year"] = psr_df["timestamp"].dt.dayofyear
psr_df["week_of_year"] = psr_df["timestamp"].dt.isocalendar().week
psr_df = psr_df.set_index("timestamp").sort_index()

### By year and day

In [None]:
drop_cols = [
    "month",
    "day",
    "hour",
    "minute",
    "day_of_week",
    "week_of_year",
    "weather_description",
]
group_cols = ["installation", "year", "day_of_year"]
sum_cols = ["sol_prod", "sunshine_duration", "direct_radiation"]
mean_cols = ["cloud_cover", "snow_depth", "is_day"]

ohe = pd.get_dummies(psr_df["weather_description"])


agg_dict = (
    {c: "sum" for c in sum_cols}
    | {c: "mean" for c in mean_cols}
    | {c: "mean" for c in ohe.columns}
)

df_by_year_day = pd.concat([psr_df, ohe], axis=1)
df_by_year_day = (
    df_by_year_day.drop(columns=drop_cols)
    .groupby(group_cols)
    .agg(agg_dict)
    .reset_index()
)


df_by_year_day["installation_year"] = df_by_year_day[["installation", "year"]].apply(
    lambda x: f"{x["installation"]} {x["year"]}", axis=1
)

show_columns = [c for c in agg_dict.keys()]

line_args = {}
axis_args = {
    "sol_prod": {"ylim": (-0.1, None), "ylabel": "peak power hours [W_ph]"},
    "sunshine_duration": {"ylabel": "[s]"},
    "direct_radiation": {"ylabel": "[J/(cm^2)]"},
    "cloud_cover": {"ylabel": "[%]"},
    "snow_depth": {"ylabel": "[cm]"},
    "is_day": {"ylabel": "[part of day]"},
} | {c: {"ylabel": "share", "ylim": (0, 1)} for c in ohe.columns}


fig, axs = plt.subplots(
    nrows=len(show_columns), figsize=(16, 2 * len(show_columns) + 2)
)

for col, ax in zip([c for c in show_columns if c != "weather_description"], axs):
    sns.lineplot(
        data=df_by_year_day,
        x="day_of_year",
        y=col,
        hue=df_by_year_day["installation_year"],
        linewidth=0.5,
        ax=ax,
        **line_args.get(col, {}),
    )
    ax.margins(x=0)
    ax.legend(title="Installation Year", bbox_to_anchor=(1.01, 1), loc="upper left")
    ax.set_title(col)
    ax.set_xticks(np.arange(0, 366, 30))
    ax.set_xticklabels(
        [f"{i:01d}" for i in np.arange(0, 366, 30)],
    )
    ax.set(**axis_args.get(col, {}))

    ax.yaxis.grid(True, which="major", linestyle="--", color="lightgrey")


plt.suptitle(
    "Daily Total Power Production, Sunshine duration and global Radiation by Installation and Year",
    fontsize=16,
    y=1.02,
)
plt.text(
    x=0.99, y=0.05, s="Shown data includes data from https://open-meteo.com",
    ha="right", va="bottom",
    transform=plt.gcf().transFigure,
    fontsize=8, color="gray"
)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import MinMaxScaler


drop_cols = ["month", "day", "hour", "minute", "day_of_week", "week_of_year"]
group_cols = ["installation", "year", "day_of_year"]


sum_cols = ["sol_prod", "sunshine_duration", "direct_radiation"]
mean_cols = ["is_day"]


agg_dict = (
    {c: "sum" for c in sum_cols}
    | {c: "mean" for c in mean_cols}
)

df_by_year_day = (
    psr_df.drop(columns=drop_cols)
    .groupby(group_cols)
    .agg(agg_dict)
    .reset_index()
)





scaler = MinMaxScaler()
df_by_year_day[
    [
        "sol_prod",
        "sunshine_duration",
        "direct_radiation",
    ]
] = scaler.fit_transform(
    df_by_year_day[
        [
            "sol_prod",
            "sunshine_duration",
            "direct_radiation",
        ]
    ]
)


df_by_year_day["installation_year"] = df_by_year_day[["installation", "year"]].apply(
    lambda x: f"{x["installation"]} {x["year"]}", axis=1
)

df_long = df_by_year_day.melt(
    id_vars=["installation_year", "day_of_year"],
    value_vars=[
        "sol_prod",
        "sunshine_duration",
        "is_day",
        "direct_radiation",
    ],
    var_name="variable",
    value_name="value",
)


line_args = {}
axis_args = {}


years = df_long["installation_year"].unique()
fig, axes = plt.subplots(
    nrows=len(years), sharex=True, figsize=(16, 2 * len(years) + 2)
)

for ax, y in zip(axes, years):
    subset = df_long[df_long["installation_year"] == y]
    sns.lineplot(
        data=subset,
        x="day_of_year",
        y="value",
        hue="variable",
        ax=ax,
        linewidth=0.5,
        **line_args.get(col, {}),
    )
    ax.set_title(f"{y}")
    ax.margins(x=0)
    ax.legend(title="", bbox_to_anchor=(1.01, 1), loc="upper left")
    ax.set_xticks(np.arange(0, 366, 30))
    ax.set_xticklabels(
        [f"{i:01d}" for i in np.arange(0, 366, 30)],
    )
    ax.set(**axis_args.get(col, {}))
    ax.yaxis.grid(True, which="major", linestyle="--", color="lightgrey")

plt.suptitle(
    "Normalized Daily Total Power Production, Sunshine duration"
    " and global Radiation by Installation and Year",
    fontsize=16,
    y=1.02,
)

plt.text(
    x=0.99, y=0.04, s="Shown data includes data from https://open-meteo.com",
    ha="right", va="bottom",
    transform=plt.gcf().transFigure,
    fontsize=8, color="gray"
)

plt.tight_layout()
plt.show()

#### Observations
* Yearly cycle is visible in sunshine duration and direct radiation.
* History of normalized closely match each other. Quantify this by correlations.

## Correlations

In [None]:
corr = psr_df[psr_df.select_dtypes(include=["number"]).columns].corr()

mask = corr.abs() < 0.3
annot = corr.round(2).astype(str)
annot[mask] = ""
sns.heatmap(corr, annot=annot, fmt="", cmap="coolwarm", vmin=-1, vmax=1);

#### Observation

* There are very high correlations between `sol_prod`, `sunshine_duration`, `direct_radiation`. `sunshine_duration` and `direct_radiation` are suited to predict `sol_prod`. Since they are also highly correlated to each other only one should be used. Which one is better should be determined by testing.

## Autocorrelations

In [None]:
from statsmodels.graphics.tsaplots import plot_acf

psr_by_year_day = psr_df.groupby(["year", "day_of_year"]).agg(
    {"sol_prod": "sum", "sunshine_duration": "sum", "direct_radiation": "sum"})

fig, axs = plt.subplots(nrows=3, figsize=(10, 12), sharex=True)
for col, ax in zip(["sol_prod", "sunshine_duration", "direct_radiation"], axs):
    plot_acf(psr_by_year_day[col].dropna(), lags=400, alpha=0.05, ax=ax)
    ax.set_title(f"ACF for {col}")

for ax in axs:
    ax.xaxis.grid(True, which="major", linestyle="--", color="lightgrey")
    ax.yaxis.grid(True, which="major", linestyle="--", color="lightgrey")


plt.tight_layout()
plt.show()

### Observation
* Data is systematically autocorrelated.
* The period is about 350 days.