In [None]:
# Requirements:
# pip install wetterdienst geopy pandas

import pandas as pd
from geopy.geocoders import Nominatim
from wetterdienst.provider.dwd.observation import (
    DwdObservationRequest,
    DwdObservationMetadata
)
from wetterdienst.metadata.parameter import Parameter as DwdPara
from wetterdienst.metadata.resolution import Resolution as DwdRes
from wetterdienst.metadata.period import Period as DwdPeriod

import datetime as dt

import os
from src.config import (
    DATA_ORIG_DIR,
)

In [None]:
installation_df = pd.read_csv(
    os.path.join(DATA_ORIG_DIR, "installation_data.csv"),
    sep=";",
    index_col="installation",
)

weather_station_ids = installation_df.loc["elegant_eagle", "closest_weather_station_ids"].split("|")

In [None]:
params = [
    # ("hourly", "sun"),
    # ("daily", "solar"),
    ("10_minutes", "solar", "radiation_global"),
    ("10_minutes", "solar", "sunshine_duration"),
    # ("hourly", "solar", "radiation_global"),
    # ("hourly", "solar", "sunshine_duration"),
]

start_date = dt.datetime(2020, 4, 1)
end_date = dt.datetime(2025, 9, 1)

station_ids = weather_station_ids

all_period_data = {}

for period in ("historical", "recent"):
    req = DwdObservationRequest(
        parameters=params,
        start_date=start_date,
        end_date=end_date,
        periods=period,
    )

    stations = req.filter_by_station_id(station_id=station_ids)

    stations_df = stations.df.to_pandas()
    value_df = stations.values.all().df.to_pandas()

    all_period_data[period] = stations_df[
        ["resolution", "dataset", "station_id", "name"]
    ].merge(value_df, on=["station_id", "resolution", "dataset"], how="left")


In [None]:
historical_date_range = all_period_data["historical"].groupby(["resolution", "dataset", "station_id", "parameter"]).agg({"date": ["min", "max"]})
display(historical_date_range)
display(all_period_data["recent"].groupby(["resolution", "dataset", "station_id", "parameter"]).agg({"date": ["min", "max"]}))

In [None]:
# add all data from recent thats not in historical
# prefer historical has higher quality assurance

cols = ["resolution", "dataset", "station_id", "parameter", "date"]


recent_data_not_in_hist = all_period_data["recent"][
    ~all_period_data["recent"]
    .set_index(cols)
    .index.isin(all_period_data["historical"].set_index(cols).index)
]
display(
    recent_data_not_in_hist.groupby(
        ["resolution", "dataset", "station_id", "parameter"]
    ).agg({"date": ["min", "max"]})
)

In [None]:
data_combined = pd.concat([all_period_data["historical"], recent_data_not_in_hist])
"""Historical data and recent data combined, to provide a comprehensive timeline for each station and parameter."""

display(
    data_combined.groupby(
        ["resolution", "dataset", "station_id", "parameter"]
    ).agg({"date": ["min", "max"]})
)

In [None]:
relevant_data = data_combined[data_combined["station_id"] == station_ids[0]]
"""Data is reduced to data from the nearest station. Missing data is filled with data from other stations."""

cols = ["resolution", "dataset", "parameter", "date"]

for stid in station_ids[1:]:
    rel_station_data = data_combined[data_combined["station_id"] == stid]

    station_data_not_in_rel = rel_station_data[
        ~rel_station_data.set_index(cols).index.isin(
            relevant_data.set_index(cols).index
        )
    ]

    relevant_data = pd.concat(
        [relevant_data, station_data_not_in_rel]
    )

display(
    relevant_data.groupby(
        ["resolution", "dataset", "parameter", "station_id"]
    ).agg({"date": ["min", "max"]})
)

In [None]:
# All information except date and values is removed.
stationless_data = relevant_data.drop(columns=["station_id", "name", "resolution", "dataset", "quality"])
"""A full timeline for each parameter across all stations."""

stationless_data = stationless_data.pivot(index="date", columns="parameter", values="value").reset_index()
stationless_data.columns.name = None
display(stationless_data)

stationless_data = stationless_data.sort_values("date").set_index("date")


# create full 10 min index
full_index = pd.date_range(start=stationless_data.index.min(),
                           end=stationless_data.index.max(),
                           freq="10min")

# Reindex → missing data is NaN
stationless_data = stationless_data.reindex(full_index).reset_index()
stationless_data.rename(columns={"index": "date"}, inplace=True)
display(stationless_data)

In [None]:
stationless_data["data_temp"] = stationless_data["date"].dt.strftime("%m-%d-%H-%M")

stationless_data["mean_rad"] = stationless_data["radiation_global"].groupby(stationless_data["data_temp"]).transform("mean")
stationless_data["mean_sun"] = stationless_data["sunshine_duration"].groupby(stationless_data["data_temp"]).transform("mean")

nans = stationless_data.isna().any(axis=1)
notnans = stationless_data.notna().any(axis=1)
display(stationless_data[nans].tail())
display(stationless_data[notnans].tail())

In [None]:
# filling the nans with the mean over all years may be way off
# but for the sake of having a complete dataset, ...
stationless_data.fillna({"radiation_global": stationless_data["mean_rad"]}, inplace=True)
stationless_data.fillna({"sunshine_duration": stationless_data["mean_sun"]}, inplace=True)

display(stationless_data[nans].tail())
display(stationless_data[notnans].tail())

stationless_data.drop(columns=["data_temp", "mean_rad", "mean_sun"], inplace=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(150, 5))
sns.lineplot(data=stationless_data, x="date", y="radiation_global", ax=ax, linewidth=0.5, alpha=0.7)
sns.lineplot(data=stationless_data, x="date", y="sunshine_duration", ax=ax, linewidth=0.5, alpha=0.7)

# ax.set_yscale("log")

In [None]:
# restructuring to 15 min steps

stationless_data = stationless_data.set_index("date")

# split the 10 min intervals into 5 min intervals
# ATTENTION: The 00:55 is missing
sd_5min = stationless_data.resample("5min").ffill() * (5 / 10) 

step = pd.Timedelta("5min")

# create all 5 min intervals, including the missing 00:55
idx5 = pd.date_range(
    start=stationless_data.index.min(),
    end=stationless_data.index.max() + step,
    freq=step,
)

# fill the 00:55 interval with the value from 00:50
# wich was already halved, so both have the correct value
sd_5min = sd_5min.reindex(idx5).ffill()

# sum the 5 min intervals to 15 min intervals
sd_15min = sd_5min.resample("15min").sum()

In [None]:
display(sd_15min)