In [None]:
# Requirements:
# pip install wetterdienst geopy pandas

import pandas as pd
from geopy.geocoders import Nominatim
from wetterdienst.provider.dwd.observation import (
    DwdObservationRequest,
    DwdObservationMetadata
)
from wetterdienst.metadata.parameter import Parameter as DwdPara
from wetterdienst.metadata.resolution import Resolution as DwdRes
from wetterdienst.metadata.period import Period as DwdPeriod

import datetime as dt

import os
from src.config import (
    DATA_ORIG_DIR,
)

In [None]:
installation_df = pd.read_csv(
    os.path.join(DATA_ORIG_DIR, "installation_data.csv"),
    sep=";",
    index_col="installation",
)

weather_station_ids = installation_df.loc["elegant_eagle", "closest_weather_station_ids"].split("|")

In [None]:
params = [
    # ("hourly", "sun"),
    # ("daily", "solar"),
    ("10_minutes", "solar", "radiation_global"),
    ("10_minutes", "solar", "sunshine_duration"),
    # ("hourly", "solar", "radiation_global"),
    # ("hourly", "solar", "sunshine_duration"),
]

start_date = dt.datetime(2020, 4, 1)
end_date = dt.datetime(2025, 9, 1)

station_ids = weather_station_ids

all_period_data = {}

for period in ("historical", "recent"):
    req = DwdObservationRequest(
        parameters=params,
        start_date=start_date,
        end_date=end_date,
        periods=period,
    )

    stations = req.filter_by_station_id(station_id=station_ids)

    stations_df = stations.df.to_pandas()
    value_df = stations.values.all().df.to_pandas()

    all_period_data[period] = stations_df[
        ["resolution", "dataset", "station_id", "name"]
    ].merge(value_df, on=["station_id", "resolution", "dataset"], how="left")


In [None]:
historical_date_range = all_period_data["historical"].groupby(["resolution", "dataset", "station_id", "parameter"]).agg({"date": ["min", "max"]})
display(historical_date_range)
display(all_period_data["recent"].groupby(["resolution", "dataset", "station_id", "parameter"]).agg({"date": ["min", "max"]}))

In [None]:
# add all data from recent thats not in historical
# prefer historical has higher quality assurance

cols = ["resolution", "dataset", "station_id", "parameter", "date"]


recent_data_not_in_hist = all_period_data["recent"][
    ~all_period_data["recent"]
    .set_index(cols)
    .index.isin(all_period_data["historical"].set_index(cols).index)
]
display(
    recent_data_not_in_hist.groupby(
        ["resolution", "dataset", "station_id", "parameter"]
    ).agg({"date": ["min", "max"]})
)

In [None]:
data_combined = pd.concat([all_period_data["historical"], recent_data_not_in_hist])
"""Historical data and recent data combined, to provide a comprehensive timeline for each station and parameter."""

display(
    data_combined.groupby(
        ["resolution", "dataset", "station_id", "parameter"]
    ).agg({"date": ["min", "max"]})
)

In [None]:
relevant_data = data_combined[data_combined["station_id"] == station_ids[0]]
"""Data is reduced to data from the nearest station. Missing data is filled with data from other stations."""

cols = ["resolution", "dataset", "parameter", "date"]

for stid in station_ids[1:]:
    rel_station_data = data_combined[data_combined["station_id"] == stid]

    station_data_not_in_rel = rel_station_data[
        ~rel_station_data.set_index(cols).index.isin(
            relevant_data.set_index(cols).index
        )
    ]

    relevant_data = pd.concat(
        [relevant_data, station_data_not_in_rel]
    )

display(
    relevant_data.groupby(
        ["resolution", "dataset", "parameter", "station_id"]
    ).agg({"date": ["min", "max"]})
)

In [None]:
# All information except date and values is removed.
stationless_data = relevant_data.drop(columns=["station_id", "name", "resolution", "dataset", "quality"])
"""A full timeline for each parameter across all stations."""

stationless_data = stationless_data.pivot(index="date", columns="parameter", values="value").reset_index()
stationless_data.columns.name = None
display(stationless_data)

stationless_data = stationless_data.sort_values("date").set_index("date")


# create full 10 min index
full_index = pd.date_range(start=stationless_data.index.min(),
                           end=stationless_data.index.max(),
                           freq="10min")

# Reindex → missing data is NaN
stationless_data = stationless_data.reindex(full_index).reset_index()
display(stationless_data)