In [11]:
import pandas as pd
import requests
DATA_DIR_URL = "https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/monthly/kl/historical"
STATION_INFO = DATA_DIR_URL + "/KL_Monatswerte_Beschreibung_Stationen.txt"

# df = pd.read_csv(STATION_INFO, encoding='windows-1252', sep=None, skiprows=[0,1,2])
# Can't parse the headers along with the rest, add manually
station_info_df = pd.read_fwf(
    STATION_INFO,
    encoding="windows-1252",
    colspecs="infer",
    infer_nrows=10,
    skiprows=[0, 1],
    header=None,
    names=[
        "Stations_id",
        "von_datum",
        "bis_datum",
        "Stationshoehe",
        "geoBreite",
        "geoLaenge",
        "Stationsname",
        "Bundesland",
    ],
)
station_info_df


Unnamed: 0,Stations_id,von_datum,bis_datum,Stationshoehe,geoBreite,geoLaenge,Stationsname,Bundesland
0,1,19310101,19860630,478,47.8413,8.8493,Aach,Baden-Württemberg
1,3,18510101,20110331,202,50.7827,6.0941,Aachen,Nordrhein-Westfalen
2,44,19710301,20221231,44,52.9336,8.2370,Großenkneten,Niedersachsen
3,52,19730101,20011231,46,53.6623,10.1990,Ahrensburg-Wulfsdorf,Schleswig-Holstein
4,61,19750701,19780831,339,48.8443,12.6171,Aiterhofen,Bayern
...,...,...,...,...,...,...,...,...
1119,19172,20200901,20221231,48,54.0246,9.3880,Wacken,Schleswig-Holstein
1120,19318,19461101,19781231,292,50.7180,10.4310,Schmalkalden,Thüringen
1121,19364,19371201,19441231,720,50.6167,10.8167,Schmiedefeld/Rennsteig,Thüringen
1122,19378,19580101,19771231,505,50.8333,10.5833,Finsterbergen,Thüringen


In [12]:
station_info_df[station_info_df['Stations_id'] == 19172]

Unnamed: 0,Stations_id,von_datum,bis_datum,Stationshoehe,geoBreite,geoLaenge,Stationsname,Bundesland
1119,19172,20200901,20221231,48,54.0246,9.388,Wacken,Schleswig-Holstein


Looks like this file lies about the actual data availability though. Because the end dates here do not match the end dates of the actual available data. So it's useful only for matching station IDs to names and coordinates, but not for filtering which stations to show in the first place.

So I will have to build my own data availability DF from the URLs.

In [13]:
import requests
from bs4 import BeautifulSoup

def get_url_paths(url, ext='', params={}):
    response = requests.get(url, params=params)
    if response.ok:
        response_text = response.text
    else:
        return response.raise_for_status()
    soup = BeautifulSoup(response_text, 'html.parser')
    parent = [url + node.get('href') for node in soup.find_all('a') if node.get('href').endswith(ext)]
    return parent

url = DATA_DIR_URL
ext = 'zip'
data_urls = get_url_paths(url, ext)


data_urls[0]

'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/monthly/kl/historicalmonatswerte_KL_00001_19310101_19860630_hist.zip'

In [14]:
stn_id = 19172
f'KL_{stn_id:05}'

'KL_19172'

In [42]:
import re

def parse_data_urls(data_urls): 
    availability = {}
    for url in data_urls:
        m = re.search('KL_(\d*)_(\d*)_(\d*)_hist', url)
        stn_id, start, end = map(int, m.groups())
        if stn_id in availability:
            print(f'Warning, more than one URL for {stn_id}')
        availability[stn_id] = {'start': start, 'end': end, 'url': url}
    return availability


m = re.search('KL_(\d*)_(\d*)_(\d*)_hist', data_urls[0])
stn_id, start, end = map(int, m.groups())
start

actual_data_availability = pd.DataFrame(parse_data_urls(data_urls)).T.reset_index(names='station_id')
actual_data_availability.head()

Unnamed: 0,station_id,start,end,url
0,1,19310101,19860630,https://opendata.dwd.de/climate_environment/CD...
1,3,18510101,20110331,https://opendata.dwd.de/climate_environment/CD...
2,44,19710301,20211231,https://opendata.dwd.de/climate_environment/CD...
3,52,19730101,20011231,https://opendata.dwd.de/climate_environment/CD...
4,61,19750701,19780831,https://opendata.dwd.de/climate_environment/CD...


In [39]:
# now filter for availability
useful_stations = actual_data_availability[(actual_data_availability['end'] >= 20200000) & (actual_data_availability['start'] < 19800000)]
useful_stations

Unnamed: 0,station_id,start,end,url
2,44,19710301,20211231,https://opendata.dwd.de/climate_environment/CD...
8,73,19520701,20211231,https://opendata.dwd.de/climate_environment/CD...
9,78,19610101,20211231,https://opendata.dwd.de/climate_environment/CD...
10,91,19781101,20211231,https://opendata.dwd.de/climate_environment/CD...
20,142,19550101,20211231,https://opendata.dwd.de/climate_environment/CD...
...,...,...,...,...
981,5825,19530701,20211231,https://opendata.dwd.de/climate_environment/CD...
990,5906,18810101,20211231,https://opendata.dwd.de/climate_environment/CD...
994,5941,19370401,20211231,https://opendata.dwd.de/climate_environment/CD...
1006,6159,19590301,20211231,https://opendata.dwd.de/climate_environment/CD...


In [41]:
print(f'Of {len(data_urls)} data files, only {len(useful_stations)} are long enough to be interesting.')

Of 1103 data files, only 290 are long enough to be interesting.


Are there duplicates? Try getting the Station IDs out of the list of paths and then check for mismatch.