In [108]:
import requests
import pathlib
import xml.etree.ElementTree as ET
import json
import time
import string
import sys
import pandas as pd
from deutsche_bahn_api import ApiAuthentication
import numpy as np
import re
from datetime import datetime, timedelta


BASE_URL = "https://apis.deutschebahn.com/db-api-marketplace/apis/timetables/v1/"
STATION_URL_1 = BASE_URL + "station/135899"

ALLOWED_UIC_PREFIXES = {
    "80": "Germany", "81": "Austria", "82": "Luxembourg", "84": "The Netherlands", 
    "87": "France", "88": "Belgium", "86": "Denmark", "74": "Sweden", 
    "10": "Finland", "83": "Italy", "71": "Spain", "94": "Portugal", 
    "79": "Slovenia", "78": "Croatia", "73": "Greece", "51": "Poland", 
    "54": "Czechia", "55": "Hungary", "56": "Slovakia", "53": "Romania", 
    "52": "Bulgaria", "25": "Estonia", "26": "Lithuania", "27": "Latvia",
    "70": "United Kingdom", "60": "Ireland"
}

DB_CLIENT_ID = "15e1384706df102d7c7502c78b266e3b"
DB_API_KEY = "2e71b6410f16f919b1ed934793e3a5bf"
DB_CLIENT_ID2 = "a9f83c55d26c3ee7f48f4ce887ec2a57"
DB_API_KEY2 = "422cac21a0a83876c75efb8806589ea0"

header2 = {
    "DB-Client-Id": DB_CLIENT_ID2,
    "DB-Api-Key": DB_API_KEY2,
    "accept": "application/xml"
}

def fetch_stations():
    url = f"{BASE_URL}/station/*"
    try:
        response = requests.get(url, headers=header2)
        response.encoding = 'utf-8' # Fix encoding for special chars
        if response.status_code == 200:
            return response.text
        elif response.status_code == 404:
            return None
        else:
            print(f"  [!] API Error {response.status_code} for '{char}'")
            return None
    except Exception as e:
        print(f"  [!] Request Exception: {e}")
        return None
    
def parse_and_filter(xml_data):
    """
    Parses XML and returns a list of dictionaries for valid EU stations.
    """
    valid_stations = []
    if not xml_data:
        return valid_stations

    try:
        root = ET.fromstring(xml_data)
        for station in root.findall("station"):
            name = station.get("name")
            eva = station.get("eva")
            ds100 = station.get("ds100")
            
            if not eva or not name:
                continue

            # 1. Country Filter (First 2 digits)
            uic_prefix = eva[:2]
            country_name = ALLOWED_UIC_PREFIXES.get(uic_prefix)

            if country_name:
                # 2. Island Name Filter
                # Check if any blocklist word appears in the station name

                valid_stations.append({
                    "EVA_ID": eva,
                    "Station_Name": name,
                    "DS100": ds100,
                    "Country": country_name,
                    "UIC_Code": uic_prefix
                })

    except ET.ParseError:
        print("  [!] XML Parsing Error")
    
    return valid_stations

In [52]:
#Verify API access
api_authentication = ApiAuthentication( DB_CLIENT_ID2, DB_API_KEY2)
success:bool = api_authentication.test_credentials()

# Fetching and Parsing DB Bahn stations

In [49]:
print("--- Starting EU Station Download ---")    
all_data = []
# Use a set to track EVAs we've already seen to prevent duplicates
seen_evas = set()
xml_content = fetch_stations()

if xml_content:
    batch = parse_and_filter(xml_content)
    
    new_count = 0
    for item in batch:
        if item['EVA_ID'] not in seen_evas:
            all_data.append(item)
            seen_evas.add(item['EVA_ID'])
            new_count += 1
    
    print(f"Found {len(batch)} items ({new_count} new).")
else:
    print("No data.")

    # Rate limit protection
    time.sleep(0.3)

print("-" * 30)
print("Processing Data with Pandas...")

# Create DataFrame
df = pd.DataFrame(all_data)

# Sort by Country then Name
if not df.empty:
    df = df.sort_values(by=["Country", "Station_Name"])
    
    # Display sample
    print(f"\nTotal Stations Found: {len(df)}")
    print("\nBreakdown by Country:")
    print(df['Country'].value_counts())
    
    # Save to CSV
    filename = "eu_stations_dataset.csv"
    df.to_csv(filename, index=False, encoding='utf-8-sig') # utf-8-sig for Excel compatibility
    print(f"\n[SUCCESS] Dataset saved to '{filename}'")
else:
    print("[!] No stations found. Check your API keys or Filter logic.")

--- Starting EU Station Download ---
Found 28635 items (28635 new).
------------------------------
Processing Data with Pandas...

Total Stations Found: 28635

Breakdown by Country:
Country
Germany           8426
France            6351
Poland            3811
Czech Republic    3399
Austria           3118
Denmark            790
Belgium            596
Netherlands        445
Italy              237
Finland            215
Spain              185
Greece             179
Luxembourg         147
Slovakia           138
Hungary            128
United Kingdom      92
Sweden              68
Croatia             64
Slovenia            62
Ireland             47
Latvia              29
Lithuania           24
Portugal            23
Estonia             23
Bulgaria            19
Romania             19
Name: count, dtype: int64

[SUCCESS] Dataset saved to 'eu_stations_dataset.csv'


In [58]:
df.iloc[np.random.choice(df.shape[0],50)]

Unnamed: 0,EVA_ID,Station_Name,DS100,Country,UIC_Code
17720,8175190,"Oberland Abzw. Forsteralm, Gaflenz (A)",PAOOF,Austria,81
9550,5401020,Havrice,OTHVR,Czech Republic,54
22196,8070974,Schmiechen Albbahn,TSHA,Germany,80
4922,5100873,Cieszyn,OPCZY,Poland,51
23880,256762,Starnberg,D256762,Estonia,25
4710,8703094,Chevaigne,OFCLK,France,87
17455,5499756,Novina obratiste,PTNOE,Czech Republic,54
18853,8101352,Pfaffstätten,OAPFT,Austria,81
20953,8650611,Ringsted St. (togbus),PDRTB,Denmark,86
3716,8085001,Bruchsal Schloßgarten,RBRD,Germany,80


#  Railway station selection & filtering

Let's select all stations which are big + might have access to the airport

In [75]:
# get cities above 200k population
!pip install geonamescache

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting geonamescache
  Obtaining dependency information for geonamescache from https://files.pythonhosted.org/packages/5a/49/6a6e14f4e99a29991f3446c4204c4fb28906bc07fc480ec11b0cad03ca43/geonamescache-3.0.0-py3-none-any.whl.metadata
  Downloading geonamescache-3.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting typing-extensions (from geonamescache)
  Obtaining dependency information for typing-extensions from https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl.metadata
  Downloading typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Downloading geonamescache-3.0.0-py3-none-any.whl (32.0 MB)
   ---------------------------------------- 0.0/32.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/32.0 MB 660.6 kB/s eta 0:00:49
   ---------------------------------------- 0.1/32.0 MB 1.3 MB/s eta 0


[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [127]:
import geonamescache

gc = geonamescache.GeonamesCache()
cities = gc.get_cities()
CITIES_300K = []

countries = gc.get_countries()
ALLOWED_COUNTRIES = {}
for code, info in countries.items():
    if info['continentcode'] == 'EU' and info["name"] in ALLOWED_UIC_PREFIXES.values():
        ALLOWED_COUNTRIES[code] = info["name"]

cities

{'3040051': {'geonameid': 3040051,
  'name': 'les Escaldes',
  'latitude': 42.50729,
  'longitude': 1.53414,
  'countrycode': 'AD',
  'population': 15853,
  'timezone': 'Europe/Andorra',
  'admin1code': '08',
  'alternatenames': ["Ehskal'des-Ehndzhordani",
   'Escaldes',
   'Escaldes-Engordany',
   'Les Escaldes',
   'esukarudesu=engorudani jiao qu',
   'lai sai si ka er de-en ge er da',
   'Эскальдес-Энджордани',
   'エスカルデス＝エンゴルダニ教区',
   '萊塞斯卡爾德-恩戈爾達',
   '萊塞斯卡爾德－恩戈爾達']},
 '3041563': {'geonameid': 3041563,
  'name': 'Andorra la Vella',
  'latitude': 42.50779,
  'longitude': 1.52109,
  'countrycode': 'AD',
  'population': 20430,
  'timezone': 'Europe/Andorra',
  'admin1code': '07',
  'alternatenames': ['ALV',
   'Ando-la-Vyey',
   'Andora',
   'Andora la Vela',
   'Andora la Velja',
   "Andora lja Vehl'ja",
   'Andoro Malnova',
   'Andorra',
   'Andorra Tuan',
   'Andorra a Vella',
   'Andorra la Biella',
   'Andorra la Vella',
   'Andorra la Vielha',
   'Andorra-a-Velha',
   "Andorra-

In [130]:
def get_cities_above_population(population_threshold   = 250000):
    result = pd.DataFrame(columns=["geonameid","name","countrycode","latitude","longitude","population"])
    for city_id, city_info in cities.items():
        if city_info['population'] >= population_threshold and city_info['countrycode'] in ALLOWED_COUNTRIES:
            #  get for each records with 1/2 alternate names in order not to miss anything
            alt_names = city_info.get('alternatenames', [])
            result = pd.concat([result, pd.DataFrame([{"geonameid": city_id, "name": city_info['name'], "countrycode": city_info['countrycode'], "latitude": city_info['latitude'], "longitude": city_info['longitude'], "population": city_info['population']}])], ignore_index=True)
            if len(alt_names) > 0:
                result = pd.concat([result, pd.DataFrame([{"geonameid": city_id, "name": alt_names[0], "countrycode": city_info['countrycode'], "latitude": city_info['latitude'], "longitude": city_info['longitude'], "population": city_info['population']}])], ignore_index=True) 
            if len(alt_names) > 1:
                result = pd.concat([result, pd.DataFrame([{"geonameid": city_id, "name": alt_names[1], "countrycode": city_info['countrycode'], "latitude": city_info['latitude'], "longitude": city_info['longitude'], "population": city_info['population']}])], ignore_index=True) 
    return result.sort_values(by="geonameid", ascending=False)

CITIES_300K = get_cities_above_population(300000)
CITIES_250K = get_cities_above_population(250000)
CITIES_200K = get_cities_above_population(200000)
len(CITIES_250K)


  result = pd.concat([result, pd.DataFrame([{"geonameid": city_id, "name": city_info['name'], "countrycode": city_info['countrycode'], "latitude": city_info['latitude'], "longitude": city_info['longitude'], "population": city_info['population']}])], ignore_index=True)
  result = pd.concat([result, pd.DataFrame([{"geonameid": city_id, "name": city_info['name'], "countrycode": city_info['countrycode'], "latitude": city_info['latitude'], "longitude": city_info['longitude'], "population": city_info['population']}])], ignore_index=True)
  result = pd.concat([result, pd.DataFrame([{"geonameid": city_id, "name": city_info['name'], "countrycode": city_info['countrycode'], "latitude": city_info['latitude'], "longitude": city_info['longitude'], "population": city_info['population']}])], ignore_index=True)


480

In [133]:
CITIES_200K[:50]

Unnamed: 0,geonameid,name,countrycode,latitude,longitude,population
174,8354626,Bezirk Hamburg-Nord,DE,53.58935,9.984,315514
173,8354626,Arrondissement de Hambourg-Nord,DE,53.58935,9.984,315514
172,8354626,Hamburg-Nord,DE,53.58935,9.984,315514
292,8285534,,ES,40.4984,-3.7314,220085
291,8285534,Fuencarral-El Pardo,ES,40.4984,-3.7314,220085
10,8063098,,AT,48.16116,16.38233,201882
9,8063098,Favoriten,AT,48.16116,16.38233,201882
567,776069,Białystok,PL,53.13333,23.16433,291855
569,776069,Balstogė,PL,53.13333,23.16433,291855
568,776069,Balstoge,PL,53.13333,23.16433,291855


In [134]:
AIRPORT_CITIES = [
    # Germany
    "Berlin", "München", "Frankfurt", "Hamburg", "Köln", "Düsseldorf", "Stuttgart", "Hannover", "Nürnberg", "Leipzig", "Dresden", "Bremen",
    # France
    "Paris", "Lyon", "Marseille", "Nice", "Toulouse", "Bordeaux", "Lille", "Strasbourg",
    # UK & Ireland
    "London", "Manchester", "Birmingham", "Edinburgh", "Glasgow", "Dublin",
    # Italy
    "Roma", "Milano", "Venezia", "Napoli", "Firenze", "Bologna", "Torino",
    # Spain & Portugal
    "Madrid", "Barcelona", "Sevilla", "Valencia", "Malaga", "Lisboa", "Porto",
    # Benelux
    "Amsterdam", "Rotterdam", "Den Haag", "Utrecht", "Bruxelles", "Brussels", "Antwerpen", "Luxembourg",
    # Central/East Europe
    "Wien", "Zürich", "Genève", "Basel", "Bern", "Praha", "Warszawa", "Krakow", "Budapest", "Bratislava", "Ljubljana", "Zagreb", "Bucuresti", "Sofia",
    # Nordics
    "København", "Stockholm", "Oslo", "Helsinki", "Malmö", "Göteborg"
]


MAIN_STATION_PATTERNS = [
    r"Hbf", r"Hauptbahnhof",                   # German/Austrian
    r"Centraal", r"Centrale", r"Central",      # Dutch/Italian/English/Nordic
    r"Termini", r"Tiburtina",                  # Roma
    r"Santa Maria Novella", r"S\.M\.N\.",      # Firenze
    r"Porta Nuova",                            # Torino
    r"Gare de Lyon", r"Gare du Nord", r"Gare de l'Est", r"Montparnasse", r"Saint-Lazare", # Paris
    r"Part-Dieu", r"Perrache",                 # Lyon
    r"St-Charles",                             # Marseille
    r"Ville",                                  # Nice Ville, Luxembourg Ville
    r"St Pancras", r"Euston", r"King's Cross", r"Paddington", r"Victoria", r"Waterloo", # London
    r"Piccadilly", r"Waverley", r"Connolly", r"Heuston", # UK/Ireland Regional
    r"Sants", r"Atocha", r"Chamartin", r"Santa Justa", r"Joaquin Sorolla", # Spain
    r"Campanha", r"Oriente", r"Santa Apolonia", # Portugal
    r"hl\.n\.", r"Glowny", r"Centralna",       # CZ/PL
    r"Keleti", r"Nyugati", r"Deli",            # Budapest
    r"Midi", r"Zuid", r"Noord", r"Brussels-South", # Brussels
    r"HB",                                     # Zurich/Swiss
]

def filter_major_hubs(df):
    """
    Filters a DataFrame of stations to return only Main Hubs in Airport Cities.
    Assumes df has columns: 'Station_Name', 'EVA_ID'
    """
    filtered_rows = []
    
    # Pre-compile regex for performance
    # Matches if any main station pattern exists in the string
    hub_regex = re.compile("|".join(MAIN_STATION_PATTERNS), re.IGNORECASE)
    
    print(f"Filtering {len(df)} stations...")

    for index, row in df.iterrows():
        raw_name = str(row.get('Station_Name', ''))
        eva_id = str(row.get('EVA_ID', ''))
        
        # 1. Normalize Name (remove brackets like '(Main)' or '(tief)')
        clean_name = raw_name.replace("(", " ").replace(")", " ").strip()
        
        # 2. Identify City
        found_city = None
        CITY_DB = set(AIRPORT_CITIES).union(set(CITIES_200K))
        for city in CITY_DB:
            # Logic: The city name must appear at the START of the station name
            # OR be a distinct word (e.g., "Flughafen Wien" -> matches Wien)
            # using regex boundary \b ensures we don't match "Bermuda" for "Bern"
            if re.search(rf"\b{city}\b", clean_name, re.IGNORECASE):
                found_city = city
                break
        
        if not found_city:
            continue
            
        # 3. Check for "Main Station" status
        is_hub = False
        
        # Check A: Explicit Keyword Match (e.g. "Hbf", "Termini")
        if hub_regex.search(clean_name):
            is_hub = True
            
        # Check B: The name IS the city name exactly (e.g. "München", "Nice", "Bratislava")
        # Some API data lists the main station just as the city name.
        if clean_name.lower() == found_city.lower():
            is_hub = True
            
        # Check C: Special handling for specific complex names
        # e.g., "Frankfurt(Main)Flughafen" -> We might want to EXCLUDE airports if we only want rail hubs,
        # OR include them if you want airport rail links.
        # Here we EXCLUDE actual airport stations to focus on CITY CENTRAL stations.
        if "Flughafen" in clean_name or "Airport" in clean_name:
            is_hub = False

        if is_hub:
            # Append normalized data
            row_data = row.to_dict()
            row_data['Assigned_City'] = found_city
            filtered_rows.append(row_data)

    # Create new DataFrame
    result_df = pd.DataFrame(filtered_rows)
    
    # Deduplicate: If multiple stations match for one city, keep them all 
    # (e.g. Paris has Nord and Est), but remove exact duplicates of EVA_ID.
    if not result_df.empty:
        result_df.drop_duplicates(subset=['EVA_ID'], inplace=True)
        result_df.sort_values(by=['Assigned_City', 'Station_Name'], inplace=True)
        
    return result_df

In [135]:
train_stations_df = filter_major_hubs(df)
train_stations_df.reset_index(drop=True, inplace=True)
train_stations_df

Filtering 28635 stations...


Unnamed: 0,EVA_ID,Station_Name,DS100,Country,UIC_Code,Assigned_City
0,8400058,Amsterdam Centraal,XNAC,Netherlands,84,Amsterdam
1,8498058,Amsterdam Centraal Eurostar,PNACE,Netherlands,84,Amsterdam
2,8400048,Amsterdam Noord Metro,PNANM,Netherlands,84,Amsterdam
3,8400061,Amsterdam Zuid,XNAZ,Netherlands,84,Amsterdam
4,8800007,Antwerpen Centraal,XBAC,Belgium,88,Antwerpen
...,...,...,...,...,...,...
106,8170077,"Hauptbahnhof (Wiedner Gürtel), Wien (A)",PAAAR,Austria,81,Wien
107,8103000,Wien Hbf,XAWIE,Austria,81,Wien
108,8100004,Wien Hbf (Autoreisezuganlage),XAWIO,Austria,81,Wien
109,8101590,Wien Hbf (Bahnsteige 1-2),OASRP,Austria,81,Wien


In [137]:
train_stations_df["Station_Name"]

0                           Amsterdam Centraal
1                  Amsterdam Centraal Eurostar
2                        Amsterdam Noord Metro
3                               Amsterdam Zuid
4                           Antwerpen Centraal
                        ...                   
106    Hauptbahnhof (Wiedner Gürtel), Wien (A)
107                                   Wien Hbf
108              Wien Hbf (Autoreisezuganlage)
109                  Wien Hbf (Bahnsteige 1-2)
110             Wien Hbf (Busbahnhof Vorplatz)
Name: Station_Name, Length: 111, dtype: object

# Parsing Timetable for given stations

In [65]:
warsaw_eva:str = df.loc[df["Station_Name"] == "Warszawa Centralna"]["EVA_ID"]
warsaw_eva

26877    5100065
Name: EVA_ID, dtype: object

In [None]:
# retrieve for single week in 
def fetch_hourly_plan(eva_no, date_str, hour_str):
    """
    Fetches the planned timetable for a specific station, date (YYMMDD), and hour (HH).
    """
    url = f"{BASE_URL}/plan/{eva_no}/{date_str}/{hour_str}"
    try:
        response = requests.get(url, headers=header2)
        if response.status_code == 200:
            return response.text
        elif response.status_code == 404:
            return None 
        elif response.status_code == 401:
            print("[!] Authentication Failed. Check API Keys.")
            return None
        else:
            print(f"  [!] Error {response.status_code} for {hour_str}:00")
            return None
    except Exception as e:
        print(f"  [!] Request Error: {e}")
        return None

In [None]:
today = datetime.now()
schedule_date = today

for h in range(7):
    fetch_hourly_plan(warsaw_eva.values[0], schedule_date.strftime("%y%m%d"), "12")

'<?xml version=\'1.0\' encoding=\'UTF-8\'?><timetable station=\'Warszawa Centralna\'><s id="-1534404592360704587-2512231249-2"><tl f="F" t="p" o="51" c="EC" n="44"/><ar pt="2512231255" pp="" fb="EC 44" hi="1" ppth="Warszawa Wschodnia"/><dp pt="2512231300" pp="" fb="EC 44" ppth="Warszawa Zachodnia|Kutno|Konin|Poznan Glowny|Zbaszynek|Swiebodzin|Rzepin|Frankfurt(Oder)|Berlin Ostbahnhof|Berlin Hbf"/></s></timetable>'