TODOs:

- [ ] Finde in Airports_Rail_Access die Angabe zur City–Airport-Zeit.
- [ ] Extrahiere daraus die Fahrzeit in Minuten.
- [ ] Behandle Airports ohne Angaben.
- [ ] Baue eine Tabelle: Airport ↔ City-Airport-Time.
- [ ] Nimm aus Alternative_Rail_Routes die Flugzeit.
- [ ] Mappe Origin/Destination-Airport auf City–Airport-Time.
- [ ] Berechne Total Air Travel Duration.
- [ ] Interpretation + Ergebnis.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

# Q3: 

## How can we estimate the typical travel time between the city’s central rail station and its nearest major airport, and how should this time be incorporated into the total air travel duration?

In [2]:
# Path to the data directory
data_path = "../Data"

# List all xlsx files in the data directorya
xlsx_files = [f for f in os.listdir(data_path) if f.endswith(".xlsx")]

In [3]:
# Dictionary to hold dataframes
dataframes = {}

for file in xlsx_files:
    file_path = os.path.join(data_path, file)
    if file == "Airports_Rail_Access.xlsx":
        df = pd.read_excel(file_path, header=1, sheet_name=0)  # multi-index columns
    else:
        df = pd.read_excel(file_path, sheet_name=0)  # only 1 sheet per file
    dataframes[file] = df
    print(f"Loaded: {file} — Shape: {df.shape} - Missing values: {df.isnull().sum().sum()}")

dataframes_list = list(dataframes.items())

Loaded: Airports_Rail_Access.xlsx — Shape: (318, 25) - Missing values: 309
Loaded: Candidate_Rail_Routes.xlsx — Shape: (8612, 11) - Missing values: 3172
Loaded: Alternative_Rail_Routes_Unidirectional.xlsx — Shape: (7425, 30) - Missing values: 27010
Loaded: Alternative_Rail_Routes_Bidirectional.xlsx — Shape: (3814, 16) - Missing values: 5148
Loaded: Airports_Indicators.xlsx — Shape: (58, 10) - Missing values: 0


## Subquestion:
## How can one estimate the typical travel time between City Central Station and the airport?

### Analyse Airports_Rail_Access.xlsx

In [4]:
# Airports_Rail_Access.xlsx
filename_ARA, df_ARA = dataframes_list[0]

# Important columns
df_ARA_clean = df_ARA[["Airport", "IATA Code", "Train connection to City Station"]].copy()

# Rename columns for easier access
df_ARA_clean = df_ARA_clean.rename(columns={
    "Airport": "airport",
    "IATA Code": "iata_code",
    "Train connection to City Station": "connection_text"
})

# Extract city airport time in minutes using regex
df_ARA_clean["city_airport_time_min"] = df_ARA_clean["connection_text"].str.extract(r"(\d+)\s*min").astype("Int64")
df_ARA_clean.drop(columns=["connection_text"], inplace=True)

In [5]:
df_ARA_clean.head()

Unnamed: 0,airport,iata_code,city_airport_time_min
0,Frankfurt Airport,FRA,12
1,Charles de Gaulle International Airport,CDG,31
2,Amsterdam Airport Schiphol,AMS,15
3,London Heathrow Airport,LHR,21
4,Madrid–Barajas Airport,MAD,19


### Analyse Alternative_Rail_Routes_Bidirectional.xlsx

In [6]:
# Alternative_Rail_Routes_Bidirectional.xlsx
filename_ARRB, df_ARRB = dataframes_list[3]

# Important columns
df_ARRB_clean = df_ARRB[["Origin", "Destination", "Air Travel Time"]].copy()

# Rename columns for easier access
df_ARRB_clean = df_ARRB_clean.rename(columns={
    "Origin": "origin",
    "Destination": "destination",
    "Air Travel Time": "air_travel_time_min"
})

# Convert air travel time to numeric (in minutes)
df_ARRB_clean["air_travel_time_min"] = df_ARRB_clean["air_travel_time_min"].round().astype(int)

In [7]:
df_ARRB_clean

Unnamed: 0,origin,destination,air_travel_time_min
0,A Coruña,AGP,100
1,A Coruña,BCN,105
2,A Coruña,GVA,135
3,A Coruña,LGW,120
4,A Coruña,MAD,76
...,...,...,...
3809,Warsaw,ZRH,118
3810,Westerland,ZRH,99
3811,Wrocław,ZRH,97
3812,XRY,Zurich,165


## Merge

In [10]:
# As in ARRB the origin and destination are not allways given as IATA codes
# we can create a mapping from airport name to IATA code from ARA
# ---------------------

# check of IATA codes in origin column
df_ARRB_clean["origin_is_iata"] = df_ARRB_clean["origin"].str.match(r"^[A-Z]{3}$")

# check of IATA codes in destination column
df_ARRB_clean["destination_is_iata"] = df_ARRB_clean["destination"].str.match(r"^[A-Z]{3}$")

df_ARRB_clean["both_is_iata"] = df_ARRB_clean["origin_is_iata"] & df_ARRB_clean["destination_is_iata"]

# Create mapping from airport name to IATA code
iata_map = df_ARA_clean[["airport", "iata_code"]].copy()

# Function to map airport names to IATA codes


In [11]:
df_ARRB_clean

Unnamed: 0,origin,destination,air_travel_time_min,origin_is_iata,destination_is_iata,both_is_iata
0,A Coruña,AGP,100,False,True,False
1,A Coruña,BCN,105,False,True,False
2,A Coruña,GVA,135,False,True,False
3,A Coruña,LGW,120,False,True,False
4,A Coruña,MAD,76,False,True,False
...,...,...,...,...,...,...
3809,Warsaw,ZRH,118,False,True,False
3810,Westerland,ZRH,99,False,True,False
3811,Wrocław,ZRH,97,False,True,False
3812,XRY,Zurich,165,True,False,False


In [None]:
# create IATA-Mapping-Dict
iata_dict = dict(zip(df_ARA_clean["airport"], df_ARA_clean["iata_code"]))

# --- 2. Funktion zum Mappen einer Zelle ---
def get_iata(value: str):
    if pd.isna(value):
        return "unknown"
    
    value_str = str(value).strip()

    # Fall 1: Wert ist bereits ein IATA-Code (3 Buchstaben A–Z)
    if re.fullmatch(r"[A-Z]{3}", value_str):
        return value_str
    
    # Fall 2: Wert ist ein Flughafenname und im Mapping vorhanden
    if value_str in iata_dict:
        return iata_dict[value_str]
    
    # Fall 3: Kein Match gefunden
    return "unknown"

# --- 3. Neue Spalten erzeugen ---
df_ARRB_clean["origin_as_iata"] = df_ARRB_clean["origin"].apply(get_iata)
df_ARRB_clean["destination_as_iata"] = df_ARRB_clean["destination"].apply(get_iata)

df_ARRB_clean

Unnamed: 0,origin,destination,air_travel_time_min,origin_is_iata,destination_is_iata,both_is_iata,origin_as_iata,destination_as_iata
0,A Coruña,AGP,100,False,True,False,unknown,AGP
1,A Coruña,BCN,105,False,True,False,unknown,BCN
2,A Coruña,GVA,135,False,True,False,unknown,GVA
3,A Coruña,LGW,120,False,True,False,unknown,LGW
4,A Coruña,MAD,76,False,True,False,unknown,MAD
...,...,...,...,...,...,...,...,...
3809,Warsaw,ZRH,118,False,True,False,unknown,ZRH
3810,Westerland,ZRH,99,False,True,False,unknown,ZRH
3811,Wrocław,ZRH,97,False,True,False,unknown,ZRH
3812,XRY,Zurich,165,True,False,False,XRY,unknown


In [None]:
# create IATA-Mapping-Dict
iata_dict = dict(zip(df_ARA_clean["airport"], df_ARA_clean["iata_code"]))

# function to map airport names to IATA codes
def 