In [15]:
import pandas as pd
import numpy as np

In [16]:
airports = pd.read_csv("data/Airports Data.csv")
bag_level = pd.read_csv("data/Bag+Level+Data.csv")
flight_level = pd.read_csv("data/Flight Level Data.csv")
pnr_remark_level = pd.read_csv("data/PNR Remark Level Data.csv")
pnr_flight_level = pd.read_csv("data/PNR+Flight+Level+Data.csv")

In [17]:
for df in [flight_level, pnr_flight_level]:
    if 'scheduled_departure_date_local' in df.columns:
        df['scheduled_departure_date_local'] = pd.to_datetime(df['scheduled_departure_date_local'])


In [18]:
flight_level["delay_minutes"] = (
    pd.to_datetime(flight_level["actual_departure_datetime_local"]) -
    pd.to_datetime(flight_level["scheduled_departure_datetime_local"])
).dt.total_seconds() / 60

In [19]:
# Create a unique flight key
def flight_key(df):
    return (
        df["company_id"].astype(str) + "_" +
        df["flight_number"].astype(str) + "_" +
        df["scheduled_departure_date_local"].astype(str)
    )

flight_level["flight_key"] = flight_key(flight_level)
pnr_flight_level["flight_key"] = flight_key(pnr_flight_level)
bag_level["flight_key"] = flight_key(bag_level)
pnr_remark_level["flight_number"] = pnr_remark_level["flight_number"].astype(str)

In [20]:
avg_seats = flight_level["total_seats"].mean()
print(f"Average total seats across all flights: {avg_seats:.2f}")

# Categorize flights by aircraft size
flight_level["size_category"] = pd.cut(
    flight_level["total_seats"],
    bins=[0, 100, 200, np.inf],
    labels=["Small (<100)", "Medium (100-200)", "Large (200+)"]
)

seat_distribution = flight_level["size_category"].value_counts(normalize=True) * 100
print("\nSeat Distribution (%):")
print(seat_distribution)

# Suggestion: yes, actual seat count is more precise than fleet_type (you can check correlation)
print("\nSeat count correlation with delay:")
if "delay_minutes" in flight_level.columns:
    print(flight_level["total_seats"].corr(flight_level["delay_minutes"]))

Average total seats across all flights: 122.98

Seat Distribution (%):
size_category
Medium (100-200)    50.475367
Small (<100)        44.412890
Large (200+)         5.111742
Name: proportion, dtype: float64

Seat count correlation with delay:
0.026573305826133622


In [21]:
flight_level["ground_time_ratio"] = (
    flight_level["actual_ground_time_minutes"] / flight_level["minimum_turn_minutes"]
)

corr_ground_delay = flight_level["ground_time_ratio"].corr(flight_level["delay_minutes"])
print(f"\nCorrelation between ground_time_ratio and delay: {corr_ground_delay:.3f}")

# Compare delay rates for ratio <1.0 and >1.3
def delayed_ratio(subset):
    return (subset["delay_minutes"] > 0).mean() * 100

low_ratio = delayed_ratio(flight_level[flight_level["ground_time_ratio"] < 1.0])
high_ratio = delayed_ratio(flight_level[flight_level["ground_time_ratio"] > 1.3])
print(f"Delayed flights (<1.0 ratio): {low_ratio:.2f}%")
print(f"Delayed flights (>1.3 ratio): {high_ratio:.2f}%")


Correlation between ground_time_ratio and delay: 0.046
Delayed flights (<1.0 ratio): 81.72%
Delayed flights (>1.3 ratio): 43.49%


In [22]:
bag_summary = bag_level.groupby("flight_key").agg(
    total_bags=("bag_tag_unique_number", "count"),
    transfer_bags=("bag_type", lambda x: (x == "Transfer").sum())
).reset_index()

bag_summary["transfer_ratio"] = bag_summary["transfer_bags"] / bag_summary["total_bags"]

# Merge with flight data for delay correlation
flight_bag_merge = flight_level.merge(bag_summary, on="flight_key", how="left")

print("\nCorrelation between total_bags and delay:")
print(flight_bag_merge["total_bags"].corr(flight_bag_merge["delay_minutes"]))

print("\nCorrelation between transfer_ratio and delay:")
print(flight_bag_merge["transfer_ratio"].corr(flight_bag_merge["delay_minutes"]))


Correlation between total_bags and delay:
0.023013057089993686

Correlation between transfer_ratio and delay:
0.08909317173585576


In [23]:
flight_level["flight_number"] = flight_level["flight_number"].astype(str)
pnr_remark_level["flight_number"] = pnr_remark_level["flight_number"].astype(str)

# Count SSRs per flight
ssr_count = (
    pnr_remark_level.groupby("flight_number")["special_service_request"]
    .count()
    .reset_index()
    .rename(columns={"special_service_request": "ssr_count"})
)

# Merge with flight data
flight_ssr_merge = flight_level.merge(ssr_count, on="flight_number", how="left")

# Fill NaNs (flights without SSRs) with 0
flight_ssr_merge["ssr_count"] = flight_ssr_merge["ssr_count"].fillna(0)

# Compute correlation
if "delay_minutes" in flight_ssr_merge.columns:
    corr_ssr_delay = flight_ssr_merge["ssr_count"].corr(flight_ssr_merge["delay_minutes"])
    print(f"\nCorrelation between SSR count and delay: {corr_ssr_delay:.3f}")
else:
    print("⚠️ 'delay_minutes' column not found in flight_level.")


Correlation between SSR count and delay: 0.065


In [25]:
# Make sure flight_key column exists in pnr_flight_level
# (Should already be created from your earlier code)

# FIRST: Convert Y/N strings to numeric (1/0) in the original dataframe
pnr_flight_level['is_child_numeric'] = (pnr_flight_level['is_child'] == 'Y').astype(int)
pnr_flight_level['is_stroller_numeric'] = (pnr_flight_level['is_stroller_user'] == 'Y').astype(int)

# Ensure lap_child_count is numeric
pnr_flight_level['lap_child_count'] = pd.to_numeric(
    pnr_flight_level['lap_child_count'], 
    errors='coerce'
).fillna(0)

# NOW aggregate using the numeric columns - groupby 'flight_key' STRING COLUMN
pnr_pax = pnr_flight_level.groupby('flight_key').agg({
    'lap_child_count': 'sum',
    'is_child_numeric': 'sum',
    'is_stroller_numeric': 'sum'
}).reset_index()

# Rename for clarity
pnr_pax.rename(columns={
    'lap_child_count': 'total_lap_children',
    'is_child_numeric': 'total_children',
    'is_stroller_numeric': 'total_stroller_users'
}, inplace=True)

# Calculate child complexity with proper weights
pnr_pax['child_complexity'] = (
    pnr_pax['total_children'] + 
    pnr_pax['total_lap_children'] * 1.5 +  # Lap children are more complex
    pnr_pax['total_stroller_users'] * 1.2   # Strollers add boarding time
)

# Merge with flight level using the 'flight_key' column
flight_pax_merge = flight_level.merge(pnr_pax, on='flight_key', how='left')

# Fill NaN values for flights without PNR data
flight_pax_merge['child_complexity'] = flight_pax_merge['child_complexity'].fillna(0)

# Check correlation
print("\nCorrelation between family complexity and delay:")
correlation = flight_pax_merge['child_complexity'].corr(flight_pax_merge['delay_minutes'])
print(f"{correlation:.4f}")

# Show some examples
print("\nSample of child complexity scores:")
sample_cols = ['flight_key', 'total_children', 'total_lap_children', 
               'total_stroller_users', 'child_complexity', 'delay_minutes']
print(flight_pax_merge[sample_cols].head(10))

# Show distribution of child complexity
print("\nChild Complexity Distribution:")
print(flight_pax_merge['child_complexity'].describe())
print(f"\nFlights with children: {(flight_pax_merge['child_complexity'] > 0).sum():,}")
print(f"Flights without children: {(flight_pax_merge['child_complexity'] == 0).sum():,}")


Correlation between family complexity and delay:
-0.0217

Sample of child complexity scores:
           flight_key  total_children  total_lap_children  \
0  OO_4792_2025-08-04               1                   0   
1   UA_920_2025-08-03               5                   1   
2  UA_1776_2025-08-10               5                   0   
3  OO_5790_2025-08-06               2                   0   
4  UA_1398_2025-08-05               3                   0   
5  OO_5470_2025-08-07               1                   1   
6   UA_374_2025-08-15               1                   1   
7  UA_1577_2025-08-15              10                   1   
8   UA_881_2025-08-03              16                   0   
9  UA_2006_2025-08-09               9                   2   

   total_stroller_users  child_complexity  delay_minutes  
0                     0               1.0            7.0  
1                     0               6.5           22.0  
2                     2               7.4          111.0 

In [27]:
# Step 1: Merge arrival airport country
intl_merge = flight_level.merge(
    airports[["airport_iata_code", "iso_country_code"]],
    left_on="scheduled_arrival_station_code",
    right_on="airport_iata_code",
    how="left"
).rename(columns={"iso_country_code": "arrival_country"})

# Step 2: Merge departure airport country
intl_merge = intl_merge.merge(
    airports[["airport_iata_code", "iso_country_code"]],
    left_on="scheduled_departure_station_code",
    right_on="airport_iata_code",
    how="left",
    suffixes=("", "_dep")
).rename(columns={"iso_country_code": "departure_country"})

# Step 3: Create international flag (departure country != arrival country)
intl_merge["is_international"] = (
    intl_merge["departure_country"] != intl_merge["arrival_country"]
).astype(int)

# Handle any missing values (airports not found)
intl_merge["is_international"] = intl_merge["is_international"].fillna(0).astype(int)

# Analysis
print("\nInternational vs Domestic Flight Counts:")
print(intl_merge["is_international"].value_counts().sort_index())
print(f"\n{intl_merge['is_international'].value_counts()[1]:,} international flights")
print(f"{intl_merge['is_international'].value_counts()[0]:,} domestic flights")

print("\nAverage delay: International vs Domestic")
avg_delay_intl = intl_merge.groupby("is_international")["delay_minutes"].mean()
print(avg_delay_intl)
print(f"\nInternational flights average: {avg_delay_intl[1]:.2f} minutes")
print(f"Domestic flights average: {avg_delay_intl[0]:.2f} minutes")
print(f"Difference: {avg_delay_intl[1] - avg_delay_intl[0]:.2f} minutes")

# Show some examples
print("\nSample International Flights:")
sample_cols = ["flight_number", "scheduled_departure_station_code", "departure_country",
               "scheduled_arrival_station_code", "arrival_country", "is_international", "delay_minutes"]
print(intl_merge[intl_merge["is_international"] == 1][sample_cols].head(5))

print("\nSample Domestic Flights:")
print(intl_merge[intl_merge["is_international"] == 0][sample_cols].head(5))


International vs Domestic Flight Counts:
is_international
0    7352
1     747
Name: count, dtype: int64

747 international flights
7,352 domestic flights

Average delay: International vs Domestic
is_international
0    20.864527
1    24.338688
Name: delay_minutes, dtype: float64

International flights average: 24.34 minutes
Domestic flights average: 20.86 minutes
Difference: 3.47 minutes

Sample International Flights:
   flight_number scheduled_departure_station_code departure_country  \
1            920                              ORD                US   
8            881                              ORD                US   
9           2006                              ORD                US   
18          3568                              ORD                US   
42          1899                              ORD                US   

   scheduled_arrival_station_code arrival_country  is_international  \
1                             LHR              GB                 1   
8        

In [29]:

flight_level["scheduled_departure_datetime_local"] = pd.to_datetime(
    flight_level["scheduled_departure_datetime_local"]
)
flight_level["hour"] = flight_level["scheduled_departure_datetime_local"].dt.hour

def is_peak(h):
    return (6 <= h <= 9) or (16 <= h <= 19)

flight_level["peak_period"] = flight_level["hour"].apply(is_peak)

# Calculate statistics
avg_delay_peak = flight_level.groupby("peak_period")["delay_minutes"].mean()
count_flights = flight_level.groupby("peak_period").size()

print("\nAverage delay during peak vs off-peak:")
print(f"  Off-Peak: {avg_delay_peak[False]:.2f} minutes ({count_flights[False]:,} flights)")
print(f"  Peak:     {avg_delay_peak[True]:.2f} minutes ({count_flights[True]:,} flights)")
print(f"  Difference: {avg_delay_peak[True] - avg_delay_peak[False]:+.2f} minutes")


Average delay during peak vs off-peak:
  Off-Peak: 20.43 minutes (4,070 flights)
  Peak:     21.95 minutes (4,029 flights)
  Difference: +1.52 minutes
