Flight Difficulty Score Development

In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [87]:
# Set seaborn style
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.1)
plt.rcParams['figure.figsize'] = (14, 6)

In [88]:
airports = pd.read_csv("data/Airports Data.csv")
bag_level = pd.read_csv("data/Bag+Level+Data.csv")
flight_level = pd.read_csv("data/Flight Level Data.csv")
pnr_remark_level = pd.read_csv("data/PNR Remark Level Data.csv")
pnr_flight_level = pd.read_csv("data/PNR+Flight+Level+Data.csv")

In [89]:
# Convert datetime columns
flight_level['scheduled_departure_datetime_local'] = pd.to_datetime(flight_level['scheduled_departure_datetime_local'])
flight_level['actual_departure_datetime_local'] = pd.to_datetime(flight_level['actual_departure_datetime_local'])
flight_level['scheduled_departure_date_local'] = pd.to_datetime(flight_level['scheduled_departure_date_local'])
bag_level['scheduled_departure_date_local'] = pd.to_datetime(bag_level['scheduled_departure_date_local'])
pnr_flight_level['scheduled_departure_date_local'] = pd.to_datetime(pnr_flight_level['scheduled_departure_date_local'])

In [90]:
# Remove negative or zero ground times
invalid_ground_time = (flight_level['scheduled_ground_time_minutes'] <= 0) | \
                      (flight_level['minimum_turn_minutes'] <= 0)
print(f"  Rows with invalid ground time: {invalid_ground_time.sum():,}")
flight_level = flight_level[~invalid_ground_time].copy()

  Rows with invalid ground time: 312


In [91]:
# Calculate delay
flight_level['delay_minutes'] = (
    flight_level['actual_departure_datetime_local'] - 
    flight_level['scheduled_departure_datetime_local']
).dt.total_seconds() / 60

In [92]:
# Create flight key
flight_key = ['company_id', 'flight_number', 'scheduled_departure_date_local',
              'scheduled_departure_station_code', 'scheduled_arrival_station_code']

# Start with flight level data
df_master = flight_level.copy()


In [93]:
df_master['ground_time_ratio'] = (
    df_master['scheduled_ground_time_minutes'] / df_master['minimum_turn_minutes']
)

df_master['ground_time_pressure_score'] = np.where(
    df_master['ground_time_ratio'] < 1.0, 10,      # Critical - below minimum
    np.where(df_master['ground_time_ratio'] < 1.1, 7,  # Very tight
             np.where(df_master['ground_time_ratio'] < 1.3, 4, 1))  # Tight / Adequate
)

In [94]:
bag_summary = bag_level.groupby(flight_key).agg({
    'bag_tag_unique_number': 'count',
    'bag_type': lambda x: (x == 'Transfer').sum()
}).reset_index()
bag_summary.columns = list(bag_summary.columns[:-2]) + ['total_bags', 'transfer_bags']
bag_summary['checked_bags'] = bag_summary['total_bags'] - bag_summary['transfer_bags']
bag_summary['transfer_ratio'] = bag_summary['transfer_bags'] / bag_summary['total_bags']

df_master = df_master.merge(bag_summary, on=flight_key, how='left')
df_master['total_bags'] = df_master['total_bags'].fillna(0)
df_master['transfer_ratio'] = df_master['transfer_ratio'].fillna(0)


In [95]:
pnr_pax = pnr_flight_level.groupby(flight_key).agg({
    'total_pax': 'sum',
    'lap_child_count': 'sum',
    'is_child': 'sum',
    'basic_economy_ind': 'sum',
    'is_stroller_user': 'sum'
}).reset_index()

df_master = df_master.merge(pnr_pax, on=flight_key, how='left')
df_master['total_pax'].fillna(0, inplace=True)
df_master['load_factor'] = df_master['total_pax'] / df_master['total_seats']

# Load factor score - REDUCED WEIGHT based on negative correlation with delay
# Analysis shows load factor has -0.25 correlation with delay
# Only very high loads (>95%) might cause minor issues with boarding
df_master['load_factor_score'] = np.where(
    df_master['load_factor'] > 0.95, 3,  # Reduced from 10
    np.where(df_master['load_factor'] > 0.90, 2,  # Reduced from 7
             np.where(df_master['load_factor'] > 0.85, 1, 0))  # Reduced from 4
)

# Bags per passenger
df_master['bags_per_pax'] = df_master['total_bags'] / df_master['total_pax'].replace(0, 1)
df_master['baggage_volume_score'] = np.where(
    df_master['bags_per_pax'] > 1.5, 8,
    np.where(df_master['bags_per_pax'] > 1.2, 5,
             np.where(df_master['bags_per_pax'] > 0.8, 2, 0))
)

# Transfer bag complexity
df_master['transfer_complexity_score'] = np.where(
    df_master['transfer_ratio'] > 0.6, 8,
    np.where(df_master['transfer_ratio'] > 0.4, 5,
             np.where(df_master['transfer_ratio'] > 0.2, 2, 0))
)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_master['total_pax'].fillna(0, inplace=True)


In [101]:
# First, convert Y/N to 1/0, then aggregate
flight_key = ['company_id', 'flight_number', 'scheduled_departure_date_local',
              'scheduled_departure_station_code', 'scheduled_arrival_station_code']

# Create numeric versions in pnr_flight_level
pnr_flight_level['is_child_numeric'] = (pnr_flight_level['is_child'] == 'Y').astype(int)
pnr_flight_level['is_stroller_numeric'] = (pnr_flight_level['is_stroller_user'] == 'Y').astype(int)
pnr_flight_level['lap_child_count'] = pd.to_numeric(pnr_flight_level['lap_child_count'], errors='coerce').fillna(0)
pnr_flight_level['total_pax'] = pd.to_numeric(pnr_flight_level['total_pax'], errors='coerce').fillna(0)

# Aggregate to flight level
pnr_aggregated = pnr_flight_level.groupby(flight_key).agg({
    'is_child_numeric': 'sum',      # Total children on flight
    'lap_child_count': 'sum',       # Total lap children
    'is_stroller_numeric': 'sum',   # Total stroller users
    'total_pax': 'sum',             # Total passengers from all PNRs
    'record_locator': 'nunique'     # Number of unique PNRs (family groups)
}).reset_index()

# Rename for clarity
pnr_aggregated.rename(columns={
    'is_child_numeric': 'total_children',
    'lap_child_count': 'total_lap_children',
    'is_stroller_numeric': 'total_stroller_users',
    'total_pax': 'total_pax_from_pnr',
    'record_locator': 'num_pnr_groups'
}, inplace=True)

# Merge with flight_level data
df_master = flight_level.merge(pnr_aggregated, on=flight_key, how='left')

# Fill NaN values
df_master['total_children'] = df_master['total_children'].fillna(0)
df_master['total_lap_children'] = df_master['total_lap_children'].fillna(0)
df_master['total_stroller_users'] = df_master['total_stroller_users'].fillna(0)
df_master['total_pax_from_pnr'] = df_master['total_pax_from_pnr'].fillna(0)
df_master['num_pnr_groups'] = df_master['num_pnr_groups'].fillna(0)

# Create enhanced features
df_master['child_complexity'] = (
    df_master['total_children'] + 
    df_master['total_lap_children'] * 1.5 +  # Lap children are more complex
    df_master['total_stroller_users'] * 1.2   # Strollers add complexity
)

# Average PNR size (larger groups = families)
df_master['avg_pnr_size'] = np.where(
    df_master['num_pnr_groups'] > 0,
    df_master['total_pax_from_pnr'] / df_master['num_pnr_groups'],
    0
)

df_master['family_complexity_score'] = np.where(
    df_master['child_complexity'] > 15, 6,
    np.where(df_master['child_complexity'] > 10, 4,
             np.where(df_master['child_complexity'] > 5, 2, 0))
)

In [102]:
df_master.sample(10)

Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,scheduled_departure_datetime_local,scheduled_arrival_datetime_local,actual_departure_datetime_local,actual_arrival_datetime_local,total_seats,...,minimum_turn_minutes,delay_minutes,total_children,total_lap_children,total_stroller_users,total_pax_from_pnr,num_pnr_groups,child_complexity,avg_pnr_size,family_complexity_score
5163,UA,561,2025-08-06,ORD,MCI,2025-08-06 12:45:00+00:00,2025-08-06T14:22:00Z,2025-08-06 13:11:00+00:00,2025-08-06T14:43:00Z,166,...,51,26.0,11,3,2,202,126,17.9,1.603175,6
5463,UA,2071,2025-08-11,ORD,SAN,2025-08-11 07:11:00+00:00,2025-08-11T09:34:00Z,2025-08-11 07:21:00+00:00,2025-08-11T09:25:00Z,179,...,62,10.0,14,1,4,228,126,20.3,1.809524,6
4492,OO,5121,2025-08-10,ORD,FOD,2025-08-10 07:45:00+00:00,2025-08-10T09:20:00Z,2025-08-10 07:59:00+00:00,2025-08-10T09:54:00Z,50,...,29,14.0,1,0,0,8,3,1.0,2.666667,0
1114,UA,1564,2025-08-15,ORD,MSP,2025-08-15 20:05:00+00:00,2025-08-15T21:43:00Z,2025-08-15 20:12:00+00:00,2025-08-15T22:44:00Z,166,...,51,7.0,5,2,0,171,116,8.0,1.474138,2
4402,UA,2483,2025-08-12,ORD,IAH,2025-08-12 10:35:00+00:00,2025-08-12T13:29:00Z,2025-08-12 10:35:00+00:00,2025-08-12T13:15:00Z,179,...,56,0.0,6,0,0,193,139,6.0,1.388489,2
6370,UA,1620,2025-08-11,ORD,SEA,2025-08-11 08:50:00+00:00,2025-08-11T11:37:00Z,2025-08-11 08:44:00+00:00,2025-08-11T11:27:00Z,200,...,62,-6.0,7,1,1,229,152,9.7,1.506579,2
6071,UA,622,2025-08-01,ORD,DEN,2025-08-01 13:04:00+00:00,2025-08-01T14:46:00Z,2025-08-01 13:04:00+00:00,2025-08-01T14:43:00Z,364,...,87,0.0,17,0,2,421,258,19.4,1.631783,6
7762,UA,1178,2025-08-11,ORD,ORF,2025-08-11 18:22:00+00:00,2025-08-11T21:40:00Z,2025-08-11 18:19:00+00:00,2025-08-11T21:36:00Z,126,...,43,-3.0,3,0,0,128,91,3.0,1.406593,0
2211,YX,3542,2025-08-14,ORD,YUL,2025-08-14 17:58:00+00:00,2025-08-14T21:29:00Z,2025-08-14 18:02:00+00:00,2025-08-14T21:07:00Z,76,...,42,4.0,1,0,0,74,61,1.0,1.213115,0
4436,OO,5731,2025-08-13,ORD,STL,2025-08-13 11:01:00+00:00,2025-08-13T12:27:00Z,2025-08-13 10:56:00+00:00,2025-08-13T12:06:00Z,76,...,34,-5.0,4,0,0,85,62,4.0,1.370968,0


In [71]:
flight_level.sample(10)

Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,scheduled_departure_datetime_local,scheduled_arrival_datetime_local,actual_departure_datetime_local,actual_arrival_datetime_local,total_seats,fleet_type,carrier,scheduled_ground_time_minutes,actual_ground_time_minutes,minimum_turn_minutes,delay_minutes
717,UA,422,2025-08-10,ORD,ATH,2025-08-10 16:15:00+00:00,2025-08-11T10:30:00Z,2025-08-10 16:20:00+00:00,2025-08-11T10:41:00Z,318,B787-10,Mainline,185,225,155,5.0
3897,UA,2369,2025-08-01,ORD,AUS,2025-08-01 08:50:00+00:00,2025-08-01T11:44:00Z,2025-08-01 08:45:00+00:00,2025-08-01T11:28:00Z,179,B737-MAX9,Mainline,75,64,56,-5.0
2495,UA,1010,2025-08-06,ORD,DCA,2025-08-06 08:00:00+00:00,2025-08-06T10:56:00Z,2025-08-06 07:50:00+00:00,2025-08-06T10:40:00Z,150,A320-200,Mainline,92,98,52,-10.0
7793,YX,3421,2025-08-01,ORD,ROC,2025-08-01 07:15:00+00:00,2025-08-01T10:17:00Z,2025-08-01 08:33:00+00:00,2025-08-01T11:05:00Z,76,ERJ-175,Express,47,146,34,78.0
1916,UA,576,2025-08-13,ORD,LAX,2025-08-13 15:35:00+00:00,2025-08-13T18:06:00Z,2025-08-13 15:42:00+00:00,2025-08-13T17:44:00Z,200,A321-2NX,Mainline,72,106,62,7.0
1085,OO,5474,2025-08-09,ORD,MBS,2025-08-09 18:35:00+00:00,2025-08-09T21:02:00Z,2025-08-09 18:42:00+00:00,2025-08-09T20:52:00Z,50,CRJ-550,Express,86,30,29,7.0
8085,UA,2113,2025-08-05,ORD,FSD,2025-08-05 20:08:00+00:00,2025-08-05T22:00:00Z,2025-08-05 21:07:00+00:00,2025-08-05T22:34:00Z,179,B737-900,Mainline,13,54,56,59.0
7026,UA,2113,2025-08-08,ORD,FSD,2025-08-08 20:30:00+00:00,2025-08-08T22:22:00Z,2025-08-08 21:27:00+00:00,2025-08-08T23:01:00Z,179,B737-900,Mainline,444,397,56,57.0
1179,UA,2232,2025-08-10,ORD,MSP,2025-08-10 07:15:00+00:00,2025-08-10T08:54:00Z,2025-08-10 07:19:00+00:00,2025-08-10T08:59:00Z,179,B737-MAX9,Mainline,75,54,62,4.0
6326,UA,1269,2025-08-06,ORD,CLE,2025-08-06 18:00:00+00:00,2025-08-06T20:28:00Z,2025-08-06 20:48:00+00:00,2025-08-07T00:02:00Z,179,B737-900,Mainline,50,194,64,168.0


In [99]:
# Debug: Check if the conversion is working
print("Before conversion:")
print(pnr_flight_level[['is_child']].head(20))

# Try the conversion
pnr_flight_level['is_child_numeric'] = (pnr_flight_level['is_child'] == 'Y').astype(int)

print("\nAfter conversion:")
print(pnr_flight_level[['is_child', 'is_child_numeric']].head(20))

print("\nSum of is_child_numeric:")
print(pnr_flight_level['is_child_numeric'].sum())

print("\nValue counts of is_child_numeric:")
print(pnr_flight_level['is_child_numeric'].value_counts())

Before conversion:
   is_child
0         N
1         N
2         Y
3         N
4         Y
5         N
6         N
7         N
8         N
9         N
10        N
11        N
12        N
13        N
14        N
15        Y
16        N
17        N
18        N
19        N

After conversion:
   is_child  is_child_numeric
0         N                 0
1         N                 0
2         Y                 1
3         N                 0
4         Y                 1
5         N                 0
6         N                 0
7         N                 0
8         N                 0
9         N                 0
10        N                 0
11        N                 0
12        N                 0
13        N                 0
14        N                 0
15        Y                 1
16        N                 0
17        N                 0
18        N                 0
19        N                 0

Sum of is_child_numeric:
41739

Value counts of is_child_numeric:
is_child_nume

In [96]:
pnr_flight_level.head()

Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,record_locator,pnr_creation_date,total_pax,is_child,basic_economy_ind,is_stroller_user,lap_child_count
0,UA,2494,2025-08-04,ORD,MCI,PNR_520583,2025-07-07,1,N,0,N,0
1,UA,2483,2025-08-06,ORD,IAH,PNR_296107,2025-03-28,1,N,0,N,0
2,UA,1620,2025-08-01,ORD,SEA,PNR_296108,2025-06-30,4,Y,0,N,0
3,UA,1620,2025-08-01,ORD,SEA,PNR_296108,2025-06-30,4,N,0,N,1
4,UA,1620,2025-08-01,ORD,SEA,PNR_296108,2025-06-30,4,Y,0,Y,0
