# Flight Delay Prediction - XGBoost Model Training

This notebook trains an XGBoost regressor to predict flight delays and exports the model for production use.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ All libraries imported successfully!")

## 2. Load and Explore Dataset

In [None]:
# Load the dataset
df = pd.read_csv('synth_data/data/flight_delays.csv')

print(f"Dataset Shape: {df.shape[0]} rows, {df.shape[1]} columns")
print("\n" + "="*80)
df.head()

In [None]:
# Display column information
print("Column Names and Types:")
print("="*80)
df.info()

In [None]:
# Statistical summary
print("Dataset Statistics:")
print("="*80)
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
print("="*80)
missing = df.isnull().sum()
missing[missing > 0]

## 3. Data Preprocessing

In [None]:
# Make a copy for processing
df_processed = df.copy()

# Identify target variable (delay_minutes or similar)
target_col = None
for col in df.columns:
    if 'delay' in col.lower() and 'minute' in col.lower():
        target_col = col
        break

if target_col is None:
    # Try to find any column with 'delay' in name
    for col in df.columns:
        if 'delay' in col.lower():
            target_col = col
            break

if target_col is None:
    print("ERROR: Could not find target column with 'delay' in name.")
    print("Available columns:", df.columns.tolist())
    print("\nPlease specify the target column name manually.")
else:
    print(f"✓ Target Variable Identified: {target_col}")
    
    # Separate features and target
    y = df_processed[target_col]
    X = df_processed.drop(columns=[target_col])
    
    print(f"✓ Features: {X.shape[1]} columns")
    print(f"✓ Target: {target_col} (mean: {y.mean():.2f}, std: {y.std():.2f})")

In [None]:
# Handle missing values
print("Handling missing values...")

# For numerical columns, fill with median
numerical_cols = X.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    if X[col].isnull().sum() > 0:
        X[col].fillna(X[col].median(), inplace=True)
        print(f"  ✓ Filled {col} with median")

# For categorical columns, fill with mode
categorical_cols = X.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if X[col].isnull().sum() > 0:
        X[col].fillna(X[col].mode()[0], inplace=True)
        print(f"  ✓ Filled {col} with mode")

print(f"\n✓ Missing values handled!")

In [None]:
# Encode categorical variables
print(f"Encoding {len(categorical_cols)} categorical columns...")

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le
    print(f"  ✓ Encoded {col}")

print(f"\n✓ Final feature set: {X.shape[1]} columns")
print("\nFeature columns:", X.columns.tolist())

## 4. Train-Test Split

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Testing set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

# Feature scaling (using StandardScaler)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n✓ Data split and scaled successfully!")

## 5. Model Training (XGBoost)

In [None]:
print("Training XGBoost Model...")
# Using X_train_scaled to match the scaler that will be saved
model = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

# Calculate metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"\n✓ XGBoost Results:")
print(f"  R² Score: {r2:.4f}")
print(f"  MAE: {mae:.4f}")
print(f"  RMSE: {rmse:.4f}")

## 6. Visualization

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5, s=10)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Delay (minutes)')
plt.ylabel('Predicted Delay (minutes)')
plt.title(f'Actual vs Predicted - XGBoost (R²: {r2:.4f})', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.savefig('xgboost_actual_vs_predicted.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Visualization saved to: xgboost_actual_vs_predicted.png")

## 7. Save Model and Scaler

In [None]:
# Save the XGBoost model and artifacts
model_filename = 'xgboost_flight_delay_model.pkl'

with open(model_filename, 'wb') as f:
    pickle.dump({
        'model': model,
        'scaler': scaler,
        'label_encoders': label_encoders,
        'feature_names': X.columns.tolist(),
        'model_name': 'XGBoost',
        'r2_score': r2,
        'rmse': rmse
    }, f)

print(f"✓ Model and Scaler saved to: {model_filename}")
print("\n" + "="*80)
print("TRAINING COMPLETE!")
print("="*80)
print(f"\nSaved artifacts include:")
print("- XGBRegressor Model")
print("- StandardScaler")
print("- LabelEncoders")
print("- Feature Names")