In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import pickle

print('âœ… Libraries loaded')
print(f'XGBoost version: {xgb.__version__}')

## 1. Data Loading and Feature Engineering

In [None]:
# Load data
data_path = 'data/all_domestic_cleaned.csv'
df = pd.read_csv(data_path)

print(f'ðŸ“Š Data loaded: {len(df):,}')
df.head()

In [None]:
# Feature Engineering
def create_features(df):
 df = df.copy()
 df['fl_date'] = pd.to_datetime(df['fl_date'])
 
 # Time features
 df['hour'] = df['crs_dep_time'].apply(lambda x: int(str(int(x)).zfill(4)[:2]) if pd.notna(x) else 0)
 df['month'] = df['fl_date'].dt.month
 df['day_of_week'] = df['fl_date'].dt.dayofweek
 df['day_of_month'] = df['fl_date'].dt.day
 df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
 
 # Time slot (category)
 df['time_period'] = pd.cut(df['hour'], bins=[-1, 6, 12, 18, 24], 
 labels=['night', 'morning', 'afternoon', 'evening'])
 
 # Peak-season flag
 df['is_peak_season'] = df['month'].isin([6, 7, 8, 12]).astype(int)
 
 return df

df = create_features(df)
print('âœ… Feature engineering complete')

## 2. Add Historical Delay Statistics by Airline/Airport (Key)

In [None]:
# Average delay by airline
carrier_delay = df.groupby('op_unique_carrier')['dep_delay'].mean().to_dict()
df['carrier_avg_delay'] = df['op_unique_carrier'].map(carrier_delay)

# Average delay by origin
origin_delay = df.groupby('origin')['dep_delay'].mean().to_dict()
df['origin_avg_delay'] = df['origin'].map(origin_delay)

# Average delay by time slot
hour_delay = df.groupby('hour')['dep_delay'].mean().to_dict()
df['hour_avg_delay'] = df['hour'].map(hour_delay)

# Average delay by day of week
dow_delay = df.groupby('day_of_week')['dep_delay'].mean().to_dict()
df['dow_avg_delay'] = df['day_of_week'].map(dow_delay)

print('âœ… Added historical-statistics features')
print('\nAverage delay by airline (Top 5):')
print(df.groupby('airline_name')['carrier_avg_delay'].first().sort_values(ascending=False).head())

## 3. Data Preparation

In [None]:
# Select features
feature_columns = [
 'op_unique_carrier', 'origin', 'dest',
 'hour', 'month', 'day_of_week', 'day_of_month', 'is_weekend',
 'time_period', 'is_peak_season',
 # Statistical features (key)
 'carrier_avg_delay', 'origin_avg_delay', 'hour_avg_delay', 'dow_avg_delay'
]

target = 'dep_delay'

# Remove missing values
df_clean = df[feature_columns + [target]].dropna()
print(f'âœ… Training data: {len(df_clean):,}')

In [None]:
# Encode categorical variables
label_encoders = {}
categorical_cols = ['op_unique_carrier', 'origin', 'dest', 'time_period']

for col in categorical_cols:
 le = LabelEncoder()
 df_clean[col] = le.fit_transform(df_clean[col])
 label_encoders[col] = le

print('âœ… Label encoding complete')

In [None]:
# Train/Val/Test split
X = df_clean[feature_columns].values
y = df_clean[target].values

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.111, random_state=42)

print(f'Train: {len(X_train):,} | Val: {len(X_val):,} | Test: {len(X_test):,}')

## 4. Train XGBoost Model

In [None]:
# XGBoost model
model = xgb.XGBRegressor(
 n_estimators=500,
 max_depth=8,
 learning_rate=0.05,
 subsample=0.8,
 colsample_bytree=0.8,
 min_child_weight=3,
 gamma=0.1,
 reg_alpha=0.1,
 reg_lambda=1.0,
 random_state=42,
 n_jobs=-1
)

print('ðŸš€ Starting XGBoost training...')

# Train with early stopping
model.fit(
 X_train, y_train,
 eval_set=[(X_val, y_val)],
 verbose=50
)

print('\nâœ… Training complete!')

## 5. Performance Evaluation

In [None]:
# Prediction
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)
y_pred_test = model.predict(X_test)

# Evaluation
print('ðŸ“Š Performance evaluation:\n')
print('Train:')
print(f' MAE: {mean_absolute_error(y_train, y_pred_train):.2f} min')
print(f' RMSE: {np.sqrt(mean_squared_error(y_train, y_pred_train)):.2f} min')
print(f' RÂ²: {r2_score(y_train, y_pred_train):.4f}')

print('\nValidation:')
print(f' MAE: {mean_absolute_error(y_val, y_pred_val):.2f} min')
print(f' RMSE: {np.sqrt(mean_squared_error(y_val, y_pred_val)):.2f} min')
print(f' RÂ²: {r2_score(y_val, y_pred_val):.4f}')

print('\nTest:')
test_mae = mean_absolute_error(y_test, y_pred_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
test_r2 = r2_score(y_test, y_pred_test)
print(f' MAE: {test_mae:.2f} min')
print(f' RMSE: {test_rmse:.2f} min')
print(f' RÂ²: {test_r2:.4f}')

print(f'\nâœ… Predicted with average error of {test_mae:.1f} min')
print(f' (Reference: mean={y_test.mean():.1f} min, std={y_test.std():.1f} min)')

## 6. Feature Importance

In [None]:
# Visualize feature importance
importance_df = pd.DataFrame({
 'feature': feature_columns,
 'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(importance_df['feature'][:15], importance_df['importance'][:15])
plt.xlabel('Importance')
plt.title('Top 15 Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print('\nðŸ“Š Most important feature:')
print(importance_df.head(10))

## 7. Prediction Analysis

In [None]:
# Prediction vs actual
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred_test, alpha=0.3, s=10)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Delay (min)')
plt.ylabel('Predicted Delay (min)')
plt.title('Actual vs Predicted')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
errors = y_pred_test - y_test
plt.hist(errors, bins=50, edgecolor='black')
plt.xlabel('Error (min)')
plt.ylabel('Count')
plt.title('Error Distribution')
plt.axvline(0, color='red', linestyle='--', lw=2)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f'Mean error: {errors.mean():.2f} min')
print(f'Error std dev: {errors.std():.2f} min')

## 8. Save Model

In [None]:
# Model package
model_package = {
 'model': model,
 'label_encoders': label_encoders,
 'feature_columns': feature_columns,
 'statistics': {
 'carrier_delay': carrier_delay,
 'origin_delay': origin_delay,
 'hour_delay': hour_delay,
 'dow_delay': dow_delay
 },
 'test_metrics': {
 'mae': test_mae,
 'rmse': test_rmse,
 'r2': test_r2
 }
}

# Save
output_path = 'models/xgboost_predictor.pkl'
with open(output_path, 'wb') as f:
 pickle.dump(model_package, f)

print(f'âœ… Model saved: {output_path}')
print(f'\nFinal performance: MAE {test_mae:.2f} min, RÂ² {test_r2:.4f}')

## âœ… Complete!

Why XGBoost can outperform deep learning:
1. **Use historical statistics**: Average delays by airline/airport -> strong predictive features
2. **Interpretable**: Feature importance shows which factors matter
3. **Fast training**: Fast even without GPU
4. **Proven performance**: Validated in Kaggle and industry