In [1]:
!pip install xgboost
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 330.3 kB/s eta 0:06:19
   ---------------------------------------- 0.1/124.9 MB 365.7 kB/s eta 0:05:42
   ---------------------------------------- 0.1/124.9 MB 774.0 kB/s eta 0:02:42
   ---------------------------------------- 0.2/124.9 MB 845.5 kB/s eta 0:02:28
   ---------------------------------------- 0.3/124.9 MB 1.2 MB/s eta 0:01:41
   ---------------------------------------- 0.4/124.9 MB 1.3 MB/s eta 0:01:35
   ---------------------------------------- 0.7/124.9 MB 1.8 MB/s eta 0:01:09
   ---------------------------------------- 0.9/124.9 MB 2.1 MB/s eta 0:00:59
   ----

In [8]:
# Load and inspect dataset
data = pd.read_csv('Aviation_KPIs_Dataset.xlsx - Sheet1.csv')
print(data.head())
print(data.columns)

  Flight Number Scheduled Departure Time Actual Departure Time  \
0         FL885      2024-05-20 11:51:21   2024-07-10 02:38:54   
1         FL930      2024-01-23 06:56:23   2024-07-07 02:53:44   
2         FL478      2024-05-30 09:18:39   2024-12-05 01:00:54   
3         FL637      2024-08-15 05:21:47   2024-09-21 13:23:42   
4         FL318      2024-07-25 15:29:58   2024-03-21 15:05:54   

   Delay (Minutes)  Aircraft Utilization (Hours/Day)  \
0               50                             12.36   
1               27                             14.53   
2              108                             10.73   
3               64                             15.10   
4               30                             13.46   

   Turnaround Time (Minutes)  Load Factor (%)  Fleet Availability (%)  \
0                        115            79.18                   96.24   
1                         83            98.59                   80.49   
2                         87            67.44  

In [9]:
# Drop unnecessary columns
data.drop(columns=['Flight Number', 'Scheduled Departure Time', 'Actual Departure Time'], inplace=True)

In [None]:
# Handle missing values
data.fillna(data.median(), inplace=True)

In [None]:
# Define features and target variable
features = data.drop(columns=['Profit (USD)'])
target = data['Profit (USD)']

In [None]:
# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=43)

In [None]:
# Train Random Forest model
random_forest = RandomForestRegressor()
random_forest.fit(X_train, y_train)

In [None]:
# Train XGBoost model
xgb_regressor = XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    eval_metric='rmse',
    random_state=42
)

In [None]:
xgb_regressor.fit(X_train, y_train, eval_set=[(X_test, y_test)])

In [None]:
# Make predictions
rf_predictions = random_forest.predict(X_test)
xgb_predictions = xgb_regressor.predict(X_test)


In [None]:
# Model evaluation
models = {"Random Forest": random_forest, "XGBoost": xgb_regressor}

In [None]:
for model_name, model in models.items():
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)
    
    train_mae = mean_absolute_error(y_train, train_predictions)
    train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
    train_r2 = r2_score(y_train, train_predictions)
    
    test_mae = mean_absolute_error(y_test, test_predictions)
    test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
    test_r2 = r2_score(y_test, test_predictions)
    
    print(f"\n{model_name} Performance:")
    print(f"Training -> MAE: {train_mae:.2f}, RMSE: {train_rmse:.2f}, R2: {train_r2:.4f}")
    print(f"Testing  -> MAE: {test_mae:.2f}, RMSE: {test_rmse:.2f}, R2: {test_r2:.4f}")
    
    if train_r2 > 0.90 and test_r2 < 0.75:
        print(f" {model_name} might be overfitting. Consider tuning hyperparameters.")
    elif train_r2 < 0.60 and test_r2 < 0.60:
        print(f" {model_name} might be underfitting. Consider increasing model complexity.")
    else:
        print(f" {model_name} appears well-balanced.")

In [None]:
# Save predictions
predicted_profits = pd.DataFrame({'Predicted Revenue': xgb_predictions})