<a href="https://colab.research.google.com/github/Rutviag/AI-ML-IISc-project---Group-11/blob/Work_Rutvi/Model_performance_Comparision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Step 2: Install necessary libraries in Colab
!pip install xgboost catboost joblib

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [5]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import xgboost as xgb
import catboost as catb

In [6]:
# Step 3: Upload dataset
from google.colab import files
uploaded = files.upload()
file_path = list(uploaded.keys())[0]  # Automatically gets the uploaded file name
data = pd.read_csv(file_path)

Saving Data 1 - New York 2023.csv to Data 1 - New York 2023 (1).csv


In [7]:
# Step 4: Explore the dataset
print("Dataset Preview:")
print(data.head())  # Display the first few rows
print("\nDataset Info:")
print(data.info())  # Check data types and missing values
print("\nSummary Statistics:")
print(data.describe())  # Summary statistics for numerical columns

Dataset Preview:
   YEAR OP_UNIQUE_CARRIER TAIL_NUM  OP_CARRIER_FL_NUM  ORIGIN_AIRPORT_ID  \
0  2023                9E   N131EV               4642              14492   
1  2023                9E   N131EV               4647              11057   
2  2023                9E   N131EV               4658              12953   
3  2023                9E   N131EV               4660              12478   
4  2023                9E   N131EV               4670              12953   

     ORIGIN_CITY_NAME  DEST_AIRPORT_ID   DEST_CITY_NAME  CRS_DEP_TIME  \
0  Raleigh/Durham, NC            12478     New York, NY          2020   
1       Charlotte, NC            12953     New York, NY          1356   
2        New York, NY            11193   Cincinnati, OH           835   
3        New York, NY            13487  Minneapolis, MN           800   
4        New York, NY            13342    Milwaukee, WI          1900   

   DEP_TIME  DEP_DELAY  DEP_DELAY_NEW  AIR_TIME  FLIGHTS  DISTANCE  \
0    2032.0      

In [8]:
# Step 5: Data Cleaning and Feature Engineering
# Drop rows with missing target values
data = data.dropna(subset=['DEP_DELAY_NEW', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'LATE_AIRCRAFT_DELAY'])

# Extract useful features from CRS_DEP_TIME (e.g., hour of day)
data['CRS_DEP_HOUR'] = data['CRS_DEP_TIME'] // 60  # Extract hour
data['CRS_DEP_MINUTES'] = data['CRS_DEP_TIME'] % 60  # Extract minutes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CRS_DEP_HOUR'] = data['CRS_DEP_TIME'] // 60  # Extract hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['CRS_DEP_MINUTES'] = data['CRS_DEP_TIME'] % 60  # Extract minutes


In [9]:
# Step 6: Define Features (X) and Target (y)
X = data[['OP_UNIQUE_CARRIER', 'ORIGIN_CITY_NAME', 'DEST_CITY_NAME',
          'CRS_DEP_HOUR', 'CRS_DEP_MINUTES', 'DISTANCE', 'CARRIER_DELAY',
          'WEATHER_DELAY', 'NAS_DELAY', 'LATE_AIRCRAFT_DELAY']]
y = data['DEP_DELAY_NEW']

# Encode target variable for Naive Bayes (if needed)
le = LabelEncoder()
y_nb = le.fit_transform(y.astype(int))

In [10]:
# Step 7: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Step 8: Build the preprocessing pipeline
categorical_features = ['OP_UNIQUE_CARRIER', 'ORIGIN_CITY_NAME', 'DEST_CITY_NAME']
numerical_features = ['CRS_DEP_HOUR', 'CRS_DEP_MINUTES', 'DISTANCE',
                      'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY',
                      'LATE_AIRCRAFT_DELAY']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [12]:
# Step 9: Train and Evaluate Models
def train_and_evaluate_model(model, model_name):
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\n{model_name} Results:")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"R2 Score: {r2:.2f}")

    return pipeline, mae, mse, r2

# Random Forest
rf_pipeline, rf_mae, rf_mse, rf_r2 = train_and_evaluate_model(
    RandomForestRegressor(n_estimators=100, random_state=42), "Random Forest")

# Decision Tree
dt_pipeline, dt_mae, dt_mse, dt_r2 = train_and_evaluate_model(
    DecisionTreeRegressor(random_state=42), "Decision Tree")

# XGBoost
xg_pipeline, xg_mae, xg_mse, xg_r2 = train_and_evaluate_model(
    xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100,
                     learning_rate=0.1, max_depth=6, subsample=0.8,
                     colsample_bytree=0.8, random_state=42), "XGBoost")

# CatBoost
catb_pipeline, ct_mae, ct_mse, ct_r2 = train_and_evaluate_model(
    catb.CatBoostRegressor(iterations=200, learning_rate=0.05, depth=10,
                           verbose=0), "CatBoost")


Random Forest Results:
Mean Absolute Error (MAE): 9.84
Mean Squared Error (MSE): 470.90
R2 Score: 0.95

Decision Tree Results:
Mean Absolute Error (MAE): 12.14
Mean Squared Error (MSE): 703.78
R2 Score: 0.93

XGBoost Results:
Mean Absolute Error (MAE): 10.49
Mean Squared Error (MSE): 580.78
R2 Score: 0.94

CatBoost Results:
Mean Absolute Error (MAE): 10.86
Mean Squared Error (MSE): 493.23
R2 Score: 0.95


In [13]:
# Step 10: Compare Model Performance
model_comparison = pd.DataFrame({
    'Model': ['Random Forest', 'Decision Tree', 'XGBoost', 'CatBoost'],
    'MAE': [rf_mae, dt_mae, xg_mae, ct_mae],
    'MSE': [rf_mse, dt_mse, xg_mse, ct_mse],
    'R2 Score': [rf_r2, dt_r2, xg_r2, ct_r2]
})

print("\nModel Performance Comparison:")
print(model_comparison)


Model Performance Comparison:
           Model        MAE         MSE  R2 Score
0  Random Forest   9.841096  470.898972  0.951306
1  Decision Tree  12.135398  703.775310  0.927225
2        XGBoost  10.488643  580.779739  0.939944
3       CatBoost  10.862682  493.228329  0.948997


In [14]:
# Step 11: Save the Best Model
if rf_mae < dt_mae and rf_mae < xg_mae and rf_mae < ct_mae:
    best_pipeline = rf_pipeline
    best_model_name = "Random Forest"
elif dt_mae < rf_mae and dt_mae < xg_mae and dt_mae < ct_mae:
    best_pipeline = dt_pipeline
    best_model_name = "Decision Tree"
elif xg_mae < rf_mae and xg_mae < dt_mae and xg_mae < ct_mae:
    best_pipeline = xg_pipeline
    best_model_name = "XGBoost"
else:
    best_pipeline = catb_pipeline
    best_model_name = "CatBoost"

joblib.dump(best_pipeline, '/content/best_flight_delay_model.pkl')
print(f"The best model is {best_model_name}. Saved as '/content/best_flight_delay_model.pkl'.")

The best model is Random Forest. Saved as '/content/best_flight_delay_model.pkl'.


In [15]:
# Step 12: Predict delays for new flight data
new_flight = pd.DataFrame({
    'OP_UNIQUE_CARRIER': ['AA'],
    'ORIGIN_CITY_NAME': ['New York, NY'],
    'DEST_CITY_NAME': ['Los Angeles, CA'],
    'CRS_DEP_HOUR': [10],
    'CRS_DEP_MINUTES': [30],
    'DISTANCE': [2475],
    'CARRIER_DELAY': [0],
    'WEATHER_DELAY': [0],
    'NAS_DELAY': [0],
    'LATE_AIRCRAFT_DELAY': [0]
})

predicted_delay = best_pipeline.predict(new_flight)
print(f"Predicted Delay for the flight: {predicted_delay[0]:.2f} minutes")


Predicted Delay for the flight: 43.35 minutes


In [16]:
# Step 13: Download the Model
from google.colab import files
files.download('/content/best_flight_delay_model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>