In [21]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import joblib
import xgboost as xgb

# Step 2: Load the dataset
# Replace the file path with your dataset location
file_path = '/Users/ptkumar/Downloads/Data 1 - New York 2023.csv'  # Adjust path if necessary
data = pd.read_csv(file_path)

# Step 3: Explore the dataset
print("Dataset Preview:")
print(data.head())  # Display the first few rows
print("\nDataset Info:")
print(data.info())  # Check data types and missing values
print("\nSummary Statistics:")
print(data.describe())  # Summary statistics for numerical columns

# Step 4: Data Cleaning and Feature Engineering
# Drop rows with missing target values
data = data.dropna(subset=['DEP_DELAY_NEW'])

# Extract useful features from CRS_DEP_TIME (e.g., hour of day)
data['CRS_DEP_HOUR'] = data['CRS_DEP_TIME'] // 100  # Extract hour
data['CRS_DEP_MINUTES'] = data['CRS_DEP_TIME'] % 100  # Extract minutes

# Step 5: Define Features (X) and Target (y)
X = data[['OP_UNIQUE_CARRIER', 'ORIGIN_CITY_NAME', 'DEST_CITY_NAME',
          'CRS_DEP_HOUR', 'CRS_DEP_MINUTES', 'DISTANCE']]
y = data['DEP_DELAY_NEW']

# Encode target variable for Naive Bayes
le = LabelEncoder()
y_nb = le.fit_transform(y.astype(int))

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Build the preprocessing pipeline
categorical_features = ['OP_UNIQUE_CARRIER', 'ORIGIN_CITY_NAME', 'DEST_CITY_NAME']
numerical_features = ['CRS_DEP_HOUR', 'CRS_DEP_MINUTES', 'DISTANCE']

print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Step 8: Train and Evaluate Models
def train_and_evaluate_model(model, model_name):
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\n{model_name} Results:")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"R2 Score: {r2:.2f}")

    return pipeline, mae, mse, r2

# Random Forest
rf_pipeline, rf_mae, rf_mse, rf_r2 = train_and_evaluate_model(RandomForestRegressor(n_estimators=100, random_state=42), "Random Forest")

# Decision Tree
dt_pipeline, dt_mae, dt_mse, dt_r2 = train_and_evaluate_model(DecisionTreeRegressor(random_state=42), "Decision Tree")

#XGBoost
xg_pipeline, xg_mae, xg_mse, xg_r2 = train_and_evaluate_model(xgb.XGBRegressor(objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42),"XGBoost")


# Step 9: Compare Model Performance
model_comparison = pd.DataFrame({
    'Model': ['Random Forest', 'Decision Tree', 'XGBoost'],
    'MAE': [rf_mae, dt_mae,xg_mae],
    'MSE': [rf_mse, dt_mse,xg_mse],
    'R2 Score': [rf_r2, dt_r2,xg_r2]
})

print("\nModel Performance Comparison:")
print(model_comparison)

# Step 10: Save the Best Model
#best_pipeline = rf_pipeline if rf_mae < dt_mae else (dt_pipeline if dt_mae < rf_mae else xg_pipeline)
if rf_mae < dt_mae and rf_mae < xg_mae:
    best_pipeline = rf_pipeline
    best_model_name = "Random Forest"
    best_mae = rf_mae
elif dt_mae < rf_mae and dt_mae < xg_mae:
    best_pipeline = dt_pipeline
    best_model_name = "Decision Tree"
    best_mae = dt_mae
else:
    best_pipeline = xg_pipeline
    best_model_name = "XGBoost"
    best_mae = xg_mae
    
print(f"The best model is {best_model_name} with an MAE of {best_mae:.4f}")

    
joblib.dump(best_pipeline, '/Users/ptkumar/Downloads/best_flight_delay_model.pkl')
print("Best model saved as 'best_flight_delay_model.pkl'.")

#Step 11: Predict delays for new flight data
# Define new flight details
new_flight = pd.DataFrame({
    'OP_UNIQUE_CARRIER': ['AA'],  # Airline code (e.g., 'AA' for American Airlines)
    'ORIGIN_CITY_NAME': ['New York, NY'],  # Origin city
    'DEST_CITY_NAME': ['Los Angeles, CA'],  # Destination city
    'CRS_DEP_HOUR': [10],  # Scheduled departure hour (e.g., 10 AM)
    'CRS_DEP_MINUTES': [30],  # Scheduled departure minutes (e.g., 10:30 AM)
    'DISTANCE': [2475]  # Flight distance in miles
})

# Predict the delay
predicted_delay = best_pipeline.predict(new_flight)
print(f"Predicted Delay for the flight: {predicted_delay[0]:.2f} minutes")

# Step 12: Predict for multiple flights
new_flights = pd.DataFrame({
    'OP_UNIQUE_CARRIER': ['AA', 'UA', 'DL'],  # Airline codes
    'ORIGIN_CITY_NAME': ['New York, NY', 'Chicago, IL', 'Atlanta, GA'],  # Origin cities
    'DEST_CITY_NAME': ['Los Angeles, CA', 'Houston, TX', 'San Francisco, CA'],  # Destinations
    'CRS_DEP_HOUR': [10, 14, 16],  # Scheduled hours
    'CRS_DEP_MINUTES': [30, 45, 15],  # Scheduled minutes
    'DISTANCE': [2475, 925, 2130]  # Distances in miles
})

# Predict delays for multiple flights
predicted_delays = best_pipeline.predict(new_flights)
new_flights['Predicted Delay (minutes)'] = predicted_delays

# Display the results
print("\nPredicted Delays for Multiple Flights:")
print(new_flights)


Random Forest Results:
Mean Absolute Error (MAE): 25.53
Mean Squared Error (MSE): 3538.65
R2 Score: -0.12

Decision Tree Results:
Mean Absolute Error (MAE): 25.79
Mean Squared Error (MSE): 3735.60
R2 Score: -0.18

XGBoost Results:
Mean Absolute Error (MAE): 24.72
Mean Squared Error (MSE): 3247.02
R2 Score: -0.02

Model Performance Comparison:
           Model        MAE          MSE  R2 Score
0  Random Forest  25.530772  3538.647411 -0.115404
1  Decision Tree  25.786504  3735.601501 -0.177486
2        XGBoost  24.721433  3247.017252 -0.023481
The best model is XGBoost with an MAE of 24.7214
Best model saved as 'best_flight_delay_model.pkl'.
Predicted Delay for the flight: 9.16 minutes

Predicted Delays for Multiple Flights:
  OP_UNIQUE_CARRIER ORIGIN_CITY_NAME     DEST_CITY_NAME  CRS_DEP_HOUR  \
0                AA     New York, NY    Los Angeles, CA            10   
1                UA      Chicago, IL        Houston, TX            14   
2                DL      Atlanta, GA  San Fran