In [112]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy
import sklearn
import os
from sklearn.model_selection import train_test_split

In [113]:
def convert_to_categorical(time_str):
    hour = int(time_str.split(':')[0])
    
    if 0 <= hour < 3:
        return 'Late Night'
    elif 3 <= hour < 6:
        return 'Early Morning'
    elif 6 <= hour < 9:
        return 'Morning'
    elif 9 <= hour < 12:
        return 'Late Morning'
    elif 12 <= hour < 15:
        return 'Noon'
    elif 15 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

In [114]:
data = pd.read_csv("INITIAL_PROCESSED_DATA.csv")

In [115]:
data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),Arrival Delay (Minutes),FLIGHT_STATUS,month,day,season,WeekDay
0,B6,2010-01-01,22,N608JB,JFK,00:01,76,160,LATE,1,1,winter,Friday
1,B6,2010-01-01,44,N586JB,JFK,08:55,75,16,LATE,1,1,winter,Friday
2,MQ,2010-01-01,4094,N610MQ,ORD,11:20,100,4,ONTIME,1,1,winter,Friday
3,9E,2010-01-01,3818,89289E,DTW,11:44,84,22,LATE,1,1,winter,Friday
4,B6,2010-01-01,42,N586JB,JFK,11:52,71,73,LATE,1,1,winter,Friday


In [116]:
data['SCHED_ARRV_TIME_CAT'] = data['Scheduled Arrival Time'].apply(convert_to_categorical)

In [117]:
data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),Arrival Delay (Minutes),FLIGHT_STATUS,month,day,season,WeekDay,SCHED_ARRV_TIME_CAT
0,B6,2010-01-01,22,N608JB,JFK,00:01,76,160,LATE,1,1,winter,Friday,Late Night
1,B6,2010-01-01,44,N586JB,JFK,08:55,75,16,LATE,1,1,winter,Friday,Morning
2,MQ,2010-01-01,4094,N610MQ,ORD,11:20,100,4,ONTIME,1,1,winter,Friday,Late Morning
3,9E,2010-01-01,3818,89289E,DTW,11:44,84,22,LATE,1,1,winter,Friday,Late Morning
4,B6,2010-01-01,42,N586JB,JFK,11:52,71,73,LATE,1,1,winter,Friday,Late Morning


# Processing For Prediction Starts Here -- TARGET CONVERTED TO CATEGORICAL

In [118]:
df = data.drop(columns=['Date (MM/DD/YYYY)', 'Flight Number','Tail Number', 'Scheduled Arrival Time','Arrival Delay (Minutes)','day'])

In [119]:
df.head()

Unnamed: 0,Carrier Code,Origin Airport,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,season,WeekDay,SCHED_ARRV_TIME_CAT
0,B6,JFK,76,LATE,1,winter,Friday,Late Night
1,B6,JFK,75,LATE,1,winter,Friday,Morning
2,MQ,ORD,100,ONTIME,1,winter,Friday,Late Morning
3,9E,DTW,84,LATE,1,winter,Friday,Late Morning
4,B6,JFK,71,LATE,1,winter,Friday,Late Morning


In [120]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

class MultiColumnOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.encoder = None

    def fit(self, X, y=None):
        self.encoder = OneHotEncoder(sparse_output=False, drop='first')
        self.encoder.fit(X[self.columns])
        return self

    def transform(self, X):
        X_encoded = X.copy()
        encoded_data = self.encoder.transform(X[self.columns])
        encoded_df = pd.DataFrame(encoded_data, columns=self.encoder.get_feature_names_out(self.columns), index=X.index)
        
        # Drop the original columns
        X_encoded = X_encoded.drop(columns=self.columns)
        
        # Concatenate the encoded DataFrame with the original DataFrame, preserving the index
        X_encoded = pd.concat([X_encoded, encoded_df], axis=1)
        return X_encoded

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

In [121]:
df.columns

Index(['Carrier Code', 'Origin Airport', 'Scheduled Elapsed Time (Minutes)',
       'FLIGHT_STATUS', 'month', 'season', 'WeekDay', 'SCHED_ARRV_TIME_CAT'],
      dtype='object')

In [122]:
df['FLIGHT_STATUS'].value_counts()

FLIGHT_STATUS
EARLY     52240
LATE      34106
ONTIME    27325
Name: count, dtype: int64

In [123]:
encoder = MultiColumnOneHotEncoder(columns=['Carrier Code', 'Origin Airport','season', 'SCHED_ARRV_TIME_CAT', 'month', 'WeekDay'])

In [124]:
encoded_data = encoder.fit_transform(df.drop(columns=['FLIGHT_STATUS']))

In [125]:
trainX, testX, trainY, testY = train_test_split(
    encoded_data,
    df['FLIGHT_STATUS'], 
    test_size=0.2, 
    random_state=947,
    stratify=df['FLIGHT_STATUS']
)

In [126]:
from sklearn.preprocessing import StandardScaler

In [127]:
scaler = StandardScaler()


enc_trainX = pd.DataFrame(scaler.fit_transform(trainX), index=trainX.index, columns=trainX.columns)
enc_testX = pd.DataFrame(scaler.transform(testX), index=testX.index, columns=testX.columns)

In [128]:
enc_trainX

Unnamed: 0,Scheduled Elapsed Time (Minutes),Carrier Code_AA,Carrier Code_B6,Carrier Code_DL,Carrier Code_EV,Carrier Code_F9,Carrier Code_G4,Carrier Code_MQ,Carrier Code_OH,Carrier Code_OO,...,month_9,month_10,month_11,month_12,WeekDay_Monday,WeekDay_Saturday,WeekDay_Sunday,WeekDay_Thursday,WeekDay_Tuesday,WeekDay_Wednesday
109767,0.420191,-0.237358,-0.448859,-0.351464,-0.344872,-0.127516,-0.15405,2.914831,-0.226431,-0.280112,...,3.302855,-0.312682,-0.30596,-0.300294,-0.417338,-0.370459,-0.408178,-0.420982,-0.409240,-0.411655
21666,-1.129327,-0.237358,-0.448859,-0.351464,2.899624,-0.127516,-0.15405,-0.343073,-0.226431,-0.280112,...,-0.302768,-0.312682,-0.30596,-0.300294,-0.417338,-0.370459,2.449914,-0.420982,-0.409240,-0.411655
107873,2.104451,-0.237358,2.227871,-0.351464,-0.344872,-0.127516,-0.15405,-0.343073,-0.226431,-0.280112,...,-0.302768,-0.312682,-0.30596,-0.300294,-0.417338,-0.370459,-0.408178,-0.420982,-0.409240,-0.411655
20188,0.049654,-0.237358,-0.448859,-0.351464,-0.344872,-0.127516,-0.15405,2.914831,-0.226431,-0.280112,...,-0.302768,-0.312682,-0.30596,-0.300294,-0.417338,-0.370459,-0.408178,-0.420982,-0.409240,-0.411655
96511,-0.624050,-0.237358,-0.448859,-0.351464,-0.344872,-0.127516,-0.15405,-0.343073,4.416353,-0.280112,...,3.302855,-0.312682,-0.30596,-0.300294,-0.417338,-0.370459,-0.408178,-0.420982,2.443552,-0.411655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56030,1.969710,-0.237358,2.227871,-0.351464,-0.344872,-0.127516,-0.15405,-0.343073,-0.226431,-0.280112,...,-0.302768,-0.312682,-0.30596,-0.300294,-0.417338,2.699351,-0.408178,-0.420982,-0.409240,-0.411655
22547,1.228636,-0.237358,-0.448859,2.845245,-0.344872,-0.127516,-0.15405,-0.343073,-0.226431,-0.280112,...,-0.302768,-0.312682,-0.30596,-0.300294,-0.417338,-0.370459,2.449914,-0.420982,-0.409240,-0.411655
63492,0.790729,-0.237358,-0.448859,2.845245,-0.344872,-0.127516,-0.15405,-0.343073,-0.226431,-0.280112,...,-0.302768,-0.312682,-0.30596,-0.300294,-0.417338,-0.370459,-0.408178,-0.420982,-0.409240,-0.411655
51736,1.262321,-0.237358,-0.448859,2.845245,-0.344872,-0.127516,-0.15405,-0.343073,-0.226431,-0.280112,...,-0.302768,3.198133,-0.30596,-0.300294,2.396141,-0.370459,-0.408178,-0.420982,-0.409240,-0.411655


In [129]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

def logistic_regression_classification(trainX, trainY, testX, testY, penalty='l2', C=1.0, max_iter=1000, solver='lbfgs'):
    """
    Perform classification using Regularized Logistic Regression for three-class prediction.
    
    Parameters:
        trainX (DataFrame or array-like): Features for training.
        trainY (Series or array-like): Target variable for training.
        testX (DataFrame or array-like): Features for testing.
        testY (Series or array-like): Target variable for testing.
        penalty (str, optional): Regularization type ('l1' for Lasso, 'l2' for Ridge). Default is 'l2'.
        C (float, optional): Inverse of regularization strength; smaller values specify stronger regularization. Default is 1.0.
        max_iter (int, optional): Maximum number of iterations for optimization algorithm. Default is 1000.
        solver (str, optional): Optimization algorithm to use ('lbfgs', 'sag', 'saga', etc.). Default is 'lbfgs'.
        
    Returns:
        dict: Dictionary containing accuracy score and classification report.
    """
    # Get unique target names
    target_names = trainY.unique() if isinstance(trainY, pd.Series) else testY.unique()
    
    # Initialize and train the Logistic Regression model
    model = LogisticRegression(penalty=penalty, C=C, max_iter=max_iter, solver=solver, verbose=1 if max_iter > 300 else 0)
    model.fit(trainX, trainY)
    
    # Predict on the testing set
    testY_pred = model.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY, testY_pred)
    
    # Generate classification report
    report = classification_report(testY, testY_pred, target_names=target_names, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report
    }
    
    return results


In [130]:
report = logistic_regression_classification(enc_trainX, trainY, enc_testX, testY, max_iter=1500, solver='saga')
print(report)

Epoch 1, change: 1.00000000
Epoch 2, change: 0.43340147
Epoch 3, change: 0.24116213
Epoch 4, change: 0.17371450
Epoch 5, change: 0.13009000
Epoch 6, change: 0.10819865
Epoch 7, change: 0.09233679
Epoch 8, change: 0.07905501
Epoch 9, change: 0.06894849
Epoch 10, change: 0.06122933
Epoch 11, change: 0.05496680
Epoch 12, change: 0.04956847
Epoch 13, change: 0.04506682
Epoch 14, change: 0.04124992
Epoch 15, change: 0.03785129
Epoch 16, change: 0.03500555
Epoch 17, change: 0.03242198
Epoch 18, change: 0.03012954
Epoch 19, change: 0.02818909
Epoch 20, change: 0.02634122
Epoch 21, change: 0.02469495
Epoch 22, change: 0.02318396
Epoch 23, change: 0.02187475
Epoch 24, change: 0.02062627
Epoch 25, change: 0.01940327
Epoch 26, change: 0.01843742
Epoch 27, change: 0.01745039
Epoch 28, change: 0.01649759
Epoch 29, change: 0.01567439
Epoch 30, change: 0.01493711
Epoch 31, change: 0.01421210
Epoch 32, change: 0.01348884
Epoch 33, change: 0.01284545
Epoch 34, change: 0.01224508
Epoch 35, change: 0.011

In [82]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

def decision_tree_classification(trainX, trainY, testX, testY, criterion='gini', max_depth=None):
    """
    Perform classification using Decision Trees for three-class prediction.
    
    Parameters:
        trainX (DataFrame or array-like): Features for training.
        trainY (Series or array-like): Target variable for training.
        testX (DataFrame or array-like): Features for testing.
        testY (Series or array-like): Target variable for testing.
        criterion (str, optional): Criterion used to measure the quality of a split ('gini' or 'entropy'). Default is 'gini'.
        max_depth (int, optional): Maximum depth of the tree. If None, the tree is fully grown. Default is None.
        
    Returns:
        dict: Dictionary containing accuracy score and classification report.
    """
    # Get unique target names
    target_names = trainY.unique() if isinstance(trainY, pd.Series) else testY.unique()
    
    # Initialize and train the Decision Tree model
    model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth)
    model.fit(trainX, trainY)
    
    # Predict on the testing set
    testY_pred = model.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY, testY_pred)
    
    # Generate classification report
    report = classification_report(testY, testY_pred, target_names=target_names, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report
    }
    
    return results

In [83]:
from sklearn.metrics import accuracy_score, classification_report

from sklearn.preprocessing import LabelEncoder

def fit_and_evaluate(model, trainX, trainY, testX, testY):
    # Convert string labels to integers
    label_encoder = LabelEncoder()
    trainY_encoded = label_encoder.fit_transform(trainY)
    testY_encoded = label_encoder.transform(testY)
    
    model.fit(trainX, trainY_encoded)
    testY_pred = model.predict(testX)
    accuracy = accuracy_score(testY_encoded, testY_pred)
    report = classification_report(testY_encoded, testY_pred, output_dict=True)
    results = {'accuracy': accuracy, 'classification_report': report}
    return results

# Update other classification functions similarly...


# Random Forest
from sklearn.ensemble import RandomForestClassifier

def random_forest_classification(trainX, trainY, testX, testY, n_estimators=100, criterion='gini', max_depth=None):
    model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Support Vector Machines (SVM)
from sklearn.svm import SVC

def svm_classification(trainX, trainY, testX, testY, kernel='rbf', C=1.0):
    model = SVC(kernel=kernel, C=C)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# K-Nearest Neighbors (KNN)
from sklearn.neighbors import KNeighborsClassifier

def knn_classification(trainX, trainY, testX, testY, n_neighbors=5):
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Gradient Boosting Machines (GBM)
from sklearn.ensemble import GradientBoostingClassifier

def gbm_classification(trainX, trainY, testX, testY, n_estimators=100, learning_rate=0.1, max_depth=3):
    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Naive Bayes
from sklearn.naive_bayes import GaussianNB

def naive_bayes_classification(trainX, trainY, testX, testY):
    model = GaussianNB()
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

def adaboost_classification(trainX, trainY, testX, testY, n_estimators=50, learning_rate=1.0):
    model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# XGBoost
from xgboost import XGBClassifier

def xgboost_classification(trainX, trainY, testX, testY, n_estimators=100, learning_rate=0.1, max_depth=3):
    model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)


In [84]:
res = []

res.append(decision_tree_classification(enc_trainX, trainY, enc_testX, testY))

res.append(random_forest_classification(enc_trainX, trainY, enc_testX, testY))

#svm_classification(enc_trainX, trainY, enc_testX, testY)

res.append(knn_classification(enc_trainX, trainY, enc_testX, testY))

res.append(gbm_classification(enc_trainX, trainY, enc_testX, testY))

res.append(naive_bayes_classification(enc_trainX, trainY, enc_testX, testY))

res.append(adaboost_classification(enc_trainX, trainY, enc_testX, testY))

res.append(xgboost_classification(enc_trainX, trainY, enc_testX, testY))

def plot_results(results_list):
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    
    for result in results_list:
        accuracy_scores.append(result['accuracy'])
        report = result['classification_report']
        precision_scores.append(report['weighted avg']['precision'])
        recall_scores.append(report['weighted avg']['recall'])
        f1_scores.append(report['weighted avg']['f1-score'])
    
    # Plot accuracy scores
    plt.figure(figsize=(8, 4))
    plt.bar(range(len(accuracy_scores)), accuracy_scores, color='skyblue')
    plt.xlabel('Experiment')
    plt.ylabel('Accuracy')
    plt.title('Accuracy Scores')
    plt.xticks(range(len(accuracy_scores)))
    plt.show()
    
    # Plot precision, recall, and f1-scores
    plt.figure(figsize=(10, 6))
    sns.lineplot(x=range(len(precision_scores)), y=precision_scores, label='Precision')
    sns.lineplot(x=range(len(recall_scores)), y=recall_scores, label='Recall')
    sns.lineplot(x=range(len(f1_scores)), y=f1_scores, label='F1-score')
    plt.xlabel('Experiment')
    plt.ylabel('Score')
    plt.title('Precision, Recall, and F1-score')
    plt.legend()
    plt.xticks(range(len(precision_scores)))
    plt.show()


plot_results(res)

# CROSS VALIDATION FOR HYPER PARAMETER TUNING

In [85]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score, classification_report

def model_with_cross_validation(model, params, trainX, trainY, testX, testY, cv=5):
    """
    Perform modeling using k-fold cross-validation and hyperparameter tuning.
    
    Parameters:
        model: Classifier object (e.g., RandomForestClassifier, SVC, etc.).
        params: Dictionary containing hyperparameters to tune.
        trainX (DataFrame or array-like): Features for training.
        trainY (Series or array-like): Target variable for training.
        testX (DataFrame or array-like): Features for testing.
        testY (Series or array-like): Target variable for testing.
        cv (int or cross-validation generator, optional): Determines the cross-validation strategy. Default is 5.
        
    Returns:
        dict: Dictionary containing accuracy score and classification report.
    """
    label_encoder = LabelEncoder()
    trainY_encoded = label_encoder.fit_transform(trainY)
    testY_encoded = label_encoder.transform(testY)
    # Create KFold cross-validation object
    kfold = KFold(n_splits=cv, shuffle=True, random_state=42)
    
    # Perform hyperparameter tuning using GridSearchCV with verbose output
    grid_search = GridSearchCV(model, params, cv=kfold, scoring='accuracy', n_jobs=-1, verbose=2)
    grid_search.fit(trainX, trainY_encoded)
    
    # Get the best model
    best_model = grid_search.best_estimator_
    
    # Make predictions on the testing set
    testY_pred = best_model.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY_encoded, testY_pred)
    
    # Generate classification report
    report = classification_report(testY_encoded, testY_pred, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report,
        'best_model': best_model
    }
    
    return results

# Usage remains the same for each classification method...

model = RandomForestClassifier()
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_depth': [None, 3, 6, 9],
    'max_features': ['sqrt', 'log2', None],
    'max_leaf_nodes': [3, 6, 9]
}

model = DecisionTreeClassifier()
params = {
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}
# results = model_with_cross_validation(model, params, enc_trainX, trainY, enc_testX, testY, cv=3)
print(results['accuracy'])
print(results['classification_report'])
print(results['best_model'])

model = XGBClassifier()
xgb_param_grid = {
    "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25],
    "max_depth": [5, 10, 15],
    "min_child_weight": [1, 3, 5],
    "gamma": [0.0, 0.1, 0.3, 0.4],
    "colsample_bytree": [0.3, 0.5, 0.7]
}
# results = model_with_cross_validation(model, xgb_param_grid, enc_trainX, trainY, enc_testX, testY, cv=3)
print(results['accuracy'])
print(results['classification_report'])
print(results['best_model'])

import sys
print(results['best_model'])

In [86]:
random_forest_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.44099406201891356,
 'classification_report': {'0': {'precision': 0.5352547606793618,
   'recall': 0.5972434915773354,
   'f1-score': 0.5645526101510902,
   'support': 10448.0},
  '1': {'precision': 0.38706697459584294,
   'recall': 0.3685136323658751,
   'f1-score': 0.37756251407974767,
   'support': 6822.0},
  '2': {'precision': 0.2776080314273243,
   'recall': 0.23275388838060385,
   'f1-score': 0.2532099134069872,
   'support': 5465.0},
  'accuracy': 0.44099406201891356,
  'macro avg': {'precision': 0.39997658890084303,
   'recall': 0.39950367077460475,
   'f1-score': 0.39844167921260837,
   'support': 22735.0},
  'weighted avg': {'precision': 0.42885597237832157,
   'recall': 0.44099406201891356,
   'f1-score': 0.4336032249254372,
   'support': 22735.0}}}

In [87]:
knn_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.45779634924125795,
 'classification_report': {'0': {'precision': 0.51586859688196,
   'recall': 0.7094180704441041,
   'f1-score': 0.5973565441650548,
   'support': 10448.0},
  '1': {'precision': 0.39048917675132094,
   'recall': 0.3358252711814717,
   'f1-score': 0.36110016549767515,
   'support': 6822.0},
  '2': {'precision': 0.282,
   'recall': 0.12900274473924978,
   'f1-score': 0.17702448210922786,
   'support': 5465.0},
  'accuracy': 0.45779634924125795,
  'macro avg': {'precision': 0.3961192578777603,
   'recall': 0.39141536212160855,
   'f1-score': 0.3784937305906526,
   'support': 22735.0},
  'weighted avg': {'precision': 0.4220295695632385,
   'recall': 0.45779634924125795,
   'f1-score': 0.4254253484578211,
   'support': 22735.0}}}

In [88]:
gbm_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.4977787552232241,
 'classification_report': {'0': {'precision': 0.5055917480998914,
   'recall': 0.8913667687595712,
   'f1-score': 0.6452126922543994,
   'support': 10448.0},
  '1': {'precision': 0.46900072939460247,
   'recall': 0.2827616534740545,
   'f1-score': 0.35281207133058984,
   'support': 6822.0},
  '2': {'precision': 0.3712871287128713,
   'recall': 0.013723696248856358,
   'f1-score': 0.026469031233456855,
   'support': 5465.0},
  'accuracy': 0.4977787552232241,
  'macro avg': {'precision': 0.44862653540245506,
   'recall': 0.39595070616082734,
   'f1-score': 0.34149793160614866,
   'support': 22735.0},
  'weighted avg': {'precision': 0.46232811605425495,
   'recall': 0.4977787552232241,
   'f1-score': 0.4087406824271868,
   'support': 22735.0}}}

In [89]:
naive_bayes_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.3304596437211348,
 'classification_report': {'0': {'precision': 0.5705229793977813,
   'recall': 0.10336906584992343,
   'f1-score': 0.1750263349809578,
   'support': 10448.0},
  '1': {'precision': 0.30868663483335745,
   'recall': 0.9381413075344474,
   'f1-score': 0.46452549446561425,
   'support': 6822.0},
  '2': {'precision': 0.30275229357798167,
   'recall': 0.006038426349496798,
   'f1-score': 0.011840688912809472,
   'support': 5465.0},
  'accuracy': 0.3304596437211348,
  'macro avg': {'precision': 0.3939873026030401,
   'recall': 0.3491829332446225,
   'f1-score': 0.21713083945312717,
   'support': 22735.0},
  'weighted avg': {'precision': 0.42758854611765357,
   'recall': 0.3304596437211348,
   'f1-score': 0.2226688997595765,
   'support': 22735.0}}}

In [90]:
adaboost_classification(enc_trainX, trainY, enc_testX, testY)



{'accuracy': 0.487222344402903,
 'classification_report': {'0': {'precision': 0.5024043838067547,
   'recall': 0.8599732006125574,
   'f1-score': 0.63426514188903,
   'support': 10448.0},
  '1': {'precision': 0.4315326633165829,
   'recall': 0.3021108179419525,
   'f1-score': 0.355406104500776,
   'support': 6822.0},
  '2': {'precision': 0.41333333333333333,
   'recall': 0.005672461116193962,
   'f1-score': 0.011191335740072202,
   'support': 5465.0},
  'accuracy': 0.487222344402903,
  'macro avg': {'precision': 0.4490901268188903,
   'recall': 0.3892521598902347,
   'f1-score': 0.3336208607099594,
   'support': 22735.0},
  'weighted avg': {'precision': 0.45972744657248155,
   'recall': 0.487222344402903,
   'f1-score': 0.40081562776249724,
   'support': 22735.0}}}

# FINAL MODEL FOR THE FIRST STEP

In [91]:
import shap

In [92]:
model = DecisionTreeClassifier(criterion='entropy', max_features='log2')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.45511326149109305, 'classification_report': {'0': {'precision': 0.5226749795888073, 'recall': 0.6740045941807045, 'f1-score': 0.5887713724342628, 'support': 10448.0}, '1': {'precision': 0.39916387959866223, 'recall': 0.3498973907944884, 'f1-score': 0.37291048273707234, 'support': 6822.0}, '2': {'precision': 0.2797074954296161, 'recall': 0.16797804208600184, 'f1-score': 0.20990053732708358, 'support': 5465.0}, 'accuracy': 0.45511326149109305, 'macro avg': {'precision': 0.40051545153902857, 'recall': 0.39729334235373154, 'f1-score': 0.39052746416613954, 'support': 22735.0}, 'weighted avg': {'precision': 0.42720939678419984, 'recall': 0.45511326149109305, 'f1-score': 0.4329265471263689, 'support': 22735.0}}}


In [93]:
model = RandomForestClassifier(n_estimators=300)
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.4468000879700902, 'classification_report': {'0': {'precision': 0.5390214999153546, 'recall': 0.6094946401225115, 'f1-score': 0.5720959482526278, 'support': 10448.0}, '1': {'precision': 0.3922849587805257, 'recall': 0.36968630900029315, 'f1-score': 0.3806505169421176, 'support': 6822.0}, '2': {'precision': 0.28227960819234194, 'recall': 0.23202195791399818, 'f1-score': 0.25469518931405044, 'support': 5465.0}, 'accuracy': 0.4468000879700902, 'macro avg': {'precision': 0.4045286889627408, 'recall': 0.403734302345601, 'f1-score': 0.40248055150293194, 'support': 22735.0}, 'weighted avg': {'precision': 0.43327568412964684, 'recall': 0.4468000879700902, 'f1-score': 0.4383534419847754, 'support': 22735.0}}}


In [106]:
model = XGBClassifier(
    colsample_bytree=0.7,enable_categorical=True,
    gamma=0.0, learning_rate=0.01, max_depth=17, 
    min_child_weight=7,n_estimators=200,
    n_jobs=20,objective='multi:softprob')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.5101385528920167, 'classification_report': {'0': {'precision': 0.5336762184668136, 'recall': 0.8342266462480857, 'f1-score': 0.6509335324869305, 'support': 10448.0}, '1': {'precision': 0.46713260294673215, 'recall': 0.3625036646144826, 'f1-score': 0.4082205348299769, 'support': 6822.0}, '2': {'precision': 0.36880072137060416, 'recall': 0.07483989021043001, 'f1-score': 0.12442957103742014, 'support': 5465.0}, 'accuracy': 0.5101385528920167, 'macro avg': {'precision': 0.4565365142613833, 'recall': 0.42385673369099935, 'f1-score': 0.3945278794514426, 'support': 22735.0}, 'weighted avg': {'precision': 0.4740762564387168, 'recall': 0.5101385528920167, 'f1-score': 0.4515435074446032, 'support': 22735.0}}}


In [95]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

def fit_and_evaluate_voting(voting_classifier, trainX, trainY, testX, testY):
    # Convert string labels to integers
    label_encoder = LabelEncoder()
    trainY_encoded = label_encoder.fit_transform(trainY)
    testY_encoded = label_encoder.transform(testY)
    
    # Fit the VotingClassifier
    voting_classifier.fit(trainX, trainY_encoded)
    
    # Make predictions
    testY_pred = voting_classifier.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY_encoded, testY_pred)
    
    # Generate classification report
    report = classification_report(testY_encoded, testY_pred, target_names=label_encoder.classes_, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report,
        'voting_classifier': voting_classifier
    }
    
    return results

In [133]:
from sklearn.ensemble import VotingClassifier

lr = LogisticRegression(penalty='l2', C=50, max_iter=1500, solver='saga')
ada = AdaBoostClassifier(n_estimators=120, learning_rate=1.0)
gbm = GradientBoostingClassifier(n_estimators=150, subsample=0.7, learning_rate=0.08)
xgb = XGBClassifier(
    colsample_bytree=0.7,enable_categorical=True,
    gamma=0.0, learning_rate=0.01, max_depth=17, 
    min_child_weight=7,n_estimators=200,
    n_jobs=20,objective='multi:softprob')

res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)

votingCLF = VotingClassifier(estimators=[('rf', gbm), ('dt', ada), ('xgb', xgb), ('lr', lr)], voting='soft', weights=[3,5,8,4])
fit_and_evaluate_voting(votingCLF, enc_trainX, trainY, enc_testX, testY)



{'accuracy': 0.509434792170662,
 'classification_report': {'EARLY': {'precision': 0.5237350932036586,
   'recall': 0.865907350689127,
   'f1-score': 0.6526946107784432,
   'support': 10448.0},
  'LATE': {'precision': 0.471107544141252,
   'recall': 0.3441805922017004,
   'f1-score': 0.39776384889039473,
   'support': 6822.0},
  'ONTIME': {'precision': 0.3920335429769392,
   'recall': 0.03421774931381519,
   'f1-score': 0.06294177044766072,
   'support': 5465.0},
  'accuracy': 0.509434792170662,
  'macro avg': {'precision': 0.4622920601072833,
   'recall': 0.41476856406821416,
   'f1-score': 0.3711334100388329,
   'support': 22735.0},
  'weighted avg': {'precision': 0.4762851652646765,
   'recall': 0.509434792170662,
   'f1-score': 0.43443479419572956,
   'support': 22735.0}},
 'voting_classifier': VotingClassifier(estimators=[('rf',
                               GradientBoostingClassifier(learning_rate=0.08,
                                                          n_estimators=150,
 

# # Processing For Prediction Starts Here -- TARGET IS NUMERIC

In [110]:
data = pd.read_csv("INITIAL_PROCESSED_DATA_NUMERIC.csv")

In [81]:
data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),Arrival Delay (Minutes),month,day,season
0,B6,2010-01-01,22,N608JB,JFK,00:01,76,160,1,1,winter
1,B6,2010-01-01,44,N586JB,JFK,08:55,75,16,1,1,winter
2,MQ,2010-01-01,4094,N610MQ,ORD,11:20,100,4,1,1,winter
3,9E,2010-01-01,3818,89289E,DTW,11:44,84,22,1,1,winter
4,B6,2010-01-01,42,N586JB,JFK,11:52,71,73,1,1,winter


In [82]:
data['SCHED_ARRV_TIME_CAT'] = data['Scheduled Arrival Time'].apply(convert_to_categorical)
df = data.drop(columns=['Date (MM/DD/YYYY)', 'Flight Number','Tail Number', 'Scheduled Arrival Time'])

In [83]:
df.head()

Unnamed: 0,Carrier Code,Origin Airport,Scheduled Elapsed Time (Minutes),Arrival Delay (Minutes),month,day,season,SCHED_ARRV_TIME_CAT
0,B6,JFK,76,160,1,1,winter,Late Night
1,B6,JFK,75,16,1,1,winter,Morning
2,MQ,ORD,100,4,1,1,winter,Late Morning
3,9E,DTW,84,22,1,1,winter,Late Morning
4,B6,JFK,71,73,1,1,winter,Late Morning


In [84]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

class MultiColumnOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.encoder = None

    def fit(self, X, y=None):
        self.encoder = OneHotEncoder(sparse_output=False, drop='first')
        self.encoder.fit(X[self.columns])
        return self

    def transform(self, X):
        X_encoded = X.copy()
        encoded_data = self.encoder.transform(X[self.columns])
        encoded_df = pd.DataFrame(encoded_data, columns=self.encoder.get_feature_names_out(self.columns), index=X.index)
        
        # Drop the original columns
        X_encoded = X_encoded.drop(columns=self.columns)
        
        # Concatenate the encoded DataFrame with the original DataFrame, preserving the index
        X_encoded = pd.concat([X_encoded, encoded_df], axis=1)
        return X_encoded

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

In [85]:
df.columns

Index(['Carrier Code', 'Origin Airport', 'Scheduled Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'month', 'day', 'season',
       'SCHED_ARRV_TIME_CAT'],
      dtype='object')

In [86]:
encoder = MultiColumnOneHotEncoder(columns=['Carrier Code', 'Origin Airport','season', 'SCHED_ARRV_TIME_CAT'])

In [87]:
encoded_data = encoder.fit_transform(df.drop(columns=['Arrival Delay (Minutes)']))

trainX, testX, trainY, testY = train_test_split(
    encoded_data,
    df['Arrival Delay (Minutes)'], 
    test_size=0.2, 
    random_state=947,
)

In [88]:
trainX

Unnamed: 0,Scheduled Elapsed Time (Minutes),month,day,Carrier Code_AA,Carrier Code_B6,Carrier Code_DL,Carrier Code_EV,Carrier Code_F9,Carrier Code_G4,Carrier Code_MQ,...,season_spring,season_summer,season_winter,SCHED_ARRV_TIME_CAT_Early Morning,SCHED_ARRV_TIME_CAT_Evening,SCHED_ARRV_TIME_CAT_Late Morning,SCHED_ARRV_TIME_CAT_Late Night,SCHED_ARRV_TIME_CAT_Morning,SCHED_ARRV_TIME_CAT_Night,SCHED_ARRV_TIME_CAT_Noon
42773,78,5,29,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
66060,72,7,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
23502,67,10,4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
24250,105,11,9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
86407,97,12,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32830,105,11,11,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70429,116,10,28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
34109,105,1,2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
105293,72,5,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [89]:
scaler = StandardScaler()
enc_trainX = pd.DataFrame(scaler.fit_transform(trainX), index=trainX.index, columns=trainX.columns)
enc_testX = pd.DataFrame(scaler.transform(testX), index=testX.index, columns=testX.columns)

In [90]:
trainY

42773     -23
66060     241
23502      26
24250      40
86407     -17
         ... 
32830      53
70429      -4
34109       0
105293     42
88954     -24
Name: Arrival Delay (Minutes), Length: 90940, dtype: int64

In [91]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

def fit_and_evaluate_regression(model, trainX, trainY, testX, testY):
    model.fit(trainX, trainY)
    testY_pred = model.predict(testX)
    mse = mean_squared_error(testY, testY_pred)
    r2 = r2_score(testY, testY_pred)
    results = {'mean_squared_error': mse, 'r2_score': r2}
    return results

# Random Forest for Regression
from sklearn.ensemble import RandomForestRegressor

def random_forest_regression(trainX, trainY, testX, testY, n_estimators=300, criterion='friedman_mse', max_depth=30):
    model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth)
    return fit_and_evaluate_regression(model, trainX, trainY, testX, testY), model

# Support Vector Machines (SVM) for Regression
from sklearn.svm import SVR

def svm_regression(trainX, trainY, testX, testY, kernel='rbf', C=1.0):
    model = SVR(kernel=kernel, C=C)
    return fit_and_evaluate_regression(model, trainX, trainY, testX, testY), model

# K-Nearest Neighbors (KNN) for Regression
from sklearn.neighbors import KNeighborsRegressor

def knn_regression(trainX, trainY, testX, testY, n_neighbors=20):
    model = KNeighborsRegressor(n_neighbors=n_neighbors)
    return fit_and_evaluate_regression(model, trainX, trainY, testX, testY), model

# Gradient Boosting Machines (GBM) for Regression
from sklearn.ensemble import GradientBoostingRegressor

def gbm_regression(trainX, trainY, testX, testY, n_estimators=200, learning_rate=0.5, max_depth=20):
    model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    return fit_and_evaluate_regression(model, trainX, trainY, testX, testY), model

# Naive Bayes is not applicable for regression tasks
# AdaBoost is not directly applicable for regression tasks

# XGBoost for Regression
from xgboost import XGBRegressor

def xgboost_regression(trainX, trainY, testX, testY, n_estimators=200, learning_rate=0.6, max_depth=30):
    model = XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    return fit_and_evaluate_regression(model, trainX, trainY, testX, testY), model

from sklearn.tree import DecisionTreeRegressor

def decision_tree_regression(trainX, trainY, testX, testY, criterion='friedman_mse', max_depth=30):
    model = DecisionTreeRegressor(criterion=criterion, max_depth=max_depth)
    return fit_and_evaluate_regression(model, trainX, trainY, testX, testY), model

In [92]:
from sklearn.metrics import accuracy_score, classification_report
def evaluate_preds(testY, testY_pred):
    accuracy = accuracy_score(testY, testY_pred)
    report = classification_report(testY, testY_pred, output_dict=True)
    results = {'accuracy': accuracy, 'classification_report': report}
    return results

In [93]:
def create_final_preds(predicted_values):
    categorical_values = []
    for val in predicted_values:
        # Customize the conditions based on your requirement
        if abs(val) <= 5:
            categorical_values.append('ONTIME')
        elif val < -5:
            categorical_values.append('EARLY')
        elif val > 5:
            categorical_values.append('LATE')
        else:
            print("THIS SHOULDN't HAPPEN")
    
    return categorical_values

In [94]:
res, model = random_forest_regression(enc_trainX, trainY, enc_testX, testY)

predicte_delay = model.predict(enc_testX)

preds = create_final_preds(predicte_delay)
acutal = create_final_preds(testY.values)

e = evaluate_preds(acutal, preds)
print(e)

{'accuracy': 0.3981086430613591, 'classification_report': {'EARLY': {'precision': 0.6249761313729234, 'recall': 0.31302601377199696, 'f1-score': 0.417128656088702, 'support': 10456.0}, 'LATE': {'precision': 0.38463776549316264, 'recall': 0.5752828546562228, 'f1-score': 0.4610287707061901, 'support': 6894.0}, 'ONTIME': {'precision': 0.2521218867399471, 'recall': 0.33649025069637883, 'f1-score': 0.2882596245625199, 'support': 5385.0}, 'accuracy': 0.3981086430613591, 'macro avg': {'precision': 0.4205785945353444, 'recall': 0.40826637304153285, 'f1-score': 0.388805683785804, 'support': 22735.0}, 'weighted avg': {'precision': 0.46378357356673705, 'recall': 0.3981086430613591, 'f1-score': 0.39991676496947925, 'support': 22735.0}}}


In [95]:
res, model = decision_tree_regression(enc_trainX, trainY, enc_testX, testY)

predicte_delay = model.predict(enc_testX)

preds = create_final_preds(predicte_delay)
acutal = create_final_preds(testY.values)

e = evaluate_preds(acutal, preds)
print(e)

{'accuracy': 0.4138552892016714, 'classification_report': {'EARLY': {'precision': 0.5358921161825726, 'recall': 0.49407039020657995, 'f1-score': 0.5141321656050956, 'support': 10456.0}, 'LATE': {'precision': 0.3687516706762898, 'recall': 0.400203075137801, 'f1-score': 0.38383416805787424, 'support': 6894.0}, 'ONTIME': {'precision': 0.26438624621414575, 'recall': 0.2755803156917363, 'f1-score': 0.26986724859065286, 'support': 5385.0}, 'accuracy': 0.4138552892016714, 'macro avg': {'precision': 0.3896766776910027, 'recall': 0.38995126034537236, 'f1-score': 0.3892778607512075, 'support': 22735.0}, 'weighted avg': {'precision': 0.42090089818827775, 'recall': 0.4138552892016714, 'f1-score': 0.4167650675970323, 'support': 22735.0}}}


In [96]:
res, model = xgboost_regression(enc_trainX, trainY, enc_testX, testY)

predicte_delay = model.predict(enc_testX)

preds = create_final_preds(predicte_delay)
acutal = create_final_preds(testY.values)

e = evaluate_preds(acutal, preds)
print(e)

{'accuracy': 0.4149549153287882, 'classification_report': {'EARLY': {'precision': 0.5568195144834056, 'recall': 0.45409334353481257, 'f1-score': 0.5002370542063952, 'support': 10456.0}, 'LATE': {'precision': 0.3686845698680018, 'recall': 0.4699738903394256, 'f1-score': 0.4132126004336182, 'support': 6894.0}, 'ONTIME': {'precision': 0.26678966789667896, 'recall': 0.26852367688022283, 'f1-score': 0.2676538639518741, 'support': 5385.0}, 'accuracy': 0.4149549153287882, 'macro avg': {'precision': 0.3974312507493621, 'recall': 0.39753030358482033, 'f1-score': 0.3937011728639625, 'support': 22735.0}, 'weighted avg': {'precision': 0.4310744943801236, 'recall': 0.4149549153287882, 'f1-score': 0.41875884598866386, 'support': 22735.0}}}


In [97]:
res, model = gbm_regression(enc_trainX, trainY, enc_testX, testY)

predicte_delay = model.predict(enc_testX)

preds = create_final_preds(predicte_delay)
acutal = create_final_preds(testY.values)

e = evaluate_preds(acutal, preds)
print(e)

{'accuracy': 0.4130195733450627, 'classification_report': {'EARLY': {'precision': 0.5470008952551477, 'recall': 0.46748278500382556, 'f1-score': 0.5041254125412541, 'support': 10456.0}, 'LATE': {'precision': 0.3660683662970161, 'recall': 0.4644618508848274, 'f1-score': 0.4094367367815357, 'support': 6894.0}, 'ONTIME': {'precision': 0.2573238321456849, 'recall': 0.2414113277623027, 'f1-score': 0.24911372999904188, 'support': 5385.0}, 'accuracy': 0.4130195733450627, 'macro avg': {'precision': 0.3901310312326163, 'recall': 0.3911186545503185, 'f1-score': 0.38755862644061057, 'support': 22735.0}, 'weighted avg': {'precision': 0.42352344465115316, 'recall': 0.4130195733450627, 'f1-score': 0.4150107593115505, 'support': 22735.0}}}


In [98]:
from sklearn.ensemble import VotingRegressor

# Define multiple classifiers
rf_classifier = RandomForestRegressor(n_estimators=300, max_features='log2', max_depth=30, criterion='friedman_mse')
DT = DecisionTreeRegressor(max_features='log2', criterion='friedman_mse')
xgb = XGBRegressor(
    colsample_bytree=0.7,enable_categorical=True,
    gamma=0.0, learning_rate=0.5, max_depth=20, 
    min_child_weight=5,n_estimators=300,
    n_jobs=5)

In [99]:
from sklearn.base import BaseEstimator, RegressorMixin
class HybridRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, regressor1, regressor2, regressor3):
        self.regressor1 = regressor1
        self.regressor2 = regressor2
        self.regressor3 = regressor3
    
    def fit(self, X, y):
        # Fit the first regressor on the regular output
        self.regressor1.fit(X, y)
        # Fit the second regressor on the difference between predicted output and actual output
        y_pred1 = self.regressor1.predict(X)
        self.regressor2.fit(X, y - y_pred1)  # Fit on residuals from first regressor
        # Fit the third regressor on the residuals from the second regressor
        y_pred2 = self.regressor2.predict(X)
        self.regressor3.fit(X, y - y_pred1 - y_pred2)  # Fit on residuals from second regressor
        return self
    
    def predict(self, X):
        # Predict using the first regressor
        y_pred = self.regressor1.predict(X)
        # Predict using the second regressor and add the predictions to the output of the first regressor
        y_pred += self.regressor2.predict(X)
        # Predict using the third regressor and add the predictions to the output of the first two regressors
        return y_pred + self.regressor3.predict(X)
    
    def evaluate(self, X, y):
        predictions = self.predict(X)
        mse = mean_squared_error(y, predictions)
        return mse


In [100]:
hybrid = HybridRegressor(rf_classifier, xgb, DT)

In [101]:
model = hybrid
model.fit(enc_trainX, trainY)

predicte_delay = model.predict(enc_testX)

preds = create_final_preds(predicte_delay)
acutal = create_final_preds(testY.values)

e = evaluate_preds(acutal, preds)

In [102]:
print(e)

{'accuracy': 0.40972069496371233, 'classification_report': {'EARLY': {'precision': 0.5356179775280899, 'recall': 0.45591048201989287, 'f1-score': 0.4925604463732176, 'support': 10456.0}, 'LATE': {'precision': 0.36340587365307414, 'recall': 0.49898462431099505, 'f1-score': 0.42053789731051344, 'support': 6894.0}, 'ONTIME': {'precision': 0.2536049439230945, 'recall': 0.20575673166202413, 'f1-score': 0.22718884560180438, 'support': 5385.0}, 'accuracy': 0.40972069496371233, 'macro avg': {'precision': 0.3842095983680862, 'recall': 0.38688394599763737, 'f1-score': 0.3800957297618451, 'support': 22735.0}, 'weighted avg': {'precision': 0.416600144667863, 'recall': 0.40972069496371233, 'f1-score': 0.4078650637740382, 'support': 22735.0}}}
