In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy
import sklearn
import os
from sklearn.model_selection import train_test_split

In [2]:
def convert_to_categorical(time_str):
    hour = int(time_str.split(':')[0])
    
    if 0 <= hour < 3:
        return 'Late Night'
    elif 3 <= hour < 6:
        return 'Early Morning'
    elif 6 <= hour < 9:
        return 'Morning'
    elif 9 <= hour < 12:
        return 'Late Morning'
    elif 12 <= hour < 15:
        return 'Noon'
    elif 15 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

# 1 HOP

In [3]:
data = pd.read_csv("1_HOP.csv")

In [4]:
data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,PREV_STAT
0,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,ONTIME
1,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,LATE
2,MQ,2010-01-01,4094,N610MQ,ORD,11:20,100,ONTIME,1,1,winter,LATE
3,9E,2010-01-01,3818,89289E,DTW,11:44,84,LATE,1,1,winter,ONTIME
4,B6,2010-01-01,42,N586JB,JFK,11:52,71,LATE,1,1,winter,LATE


In [5]:
data['SCHED_ARRV_TIME_CAT'] = data['Scheduled Arrival Time'].apply(convert_to_categorical)

In [6]:
data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,PREV_STAT,SCHED_ARRV_TIME_CAT
0,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,ONTIME,Late Night
1,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,LATE,Morning
2,MQ,2010-01-01,4094,N610MQ,ORD,11:20,100,ONTIME,1,1,winter,LATE,Late Morning
3,9E,2010-01-01,3818,89289E,DTW,11:44,84,LATE,1,1,winter,ONTIME,Late Morning
4,B6,2010-01-01,42,N586JB,JFK,11:52,71,LATE,1,1,winter,LATE,Late Morning


In [7]:
df = data.drop(columns=['Date (MM/DD/YYYY)', 'Flight Number','Tail Number', 'Scheduled Arrival Time'])

In [8]:
df.head()

Unnamed: 0,Carrier Code,Origin Airport,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,PREV_STAT,SCHED_ARRV_TIME_CAT
0,B6,JFK,76,LATE,1,1,winter,ONTIME,Late Night
1,B6,JFK,75,LATE,1,1,winter,LATE,Morning
2,MQ,ORD,100,ONTIME,1,1,winter,LATE,Late Morning
3,9E,DTW,84,LATE,1,1,winter,ONTIME,Late Morning
4,B6,JFK,71,LATE,1,1,winter,LATE,Late Morning


In [9]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

class MultiColumnOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.encoder = None

    def fit(self, X, y=None):
        self.encoder = OneHotEncoder(sparse_output=False, drop='first')
        self.encoder.fit(X[self.columns])
        return self

    def transform(self, X):
        X_encoded = X.copy()
        encoded_data = self.encoder.transform(X[self.columns])
        encoded_df = pd.DataFrame(encoded_data, columns=self.encoder.get_feature_names_out(self.columns), index=X.index)
        
        # Drop the original columns
        X_encoded = X_encoded.drop(columns=self.columns)
        
        # Concatenate the encoded DataFrame with the original DataFrame, preserving the index
        X_encoded = pd.concat([X_encoded, encoded_df], axis=1)
        return X_encoded

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

In [10]:
df.columns

Index(['Carrier Code', 'Origin Airport', 'Scheduled Elapsed Time (Minutes)',
       'FLIGHT_STATUS', 'month', 'day', 'season', 'PREV_STAT',
       'SCHED_ARRV_TIME_CAT'],
      dtype='object')

In [11]:
df['FLIGHT_STATUS'].value_counts()

FLIGHT_STATUS
EARLY     52242
LATE      34107
ONTIME    27326
Name: count, dtype: int64

In [12]:
encoder = MultiColumnOneHotEncoder(columns=['Carrier Code', 'Origin Airport','season', 'SCHED_ARRV_TIME_CAT', 'month','PREV_STAT'])

In [13]:
encoded_data = encoder.fit_transform(df.drop(columns=['FLIGHT_STATUS']))

In [14]:
trainX, testX, trainY, testY = train_test_split(
    encoded_data,
    df['FLIGHT_STATUS'], 
    test_size=0.2, 
    random_state=947,
    stratify=df['FLIGHT_STATUS']
)

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
scaler = StandardScaler()


enc_trainX = pd.DataFrame(scaler.fit_transform(trainX), index=trainX.index, columns=trainX.columns)
enc_testX = pd.DataFrame(scaler.transform(testX), index=testX.index, columns=testX.columns)

In [17]:
enc_trainX

Unnamed: 0,Scheduled Elapsed Time (Minutes),day,Carrier Code_AA,Carrier Code_B6,Carrier Code_DL,Carrier Code_EV,Carrier Code_F9,Carrier Code_G4,Carrier Code_MQ,Carrier Code_OH,...,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,PREV_STAT_LATE,PREV_STAT_ONTIME
97066,0.083886,-0.537515,4.176871,-0.452203,-0.350921,-0.345383,-0.127692,-0.152656,-0.341501,-0.225969,...,-0.306747,-0.301439,-0.306619,-0.308504,-0.302913,3.196261,-0.305695,-0.300091,-0.654071,-0.562754
60085,0.117531,-1.678626,-0.239414,-0.452203,-0.350921,-0.345383,-0.127692,-0.152656,-0.341501,-0.225969,...,-0.306747,-0.301439,-0.306619,-0.308504,-0.302913,-0.312866,-0.305695,-0.300091,-0.654071,-0.562754
4849,-1.160986,-0.651626,-0.239414,-0.452203,-0.350921,-0.345383,-0.127692,-0.152656,-0.341501,-0.225969,...,-0.306747,3.317421,-0.306619,-0.308504,-0.302913,-0.312866,-0.305695,-0.300091,1.528886,-0.562754
52868,1.968016,-0.081070,-0.239414,2.211397,-0.350921,-0.345383,-0.127692,-0.152656,-0.341501,-0.225969,...,-0.306747,-0.301439,-0.306619,-0.308504,-0.302913,-0.312866,-0.305695,-0.300091,-0.654071,1.776974
10072,0.891370,1.402375,-0.239414,-0.452203,-0.350921,2.895340,-0.127692,-0.152656,-0.341501,-0.225969,...,-0.306747,-0.301439,-0.306619,-0.308504,-0.302913,-0.312866,3.271235,-0.300091,-0.654071,1.776974
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1129,-0.689953,-0.423403,-0.239414,-0.452203,-0.350921,-0.345383,-0.127692,-0.152656,-0.341501,4.425378,...,-0.306747,-0.301439,-0.306619,-0.308504,-0.302913,-0.312866,-0.305695,-0.300091,-0.654071,-0.562754
22544,0.050241,0.375375,-0.239414,-0.452203,-0.350921,-0.345383,-0.127692,-0.152656,2.928250,-0.225969,...,-0.306747,-0.301439,-0.306619,3.241454,-0.302913,-0.312866,-0.305695,-0.300091,-0.654071,-0.562754
37772,-0.790889,1.744709,-0.239414,-0.452203,-0.350921,2.895340,-0.127692,-0.152656,-0.341501,-0.225969,...,-0.306747,-0.301439,3.261380,-0.308504,-0.302913,-0.312866,-0.305695,-0.300091,-0.654071,1.776974
113539,0.857725,1.402375,-0.239414,-0.452203,2.849640,-0.345383,-0.127692,-0.152656,-0.341501,-0.225969,...,-0.306747,-0.301439,-0.306619,-0.308504,-0.302913,-0.312866,-0.305695,3.332319,-0.654071,-0.562754


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

def logistic_regression_classification(trainX, trainY, testX, testY, penalty='l2', C=1.0, max_iter=1000, solver='lbfgs'):
    """
    Perform classification using Regularized Logistic Regression for three-class prediction.
    
    Parameters:
        trainX (DataFrame or array-like): Features for training.
        trainY (Series or array-like): Target variable for training.
        testX (DataFrame or array-like): Features for testing.
        testY (Series or array-like): Target variable for testing.
        penalty (str, optional): Regularization type ('l1' for Lasso, 'l2' for Ridge). Default is 'l2'.
        C (float, optional): Inverse of regularization strength; smaller values specify stronger regularization. Default is 1.0.
        max_iter (int, optional): Maximum number of iterations for optimization algorithm. Default is 1000.
        solver (str, optional): Optimization algorithm to use ('lbfgs', 'sag', 'saga', etc.). Default is 'lbfgs'.
        
    Returns:
        dict: Dictionary containing accuracy score and classification report.
    """
    # Get unique target names
    target_names = trainY.unique() if isinstance(trainY, pd.Series) else testY.unique()
    
    # Initialize and train the Logistic Regression model
    model = LogisticRegression(penalty=penalty, C=C, max_iter=max_iter, solver=solver, verbose=1 if max_iter > 300 else 0)
    model.fit(trainX, trainY)
    
    # Predict on the testing set
    testY_pred = model.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY, testY_pred)
    
    # Generate classification report
    report = classification_report(testY, testY_pred, target_names=target_names, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report
    }
    
    return results


In [19]:
report = logistic_regression_classification(enc_trainX, trainY, enc_testX, testY, max_iter=1500, solver='saga')
print(report)

Epoch 1, change: 1.00000000
Epoch 2, change: 0.25144802
Epoch 3, change: 0.19113114
Epoch 4, change: 0.15780325
Epoch 5, change: 0.14343723
Epoch 6, change: 0.13748756
Epoch 7, change: 0.13117865
Epoch 8, change: 0.11334531
Epoch 9, change: 0.09931395
Epoch 10, change: 0.08810882
Epoch 11, change: 0.07926263
Epoch 12, change: 0.07168346
Epoch 13, change: 0.06545179
Epoch 14, change: 0.06014111
Epoch 15, change: 0.05557221
Epoch 16, change: 0.05149549
Epoch 17, change: 0.04797028
Epoch 18, change: 0.04492433
Epoch 19, change: 0.04219557
Epoch 20, change: 0.03971930
Epoch 21, change: 0.03745472
Epoch 22, change: 0.03550737
Epoch 23, change: 0.03364748
Epoch 24, change: 0.03197090
Epoch 25, change: 0.03050380
Epoch 26, change: 0.02904795
Epoch 27, change: 0.02784410
Epoch 28, change: 0.02661052
Epoch 29, change: 0.02551979
Epoch 30, change: 0.02449783
Epoch 31, change: 0.02355395
Epoch 32, change: 0.02267661
Epoch 33, change: 0.02183294
Epoch 34, change: 0.02105511
Epoch 35, change: 0.020

Epoch 414, change: 0.00037002
Epoch 415, change: 0.00036745
Epoch 416, change: 0.00036481
Epoch 417, change: 0.00036222
Epoch 418, change: 0.00035975
Epoch 419, change: 0.00035743
Epoch 420, change: 0.00035485
Epoch 421, change: 0.00035253
Epoch 422, change: 0.00034988
Epoch 423, change: 0.00034765
Epoch 424, change: 0.00034489
Epoch 425, change: 0.00034284
Epoch 426, change: 0.00034062
Epoch 427, change: 0.00033800
Epoch 428, change: 0.00033600
Epoch 429, change: 0.00033325
Epoch 430, change: 0.00033109
Epoch 431, change: 0.00032894
Epoch 432, change: 0.00032651
Epoch 433, change: 0.00032443
Epoch 434, change: 0.00032198
Epoch 435, change: 0.00031962
Epoch 436, change: 0.00031770
Epoch 437, change: 0.00031540
Epoch 438, change: 0.00031349
Epoch 439, change: 0.00031126
Epoch 440, change: 0.00030926
Epoch 441, change: 0.00030690
Epoch 442, change: 0.00030485
Epoch 443, change: 0.00030264
Epoch 444, change: 0.00030070
Epoch 445, change: 0.00029860
Epoch 446, change: 0.00029682
Epoch 447,

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

def decision_tree_classification(trainX, trainY, testX, testY, criterion='gini', max_depth=None):
    """
    Perform classification using Decision Trees for three-class prediction.
    
    Parameters:
        trainX (DataFrame or array-like): Features for training.
        trainY (Series or array-like): Target variable for training.
        testX (DataFrame or array-like): Features for testing.
        testY (Series or array-like): Target variable for testing.
        criterion (str, optional): Criterion used to measure the quality of a split ('gini' or 'entropy'). Default is 'gini'.
        max_depth (int, optional): Maximum depth of the tree. If None, the tree is fully grown. Default is None.
        
    Returns:
        dict: Dictionary containing accuracy score and classification report.
    """
    # Get unique target names
    target_names = trainY.unique() if isinstance(trainY, pd.Series) else testY.unique()
    
    # Initialize and train the Decision Tree model
    model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth)
    model.fit(trainX, trainY)
    
    # Predict on the testing set
    testY_pred = model.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY, testY_pred)
    
    # Generate classification report
    report = classification_report(testY, testY_pred, target_names=target_names, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report
    }
    
    return results

In [21]:
decision_tree_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.41706619749285245,
 'classification_report': {'ONTIME': {'precision': 0.5287541449549976,
   'recall': 0.534118097425591,
   'f1-score': 0.5314225861740621,
   'support': 10449.0},
  'EARLY': {'precision': 0.3604974396488661,
   'recall': 0.36123735522650635,
   'f1-score': 0.36086701816051553,
   'support': 6821.0},
  'LATE': {'precision': 0.2688493919550982,
   'recall': 0.2629460201280878,
   'f1-score': 0.2658649398704903,
   'support': 5465.0},
  'accuracy': 0.41706619749285245,
  'macro avg': {'precision': 0.38603365885298735,
   'recall': 0.38610049092672843,
   'f1-score': 0.3860515147350226,
   'support': 22735.0},
  'weighted avg': {'precision': 0.41579797772220356,
   'recall': 0.41706619749285245,
   'f1-score': 0.4164178768505776,
   'support': 22735.0}}}

In [22]:
from sklearn.metrics import accuracy_score, classification_report

from sklearn.preprocessing import LabelEncoder

def fit_and_evaluate(model, trainX, trainY, testX, testY):
    # Convert string labels to integers
    label_encoder = LabelEncoder()
    trainY_encoded = label_encoder.fit_transform(trainY)
    testY_encoded = label_encoder.transform(testY)
    
    model.fit(trainX, trainY_encoded)
    testY_pred = model.predict(testX)
    accuracy = accuracy_score(testY_encoded, testY_pred)
    report = classification_report(testY_encoded, testY_pred, output_dict=True)
    results = {'accuracy': accuracy, 'classification_report': report}
    return results

# Update other classification functions similarly...


# Random Forest
from sklearn.ensemble import RandomForestClassifier

def random_forest_classification(trainX, trainY, testX, testY, n_estimators=100, criterion='gini', max_depth=None):
    model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Support Vector Machines (SVM)
from sklearn.svm import SVC

def svm_classification(trainX, trainY, testX, testY, kernel='rbf', C=1.0):
    model = SVC(kernel=kernel, C=C)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# K-Nearest Neighbors (KNN)
from sklearn.neighbors import KNeighborsClassifier

def knn_classification(trainX, trainY, testX, testY, n_neighbors=5):
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Gradient Boosting Machines (GBM)
from sklearn.ensemble import GradientBoostingClassifier

def gbm_classification(trainX, trainY, testX, testY, n_estimators=100, learning_rate=0.1, max_depth=3):
    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Naive Bayes
from sklearn.naive_bayes import GaussianNB

def naive_bayes_classification(trainX, trainY, testX, testY):
    model = GaussianNB()
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

def adaboost_classification(trainX, trainY, testX, testY, n_estimators=50, learning_rate=1.0):
    model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# XGBoost
from xgboost import XGBClassifier

def xgboost_classification(trainX, trainY, testX, testY, n_estimators=100, learning_rate=0.1, max_depth=3):
    model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)


In [42]:
random_forest_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.4398504508467121,
 'classification_report': {'0': {'precision': 0.5436850057772642,
   'recall': 0.5854148722365776,
   'f1-score': 0.563778801843318,
   'support': 10449.0},
  '1': {'precision': 0.3870185016681832,
   'recall': 0.3741386893417388,
   'f1-score': 0.3804696235557212,
   'support': 6821.0},
  '2': {'precision': 0.2721881390593047,
   'recall': 0.2435498627630375,
   'f1-score': 0.2570738773539353,
   'support': 5465.0},
  'accuracy': 0.4398504508467121,
  'macro avg': {'precision': 0.4009638821682507,
   'recall': 0.4010344747804513,
   'f1-score': 0.40044076758432484,
   'support': 22735.0},
  'weighted avg': {'precision': 0.43141966154406913,
   'recall': 0.4398504508467121,
   'f1-score': 0.4350568173509417,
   'support': 22735.0}}}

In [43]:
knn_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.4620628986144711,
 'classification_report': {'0': {'precision': 0.5161491544219573,
   'recall': 0.7126997798832424,
   'f1-score': 0.5987056317080034,
   'support': 10449.0},
  '1': {'precision': 0.40034071550255534,
   'recall': 0.3445242633045008,
   'f1-score': 0.37034118666771726,
   'support': 6821.0},
  '2': {'precision': 0.2905211325400082,
   'recall': 0.12955169258920401,
   'f1-score': 0.1791951404707669,
   'support': 5465.0},
  'accuracy': 0.4620628986144711,
  'macro avg': {'precision': 0.402337000821507,
   'recall': 0.3955919119256491,
   'f1-score': 0.38274731961549585,
   'support': 22735.0},
  'weighted avg': {'precision': 0.42716800194981774,
   'recall': 0.4620628986144711,
   'f1-score': 0.42935006917308854,
   'support': 22735.0}}}

In [44]:
gbm_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.49298438530899497,
 'classification_report': {'0': {'precision': 0.5038180429802552,
   'recall': 0.8840080390467987,
   'f1-score': 0.6418371955668276,
   'support': 10449.0},
  '1': {'precision': 0.4556012581659811,
   'recall': 0.27605922885207446,
   'f1-score': 0.3438013511046193,
   'support': 6821.0},
  '2': {'precision': 0.3283582089552239,
   'recall': 0.016102470265324794,
   'f1-score': 0.030699459270887842,
   'support': 5465.0},
  'accuracy': 0.49298438530899497,
  'macro avg': {'precision': 0.42925917003382014,
   'recall': 0.39205657938806593,
   'f1-score': 0.3387793353141116,
   'support': 22735.0},
  'weighted avg': {'precision': 0.4471752155263313,
   'recall': 0.49298438530899497,
   'f1-score': 0.40551565503751014,
   'support': 22735.0}}}

In [45]:
naive_bayes_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.2538376951836376,
 'classification_report': {'0': {'precision': 0.579295154185022,
   'recall': 0.025169872715092353,
   'f1-score': 0.048243602678161975,
   'support': 10449.0},
  '1': {'precision': 0.3818544366899302,
   'recall': 0.056150124615159065,
   'f1-score': 0.09790388548057259,
   'support': 6821.0},
  '2': {'precision': 0.24085910329918225,
   'recall': 0.9377859103385179,
   'f1-score': 0.3832778671054108,
   'support': 5465.0},
  'accuracy': 0.2538376951836376,
  'macro avg': {'precision': 0.40066956472471144,
   'recall': 0.33970196922292306,
   'f1-score': 0.17647511842138178,
   'support': 22735.0},
  'weighted avg': {'precision': 0.4387059238298368,
   'recall': 0.2538376951836376,
   'f1-score': 0.14367773701245523,
   'support': 22735.0}}}

In [46]:
adaboost_classification(enc_trainX, trainY, enc_testX, testY)



{'accuracy': 0.4873982845832417,
 'classification_report': {'0': {'precision': 0.5043590753135722,
   'recall': 0.8581682457651449,
   'f1-score': 0.6353266260450616,
   'support': 10449.0},
  '1': {'precision': 0.42781077206635265,
   'recall': 0.30626007916727754,
   'f1-score': 0.35697197539302805,
   'support': 6821.0},
  '2': {'precision': 0.3424657534246575,
   'recall': 0.004574565416285453,
   'f1-score': 0.00902853015529072,
   'support': 5465.0},
  'accuracy': 0.4873982845832417,
  'macro avg': {'precision': 0.42487853360152744,
   'recall': 0.389667630116236,
   'f1-score': 0.3337757105311268,
   'support': 22735.0},
  'weighted avg': {'precision': 0.4424772639842472,
   'recall': 0.4873982845832417,
   'f1-score': 0.4012656554651135,
   'support': 22735.0}}}

In [47]:
xgboost_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.4913129535957774,
 'classification_report': {'0': {'precision': 0.500080416018871,
   'recall': 0.8927170064120968,
   'f1-score': 0.6410555975534328,
   'support': 10449.0},
  '1': {'precision': 0.4587203302373581,
   'recall': 0.2606655915554904,
   'f1-score': 0.33242965317378703,
   'support': 6821.0},
  '2': {'precision': 0.3106796116504854,
   'recall': 0.01171088746569076,
   'f1-score': 0.0225709751366602,
   'support': 5465.0},
  'accuracy': 0.4913129535957774,
  'macro avg': {'precision': 0.42316011930223824,
   'recall': 0.388364495144426,
   'f1-score': 0.33201874195462666,
   'support': 22735.0},
  'weighted avg': {'precision': 0.44214364271828044,
   'recall': 0.4913129535957774,
   'f1-score': 0.39979076236006456,
   'support': 22735.0}}}

In [110]:
model = DecisionTreeClassifier(criterion='entropy', max_features='log2')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.4229162084891137, 'classification_report': {'0': {'precision': 0.5311355311355311, 'recall': 0.5412001148435257, 'f1-score': 0.5361205915813424, 'support': 10449.0}, '1': {'precision': 0.3682807641048423, 'recall': 0.36460929482480575, 'f1-score': 0.36643583321054957, 'support': 6821.0}, '2': {'precision': 0.2761012183692596, 'recall': 0.26953339432753887, 'f1-score': 0.2727777777777778, 'support': 5465.0}, 'accuracy': 0.4229162084891137, 'macro avg': {'precision': 0.39183917120321105, 'recall': 0.3917809346652901, 'f1-score': 0.39177806752322325, 'support': 22735.0}, 'weighted avg': {'precision': 0.4209708121918758, 'recall': 0.4229162084891137, 'f1-score': 0.4219095419097498, 'support': 22735.0}}}


In [109]:
model = RandomForestClassifier(n_estimators=100, criterion='entropy')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.439190675170442, 'classification_report': {'0': {'precision': 0.5407237826890904, 'recall': 0.5877117427505024, 'f1-score': 0.5632394753737503, 'support': 10449.0}, '1': {'precision': 0.38629379225568533, 'recall': 0.3685676587010702, 'f1-score': 0.3772225973441368, 'support': 6821.0}, '2': {'precision': 0.2731006160164271, 'recall': 0.2433668801463861, 'f1-score': 0.25737784228350263, 'support': 5465.0}, 'accuracy': 0.439190675170442, 'macro avg': {'precision': 0.40003939698706764, 'recall': 0.3998820938659862, 'f1-score': 0.39927997166713, 'support': 22735.0}, 'weighted avg': {'precision': 0.43006059506593836, 'recall': 0.439190675170442, 'f1-score': 0.4339078303384217, 'support': 22735.0}}}


In [100]:
model = XGBClassifier(
    colsample_bytree=0.7,enable_categorical=True,
    gamma=0.0, learning_rate=0.01, max_depth=17, 
    min_child_weight=5,n_estimators=100,
    n_jobs=20,objective='multi:softprob')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.5092588519903233, 'classification_report': {'0': {'precision': 0.5277646223064172, 'recall': 0.8531916929849747, 'f1-score': 0.6521341574924107, 'support': 10449.0}, '1': {'precision': 0.4764922322158626, 'recall': 0.34173874798416654, 'f1-score': 0.39801929480064885, 'support': 6821.0}, '2': {'precision': 0.3491062039957939, 'recall': 0.060750228728270815, 'f1-score': 0.10349127182044887, 'support': 5465.0}, 'accuracy': 0.5092588519903233, 'macro avg': {'precision': 0.4511210195060246, 'recall': 0.4185602232324707, 'f1-score': 0.38454824137116944, 'support': 22735.0}, 'weighted avg': {'precision': 0.46943617590768266, 'recall': 0.5092588519903233, 'f1-score': 0.4440122815910349, 'support': 22735.0}}}


In [103]:
model = KNeighborsClassifier(n_neighbors=70)
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.48972949197272925, 'classification_report': {'0': {'precision': 0.5093936065668536, 'recall': 0.8433342903627141, 'f1-score': 0.6351448753063283, 'support': 10449.0}, '1': {'precision': 0.44491080797481636, 'recall': 0.3108048673215071, 'f1-score': 0.3659589159330226, 'support': 6821.0}, '2': {'precision': 0.30104321907600595, 'recall': 0.03696248856358646, 'f1-score': 0.06584093872229466, 'support': 5465.0}, 'accuracy': 0.48972949197272925, 'macro avg': {'precision': 0.41844921120589196, 'recall': 0.39703388208260254, 'f1-score': 0.35564824332054856, 'support': 22735.0}, 'weighted avg': {'precision': 0.4399644428618275, 'recall': 0.48972949197272925, 'f1-score': 0.4175348712457582, 'support': 22735.0}}}


In [130]:
model = AdaBoostClassifier(n_estimators=200, learning_rate=0.8, random_state=947)
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)



{'accuracy': 0.4939520563008577, 'classification_report': {'0': {'precision': 0.5113453121400598, 'recall': 0.8497463872140875, 'f1-score': 0.63847840937691, 'support': 10449.0}, '1': {'precision': 0.44086021505376344, 'recall': 0.33660753555197187, 'f1-score': 0.38174411838058026, 'support': 6821.0}, '2': {'precision': 0.3374233128834356, 'recall': 0.010064043915827997, 'f1-score': 0.01954513148542999, 'support': 5465.0}, 'accuracy': 0.4939520563008577, 'macro avg': {'precision': 0.42987628002575295, 'recall': 0.3988059888939624, 'f1-score': 0.3465892197476401, 'support': 22735.0}, 'weighted avg': {'precision': 0.44839116333147927, 'recall': 0.4939520563008577, 'f1-score': 0.41267436439943467, 'support': 22735.0}}}


In [23]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

def fit_and_evaluate_voting(voting_classifier, trainX, trainY, testX, testY):
    # Convert string labels to integers
    label_encoder = LabelEncoder()
    trainY_encoded = label_encoder.fit_transform(trainY)
    testY_encoded = label_encoder.transform(testY)
    
    # Fit the VotingClassifier
    voting_classifier.fit(trainX, trainY_encoded)
    
    # Make predictions
    testY_pred = voting_classifier.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY_encoded, testY_pred)
    
    # Generate classification report
    report = classification_report(testY_encoded, testY_pred, target_names=label_encoder.classes_, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report,
        'voting_classifier': voting_classifier
    }
    
    return results

In [28]:
scaler = StandardScaler()
enc_trainX = pd.DataFrame(scaler.fit_transform(trainX), index=trainX.index, columns=trainX.columns)
enc_testX = pd.DataFrame(scaler.transform(testX), index=testX.index, columns=testX.columns)

In [35]:
from sklearn.ensemble import VotingClassifier

# Define multiple classifiers
ada = AdaBoostClassifier(n_estimators=200, learning_rate=0.8, random_state=947)
knn = KNeighborsClassifier(n_neighbors=70)
xgb = XGBClassifier(
    colsample_bytree=0.7,enable_categorical=True,
    gamma=0.0, learning_rate=0.01, max_depth=17, 
    min_child_weight=5,n_estimators=100,objective='multi:softprob')

votingCLF = VotingClassifier(estimators=[('knn', knn), ('ada', ada), ('xgb', xgb)], voting='soft', weights=[5,7,10])
fit_and_evaluate_voting(votingCLF, enc_trainX, trainY, enc_testX, testY)



{'accuracy': 0.5031009456784693,
 'classification_report': {'EARLY': {'precision': 0.5173605113145621,
   'recall': 0.8598908986505885,
   'f1-score': 0.6460310612597067,
   'support': 10449.0},
  'LATE': {'precision': 0.46913061738765227,
   'recall': 0.3275179592435127,
   'f1-score': 0.3857377190710524,
   'support': 6821.0},
  'ONTIME': {'precision': 0.3613861386138614,
   'recall': 0.040073193046660564,
   'f1-score': 0.07214626914841048,
   'support': 5465.0},
  'accuracy': 0.5031009456784693,
  'macro avg': {'precision': 0.4492924224386919,
   'recall': 0.40916068364692054,
   'f1-score': 0.36797168315972323,
   'support': 22735.0},
  'weighted avg': {'precision': 0.46539763234888004,
   'recall': 0.5031009456784693,
   'f1-score': 0.42998789979249563,
   'support': 22735.0}},
 'voting_classifier': VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=70)),
                              ('ada',
                               AdaBoostClassifier(learning_rate=0.8,


# 3 HOP

In [36]:
data = pd.read_csv("3_HOP.csv")

In [37]:
data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,PREV_STAT
0,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,ONTIME
1,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,ONTIME
2,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,ONTIME
3,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,LATE
4,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,ONTIME


In [38]:
data['SCHED_ARRV_TIME_CAT'] = data['Scheduled Arrival Time'].apply(convert_to_categorical)

In [39]:
data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,PREV_STAT,SCHED_ARRV_TIME_CAT
0,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,ONTIME,Late Night
1,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,ONTIME,Late Night
2,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,ONTIME,Late Night
3,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,LATE,Morning
4,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,ONTIME,Morning


In [40]:
df = data.drop(columns=['Date (MM/DD/YYYY)', 'Flight Number','Tail Number', 'Scheduled Arrival Time'])

In [41]:
df.head()

Unnamed: 0,Carrier Code,Origin Airport,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,PREV_STAT,SCHED_ARRV_TIME_CAT
0,B6,JFK,76,LATE,1,1,winter,ONTIME,Late Night
1,B6,JFK,76,LATE,1,1,winter,ONTIME,Late Night
2,B6,JFK,76,LATE,1,1,winter,ONTIME,Late Night
3,B6,JFK,75,LATE,1,1,winter,LATE,Morning
4,B6,JFK,75,LATE,1,1,winter,ONTIME,Morning


In [42]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

class MultiColumnOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.encoder = None

    def fit(self, X, y=None):
        self.encoder = OneHotEncoder(sparse_output=False, drop='first')
        self.encoder.fit(X[self.columns])
        return self

    def transform(self, X):
        X_encoded = X.copy()
        encoded_data = self.encoder.transform(X[self.columns])
        encoded_df = pd.DataFrame(encoded_data, columns=self.encoder.get_feature_names_out(self.columns), index=X.index)
        
        # Drop the original columns
        X_encoded = X_encoded.drop(columns=self.columns)
        
        # Concatenate the encoded DataFrame with the original DataFrame, preserving the index
        X_encoded = pd.concat([X_encoded, encoded_df], axis=1)
        return X_encoded

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

In [43]:
df.columns

Index(['Carrier Code', 'Origin Airport', 'Scheduled Elapsed Time (Minutes)',
       'FLIGHT_STATUS', 'month', 'day', 'season', 'PREV_STAT',
       'SCHED_ARRV_TIME_CAT'],
      dtype='object')

In [44]:
df['FLIGHT_STATUS'].value_counts()

FLIGHT_STATUS
EARLY     156726
LATE      102321
ONTIME     81978
Name: count, dtype: int64

In [45]:
encoder = MultiColumnOneHotEncoder(columns=['Carrier Code', 'Origin Airport','season', 'SCHED_ARRV_TIME_CAT', 'month','PREV_STAT'])

In [46]:
encoded_data = encoder.fit_transform(df.drop(columns=['FLIGHT_STATUS']))

In [47]:
trainX, testX, trainY, testY = train_test_split(
    encoded_data,
    df['FLIGHT_STATUS'], 
    test_size=0.2, 
    random_state=947,
    stratify=df['FLIGHT_STATUS']
)

In [48]:
from sklearn.preprocessing import StandardScaler

In [49]:
scaler = StandardScaler()


enc_trainX = pd.DataFrame(scaler.fit_transform(trainX), index=trainX.index, columns=trainX.columns)
enc_testX = pd.DataFrame(scaler.transform(testX), index=testX.index, columns=testX.columns)

In [50]:
enc_trainX

Unnamed: 0,Scheduled Elapsed Time (Minutes),day,Carrier Code_AA,Carrier Code_B6,Carrier Code_DL,Carrier Code_EV,Carrier Code_F9,Carrier Code_G4,Carrier Code_MQ,Carrier Code_OH,...,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,PREV_STAT_LATE,PREV_STAT_ONTIME
57395,0.051813,-1.563628,-0.239978,-0.452409,-0.350117,-0.345110,-0.12563,-0.152694,2.909811,-0.225996,...,-0.307305,-0.299736,-0.306912,-0.309073,-0.302162,-0.312534,-0.306275,-0.300961,-0.654688,-0.562495
207852,-0.823517,1.629038,-0.239978,-0.452409,2.856186,-0.345110,-0.12563,-0.152694,-0.343665,-0.225996,...,-0.307305,-0.299736,-0.306912,-0.309073,3.309484,-0.312534,-0.306275,-0.300961,-0.654688,1.777795
103525,-0.116520,1.058919,-0.239978,-0.452409,-0.350117,-0.345110,-0.12563,-0.152694,2.909811,-0.225996,...,-0.307305,-0.299736,-0.306912,-0.309073,-0.302162,-0.312534,-0.306275,-0.300961,-0.654688,1.777795
176924,-1.025516,1.743062,-0.239978,2.210391,-0.350117,-0.345110,-0.12563,-0.152694,-0.343665,-0.225996,...,-0.307305,-0.299736,-0.306912,-0.309073,-0.302162,-0.312534,-0.306275,3.322687,1.527445,-0.562495
33003,-0.453185,1.515014,-0.239978,-0.452409,-0.350117,-0.345110,-0.12563,-0.152694,-0.343665,-0.225996,...,-0.307305,-0.299736,-0.306912,-0.309073,-0.302162,-0.312534,-0.306275,3.322687,-0.654688,-0.562495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180738,0.489477,-1.221557,4.167049,-0.452409,-0.350117,-0.345110,-0.12563,-0.152694,-0.343665,-0.225996,...,-0.307305,-0.299736,-0.306912,-0.309073,-0.302162,-0.312534,-0.306275,-0.300961,1.527445,-0.562495
72137,-1.025516,1.629038,-0.239978,-0.452409,-0.350117,2.897628,-0.12563,-0.152694,-0.343665,-0.225996,...,-0.307305,-0.299736,-0.306912,-0.309073,-0.302162,3.199656,-0.306275,-0.300961,-0.654688,1.777795
256919,-0.789850,1.172943,-0.239978,-0.452409,-0.350117,-0.345110,-0.12563,-0.152694,-0.343665,-0.225996,...,-0.307305,-0.299736,-0.306912,-0.309073,-0.302162,-0.312534,3.265037,-0.300961,-0.654688,-0.562495
215427,-0.924517,-1.335581,-0.239978,2.210391,-0.350117,-0.345110,-0.12563,-0.152694,-0.343665,-0.225996,...,-0.307305,-0.299736,-0.306912,-0.309073,-0.302162,-0.312534,-0.306275,3.322687,-0.654688,-0.562495


In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

def logistic_regression_classification(trainX, trainY, testX, testY, penalty='l2', C=1.0, max_iter=1000, solver='lbfgs'):
    """
    Perform classification using Regularized Logistic Regression for three-class prediction.
    
    Parameters:
        trainX (DataFrame or array-like): Features for training.
        trainY (Series or array-like): Target variable for training.
        testX (DataFrame or array-like): Features for testing.
        testY (Series or array-like): Target variable for testing.
        penalty (str, optional): Regularization type ('l1' for Lasso, 'l2' for Ridge). Default is 'l2'.
        C (float, optional): Inverse of regularization strength; smaller values specify stronger regularization. Default is 1.0.
        max_iter (int, optional): Maximum number of iterations for optimization algorithm. Default is 1000.
        solver (str, optional): Optimization algorithm to use ('lbfgs', 'sag', 'saga', etc.). Default is 'lbfgs'.
        
    Returns:
        dict: Dictionary containing accuracy score and classification report.
    """
    # Get unique target names
    target_names = trainY.unique() if isinstance(trainY, pd.Series) else testY.unique()
    
    # Initialize and train the Logistic Regression model
    model = LogisticRegression(penalty=penalty, C=C, max_iter=max_iter, solver=solver, verbose=1 if max_iter > 300 else 0)
    model.fit(trainX, trainY)
    
    # Predict on the testing set
    testY_pred = model.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY, testY_pred)
    
    # Generate classification report
    report = classification_report(testY, testY_pred, target_names=target_names, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report
    }
    
    return results


In [52]:
report = logistic_regression_classification(enc_trainX, trainY, enc_testX, testY, max_iter=1500, solver='saga')
print(report)

0014725
Epoch 551, change: 0.00014641
Epoch 552, change: 0.00014537
Epoch 553, change: 0.00014449
Epoch 554, change: 0.00014360
Epoch 555, change: 0.00014261
Epoch 556, change: 0.00014154
Epoch 557, change: 0.00014070
Epoch 558, change: 0.00013971
Epoch 559, change: 0.00013891
Epoch 560, change: 0.00013792
Epoch 561, change: 0.00013695
Epoch 562, change: 0.00013617
Epoch 563, change: 0.00013532
Epoch 564, change: 0.00013424
Epoch 565, change: 0.00013348
Epoch 566, change: 0.00013261
Epoch 567, change: 0.00013166
Epoch 568, change: 0.00013088
Epoch 569, change: 0.00012998
Epoch 570, change: 0.00012920
Epoch 571, change: 0.00012826
Epoch 572, change: 0.00012749
Epoch 573, change: 0.00012651
Epoch 574, change: 0.00012587
Epoch 575, change: 0.00012493
Epoch 576, change: 0.00012410
Epoch 577, change: 0.00012334
Epoch 578, change: 0.00012256
Epoch 579, change: 0.00012165
Epoch 580, change: 0.00012090
Epoch 581, change: 0.00012022
Epoch 582, change: 0.00011931
Epoch 583, change: 0.00011860
Ep

Epoch 354, change: 0.00057047
Epoch 355, change: 0.00056620
Epoch 356, change: 0.00056220
Epoch 357, change: 0.00055818
Epoch 358, change: 0.00055422
Epoch 359, change: 0.00055030
Epoch 360, change: 0.00054623
Epoch 361, change: 0.00054253
Epoch 362, change: 0.00053845
Epoch 363, change: 0.00053481
Epoch 364, change: 0.00053077
Epoch 365, change: 0.00052720
Epoch 366, change: 0.00052344
Epoch 367, change: 0.00051958
Epoch 368, change: 0.00051614
Epoch 369, change: 0.00051234
Epoch 370, change: 0.00050872
Epoch 371, change: 0.00050518
Epoch 372, change: 0.00050161
Epoch 373, change: 0.00049787
Epoch 374, change: 0.00049448
Epoch 375, change: 0.00049092
Epoch 376, change: 0.00048757
Epoch 377, change: 0.00048408
Epoch 378, change: 0.00048057
Epoch 379, change: 0.00047737
Epoch 380, change: 0.00047391
Epoch 381, change: 0.00047052
Epoch 382, change: 0.00046740
Epoch 383, change: 0.00046382
Epoch 384, change: 0.00046059
Epoch 385, change: 0.00045764
Epoch 386, change: 0.00045423
Epoch 387,

In [53]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

def decision_tree_classification(trainX, trainY, testX, testY, criterion='gini', max_depth=None):
    """
    Perform classification using Decision Trees for three-class prediction.
    
    Parameters:
        trainX (DataFrame or array-like): Features for training.
        trainY (Series or array-like): Target variable for training.
        testX (DataFrame or array-like): Features for testing.
        testY (Series or array-like): Target variable for testing.
        criterion (str, optional): Criterion used to measure the quality of a split ('gini' or 'entropy'). Default is 'gini'.
        max_depth (int, optional): Maximum depth of the tree. If None, the tree is fully grown. Default is None.
        
    Returns:
        dict: Dictionary containing accuracy score and classification report.
    """
    # Get unique target names
    target_names = trainY.unique() if isinstance(trainY, pd.Series) else testY.unique()
    
    # Initialize and train the Decision Tree model
    model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth)
    model.fit(trainX, trainY)
    
    # Predict on the testing set
    testY_pred = model.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY, testY_pred)
    
    # Generate classification report
    report = classification_report(testY, testY_pred, target_names=target_names, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report
    }
    
    return results

In [54]:
decision_tree_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.7412359797668793,
 'classification_report': {'LATE': {'precision': 0.785244772755728,
   'recall': 0.800350933163184,
   'f1-score': 0.7927258938587205,
   'support': 31345.0},
  'ONTIME': {'precision': 0.7183043178912166,
   'recall': 0.7137412040656763,
   'f1-score': 0.716015490955439,
   'support': 20464.0},
  'EARLY': {'precision': 0.6822206870564592,
   'recall': 0.6625396438155647,
   'f1-score': 0.6722361459203564,
   'support': 16396.0},
  'accuracy': 0.7412359797668793,
  'macro avg': {'precision': 0.7285899259011347,
   'recall': 0.7255439270148084,
   'f1-score': 0.7269925102448386,
   'support': 68205.0},
  'weighted avg': {'precision': 0.7403939205092274,
   'recall': 0.7412359797668793,
   'f1-score': 0.7407450773172621,
   'support': 68205.0}}}

In [55]:
from sklearn.metrics import accuracy_score, classification_report

from sklearn.preprocessing import LabelEncoder

def fit_and_evaluate(model, trainX, trainY, testX, testY):
    # Convert string labels to integers
    label_encoder = LabelEncoder()
    trainY_encoded = label_encoder.fit_transform(trainY)
    testY_encoded = label_encoder.transform(testY)
    
    model.fit(trainX, trainY_encoded)
    testY_pred = model.predict(testX)
    accuracy = accuracy_score(testY_encoded, testY_pred)
    report = classification_report(testY_encoded, testY_pred, output_dict=True)
    results = {'accuracy': accuracy, 'classification_report': report}
    return results

# Update other classification functions similarly...


# Random Forest
from sklearn.ensemble import RandomForestClassifier

def random_forest_classification(trainX, trainY, testX, testY, n_estimators=100, criterion='gini', max_depth=None):
    model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Support Vector Machines (SVM)
from sklearn.svm import SVC

def svm_classification(trainX, trainY, testX, testY, kernel='rbf', C=1.0):
    model = SVC(kernel=kernel, C=C)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# K-Nearest Neighbors (KNN)
from sklearn.neighbors import KNeighborsClassifier

def knn_classification(trainX, trainY, testX, testY, n_neighbors=5):
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Gradient Boosting Machines (GBM)
from sklearn.ensemble import GradientBoostingClassifier

def gbm_classification(trainX, trainY, testX, testY, n_estimators=100, learning_rate=0.1, max_depth=3):
    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Naive Bayes
from sklearn.naive_bayes import GaussianNB

def naive_bayes_classification(trainX, trainY, testX, testY):
    model = GaussianNB()
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

def adaboost_classification(trainX, trainY, testX, testY, n_estimators=50, learning_rate=1.0):
    model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# XGBoost
from xgboost import XGBClassifier

def xgboost_classification(trainX, trainY, testX, testY, n_estimators=100, learning_rate=0.1, max_depth=3):
    model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)


In [56]:
random_forest_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.6984238692177993,
 'classification_report': {'0': {'precision': 0.754432206846779,
   'recall': 0.7670441856755463,
   'f1-score': 0.7606859240041763,
   'support': 31345.0},
  '1': {'precision': 0.6699128902809044,
   'recall': 0.6689308053166536,
   'f1-score': 0.6694214876033058,
   'support': 20464.0},
  '2': {'precision': 0.622814740284241,
   'recall': 0.6040497682361552,
   'f1-score': 0.6132887485293207,
   'support': 16396.0},
  'accuracy': 0.6984238692177993,
  'macro avg': {'precision': 0.6823866124706415,
   'recall': 0.6800082530761183,
   'f1-score': 0.6811320533789343,
   'support': 68205.0},
  'weighted avg': {'precision': 0.69743340505859,
   'recall': 0.6984238692177993,
   'f1-score': 0.6978700085200747,
   'support': 68205.0}}}

In [57]:
knn_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.5305915988563888,
 'classification_report': {'0': {'precision': 0.5778555618469141,
   'recall': 0.7162864890732175,
   'f1-score': 0.6396672317269477,
   'support': 31345.0},
  '1': {'precision': 0.4865331311123895,
   'recall': 0.454603205629398,
   'f1-score': 0.470026525198939,
   'support': 20464.0},
  '2': {'precision': 0.43343108504398825,
   'recall': 0.2704318126372286,
   'f1-score': 0.33305791331780965,
   'support': 16396.0},
  'accuracy': 0.5305915988563888,
  'macro avg': {'precision': 0.4992732593344306,
   'recall': 0.48044050244661474,
   'f1-score': 0.4809172234145655,
   'support': 68205.0},
  'weighted avg': {'precision': 0.515736861689857,
   'recall': 0.5305915988563888,
   'f1-score': 0.5150620883646517,
   'support': 68205.0}}}

In [58]:
gbm_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.4949783740195,
 'classification_report': {'0': {'precision': 0.5037164612037709,
   'recall': 0.8864252671877493,
   'f1-score': 0.6423906132593492,
   'support': 31345.0},
  '1': {'precision': 0.46042525669011236,
   'recall': 0.2782935887412041,
   'f1-score': 0.346907075198733,
   'support': 20464.0},
  '2': {'precision': 0.41420118343195267,
   'recall': 0.017077335935594046,
   'f1-score': 0.03280224929709466,
   'support': 16396.0},
  'accuracy': 0.4949783740195,
  'macro avg': {'precision': 0.45944763377527864,
   'recall': 0.39393206395484914,
   'f1-score': 0.3406999792517256,
   'support': 68205.0},
  'weighted avg': {'precision': 0.4692086728669299,
   'recall': 0.4949783740195,
   'f1-score': 0.4071939863493342,
   'support': 68205.0}}}

In [59]:
naive_bayes_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.31131148742760795,
 'classification_report': {'0': {'precision': 0.5722300140252454,
   'recall': 0.03904929015791992,
   'f1-score': 0.07310954485724525,
   'support': 31345.0},
  '1': {'precision': 0.35908942024238455,
   'recall': 0.5357212666145426,
   'f1-score': 0.4299721535866965,
   'support': 20464.0},
  '2': {'precision': 0.2545587573165241,
   'recall': 0.5517199316906563,
   'f1-score': 0.3483786490025418,
   'support': 16396.0},
  'accuracy': 0.31131148742760795,
  'macro avg': {'precision': 0.39529273052805136,
   'recall': 0.3754968294877063,
   'f1-score': 0.2838201158154945,
   'support': 68205.0},
  'weighted avg': {'precision': 0.43191409823947224,
   'recall': 0.31131148742760795,
   'f1-score': 0.24635415531990593,
   'support': 68205.0}}}

In [60]:
adaboost_classification(enc_trainX, trainY, enc_testX, testY)



{'accuracy': 0.48596143977714246,
 'classification_report': {'0': {'precision': 0.5021168659193912,
   'recall': 0.8588929653852289,
   'f1-score': 0.6337421435465267,
   'support': 31345.0},
  '1': {'precision': 0.42723102585487904,
   'recall': 0.3003811571540266,
   'f1-score': 0.3527487662114082,
   'support': 20464.0},
  '2': {'precision': 0.38,
   'recall': 0.004635276896804099,
   'f1-score': 0.0091588334538443,
   'support': 16396.0},
  'accuracy': 0.48596143977714246,
  'macro avg': {'precision': 0.43644929725809006,
   'recall': 0.3879697998120199,
   'f1-score': 0.3318832477372597,
   'support': 68205.0},
  'weighted avg': {'precision': 0.4502923374435534,
   'recall': 0.48596143977714246,
   'f1-score': 0.3992884168979601,
   'support': 68205.0}}}

In [61]:
xgboost_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.49316032548933364,
 'classification_report': {'0': {'precision': 0.4995575221238938,
   'recall': 0.9004625937151061,
   'f1-score': 0.6426091411008026,
   'support': 31345.0},
  '1': {'precision': 0.4648337028824834,
   'recall': 0.256108287724785,
   'f1-score': 0.33025615173760986,
   'support': 20464.0},
  '2': {'precision': 0.3953488372093023,
   'recall': 0.010368382532324957,
   'f1-score': 0.020206822774277905,
   'support': 16396.0},
  'accuracy': 0.49316032548933364,
  'macro avg': {'precision': 0.4532466874052265,
   'recall': 0.38897975465740536,
   'f1-score': 0.33102403853756346,
   'support': 68205.0},
  'weighted avg': {'precision': 0.4640880721595823,
   'recall': 0.49316032548933364,
   'f1-score': 0.3992706763898566,
   'support': 68205.0}}}

In [62]:
model = DecisionTreeClassifier(criterion='entropy', max_features='log2')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.6972949197272927, 'classification_report': {'0': {'precision': 0.7488874365916659, 'recall': 0.7677141489870792, 'f1-score': 0.7581839377422099, 'support': 31345.0}, '1': {'precision': 0.671620561114381, 'recall': 0.6691262705238468, 'f1-score': 0.6703710956623911, 'support': 20464.0}, '2': {'precision': 0.6249681203774547, 'recall': 0.5978287387167602, 'f1-score': 0.6110972568578553, 'support': 16396.0}, 'accuracy': 0.6972949197272927, 'macro avg': {'precision': 0.6818253726945005, 'recall': 0.678223052742562, 'f1-score': 0.6798840967541522, 'support': 68205.0}, 'weighted avg': {'precision': 0.6959152139039544, 'recall': 0.6972949197272927, 'f1-score': 0.6964782677751798, 'support': 68205.0}}}


In [63]:
model = RandomForestClassifier(n_estimators=100, criterion='entropy')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.6997727439337292, 'classification_report': {'0': {'precision': 0.7554182479691371, 'recall': 0.7683841122986123, 'f1-score': 0.761846017587145, 'support': 31345.0}, '1': {'precision': 0.6718413320274241, 'recall': 0.6703967943706021, 'f1-score': 0.6711182858820076, 'support': 20464.0}, '2': {'precision': 0.6240724437177714, 'recall': 0.6052695779458405, 'f1-score': 0.6145272153074494, 'support': 16396.0}, 'accuracy': 0.6997727439337292, 'macro avg': {'precision': 0.6837773412381108, 'recall': 0.6813501615383516, 'f1-score': 0.682497172925534, 'support': 68205.0}, 'weighted avg': {'precision': 0.6987675066109288, 'recall': 0.6997727439337292, 'f1-score': 0.6992099735465055, 'support': 68205.0}}}


In [64]:
model = XGBClassifier(
    colsample_bytree=0.7,enable_categorical=True,
    gamma=0.0, learning_rate=0.01, max_depth=17, 
    min_child_weight=5,n_estimators=100,
    n_jobs=20,objective='multi:softprob')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.5609266182831171, 'classification_report': {'0': {'precision': 0.5558276298095257, 'recall': 0.8834901898229383, 'f1-score': 0.6823625073920757, 'support': 31345.0}, '1': {'precision': 0.5634381099818268, 'recall': 0.4090598123534011, 'f1-score': 0.4739956399875428, 'support': 20464.0}, '2': {'precision': 0.6224113475177305, 'recall': 0.1338131251524762, 'f1-score': 0.2202700667637167, 'support': 16396.0}, 'accuracy': 0.5609266182831171, 'macro avg': {'precision': 0.5805590291030277, 'recall': 0.47545437577627186, 'f1-score': 0.45887607138111175, 'support': 68205.0}, 'weighted avg': {'precision': 0.5741173080118525, 'recall': 0.5609266182831171, 'f1-score': 0.5087610524971422, 'support': 68205.0}}}


In [65]:
model = KNeighborsClassifier(n_neighbors=70)
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.5060919287442269, 'classification_report': {'0': {'precision': 0.5307745432475985, 'recall': 0.8091242622427819, 'f1-score': 0.6410373066424022, 'support': 31345.0}, '1': {'precision': 0.4631933177092723, 'recall': 0.3766614542611415, 'f1-score': 0.4154696132596685, 'support': 20464.0}, '2': {'precision': 0.3829674689235652, 'recall': 0.08831422298121493, 'f1-score': 0.14352976160975367, 'support': 16396.0}, 'accuracy': 0.5060919287442269, 'macro avg': {'precision': 0.4589784432934787, 'recall': 0.4246999798283794, 'f1-score': 0.40001222717060814, 'support': 68205.0}, 'weighted avg': {'precision': 0.4749659223248926, 'recall': 0.5060919287442269, 'f1-score': 0.45376143264871305, 'support': 68205.0}}}


In [66]:
model = AdaBoostClassifier(n_estimators=200, learning_rate=0.8, random_state=947)
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)



{'accuracy': 0.49157686386628546, 'classification_report': {'0': {'precision': 0.5096482354513008, 'recall': 0.8518743021215505, 'f1-score': 0.6377510807518689, 'support': 31345.0}, '1': {'precision': 0.43336579032781564, 'recall': 0.32623143080531664, 'f1-score': 0.37224344141180404, 'support': 20464.0}, '2': {'precision': 0.36855036855036855, 'recall': 0.009148572822639669, 'f1-score': 0.017853954650955187, 'support': 16396.0}, 'accuracy': 0.49157686386628546, 'macro avg': {'precision': 0.43718813144316165, 'recall': 0.39575143524983564, 'f1-score': 0.34261615893820946, 'support': 68205.0}, 'weighted avg': {'precision': 0.45284177576777784, 'recall': 0.49157686386628546, 'f1-score': 0.40907016863390583, 'support': 68205.0}}}


In [67]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

def fit_and_evaluate_voting(voting_classifier, trainX, trainY, testX, testY):
    # Convert string labels to integers
    label_encoder = LabelEncoder()
    trainY_encoded = label_encoder.fit_transform(trainY)
    testY_encoded = label_encoder.transform(testY)
    
    # Fit the VotingClassifier
    voting_classifier.fit(trainX, trainY_encoded)
    
    # Make predictions
    testY_pred = voting_classifier.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY_encoded, testY_pred)
    
    # Generate classification report
    report = classification_report(testY_encoded, testY_pred, target_names=label_encoder.classes_, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report,
        'voting_classifier': voting_classifier
    }
    
    return results

In [68]:
scaler = StandardScaler()
enc_trainX = pd.DataFrame(scaler.fit_transform(trainX), index=trainX.index, columns=trainX.columns)
enc_testX = pd.DataFrame(scaler.transform(testX), index=testX.index, columns=testX.columns)

In [70]:
from sklearn.ensemble import VotingClassifier

# Define multiple classifiers
ada = AdaBoostClassifier(n_estimators=200, learning_rate=0.8, random_state=947)
knn = KNeighborsClassifier(n_neighbors=70)
dt = DecisionTreeClassifier(criterion='entropy', max_features='log2')
rf = RandomForestClassifier(n_estimators=100, criterion='entropy')
xgb = XGBClassifier(
    colsample_bytree=0.7,enable_categorical=True,
    gamma=0.0, learning_rate=0.01, max_depth=17, 
    min_child_weight=5,n_estimators=100,objective='multi:softprob')

votingCLF = VotingClassifier(estimators=[('knn', knn), ('ada', ada), ('xgb', xgb), ('dt', dt),('rf', rf)], voting='soft', weights=[5,7,10,6,5])
fit_and_evaluate_voting(votingCLF, enc_trainX, trainY, enc_testX, testY)



{'accuracy': 0.6996994355252547,
 'classification_report': {'EARLY': {'precision': 0.7521871316001737,
   'recall': 0.7735204976870315,
   'f1-score': 0.7627046666352097,
   'support': 31345.0},
  'LATE': {'precision': 0.674373985339696,
   'recall': 0.6698592650508209,
   'f1-score': 0.672109043661592,
   'support': 20464.0},
  'ONTIME': {'precision': 0.6244566607005881,
   'recall': 0.5958160526957794,
   'f1-score': 0.6098002496878902,
   'support': 16396.0},
  'accuracy': 0.6996994355252547,
  'macro avg': {'precision': 0.6836725925468192,
   'recall': 0.6797319384778774,
   'f1-score': 0.681537986661564,
   'support': 68205.0},
  'weighted avg': {'precision': 0.6981348330011851,
   'recall': 0.6996994355252547,
   'f1-score': 0.6987655177634208,
   'support': 68205.0}},
 'voting_classifier': VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=70)),
                              ('ada',
                               AdaBoostClassifier(learning_rate=0.8,
         