In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy
import sklearn
import os
from sklearn.model_selection import train_test_split

In [20]:
def convert_to_categorical(time_str):
    hour = int(time_str.split(':')[0])
    
    if 0 <= hour < 3:
        return 'Late Night'
    elif 3 <= hour < 6:
        return 'Early Morning'
    elif 6 <= hour < 9:
        return 'Morning'
    elif 9 <= hour < 12:
        return 'Late Morning'
    elif 12 <= hour < 15:
        return 'Noon'
    elif 15 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

# 1 HOP

In [21]:
data = pd.read_csv("1_HOP.csv")

In [22]:
data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,WeekDay,PREV_STAT
0,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,Friday,ONTIME
1,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,Friday,LATE
2,MQ,2010-01-01,4094,N610MQ,ORD,11:20,100,ONTIME,1,1,winter,Friday,LATE
3,9E,2010-01-01,3818,89289E,DTW,11:44,84,LATE,1,1,winter,Friday,ONTIME
4,B6,2010-01-01,42,N586JB,JFK,11:52,71,LATE,1,1,winter,Friday,LATE


In [23]:
data['SCHED_ARRV_TIME_CAT'] = data['Scheduled Arrival Time'].apply(convert_to_categorical)

In [24]:
data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,WeekDay,PREV_STAT,SCHED_ARRV_TIME_CAT
0,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,Friday,ONTIME,Late Night
1,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,Friday,LATE,Morning
2,MQ,2010-01-01,4094,N610MQ,ORD,11:20,100,ONTIME,1,1,winter,Friday,LATE,Late Morning
3,9E,2010-01-01,3818,89289E,DTW,11:44,84,LATE,1,1,winter,Friday,ONTIME,Late Morning
4,B6,2010-01-01,42,N586JB,JFK,11:52,71,LATE,1,1,winter,Friday,LATE,Late Morning


In [25]:
df = data.drop(columns=['Date (MM/DD/YYYY)', 'Flight Number','Tail Number', 'Scheduled Arrival Time'])

In [26]:
df.head()

Unnamed: 0,Carrier Code,Origin Airport,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,WeekDay,PREV_STAT,SCHED_ARRV_TIME_CAT
0,B6,JFK,76,LATE,1,1,winter,Friday,ONTIME,Late Night
1,B6,JFK,75,LATE,1,1,winter,Friday,LATE,Morning
2,MQ,ORD,100,ONTIME,1,1,winter,Friday,LATE,Late Morning
3,9E,DTW,84,LATE,1,1,winter,Friday,ONTIME,Late Morning
4,B6,JFK,71,LATE,1,1,winter,Friday,LATE,Late Morning


In [27]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

class MultiColumnOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.encoder = None

    def fit(self, X, y=None):
        self.encoder = OneHotEncoder(sparse_output=False, drop='first')
        self.encoder.fit(X[self.columns])
        return self

    def transform(self, X):
        X_encoded = X.copy()
        encoded_data = self.encoder.transform(X[self.columns])
        encoded_df = pd.DataFrame(encoded_data, columns=self.encoder.get_feature_names_out(self.columns), index=X.index)
        
        # Drop the original columns
        X_encoded = X_encoded.drop(columns=self.columns)
        
        # Concatenate the encoded DataFrame with the original DataFrame, preserving the index
        X_encoded = pd.concat([X_encoded, encoded_df], axis=1)
        return X_encoded

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

In [28]:
df.columns

Index(['Carrier Code', 'Origin Airport', 'Scheduled Elapsed Time (Minutes)',
       'FLIGHT_STATUS', 'month', 'day', 'season', 'WeekDay', 'PREV_STAT',
       'SCHED_ARRV_TIME_CAT'],
      dtype='object')

In [29]:
df['FLIGHT_STATUS'].value_counts()

FLIGHT_STATUS
EARLY     52240
LATE      34106
ONTIME    27325
Name: count, dtype: int64

In [30]:
encoder = MultiColumnOneHotEncoder(columns=['Carrier Code', 'Origin Airport','season', 'SCHED_ARRV_TIME_CAT', 'month','PREV_STAT', 'WeekDay'])

In [31]:
encoded_data = encoder.fit_transform(df.drop(columns=['FLIGHT_STATUS']))

In [32]:
trainX, testX, trainY, testY = train_test_split(
    encoded_data,
    df['FLIGHT_STATUS'], 
    test_size=0.2, 
    random_state=947,
    stratify=df['FLIGHT_STATUS']
)

In [33]:
from sklearn.preprocessing import StandardScaler

In [34]:
scaler = StandardScaler()


enc_trainX = pd.DataFrame(scaler.fit_transform(trainX), index=trainX.index, columns=trainX.columns)
enc_testX = pd.DataFrame(scaler.transform(testX), index=testX.index, columns=testX.columns)

In [35]:
enc_trainX

Unnamed: 0,Scheduled Elapsed Time (Minutes),day,Carrier Code_AA,Carrier Code_B6,Carrier Code_DL,Carrier Code_EV,Carrier Code_F9,Carrier Code_G4,Carrier Code_MQ,Carrier Code_OH,...,month_12,PREV_STAT_LATE,PREV_STAT_ONTIME,PREV_STAT_nan,WeekDay_Monday,WeekDay_Saturday,WeekDay_Sunday,WeekDay_Thursday,WeekDay_Tuesday,WeekDay_Wednesday
109767,0.420191,0.718113,-0.237358,-0.448859,-0.351464,-0.344872,-0.127516,-0.15405,2.914831,-0.226431,...,-0.300294,1.525959,-0.562737,-0.005744,-0.417338,-0.370459,-0.408178,-0.420982,-0.409240,-0.411655
21666,-1.129327,-1.678529,-0.237358,-0.448859,-0.351464,2.899624,-0.127516,-0.15405,-0.343073,-0.226431,...,-0.300294,-0.655326,-0.562737,-0.005744,-0.417338,-0.370459,2.449914,-0.420982,-0.409240,-0.411655
107873,2.104451,-1.336151,-0.237358,2.227871,-0.351464,-0.344872,-0.127516,-0.15405,-0.343073,-0.226431,...,-0.300294,-0.655326,1.777030,-0.005744,-0.417338,-0.370459,-0.408178,-0.420982,-0.409240,-0.411655
20188,0.049654,-1.107900,-0.237358,-0.448859,-0.351464,-0.344872,-0.127516,-0.15405,2.914831,-0.226431,...,-0.300294,1.525959,-0.562737,-0.005744,-0.417338,-0.370459,-0.408178,-0.420982,-0.409240,-0.411655
96511,-0.624050,1.288742,-0.237358,-0.448859,-0.351464,-0.344872,-0.127516,-0.15405,-0.343073,4.416353,...,-0.300294,-0.655326,-0.562737,-0.005744,-0.417338,-0.370459,-0.408178,-0.420982,2.443552,-0.411655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56030,1.969710,1.516994,-0.237358,2.227871,-0.351464,-0.344872,-0.127516,-0.15405,-0.343073,-0.226431,...,-0.300294,-0.655326,-0.562737,-0.005744,-0.417338,2.699351,-0.408178,-0.420982,-0.409240,-0.411655
22547,1.228636,0.375736,-0.237358,-0.448859,2.845245,-0.344872,-0.127516,-0.15405,-0.343073,-0.226431,...,-0.300294,-0.655326,-0.562737,-0.005744,-0.417338,-0.370459,2.449914,-0.420982,-0.409240,-0.411655
63492,0.790729,-1.336151,-0.237358,-0.448859,2.845245,-0.344872,-0.127516,-0.15405,-0.343073,-0.226431,...,-0.300294,-0.655326,-0.562737,-0.005744,-0.417338,-0.370459,-0.408178,-0.420982,-0.409240,-0.411655
51736,1.262321,1.745245,-0.237358,-0.448859,2.845245,-0.344872,-0.127516,-0.15405,-0.343073,-0.226431,...,-0.300294,-0.655326,-0.562737,-0.005744,2.396141,-0.370459,-0.408178,-0.420982,-0.409240,-0.411655


In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

def logistic_regression_classification(trainX, trainY, testX, testY, penalty='l2', C=1.0, max_iter=1000, solver='lbfgs'):
    """
    Perform classification using Regularized Logistic Regression for three-class prediction.
    
    Parameters:
        trainX (DataFrame or array-like): Features for training.
        trainY (Series or array-like): Target variable for training.
        testX (DataFrame or array-like): Features for testing.
        testY (Series or array-like): Target variable for testing.
        penalty (str, optional): Regularization type ('l1' for Lasso, 'l2' for Ridge). Default is 'l2'.
        C (float, optional): Inverse of regularization strength; smaller values specify stronger regularization. Default is 1.0.
        max_iter (int, optional): Maximum number of iterations for optimization algorithm. Default is 1000.
        solver (str, optional): Optimization algorithm to use ('lbfgs', 'sag', 'saga', etc.). Default is 'lbfgs'.
        
    Returns:
        dict: Dictionary containing accuracy score and classification report.
    """
    # Get unique target names
    target_names = trainY.unique() if isinstance(trainY, pd.Series) else testY.unique()
    
    # Initialize and train the Logistic Regression model
    model = LogisticRegression(penalty=penalty, C=C, max_iter=max_iter, solver=solver, verbose=1 if max_iter > 300 else 0)
    model.fit(trainX, trainY)
    
    # Predict on the testing set
    testY_pred = model.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY, testY_pred)
    
    # Generate classification report
    report = classification_report(testY, testY_pred, target_names=target_names, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report
    }
    
    return results


In [37]:
report = logistic_regression_classification(enc_trainX, trainY, enc_testX, testY, max_iter=1500, solver='saga')
print(report)

Epoch 1, change: 1.00000000
Epoch 2, change: 0.18844127
Epoch 3, change: 0.17750727
Epoch 4, change: 0.14631653
Epoch 5, change: 0.13879186
Epoch 6, change: 0.12224287
Epoch 7, change: 0.10173988
Epoch 8, change: 0.08808443
Epoch 9, change: 0.07750939
Epoch 10, change: 0.06898753
Epoch 11, change: 0.06202657
Epoch 12, change: 0.05593147
Epoch 13, change: 0.05096095
Epoch 14, change: 0.04671184
Epoch 15, change: 0.04333052
Epoch 16, change: 0.04023046
Epoch 17, change: 0.03731627
Epoch 18, change: 0.03471526
Epoch 19, change: 0.03254948
Epoch 20, change: 0.03069005
Epoch 21, change: 0.02895656
Epoch 22, change: 0.02731584
Epoch 23, change: 0.02589158
Epoch 24, change: 0.02455312
Epoch 25, change: 0.02336274
Epoch 26, change: 0.02221894
Epoch 27, change: 0.02129097
Epoch 28, change: 0.02017418
Epoch 29, change: 0.01935158
Epoch 30, change: 0.01857211
Epoch 31, change: 0.01778390
Epoch 32, change: 0.01705854
Epoch 33, change: 0.01643653
Epoch 34, change: 0.01569263
Epoch 35, change: 0.015

{'accuracy': 0.5126896855069276, 'classification_report': {'EARLY': {'precision': 0.541311837742963, 'recall': 0.7969946401225115, 'f1-score': 0.6447292013472185, 'support': 10448.0}, 'LATE': {'precision': 0.4703304728866168, 'recall': 0.37760187628261505, 'f1-score': 0.4188958451906659, 'support': 6822.0}, 'ONTIME': {'precision': 0.4016, 'recall': 0.13778591033851784, 'f1-score': 0.20517711171662126, 'support': 5465.0}, 'accuracy': 0.5126896855069276, 'macro avg': {'precision': 0.4710807702098599, 'recall': 0.43746080891454814, 'f1-score': 0.4229340527515018, 'support': 22735.0}, 'weighted avg': {'precision': 0.48642905505920286, 'recall': 0.5126896855069276, 'f1-score': 0.4713055230744577, 'support': 22735.0}}}


In [38]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

def decision_tree_classification(trainX, trainY, testX, testY, criterion='gini', max_depth=None):
    """
    Perform classification using Decision Trees for three-class prediction.
    
    Parameters:
        trainX (DataFrame or array-like): Features for training.
        trainY (Series or array-like): Target variable for training.
        testX (DataFrame or array-like): Features for testing.
        testY (Series or array-like): Target variable for testing.
        criterion (str, optional): Criterion used to measure the quality of a split ('gini' or 'entropy'). Default is 'gini'.
        max_depth (int, optional): Maximum depth of the tree. If None, the tree is fully grown. Default is None.
        
    Returns:
        dict: Dictionary containing accuracy score and classification report.
    """
    # Get unique target names
    target_names = trainY.unique() if isinstance(trainY, pd.Series) else testY.unique()
    
    # Initialize and train the Decision Tree model
    model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth)
    model.fit(trainX, trainY)
    
    # Predict on the testing set
    testY_pred = model.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY, testY_pred)
    
    # Generate classification report
    report = classification_report(testY, testY_pred, target_names=target_names, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report
    }
    
    return results

In [39]:
decision_tree_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.45462942599516165,
 'classification_report': {'EARLY': {'precision': 0.5582942097026604,
   'recall': 0.5463246554364471,
   'f1-score': 0.5522445820433437,
   'support': 10448.0},
  'LATE': {'precision': 0.4096543677993292,
   'recall': 0.41175608326004104,
   'f1-score': 0.41070253673514145,
   'support': 6822.0},
  'ONTIME': {'precision': 0.32171913689423415,
   'recall': 0.3328453796889296,
   'f1-score': 0.32718769673531795,
   'support': 5465.0},
  'accuracy': 0.45462942599516165,
  'macro avg': {'precision': 0.4298892381320745,
   'recall': 0.43030870612847255,
   'f1-score': 0.430044938504601,
   'support': 22735.0},
  'weighted avg': {'precision': 0.4568249431813244,
   'recall': 0.45462942599516165,
   'f1-score': 0.45567384479676715,
   'support': 22735.0}}}

In [40]:
from sklearn.metrics import accuracy_score, classification_report

from sklearn.preprocessing import LabelEncoder

def fit_and_evaluate(model, trainX, trainY, testX, testY):
    # Convert string labels to integers
    label_encoder = LabelEncoder()
    trainY_encoded = label_encoder.fit_transform(trainY)
    testY_encoded = label_encoder.transform(testY)
    
    model.fit(trainX, trainY_encoded)
    testY_pred = model.predict(testX)
    accuracy = accuracy_score(testY_encoded, testY_pred)
    report = classification_report(testY_encoded, testY_pred, output_dict=True)
    results = {'accuracy': accuracy, 'classification_report': report}
    return results

# Update other classification functions similarly...


# Random Forest
from sklearn.ensemble import RandomForestClassifier

def random_forest_classification(trainX, trainY, testX, testY, n_estimators=100, criterion='gini', max_depth=None):
    model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Support Vector Machines (SVM)
from sklearn.svm import SVC

def svm_classification(trainX, trainY, testX, testY, kernel='rbf', C=1.0):
    model = SVC(kernel=kernel, C=C)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# K-Nearest Neighbors (KNN)
from sklearn.neighbors import KNeighborsClassifier

def knn_classification(trainX, trainY, testX, testY, n_neighbors=5):
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Gradient Boosting Machines (GBM)
from sklearn.ensemble import GradientBoostingClassifier

def gbm_classification(trainX, trainY, testX, testY, n_estimators=100, learning_rate=0.1, max_depth=3):
    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Naive Bayes
from sklearn.naive_bayes import GaussianNB

def naive_bayes_classification(trainX, trainY, testX, testY):
    model = GaussianNB()
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

def adaboost_classification(trainX, trainY, testX, testY, n_estimators=50, learning_rate=1.0):
    model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# XGBoost
from xgboost import XGBClassifier

def xgboost_classification(trainX, trainY, testX, testY, n_estimators=100, learning_rate=0.1, max_depth=3):
    model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)


In [41]:
random_forest_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.5037607213547394,
 'classification_report': {'0': {'precision': 0.5727316758153052,
   'recall': 0.6790773353751914,
   'f1-score': 0.6213872832369942,
   'support': 10448.0},
  '1': {'precision': 0.45663913595933925,
   'recall': 0.42143066549399005,
   'f1-score': 0.4383290135691416,
   'support': 6822.0},
  '2': {'precision': 0.3660824487780795,
   'recall': 0.2713632204940531,
   'f1-score': 0.31168558217738546,
   'support': 5465.0},
  'accuracy': 0.5037607213547394,
  'macro avg': {'precision': 0.46515108685090806,
   'recall': 0.45729040712107816,
   'f1-score': 0.4571339596611737,
   'support': 22735.0},
  'weighted avg': {'precision': 0.4882222703762976,
   'recall': 0.5037607213547394,
   'f1-score': 0.492012165050724,
   'support': 22735.0}}}

In [42]:
knn_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.47583021772597317,
 'classification_report': {'0': {'precision': 0.5206277412280702,
   'recall': 0.727124808575804,
   'f1-score': 0.6067891373801917,
   'support': 10448.0},
  '1': {'precision': 0.41910265125764784,
   'recall': 0.36147757255936674,
   'f1-score': 0.3881630725641429,
   'support': 6822.0},
  '2': {'precision': 0.33421868083222667,
   'recall': 0.13815187557182068,
   'f1-score': 0.19549456240290006,
   'support': 5465.0},
  'accuracy': 0.47583021772597317,
  'macro avg': {'precision': 0.4246496911059816,
   'recall': 0.40891808556899717,
   'f1-score': 0.3968155907824116,
   'support': 22735.0},
  'weighted avg': {'precision': 0.4453548281494906,
   'recall': 0.47583021772597317,
   'f1-score': 0.4423206145552089,
   'support': 22735.0}}}

In [43]:
gbm_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.5248735429953816,
 'classification_report': {'0': {'precision': 0.5343753816094762,
   'recall': 0.8376722817764165,
   'f1-score': 0.6525013047043913,
   'support': 10448.0},
  '1': {'precision': 0.5057026476578411,
   'recall': 0.36396951040750514,
   'f1-score': 0.4232867371292192,
   'support': 6822.0},
  '2': {'precision': 0.4823773324118867,
   'recall': 0.12772186642268984,
   'f1-score': 0.2019675925925926,
   'support': 5465.0},
  'accuracy': 0.5248735429953816,
  'macro avg': {'precision': 0.5074851205597347,
   'recall': 0.4431212195355371,
   'f1-score': 0.42591854480873437,
   'support': 22735.0},
  'weighted avg': {'precision': 0.5132724684850918,
   'recall': 0.5248735429953816,
   'f1-score': 0.47542329649287585,
   'support': 22735.0}}}

In [44]:
naive_bayes_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.30613591378931165,
 'classification_report': {'0': {'precision': 0.572139303482587,
   'recall': 0.022013782542113322,
   'f1-score': 0.0423963133640553,
   'support': 10448.0},
  '1': {'precision': 0.3013594149580511,
   'recall': 0.9846086191732629,
   'f1-score': 0.46147504379787707,
   'support': 6822.0},
  '2': {'precision': 0.29545454545454547,
   'recall': 0.0023787740164684353,
   'f1-score': 0.00471954982755491,
   'support': 5465.0},
  'accuracy': 0.30613591378931165,
  'macro avg': {'precision': 0.3896510879650612,
   'recall': 0.3363337252439482,
   'f1-score': 0.1695303023298291,
   'support': 22735.0},
  'weighted avg': {'precision': 0.42437846767270654,
   'recall': 0.30613591378931165,
   'f1-score': 0.15909090699909192,
   'support': 22735.0}}}

In [45]:
adaboost_classification(enc_trainX, trainY, enc_testX, testY)



{'accuracy': 0.5073674950516824,
 'classification_report': {'0': {'precision': 0.5288780426756774,
   'recall': 0.80895865237366,
   'f1-score': 0.6396004389118014,
   'support': 10448.0},
  '1': {'precision': 0.4731141199226306,
   'recall': 0.35854588097332163,
   'f1-score': 0.4079386257505003,
   'support': 6822.0},
  '2': {'precision': 0.40214646464646464,
   'recall': 0.11655992680695335,
   'f1-score': 0.1807348560079444,
   'support': 5465.0},
  'accuracy': 0.5073674950516824,
  'macro avg': {'precision': 0.4680462090815909,
   'recall': 0.42802148671797835,
   'f1-score': 0.4094246402234154,
   'support': 22735.0},
  'weighted avg': {'precision': 0.4816816690248776,
   'recall': 0.5073674950516824,
   'f1-score': 0.4597852948627153,
   'support': 22735.0}}}

In [46]:
xgboost_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.5223663954255553,
 'classification_report': {'0': {'precision': 0.5316662630176798,
   'recall': 0.840447932618683,
   'f1-score': 0.6513128615932354,
   'support': 10448.0},
  '1': {'precision': 0.5004093327875563,
   'recall': 0.35839929639401935,
   'f1-score': 0.41766313631704816,
   'support': 6822.0},
  '2': {'precision': 0.4876219054763691,
   'recall': 0.11893870082342177,
   'f1-score': 0.19123271550456017,
   'support': 5465.0},
  'accuracy': 0.5223663954255553,
  'macro avg': {'precision': 0.5065658337605351,
   'recall': 0.43926197661204136,
   'f1-score': 0.4200695711382812,
   'support': 22735.0},
  'weighted avg': {'precision': 0.5116998151622513,
   'recall': 0.5223663954255553,
   'f1-score': 0.4706092581532195,
   'support': 22735.0}}}

In [47]:
model = DecisionTreeClassifier(criterion='entropy', max_features='log2')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.44499670112161865, 'classification_report': {'0': {'precision': 0.5512213961571883, 'recall': 0.5464203675344563, 'f1-score': 0.5488103821196828, 'support': 10448.0}, '1': {'precision': 0.3966749307277235, 'recall': 0.39871005570214013, 'f1-score': 0.39768988961181373, 'support': 6822.0}, '2': {'precision': 0.30574171345770695, 'recall': 0.3088746569075938, 'f1-score': 0.3073002002548698, 'support': 5465.0}, 'accuracy': 0.44499670112161865, 'macro avg': {'precision': 0.41787934678087296, 'recall': 0.41800169338139676, 'f1-score': 0.41793349066212215, 'support': 22735.0}, 'weighted avg': {'precision': 0.44583927814036517, 'recall': 0.44499670112161865, 'f1-score': 0.4454104637656082, 'support': 22735.0}}}


In [48]:
model = RandomForestClassifier(n_estimators=100, criterion='entropy')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.5014295139652518, 'classification_report': {'0': {'precision': 0.5726704037695994, 'recall': 0.6746745788667687, 'f1-score': 0.6195016917871424, 'support': 10448.0}, '1': {'precision': 0.45374449339207046, 'recall': 0.42274992670771033, 'f1-score': 0.4376991956290788, 'support': 6822.0}, '2': {'precision': 0.36044226044226046, 'recall': 0.2684354986276304, 'f1-score': 0.30770844257996854, 'support': 5465.0}, 'accuracy': 0.5014295139652518, 'macro avg': {'precision': 0.4622857192013101, 'recall': 0.4552866680673698, 'f1-score': 0.4549697766653966, 'support': 22735.0}, 'weighted avg': {'precision': 0.4859697499811934, 'recall': 0.5014295139652518, 'f1-score': 0.4900006257784547, 'support': 22735.0}}}


In [49]:
model = XGBClassifier(
    colsample_bytree=0.7,enable_categorical=True,
    gamma=0.0, learning_rate=0.01, max_depth=17, 
    min_child_weight=5,n_estimators=100,
    n_jobs=20,objective='multi:softprob')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.5510886298658456, 'classification_report': {'0': {'precision': 0.5520098948670378, 'recall': 0.8543261868300153, 'f1-score': 0.6706739800135246, 'support': 10448.0}, '1': {'precision': 0.5367053729580791, 'recall': 0.3997361477572559, 'f1-score': 0.45820381416449635, 'support': 6822.0}, '2': {'precision': 0.5902964959568733, 'recall': 0.16029277218664226, 'f1-score': 0.25212260756943444, 'support': 5465.0}, 'accuracy': 0.5510886298658456, 'macro avg': {'precision': 0.55967058792733, 'recall': 0.4714517022579712, 'f1-score': 0.46033346724915186, 'support': 22735.0}, 'weighted avg': {'precision': 0.5566207955265071, 'recall': 0.5510886298658456, 'f1-score': 0.5063082565990086, 'support': 22735.0}}}


In [50]:
model = KNeighborsClassifier(n_neighbors=70)
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.5058720035188036, 'classification_report': {'0': {'precision': 0.5178467280998483, 'recall': 0.8498277182235835, 'f1-score': 0.6435456983402189, 'support': 10448.0}, '1': {'precision': 0.4728890728476821, 'recall': 0.33494576370565815, 'f1-score': 0.39214003775527717, 'support': 6822.0}, '2': {'precision': 0.44517833553500663, 'recall': 0.06166514181152791, 'f1-score': 0.10832529733204757, 'support': 5465.0}, 'accuracy': 0.5058720035188036, 'macro avg': {'precision': 0.47863804549417904, 'recall': 0.4154795412469232, 'f1-score': 0.38133701114251456, 'support': 22735.0}, 'weighted avg': {'precision': 0.4868885627381973, 'recall': 0.5058720035188036, 'f1-score': 0.43945205822497235, 'support': 22735.0}}}


In [51]:
model = AdaBoostClassifier(n_estimators=200, learning_rate=0.8, random_state=947)
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)



{'accuracy': 0.5092588519903233, 'classification_report': {'0': {'precision': 0.53113851161044, 'recall': 0.8122128637059725, 'f1-score': 0.6422705771050142, 'support': 10448.0}, '1': {'precision': 0.4752918287937743, 'recall': 0.35810612723541485, 'f1-score': 0.4084601237251296, 'support': 6822.0}, '2': {'precision': 0.4011124845488257, 'recall': 0.11875571820677036, 'f1-score': 0.1832556826203586, 'support': 5465.0}, 'accuracy': 0.5092588519903233, 'macro avg': {'precision': 0.4691809416510133, 'recall': 0.4296915697160526, 'f1-score': 0.41132879448350085, 'support': 22735.0}, 'weighted avg': {'precision': 0.4831253905166632, 'recall': 0.5092588519903233, 'f1-score': 0.4617748079686071, 'support': 22735.0}}}


In [52]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

def fit_and_evaluate_voting(voting_classifier, trainX, trainY, testX, testY):
    # Convert string labels to integers
    label_encoder = LabelEncoder()
    trainY_encoded = label_encoder.fit_transform(trainY)
    testY_encoded = label_encoder.transform(testY)
    
    # Fit the VotingClassifier
    voting_classifier.fit(trainX, trainY_encoded)
    
    # Make predictions
    testY_pred = voting_classifier.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY_encoded, testY_pred)
    
    # Generate classification report
    report = classification_report(testY_encoded, testY_pred, target_names=label_encoder.classes_, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report,
        'voting_classifier': voting_classifier
    }
    
    return results

In [53]:
scaler = StandardScaler()
enc_trainX = pd.DataFrame(scaler.fit_transform(trainX), index=trainX.index, columns=trainX.columns)
enc_testX = pd.DataFrame(scaler.transform(testX), index=testX.index, columns=testX.columns)

In [54]:
from sklearn.ensemble import VotingClassifier

# Define multiple classifiers
ada = AdaBoostClassifier(n_estimators=200, learning_rate=0.8, random_state=947)
knn = KNeighborsClassifier(n_neighbors=70)
xgb = XGBClassifier(
    colsample_bytree=0.7,enable_categorical=True,
    gamma=0.0, learning_rate=0.01, max_depth=17, 
    min_child_weight=5,n_estimators=100,objective='multi:softprob')

votingCLF = VotingClassifier(estimators=[('knn', knn), ('ada', ada), ('xgb', xgb)], voting='soft', weights=[5,7,10])
fit_and_evaluate_voting(votingCLF, enc_trainX, trainY, enc_testX, testY)



{'accuracy': 0.5442269628326369,
 'classification_report': {'EARLY': {'precision': 0.5418824790141097,
   'recall': 0.8711715160796325,
   'f1-score': 0.6681592952835382,
   'support': 10448.0},
  'LATE': {'precision': 0.5349170591849273,
   'recall': 0.38287892113749633,
   'f1-score': 0.4463049978641606,
   'support': 6822.0},
  'ONTIME': {'precision': 0.6246445497630332,
   'recall': 0.12058554437328453,
   'f1-score': 0.20214723926380368,
   'support': 5465.0},
  'accuracy': 0.5442269628326369,
  'macro avg': {'precision': 0.56714802932069,
   'recall': 0.45821199386347117,
   'f1-score': 0.4388705108038342,
   'support': 22735.0},
  'weighted avg': {'precision': 0.5596865970069922,
   'recall': 0.5442269628326369,
   'f1-score': 0.48956919617894873,
   'support': 22735.0}},
 'voting_classifier': VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=70)),
                              ('ada',
                               AdaBoostClassifier(learning_rate=0.8,
     

# 3 HOP

In [112]:
data = pd.read_csv("3_HOP.csv", parse_dates=["Date (MM/DD/YYYY)"])

In [113]:
data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,PREV_STAT
0,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,ONTIME
1,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,ONTIME
2,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,ONTIME
3,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,LATE
4,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,ONTIME


In [114]:
data['SCHED_ARRV_TIME_CAT'] = data['Scheduled Arrival Time'].apply(convert_to_categorical)

In [115]:
data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,PREV_STAT,SCHED_ARRV_TIME_CAT
0,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,ONTIME,Late Night
1,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,ONTIME,Late Night
2,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,ONTIME,Late Night
3,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,LATE,Morning
4,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,ONTIME,Morning


In [116]:
data['WeekDay'] = data['Date (MM/DD/YYYY)'].dt.day_name()

In [117]:
df = data.drop(columns=['Date (MM/DD/YYYY)', 'Flight Number','Tail Number', 'Scheduled Arrival Time'])

In [118]:
df.head()

Unnamed: 0,Carrier Code,Origin Airport,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,PREV_STAT,SCHED_ARRV_TIME_CAT,WeekDay
0,B6,JFK,76,LATE,1,1,winter,ONTIME,Late Night,Friday
1,B6,JFK,76,LATE,1,1,winter,ONTIME,Late Night,Friday
2,B6,JFK,76,LATE,1,1,winter,ONTIME,Late Night,Friday
3,B6,JFK,75,LATE,1,1,winter,LATE,Morning,Friday
4,B6,JFK,75,LATE,1,1,winter,ONTIME,Morning,Friday


In [119]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

class MultiColumnOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.encoder = None

    def fit(self, X, y=None):
        self.encoder = OneHotEncoder(sparse_output=False, drop='first')
        self.encoder.fit(X[self.columns])
        return self

    def transform(self, X):
        X_encoded = X.copy()
        encoded_data = self.encoder.transform(X[self.columns])
        encoded_df = pd.DataFrame(encoded_data, columns=self.encoder.get_feature_names_out(self.columns), index=X.index)
        
        # Drop the original columns
        X_encoded = X_encoded.drop(columns=self.columns)
        
        # Concatenate the encoded DataFrame with the original DataFrame, preserving the index
        X_encoded = pd.concat([X_encoded, encoded_df], axis=1)
        return X_encoded

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

In [120]:
df.columns

Index(['Carrier Code', 'Origin Airport', 'Scheduled Elapsed Time (Minutes)',
       'FLIGHT_STATUS', 'month', 'day', 'season', 'PREV_STAT',
       'SCHED_ARRV_TIME_CAT', 'WeekDay'],
      dtype='object')

In [128]:
df['FLIGHT_STATUS'].value_counts()

FLIGHT_STATUS
EARLY     156726
LATE      102321
ONTIME     81978
Name: count, dtype: int64

In [129]:
encoder = MultiColumnOneHotEncoder(columns=['Carrier Code', 'Origin Airport','season', 'SCHED_ARRV_TIME_CAT', 'month','PREV_STAT', "WeekDay"])

In [130]:
encoded_data = encoder.fit_transform(df.drop(columns=['FLIGHT_STATUS', 'day']))

In [131]:
trainX, testX, trainY, testY = train_test_split(
    encoded_data,
    df['FLIGHT_STATUS'], 
    test_size=0.2, 
    random_state=947,
    stratify=df['FLIGHT_STATUS'],
    shuffle=True
)

In [132]:
from sklearn.preprocessing import StandardScaler

In [133]:
scaler = StandardScaler()


enc_trainX = pd.DataFrame(scaler.fit_transform(trainX), index=trainX.index, columns=trainX.columns)
enc_testX = pd.DataFrame(scaler.transform(testX), index=testX.index, columns=testX.columns)

In [135]:
enc_trainX

Unnamed: 0,Scheduled Elapsed Time (Minutes),Carrier Code_AA,Carrier Code_B6,Carrier Code_DL,Carrier Code_EV,Carrier Code_F9,Carrier Code_G4,Carrier Code_MQ,Carrier Code_OH,Carrier Code_OO,...,month_11,month_12,PREV_STAT_LATE,PREV_STAT_ONTIME,WeekDay_Monday,WeekDay_Saturday,WeekDay_Sunday,WeekDay_Thursday,WeekDay_Tuesday,WeekDay_Wednesday
57395,0.051813,-0.239978,-0.452409,-0.350117,-0.345110,-0.12563,-0.152694,2.909811,-0.225996,-0.278779,...,-0.306275,-0.300961,-0.654688,-0.562495,-0.415927,-0.371895,-0.408277,2.380066,-0.409065,-0.413773
207852,-0.823517,-0.239978,-0.452409,2.856186,-0.345110,-0.12563,-0.152694,-0.343665,-0.225996,-0.278779,...,-0.306275,-0.300961,-0.654688,1.777795,-0.415927,-0.371895,2.449317,-0.420156,-0.409065,-0.413773
103525,-0.116520,-0.239978,-0.452409,-0.350117,-0.345110,-0.12563,-0.152694,2.909811,-0.225996,-0.278779,...,-0.306275,-0.300961,-0.654688,1.777795,-0.415927,2.688933,-0.408277,-0.420156,-0.409065,-0.413773
176924,-1.025516,-0.239978,2.210391,-0.350117,-0.345110,-0.12563,-0.152694,-0.343665,-0.225996,-0.278779,...,-0.306275,3.322687,1.527445,-0.562495,-0.415927,-0.371895,2.449317,-0.420156,-0.409065,-0.413773
33003,-0.453185,-0.239978,-0.452409,-0.350117,-0.345110,-0.12563,-0.152694,-0.343665,-0.225996,-0.278779,...,-0.306275,3.322687,-0.654688,-0.562495,-0.415927,-0.371895,-0.408277,-0.420156,-0.409065,2.416785
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180738,0.489477,4.167049,-0.452409,-0.350117,-0.345110,-0.12563,-0.152694,-0.343665,-0.225996,-0.278779,...,-0.306275,-0.300961,1.527445,-0.562495,2.404266,-0.371895,-0.408277,-0.420156,-0.409065,-0.413773
72137,-1.025516,-0.239978,-0.452409,-0.350117,2.897628,-0.12563,-0.152694,-0.343665,-0.225996,-0.278779,...,-0.306275,-0.300961,-0.654688,1.777795,-0.415927,-0.371895,-0.408277,-0.420156,2.444599,-0.413773
256919,-0.789850,-0.239978,-0.452409,-0.350117,-0.345110,-0.12563,-0.152694,-0.343665,-0.225996,-0.278779,...,3.265037,-0.300961,-0.654688,-0.562495,-0.415927,-0.371895,-0.408277,-0.420156,2.444599,-0.413773
215427,-0.924517,-0.239978,2.210391,-0.350117,-0.345110,-0.12563,-0.152694,-0.343665,-0.225996,-0.278779,...,-0.306275,3.322687,-0.654688,-0.562495,-0.415927,-0.371895,-0.408277,-0.420156,2.444599,-0.413773


In [136]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

def logistic_regression_classification(trainX, trainY, testX, testY, penalty='l2', C=1.0, max_iter=1000, solver='lbfgs'):
    """
    Perform classification using Regularized Logistic Regression for three-class prediction.
    
    Parameters:
        trainX (DataFrame or array-like): Features for training.
        trainY (Series or array-like): Target variable for training.
        testX (DataFrame or array-like): Features for testing.
        testY (Series or array-like): Target variable for testing.
        penalty (str, optional): Regularization type ('l1' for Lasso, 'l2' for Ridge). Default is 'l2'.
        C (float, optional): Inverse of regularization strength; smaller values specify stronger regularization. Default is 1.0.
        max_iter (int, optional): Maximum number of iterations for optimization algorithm. Default is 1000.
        solver (str, optional): Optimization algorithm to use ('lbfgs', 'sag', 'saga', etc.). Default is 'lbfgs'.
        
    Returns:
        dict: Dictionary containing accuracy score and classification report.
    """
    # Get unique target names
    target_names = trainY.unique() if isinstance(trainY, pd.Series) else testY.unique()
    
    # Initialize and train the Logistic Regression model
    model = LogisticRegression(penalty=penalty, C=C, max_iter=max_iter, solver=solver, verbose=1 if max_iter > 300 else 0)
    model.fit(trainX, trainY)
    
    # Predict on the testing set
    testY_pred = model.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY, testY_pred)
    
    # Generate classification report
    report = classification_report(testY, testY_pred, target_names=target_names, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report
    }
    
    return results


In [None]:
report = logistic_regression_classification(enc_trainX, trainY, enc_testX, testY, max_iter=1500, solver='saga')
print(report)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

def decision_tree_classification(trainX, trainY, testX, testY, criterion='gini', max_depth=None):
    """
    Perform classification using Decision Trees for three-class prediction.
    
    Parameters:
        trainX (DataFrame or array-like): Features for training.
        trainY (Series or array-like): Target variable for training.
        testX (DataFrame or array-like): Features for testing.
        testY (Series or array-like): Target variable for testing.
        criterion (str, optional): Criterion used to measure the quality of a split ('gini' or 'entropy'). Default is 'gini'.
        max_depth (int, optional): Maximum depth of the tree. If None, the tree is fully grown. Default is None.
        
    Returns:
        dict: Dictionary containing accuracy score and classification report.
    """
    # Get unique target names
    target_names = trainY.unique() if isinstance(trainY, pd.Series) else testY.unique()
    
    # Initialize and train the Decision Tree model
    model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth)
    model.fit(trainX, trainY)
    
    # Predict on the testing set
    testY_pred = model.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY, testY_pred)
    
    # Generate classification report
    report = classification_report(testY, testY_pred, target_names=target_names, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report
    }
    
    return results

In [None]:
decision_tree_classification(enc_trainX, trainY, enc_testX, testY)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

from sklearn.preprocessing import LabelEncoder

def fit_and_evaluate(model, trainX, trainY, testX, testY):
    # Convert string labels to integers
    label_encoder = LabelEncoder()
    trainY_encoded = label_encoder.fit_transform(trainY)
    testY_encoded = label_encoder.transform(testY)
    
    model.fit(trainX, trainY_encoded)
    testY_pred = model.predict(testX)
    accuracy = accuracy_score(testY_encoded, testY_pred)
    report = classification_report(testY_encoded, testY_pred, output_dict=True)
    results = {'accuracy': accuracy, 'classification_report': report}
    return results

# Update other classification functions similarly...


# Random Forest
from sklearn.ensemble import RandomForestClassifier

def random_forest_classification(trainX, trainY, testX, testY, n_estimators=100, criterion='gini', max_depth=None):
    model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Support Vector Machines (SVM)
from sklearn.svm import SVC

def svm_classification(trainX, trainY, testX, testY, kernel='rbf', C=1.0):
    model = SVC(kernel=kernel, C=C)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# K-Nearest Neighbors (KNN)
from sklearn.neighbors import KNeighborsClassifier

def knn_classification(trainX, trainY, testX, testY, n_neighbors=5):
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Gradient Boosting Machines (GBM)
from sklearn.ensemble import GradientBoostingClassifier

def gbm_classification(trainX, trainY, testX, testY, n_estimators=100, learning_rate=0.1, max_depth=3):
    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Naive Bayes
from sklearn.naive_bayes import GaussianNB

def naive_bayes_classification(trainX, trainY, testX, testY):
    model = GaussianNB()
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

def adaboost_classification(trainX, trainY, testX, testY, n_estimators=50, learning_rate=1.0):
    model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# XGBoost
from xgboost import XGBClassifier

def xgboost_classification(trainX, trainY, testX, testY, n_estimators=100, learning_rate=0.1, max_depth=3):
    model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)


In [None]:
random_forest_classification(enc_trainX, trainY, enc_testX, testY)

In [None]:
knn_classification(enc_trainX, trainY, enc_testX, testY)

In [None]:
gbm_classification(enc_trainX, trainY, enc_testX, testY)

In [None]:
naive_bayes_classification(enc_trainX, trainY, enc_testX, testY)

In [None]:
adaboost_classification(enc_trainX, trainY, enc_testX, testY)

In [None]:
xgboost_classification(enc_trainX, trainY, enc_testX, testY)

In [None]:
model = DecisionTreeClassifier(criterion='entropy', max_features='log2')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

In [None]:
model = RandomForestClassifier(n_estimators=100, criterion='entropy')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

In [None]:
model = XGBClassifier(
    colsample_bytree=0.7,enable_categorical=True,
    gamma=0.0, learning_rate=0.01, max_depth=17, 
    min_child_weight=5,n_estimators=100,
    n_jobs=20,objective='multi:softprob')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

In [None]:
model = KNeighborsClassifier(n_neighbors=70)
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

In [None]:
model = AdaBoostClassifier(n_estimators=200, learning_rate=0.8, random_state=947)
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

def fit_and_evaluate_voting(voting_classifier, trainX, trainY, testX, testY):
    # Convert string labels to integers
    label_encoder = LabelEncoder()
    trainY_encoded = label_encoder.fit_transform(trainY)
    testY_encoded = label_encoder.transform(testY)
    
    # Fit the VotingClassifier
    voting_classifier.fit(trainX, trainY_encoded)
    
    # Make predictions
    testY_pred = voting_classifier.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY_encoded, testY_pred)
    
    # Generate classification report
    report = classification_report(testY_encoded, testY_pred, target_names=label_encoder.classes_, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report,
        'voting_classifier': voting_classifier
    }
    
    return results

In [None]:
scaler = StandardScaler()
enc_trainX = pd.DataFrame(scaler.fit_transform(trainX), index=trainX.index, columns=trainX.columns)
enc_testX = pd.DataFrame(scaler.transform(testX), index=testX.index, columns=testX.columns)

In [None]:
from sklearn.ensemble import VotingClassifier

# Define multiple classifiers
ada = AdaBoostClassifier(n_estimators=200, learning_rate=0.8, random_state=947)
knn = KNeighborsClassifier(n_neighbors=70)
dt = DecisionTreeClassifier(criterion='entropy', max_features='log2')
rf = RandomForestClassifier(n_estimators=100, criterion='entropy')
xgb = XGBClassifier(
    colsample_bytree=0.7,enable_categorical=True,
    gamma=0.0, learning_rate=0.01, max_depth=17, 
    min_child_weight=5,n_estimators=100,objective='multi:softprob')

votingCLF = VotingClassifier(estimators=[('knn', knn), ('ada', ada), ('xgb', xgb), ('dt', dt),('rf', rf)], voting='soft', weights=[5,7,10,6,5])
fit_and_evaluate_voting(votingCLF, enc_trainX, trainY, enc_testX, testY)