In [None]:
#imports
import matplotlib.pyplot as plt
 
import pandas as pd
import itertools
from itertools import combinations
import numpy as np
 
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_val_predict, RandomizedSearchCV, KFold
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, ConfusionMatrixDisplay, f1_score, classification_report, make_scorer
)
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

import folium 

from scipy.stats import randint

 
#loading the Dataset into a pandas dataframe
df = pd.read_csv('Airlines.csv')
allAirports = pd.read_csv('AllAirports.csv')

In [None]:
# Select features and target variable (id is not needed)
features = ['Airline', 'AirportFrom', 'AirportTo', 'DayOfWeek', 'Time', 'Length']
target = 'Delay'
X = df[features]
y = df[target]

# Preprossing of the data
encoder = preprocessing.OrdinalEncoder()
encoded = pd.DataFrame(encoder.fit_transform(X[['Airline', 'AirportFrom', 'AirportTo']]), columns = encoder.get_feature_names_out(['Airline', 'AirportFrom', 'AirportTo']))

X = X.drop(columns = ['Airline', 'AirportFrom', 'AirportTo'])
X = X.join(encoded)

# Separate continuous and categorical columns
continuous_cols = ['DayOfWeek', 'Time', 'Length']
categorical_cols = [col for col in X.columns if col not in continuous_cols]

# Instantiate MinMaxScaler
min_max_scaler = MinMaxScaler()

# Apply MinMaxScaler only to continuous columns
X[continuous_cols] = min_max_scaler.fit_transform(X[continuous_cols])

# print data frames
print(X)
print(y)

# train and test data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 80% training and 20% testing

In [None]:
# Create a Random Forest classifier within a pipeline

clf = Pipeline([
    ('classifier', RandomForestClassifier(random_state=42))
])

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the parameter grid for randomized search

param_grid = {
    'classifier__n_estimators': [10, 50, 100],
    'classifier__max_depth' : [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10, 15],
    'classifier__min_samples_leaf': [1, 2, 5, 10],
    'classifier__criterion' : ['gini', 'entropy']
}


# Setup the randomized search
random_search = RandomizedSearchCV(estimator = clf , param_distributions = param_grid, 
                                   n_iter = 15, cv = kfold, random_state=42, n_jobs = -1)

# Fit the training data
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_

print(best_params)

score = cross_val_score(random_search, X_train, y_train, cv = kfold, scoring = make_scorer(accuracy_score), n_jobs = -1)
print("Cross Validated score (mean): " + str(score.mean()))

# Get the best estimator
best_estimator=random_search.best_estimator_ 

# Fit the best estimator on the full training set
best_estimator.fit(X_train, y_train)

# Make predictions on the testing set (20%)
y_pred = best_estimator.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
precision = precision_score(y_test, y_pred)
print("Precision: ", precision)
recall = recall_score(y_test, y_pred)
print("Recall: ", recall)
f1 = f1_score(y_test, y_pred)
print("F1: ", f1)

# Get the best tree (best performing estimator) from the Random Forest
best_tree_index = np.argmax(best_estimator.named_steps['classifier'].cv_results_['mean_test_score']) #Find index of the best tree
best_tree = best_estimator.named_steps['classifier'].estimators_[best_tree_index] # Return best tree

y_pred=best_tree.predict(X_test)

# Plot the best performing tree
plt.figure(figsize=12,8)
best_tree.plot_tree(best_tree, feature_names=X_train.columns, class_names=True, filled=True)
plt.show()

# Create confusion matrix
cm=confusion_matrix(y_test, y_pred)

# Display the confusion matrix
print(cm)



