In [None]:
import pandas as pd
from tkinter import Tk
from tkinter.filedialog import askopenfilename
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from pandas.plotting import scatter_matrix
from sklearn.metrics import confusion_matrix
import hashlib
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB
from sklearn import metrics
from sklearn.metrics import classification_report
import seaborn as sns
import math
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.linear_model import SGDClassifier

In [None]:
def load_flight_data():
    csv_path = "/Users/Victor/Documents/UNI/TFG/ML/database.csv" #Change this to your own path
    
    return pd.read_csv(csv_path, delimiter=';', encoding="ISO-8859-1")
data_set = load_flight_data()

# data_set.info()
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, sparse_output=False):
        self.sparse_output = sparse_output
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        enc = LabelBinarizer(sparse_output=self.sparse_output)
        return enc.fit_transform(X)
    
def display_scores_f1(scores):
    print("Mean f1:", scores.mean())
    print("Standard deviation:", scores.std())
    print("Best f1: ", scores.max())
    
def display_scores_recall(scores):
    print("Mean recall:", scores.mean())
    print("Standard deviation:", scores.std())
    print("Best recall: ", scores.max())

def display_scores_precision(scores):
    print("Mean precision:", scores.mean())
    print("Standard deviation:", scores.std())
    print("Best precision: ", scores.max())

def display_scores_accuracy(scores):
    print("Mean accuracy:", scores.mean())
    print("Standard deviation:", scores.std())
    print("Best accuracy: ", scores.max())

In [None]:
#Defining the scaler

scaler = MinMaxScaler()

#Label Binarizer + Numerical pipeline

lb = LabelBinarizer()
num_attribs = ['Initial_time', 'ARP_distance', 'ARP_azimuth',
       'Heading', 'Altitude', 'IAS', 'GS', 'Barometric_pressure',
       'Wind_direction', 'Wind_variability', 'Wind_speed', 'Visibility',
       'European_airlines', 'American_airlines', 'Latam_airlines',
       'Other_airlines', 'Day_of_the_week', 'Mix_index']

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', SimpleImputer(strategy="median")),
    ])

kling = data_set.copy()

data_set_num_pipeline = num_pipeline.fit_transform(data_set)
data_set_num_pipeline = pd.DataFrame(data_set_num_pipeline,index=data_set.index,columns = num_attribs)

data_set_wtc_pipeline = lb.fit_transform(data_set['WTC'])
data_set_wtc_pipeline = pd.DataFrame(data_set_wtc_pipeline,index=data_set.index,columns = lb.classes_)

data_set_rwy_pipeline = lb.fit_transform(data_set['RWY'])
data_set_rwy_pipeline = pd.DataFrame(data_set_rwy_pipeline,index=data_set.index,columns = lb.classes_)

data_set_labels = pd.DataFrame(data_set["Time_to_land"])
frames = [data_set_labels,data_set_num_pipeline,data_set_wtc_pipeline,data_set_rwy_pipeline]
data_set_pipeline = pd.concat(frames, axis = 1)
data_set_prepared = data_set_pipeline.copy()

#Change from time to land to 5 categories

'''
data_set_prepared.loc[data_set_prepared["Time_to_land"] <= 3447, 'landing_cat'] = 'Very delayed'
data_set_prepared.loc[data_set_prepared["Time_to_land"] <= 1538, 'landing_cat'] = 'Delayed'
data_set_prepared.loc[data_set_prepared["Time_to_land"] <= 1076, 'landing_cat'] = 'Planned'
data_set_prepared.loc[data_set_prepared["Time_to_land"] <= 658, 'landing_cat'] = 'Advanced'
data_set_prepared.loc[data_set_prepared["Time_to_land"] <= 565, 'landing_cat'] = 'Very advanced'
data_set_prepared.drop("Time_to_land", axis = 1, inplace = True)
data_set_prepared["landing_cat"].hist(bins=50)'''


#Change from time to land to 3 categories


data_set_prepared.loc[data_set_prepared["Time_to_land"] <= 3447, 'landing_cat'] = 'Delayed'
data_set_prepared.loc[data_set_prepared["Time_to_land"] <= 1076, 'landing_cat'] = 'Planned'
data_set_prepared.loc[data_set_prepared["Time_to_land"] <= 658, 'landing_cat'] = 'Advanced'
data_set_prepared.drop("Time_to_land", axis = 1, inplace = True)
data_set_prepared["landing_cat"].hist(bins=50)

data_set_prepared.info()
#data_set_prepared["landing_cat"].value_counts()
data_set_prepared

## Case 1: StratifiedShuffleSplit

In [None]:
# StratifiedShuffleSplit to generate the training set and the test set

data_set_prepared_1 = data_set_prepared.copy()
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=3)
for train_index, test_index in sss.split(data_set_prepared_1, data_set_prepared_1["landing_cat"]):
    strat_train_set = data_set_prepared_1.loc[train_index]
    strat_test_set = data_set_prepared_1.loc[test_index]
data_set_train_1 = strat_train_set.copy()
data_set_test_1 = strat_test_set.copy()

X_train_1 = data_set_train_1[list(data_set_train_1.columns)[0:-1]]
Y_train_1 = data_set_train_1[list(data_set_train_1.columns)[27]]
X_test_1 = data_set_test_1[list(data_set_test_1.columns)[0:-1]]
Y_test_1 = data_set_test_1[list(data_set_test_1.columns)[27]]


### Case 1.1: CV no tuning

In [None]:
#Train and test set

X_train_1_1 = scaler.fit_transform(X_train_1.copy())
Y_train_1_1 = Y_train_1.copy()
X_test_1_1 = scaler.transform(X_test_1.copy())
Y_test_1_1 = Y_test_1.copy()

#Defining the classifier

NB_clf = ComplementNB()

#CV for the model

scores_varias = cross_validate(NB_clf, X_train_1_1, Y_train_1_1, scoring = ('accuracy','f1_weighted', 'recall_weighted', 'precision_weighted'), cv = 10, return_train_score = False)

#Performance measures

f1 = scores_varias['test_f1_weighted']
accuracy = scores_varias['test_accuracy']
recall = scores_varias['test_recall_weighted']
precision = scores_varias['test_precision_weighted']
    
display_scores_f1(f1)
display_scores_accuracy(accuracy)
display_scores_recall(recall)
display_scores_precision(precision)

#Prediction

Y_pred_1_1 = cross_val_predict(NB_clf, X_train_1_1, Y_train_1_1, cv = 10)

#Confusion matrix

cm = confusion_matrix(Y_train_1_1, Y_pred_1_1)
labels = ["Advanced","Planned","Delayed"]
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

print("Confusion matrix:", str(cm))

## Case 2: Train test split

In [None]:
data_set_prepared_2 = data_set_prepared.copy()
y_2 = data_set_prepared_2["landing_cat"]
data_set_temp_2 = data_set_prepared_2.copy()
data_set_temp_2.drop("landing_cat", axis=1, inplace=True)
X_2 = data_set_temp_2
_set = train_test_split(X_2, y_2, train_size=0.80,test_size=0.20, random_state=42)
X_train_2 = _set[0]
X_test_2 = _set[1]
Y_train_2 = _set[2]
Y_test_2 = _set[3]

### Case 2.1: CV no tuning

In [None]:
X_train_2_1 = scaler.fit_transform(X_train_2.copy())
Y_train_2_1 = Y_train_2.copy()
X_test_2_1 = scaler.transform(X_test_2.copy())
Y_test_2_1 = Y_test_2.copy()

#Defining the classifier

NB_clf = ComplementNB()

#CV for the model

scores_varias = cross_validate(NB_clf, X_train_2_1, Y_train_2_1, scoring = ('accuracy','f1_weighted', 'recall_weighted', 'precision_weighted'), cv = 10, return_train_score = False)

#Performance measures

f1 = scores_varias['test_f1_weighted']
accuracy = scores_varias['test_accuracy']
recall = scores_varias['test_recall_weighted']
precision = scores_varias['test_precision_weighted']
    
display_scores_f1(f1)
display_scores_accuracy(accuracy)
display_scores_recall(recall)
display_scores_precision(precision)

#Prediction

Y_pred_2_1 = cross_val_predict(NB_clf, X_train_2_1, Y_train_2_1, cv = 10)

#Confusion matrix

cm = confusion_matrix(Y_train_2_1, Y_pred_2_1)
labels = ["Advanced","Planned","Delayed"]
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

print("Confusion matrix:", str(cm))