In [None]:
import pandas as pd
from tkinter import Tk
from tkinter.filedialog import askopenfilename
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import hashlib
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelBinarizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
import seaborn as sns
import math
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier

In [None]:
def load_flight_data():
    csv_path = "/Users/Victor/Documents/UNI/TFG/ML/database.csv" #Change this to your own path
    
    return pd.read_csv(csv_path, delimiter=';', encoding="ISO-8859-1")
data_set = load_flight_data()

# data_set.info()
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, sparse_output=False):
        self.sparse_output = sparse_output
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        enc = LabelBinarizer(sparse_output=self.sparse_output)
        return enc.fit_transform(X)
    
def display_scores_f1(scores):
    print("Mean f1:", scores.mean())
    print("Standard deviation:", scores.std())
    print("Best f1: ", scores.max())
    
def display_scores_recall(scores):
    print("Mean recall:", scores.mean())
    print("Standard deviation:", scores.std())
    print("Best recall: ", scores.max())

def display_scores_precision(scores):
    print("Mean precision:", scores.mean())
    print("Standard deviation:", scores.std())
    print("Best precision: ", scores.max())

def display_scores_accuracy(scores):
    print("Mean accuracy:", scores.mean())
    print("Standard deviation:", scores.std())
    print("Best accuracy: ", scores.max())

In [None]:
#Defining the scaler

scaler = StandardScaler()

#Label Binarizer + Numerical pipeline

lb = LabelBinarizer()
num_attribs = ['Initial_time', 'ARP_distance', 'ARP_azimuth',
       'Heading', 'Altitude', 'IAS', 'GS', 'Barometric_pressure',
       'Wind_direction', 'Wind_variability', 'Wind_speed', 'Visibility',
       'European_airlines', 'American_airlines', 'Latam_airlines',
       'Other_airlines', 'Day_of_the_week', 'Mix_index']

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', SimpleImputer(strategy="median")),
    ])

kling = data_set.copy()

data_set_num_pipeline = num_pipeline.fit_transform(data_set)
data_set_num_pipeline = pd.DataFrame(data_set_num_pipeline,index=data_set.index,columns = num_attribs)

data_set_wtc_pipeline = lb.fit_transform(data_set['WTC'])
data_set_wtc_pipeline = pd.DataFrame(data_set_wtc_pipeline,index=data_set.index,columns = lb.classes_)

data_set_rwy_pipeline = lb.fit_transform(data_set['RWY'])
data_set_rwy_pipeline = pd.DataFrame(data_set_rwy_pipeline,index=data_set.index,columns = lb.classes_)

data_set_labels = pd.DataFrame(data_set["Time_to_land"])
frames = [data_set_labels,data_set_num_pipeline,data_set_wtc_pipeline,data_set_rwy_pipeline]
data_set_pipeline = pd.concat(frames, axis = 1)
data_set_prepared = data_set_pipeline.copy()

#Change from time to land to 5 categories

'''
data_set_prepared.loc[data_set_prepared["Time_to_land"] <= 3447, 'landing_cat'] = 'Very delayed'
data_set_prepared.loc[data_set_prepared["Time_to_land"] <= 1538, 'landing_cat'] = 'Delayed'
data_set_prepared.loc[data_set_prepared["Time_to_land"] <= 1076, 'landing_cat'] = 'Planned'
data_set_prepared.loc[data_set_prepared["Time_to_land"] <= 658, 'landing_cat'] = 'Advanced'
data_set_prepared.loc[data_set_prepared["Time_to_land"] <= 565, 'landing_cat'] = 'Very advanced'
data_set_prepared.drop("Time_to_land", axis = 1, inplace = True)
data_set_prepared["landing_cat"].hist(bins=50)'''


#Change from time to land to 3 categories


data_set_prepared.loc[data_set_prepared["Time_to_land"] <= 3447, 'landing_cat'] = 'Delayed'
data_set_prepared.loc[data_set_prepared["Time_to_land"] <= 1076, 'landing_cat'] = 'Planned'
data_set_prepared.loc[data_set_prepared["Time_to_land"] <= 658, 'landing_cat'] = 'Advanced'
data_set_prepared.drop("Time_to_land", axis = 1, inplace = True)
data_set_prepared["landing_cat"].hist(bins=50)

data_set_prepared.info()
#data_set_prepared["landing_cat"].value_counts()
data_set_prepared

# Ensemble

In [None]:
# StratifiedShuffleSplit to generate the training set and the test set
data_set_prepared_1_1 = data_set_prepared.copy()
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(data_set_prepared_1_1, data_set_prepared_1_1["landing_cat"]):
    strat_train_set = data_set_prepared_1_1.loc[train_index]
    strat_test_set = data_set_prepared_1_1.loc[test_index]
data_set_train_1_1 = strat_train_set.copy()
data_set_test_1_1 = strat_test_set.copy()
#strat_train_set
#strat_test_set

In [None]:
# Features' importance
data_set_train_1_1_1_1 = data_set_train_1_1.copy()
data_set_test_1_1_1_1 = data_set_test_1_1.copy()


rnd_clf=RandomForestClassifier()
log_clf=LogisticRegression()
#svm_clf = SVC(class_weight = 'balanced', kernel = 'rbf' ,gamma = 'scale', probability = 'True')
kne_clf = KNeighborsClassifier(leaf_size = 1, n_neighbors = 10, weights = 'distance', p = 1)
mlp_clf = MLPClassifier()
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators = 1000,
    max_samples = 10000, bootstrap = True, n_jobs = -1, oob_score = True, bootstrap_features = True, max_features = 20)
voting_clf = VotingClassifier(
    estimators = [('rf', rnd_clf), ('bag',bag_clf)],
    voting = 'soft')

#('svc', svm_clf), ('lr', log_clf),('kn',kne_clf),('mlp', mlp_clf),


X_train_1_1_1_1 = data_set_train_1_1_1_1[list(data_set_train_1_1_1_1.columns)[0:-1]]
Y_train_1_1_1_1 = data_set_train_1_1_1_1[list(data_set_train_1_1_1_1.columns)[27]]
X_test_1_1_1_1 = data_set_test_1_1_1_1[list(data_set_test_1_1_1_1.columns)[0:-1]]
Y_test_1_1_1_1 = data_set_test_1_1_1_1[list(data_set_test_1_1_1_1.columns)[27]]


for clf in (rnd_clf, bag_clf, voting_clf):
    clf.fit(X_train_1_1_1_1,Y_train_1_1_1_1)
    Y_pred_1_1_1_1=clf.predict(X_test_1_1_1_1)
    print(clf.__class__.__name__, metrics.accuracy_score(Y_test_1_1_1_1, Y_pred_1_1_1_1))

#svm_clf, log_clf,kne_clf, mlp_clf,

labels = ["Advanced","Planned","Delayed"]
cm = metrics.confusion_matrix(Y_test_1_1_1_1, Y_pred_1_1_1_1, labels)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cm)
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

print("Confusion matrix:", str(cm))