In [2]:
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.model_selection import RepeatedStratifiedKFold

from imblearn.pipeline import Pipeline

import xgboost as xgb

import pandas as pd

In [22]:
k_folds = 10
rs = 0

In [3]:
flights_weather_df = pd.read_csv('../data/intermediate/modelling/flights_weather_modelling.csv')
flights_weather_df = flights_weather_df.astype(float)

In [4]:
X = flights_weather_df.drop(columns=['INDEX', 'IS_WEATHER_DELAY'])
y = flights_weather_df["IS_WEATHER_DELAY"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rs)

Decision Trees

In [9]:
params = {
    'classifier__max_depth': [2, 4, 6, 8, 10, 20, 30, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# insure no information leaking between train and validation sets by using pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler(feature_range=(-1, 1))),  # normalization
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])

dt_grid_search = GridSearchCV(pipeline, param_grid=params, cv=k_folds)
dt_grid_search.fit(X_train, y_train)

dt_best = dt_grid_search.best_estimator_
print("Pre-pruned Decision Tree Best Params: " + str(dt_grid_search.best_params_))
    
# evaluate the model
df_y_pred = dt_best.predict(X_test)
accuracy = accuracy_score(y_test, df_y_pred)

print("Accuracy: {:.2f}%".format(accuracy * 100))

Pre-pruned Decision Tree Best Params: {'classifier__max_depth': 6, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5}
Accuracy: 62.39%


In [8]:
cm = confusion_matrix(y_test, df_y_pred)

df_cm = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
print(df_cm)

          Predicted 0  Predicted 1
Actual 0         1002          475
Actual 1          569          730


Random Forests

In [11]:
params = {
    'classifier__n_estimators': [100, 200],  # Number of trees in the forest
    'classifier__max_depth': [None, 5, 10],  # Maximum depth of each tree
    'classifier__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'classifier__min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}


pipeline = Pipeline([
    ('scaler', MinMaxScaler(feature_range=(-1, 1))), # normalization
    ('classifier', RandomForestClassifier())
])

rf_grid_search = GridSearchCV(pipeline, param_grid=params, cv=k_folds)
rf_grid_search.fit(X_train, y_train)

rf_best = rf_grid_search.best_estimator_
print("Random Forest Best Params:", rf_grid_search.best_params_)

# evaluate the model
rf_y_pred = rf_best.predict(X_test)
accuracy = accuracy_score(y_test, rf_y_pred)

print("Accuracy: {:.2f}%".format(accuracy * 100))

Random Forest Best Params: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Accuracy: 64.84%


In [12]:
cm = confusion_matrix(y_test, rf_y_pred)

rf_cm = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
print(rf_cm)

          Predicted 0  Predicted 1
Actual 0         1041          408
Actual 1          568          759


Adaboost

In [27]:
params = {
    'classifier__n_estimators': [100, 300],  # Number of estimators (trees)
    'classifier__learning_rate': [0.1, 1.0],  # Learning rate
    'classifier__base_estimator__max_depth': [2, 5],  # Maximum depth of each base estimator (decision tree)
}

pipeline = Pipeline([
    ('scaler', MinMaxScaler(feature_range=(-1, 1))), # normalization
    ('classifier', AdaBoostClassifier(base_estimator=DecisionTreeClassifier()))
])

adaboost_grid_search = GridSearchCV(pipeline, param_grid=params, cv=k_folds)
adaboost_grid_search.fit(X_train, y_train)

adaboost_best = adaboost_grid_search.best_estimator_
print("AdaBoost Best Params:", adaboost_grid_search.best_params_)

# evaluate the model
adaboost_y_pred = adaboost_best.predict(X_test)
accuracy = accuracy_score(y_test, adaboost_y_pred)

print("Accuracy: {:.2f}%".format(accuracy * 100))

AdaBoost Best Params: {'classifier__base_estimator__max_depth': 2, 'classifier__learning_rate': 0.1, 'classifier__n_estimators': 100}
Accuracy: 63.90%


In [28]:
cm = confusion_matrix(y_test, adaboost_y_pred)

adaboost_cm = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
print(adaboost_cm)

          Predicted 0  Predicted 1
Actual 0         1030          419
Actual 1          583          744


XGBoost

In [None]:
params = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.1, 0.2, 0.3],
    'classifier__max_depth': [3, 4, 5]
}

pipeline = Pipeline([
    ('scaler', MinMaxScaler(feature_range=(-1, 1))), # normalization
    ('classifier', xgb.XGBClassifier(random_state=rs))
])

xgboost_grid_search = GridSearchCV(pipeline, params, cv=k_folds, scoring='roc_auc')
xgboost_grid_search.fit(X_train, y_train)

xgboost_best = xgboost_grid_search.best_estimator_
xgboost_best_params = xgboost_grid_search.best_params_
print("XGBoost Best Params:", xgboost_grid_search.best_params_)

# evaluate the model
xgboost_y_pred = xgboost_best.predict(X_test)
accuracy = accuracy_score(y_test, xgboost_y_pred)

print("Accuracy: {:.2f}%".format(accuracy*100))

In [None]:
cm = confusion_matrix(y_test, xgboost_y_pred)

xgboost_cm = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
print(xgboost_cm)

Feature Importance Plot

In [None]:
# normalize the training and testing data
scaler = MinMaxScaler(feature_range=(-1, 1))
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

xgboost_model = xgb.XGBClassifier(**xgboost_best_params, random_state=rs)
xgboost_model.fit(X_train_normalized, y_train)

# evaluate the model
xgboost_best_y_pred = xgboost_model.predict(X_test_normalized)
accuracy = accuracy_score(y_test, xgboost_best_y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

xgb.plot_importance(xgboost_model)