In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

#matplotlib inline

In [None]:
# read the data
flights = pd.read_csv('../input/feb-2020-us-flight-delay/feb-20-us-flight-delay.csv')

In [None]:
flights.columns

### Data Format
* **MONTH** - Month
* **DAY_OF_MONTH** - Day of Month
* **DAY_OF_WEEK** - Day of Week
* **OP_UNIQUE_CARRIER** - Unique Carrier Code
* **ORIGIN** - Origin airport location
* **DEST** - Destination airport location
* **DEP_TIME** - Actual Departure Time (local time: hhmm)
* **DEP_DEL15** - Departure Delay Indicator, 15 Minutes or More (1=Yes, 0=No) [TARGET VARIABLE]
* **DISTANCE** - Distance between airports (miles)

In [None]:
flights.head()

In [None]:
flights['Unnamed: 9'].unique()

* 'Unnamed: 9' is an extra-empty column, so we will get rid of it.

In [None]:
flights.drop('Unnamed: 9', axis=1, inplace=True)

In [None]:
flights.columns

# Data preprocessing and visualization:

In [None]:
flights.shape

In [None]:
# Rename the DEP_DEL15 to is_dealy
flights.rename(columns={'DEP_DEL15':'is_delay'}, inplace=True)

In [None]:
# Look for null values
flights.isnull().sum()

In [None]:
print(f'\'is_delay\' missing values are {100*4951/flights.shape[0]}%')
print(f'\'DEP_TIME\' missing values are {100*4938/flights.shape[0]}%')

* The missing 'is_delay' values represent only 0.86%, so we can safely remove them.
* The same thing applies for 'DEP_TIME', the missing is ~0.86%.

**Note:** As will start to do some preprocessing and cleaning, we will make a new copy of the data to work on without changing the original one.

In [None]:
data = flights.copy()

### - Remove the missing values

In [None]:
data = data.dropna()

In [None]:
data.isnull().sum()

## Visualization

### - Delay vs No Delay

In [None]:
sns.countplot(x=data['is_delay'])

In [None]:
data.groupby('is_delay').size()/len(data)

**We see that the data is highly imbalanced; 85.6% is 'no delay' vs 14.4% 'delay' flights.**

### Now there are some questions we need to ask:

* What day of the week has the most delays?
* Which origin and destination airports have the most delays?
* Is flight distance a factor in the delays?
* Which carrier has the most delays?

In [None]:
sns.countplot(x='DAY_OF_WEEK', hue="is_delay", data=data)

In [None]:
print(f'Number of Origin airports is {data.ORIGIN.nunique()}')
print(f'Number of Dest airports is {data.DEST.nunique()}')

- We are goning to create a dataframe for 'Origin' and 'Dest' as the plot will be not clear due to the large numbers.

In [None]:
origins = pd.DataFrame(data.groupby('ORIGIN').is_delay.count())
origins.rename(columns={'is_delay':'flights'}, inplace=True)
origins['delayed'] = data.groupby('ORIGIN').is_delay.sum()
origins['delayed_perc'] = 100*origins.delayed/origins.flights
origins.reset_index(level=0, inplace=True)
origins.sort_values(by=['delayed_perc'], inplace=True, ascending=False, ignore_index=True)

In [None]:
origins.head(10)

In [None]:
dests = pd.DataFrame(data.groupby('DEST').is_delay.count())
dests.rename(columns={'is_delay':'flights'}, inplace=True)
dests['delayed'] = data.groupby('DEST').is_delay.sum()
dests['delayed_perc'] = 100*dests.delayed/dests.flights
dests.reset_index(level=0, inplace=True)
dests.sort_values(by=['delayed_perc'], inplace=True, ascending=False, ignore_index=True)

In [None]:
dests.head(10)

- Most delays happens at HGR airport.

- Thre are 350 airports for both 'DEST, nad 'ORIGIN', this will result in 700 new features when doing one-hot-encoding, so we will use the top 10 airports only, and set the rest 

In [None]:
origins.sort_values(by=['flights'], inplace=True, ascending=False, ignore_index=True)
dests.sort_values(by=['flights'], inplace=True, ascending=False, ignore_index=True)

In [None]:
origins.head(10)

In [None]:
dests.head(10)

- Top airports for 'dest' and 'origin' are:  ['ATL', 'ORD', 'DFW', 'DEN', 'CLT', 'LAX', 'PHX', 'IAH', 'LAS', 'SFO']

In [None]:
airports = ['ATL', 'ORD', 'DFW', 'DEN', 'CLT', 'LAX', 'PHX', 'IAH', 'LAS', 'SFO']

data['ORIGIN'].loc[~data['ORIGIN'].isin(airports)] = 'others'
data['DEST'].loc[~data['DEST'].isin(airports)] = 'others'

In [None]:
print(f'values in \'ORIGIN\': {data.ORIGIN.unique()}')
print(f'values in \'DEST\': {data.DEST.unique()}')

In [None]:
sns.lmplot( x="is_delay", y="DISTANCE", data=data, fit_reg=False, hue='is_delay', legend=False)

- Delay happens in both short and long distances.

In [None]:
sns.countplot(x='OP_UNIQUE_CARRIER', hue='is_delay', data=data)

## Check the features
- Drop uncessary ones if exist.
- Encoding
- Scale if needed.

In [None]:
# Check coulmns types
data.dtypes

In [None]:
data.describe()

- Distance range is high ---> needs to be scaled.
- Dep_TIME needs to be in 24-hour format.

In [None]:
data['DISTANCE'] = (data['DISTANCE']-data['DISTANCE'].mean())/data['DISTANCE'].std()#
data['DEP_TIME'] = (data['DEP_TIME']//100)

In [None]:
from sklearn.preprocessing import MinMaxScaler

to_scale = ['DISTANCE', 'DAY_OF_WEEK', 'DEP_TIME', 'DAY_OF_MONTH']
scaler = MinMaxScaler()
data[to_scale] = scaler.fit_transform(data[to_scale])

In [None]:
data.describe()

In [None]:
# The month is not important here as it is the same for all samples (Feb)
data.drop(columns=['MONTH'], axis=1, inplace=True)

categorical_columns  = ['DAY_OF_MONTH', 'DAY_OF_WEEK','OP_UNIQUE_CARRIER', 
                        'ORIGIN', 'DEST', 'is_delay']

categorical_columns.remove('is_delay') # Remove the target variable before converting to categorical

# Convert them to categorical dtype
for c in categorical_columns:
    data[c] = data[c].astype('category')

In [None]:
data.dtypes

In [None]:
data_dummies = pd.get_dummies(data[categorical_columns], drop_first=True)
data = pd.concat([data, data_dummies], axis = 1)
data.drop(categorical_columns,axis=1, inplace=True)

In [None]:
data.columns

# Bulid the Baseline Model

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support

In [None]:
# Extract the target column
target = data.is_delay
data.drop(columns=['is_delay'], axis=1, inplace=True)

### Split twice to get train, and test sets

In [None]:
# Split the dataset in the ratio train 80% and test 20%
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.20, random_state=95) 

## 1 - Logistic Regression

In [None]:
logReg =LogisticRegression()
logReg.fit(x_train, y_train)
y_pred_logReg = logReg.predict(x_test)

## 2- Random Forest

In [None]:
randForest = RandomForestClassifier()
randForest.fit(x_train, y_train)
y_pred_randForest = randForest.predict(x_test)

## 3- XGBoost

In [None]:
xgb = XGBClassifier(use_label_encoder=False)
xgb.fit(x_train, y_train)
y_pred_xgb = xgb.predict(x_test)

In [None]:
# Calculate accuracy
def evaluate_model(labels, preds):
    accuracy = (preds == labels).sum() / preds.shape[0]
    print(f'Accuracy: {accuracy}')

    auc = roc_auc_score(labels, preds)
    print(f'AUC     : {auc}')

    precision, recall, f1_score, _ = precision_recall_fscore_support(labels, preds, average = 'binary')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1_score: {f1_score}')

    confusion_matrix = pd.crosstab(index=labels, columns=np.round(preds), 
                                   rownames=['True'], colnames=['predictions']).astype(int)
    plt.figure(figsize = (5,5))
    sns.heatmap(confusion_matrix, annot=True, fmt='.2f', cmap="YlGnBu").set_title('Confusion Matrix') 

In [None]:
evaluate_model(y_pred_logReg, y_test)

In [None]:
evaluate_model(y_pred_randForest, y_test)

In [None]:
evaluate_model(y_pred_xgb, y_test)

### since the data is unbalanced, we will look at the F1-Score.
* f1-score of the RandomForest model is the highest one, so will go with this model.

# Model Tuning

### We will use the **Grid Search**¶ algorithm to tune the hyperparameters

### Most important hyperparameters of Random Forest:

* n_estimators = number of trees, larger --> more complex.
* max_features =  number of maximum features provided to each tree, the default value is the best 'square root of the number of features'.
* max_depth = max number of levels in each decision tree, if it's too large --> overfitting.
* min_samples_split = min number of data points placed in a node before the node is split, larger values prevent overfitting.
* max_leaf_nodes = number of leaf nodes, very small --> underfitting, and very large --> overfitting.
* min_samples_leaf = min number of data points allowed in a leaf node, very large --> underfitting, and very small --> overfitting.
* bootstrap = method for sampling data points (with or without replacement)

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer

f1_scorer = make_scorer(f1_score, greater_is_better=True)

param_grid = [
{'n_estimators': [10, 25],
 'max_depth': [10, 50],
 'min_samples_split': [5, 10, 15],
 'bootstrap': [True, False]}
]

grid_search_forest = GridSearchCV(randForest, param_grid, cv=10, scoring=f1_scorer)
grid_search_forest.fit(x_train, y_train)

In [None]:
cvres = grid_search_forest.cv_results_

for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

In [None]:
grid_search_forest.best_estimator_

In [None]:
grid_search_forest.best_score_

In [None]:
grid_best= grid_search_forest.best_estimator_.predict(x_test)
evaluate_model(grid_best, y_test)

## Final Notes:
- The final f1-score increased slightly, but if we increased the number of trees i.e.(n_estimators), it may get better.
- Adding more features like (weather) will result in a better performance, but it's not availble right now.