# Import Libraries

In [82]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt

# Show data information

In [83]:
dec_2021_flights = pd.read_csv('december_2021_flights.csv')
print(dec_2021_flights.head())
print(dec_2021_flights.describe())
print(dec_2021_flights.info())
print(dec_2021_flights.nunique())

   YEAR  MONTH  DAY_OF_MONTH  DAY_OF_WEEK                FL_DATE  \
0  2021     12             7            2  12/7/2021 12:00:00 AM   
1  2021     12             7            2  12/7/2021 12:00:00 AM   
2  2021     12             7            2  12/7/2021 12:00:00 AM   
3  2021     12             7            2  12/7/2021 12:00:00 AM   
4  2021     12             7            2  12/7/2021 12:00:00 AM   

  OP_UNIQUE_CARRIER ORIGIN DEST  CRS_DEP_TIME  DEP_TIME  DEP_DELAY_NEW  \
0                WN    DEN  MDW           650     650.0            0.0   
1                WN    DEN  MDW          1520    1613.0           53.0   
2                WN    DEN  MEM          1330    1429.0           59.0   
3                WN    DEN  MIA          1800    1901.0           61.0   
4                WN    DEN  MIA          1045    1120.0           35.0   

   DEP_DEL15  CRS_ARR_TIME  ARR_TIME  ARR_DELAY_NEW  ARR_DEL15  \
0        0.0          1010     956.0            0.0        0.0   
1        1.0  

# Clean and alter data

In [84]:
dec_2021_flights = dec_2021_flights.drop([
    'YEAR',
    'MONTH',
    'FL_DATE',
    'DEP_TIME',
    "ARR_TIME",
    'DISTANCE', 
    'ARR_DELAY_NEW', 
    'CRS_ELAPSED_TIME',
    'CRS_ARR_TIME',
    'ARR_DEL15',
    'DISTANCE_GROUP',
    'DEP_DEL15'
], axis=1)

dec_2021_flights['DAY_OF_WEEK'] = dec_2021_flights['DAY_OF_WEEK'].map({
    1:"Sunday",
    2:"Monday",
    3:"Tuesday",
    4:"Wednesday",
    5:"Thursday",
    6:"Friday",
    7:"Saturday"
})

dec_2021_flights = dec_2021_flights.rename(columns={
    "OP_UNIQUE_CARRIER":"AIRLINE",
    "CRS_DEP_TIME":"DEPARTURE",
})

dec_2021_flights['DELAYED'] = dec_2021_flights['DEP_DELAY_NEW'] > 30

dec_2021_flights = dec_2021_flights.drop([
    'DEP_DELAY_NEW'
], axis=1)

dec_2021_flights = dec_2021_flights[
    (dec_2021_flights["ORIGIN"] == "ORD") |
    (dec_2021_flights["ORIGIN"] == "MKE") |
    (dec_2021_flights["ORIGIN"] == "MDW")
]

dec_2021_flights = dec_2021_flights[
    (dec_2021_flights["DEST"] == "TPA") |
    (dec_2021_flights["DEST"] == "MIA") |
    (dec_2021_flights["DEST"] == "MCO") |
    (dec_2021_flights["DEST"] == "SRQ")
]


print(dec_2021_flights.info())




<class 'pandas.core.frame.DataFrame'>
Int64Index: 1989 entries, 930 to 580177
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   DAY_OF_MONTH  1989 non-null   int64 
 1   DAY_OF_WEEK   1989 non-null   object
 2   AIRLINE       1989 non-null   object
 3   ORIGIN        1989 non-null   object
 4   DEST          1989 non-null   object
 5   DEPARTURE     1989 non-null   int64 
 6   DELAYED       1989 non-null   bool  
dtypes: bool(1), int64(2), object(4)
memory usage: 110.7+ KB
None


# Train / Test split

In [85]:
delays = dec_2021_flights['DELAYED'].to_numpy().astype(int)

features = dec_2021_flights.drop(["DELAYED"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(features, delays, test_size=0.3, random_state=42, stratify=delays)

# Preprocess the Data

In [86]:
# Convert the categorical features in the train and test sets independently
# this improves computation time. There are also some models that require strictly numeric format
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# we need to reindex the columns in case features appeared in the test set that were not in the train set
# set the fill value to 0
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0,1))
rescaled_X_train = scaler.fit_transform(X_train)
rescaled_X_test = scaler.transform(X_test)

# Train Logistic Regression Model

In [87]:
# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression(max_iter=100, tol=0.1)

# fit the logistic Regression model
logreg.fit(rescaled_X_train, y_train)


LogisticRegression(tol=0.1)

# Model Accuracy

In [88]:
# predict using logistic regression
y_pred_test_logreg = logreg.predict(rescaled_X_test)

print("Accuracy of logistic regression classifier (Test): ",accuracy_score(y_test, y_pred_test_logreg))


Accuracy of logistic regression classifier (Test):  0.8659966499162479


# Confusion Matrix

In [89]:
# Print the confusion matrix of the logreg model
print(confusion_matrix(y_test, y_pred_test_logreg))


[[517   0]
 [ 80   0]]


# Consider Class Imbalance of Delayed Column
- Upsample dataframe where delayed is true
- re-create or logistic regression model
- Accuracy

In [90]:
# Sepearate records where the flights were delayed from flights that were on time
delayed_true = dec_2021_flights[dec_2021_flights['DELAYED'] == True]
delayed_false = dec_2021_flights[dec_2021_flights['DELAYED'] == False]

# Get size of delayed_false
num_samples = delayed_false.shape[0]
print(num_samples)

delayed_true_upsampled = resample(
    delayed_true,
    replace=True,             # sample with replacement
    n_samples=num_samples,    # to match majority class
    random_state=123          # reproducible results
) 

dec_2021_flights_upsampled = pd.concat([delayed_false, delayed_true_upsampled])

1723


# Train / Test Split (Upsampled)

In [91]:
delays = dec_2021_flights_upsampled['DELAYED'].to_numpy().astype(int)

features = dec_2021_flights_upsampled.drop(["DELAYED"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(features, delays, test_size=0.3, random_state=42, stratify=delays)

# Preprocess the Data (Upsampled)

In [92]:
# Convert the categorical features in the train and test sets independently
# this improves computation time. There are also some models that require strictly numeric format
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# we need to reindex the columns in case features appeared in the test set that were not in the train set
# set the fill value to 0
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0,1))
rescaled_X_train = scaler.fit_transform(X_train)
rescaled_X_test = scaler.transform(X_test)

# Train Logistic Regression Model (Upsampled)

In [93]:
# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression(max_iter=100, tol=0.1)

# fit the logistic Regression model
logreg.fit(rescaled_X_train, y_train)


LogisticRegression(tol=0.1)

# Model Accuracy (Upsampled)

In [94]:
# predict using logistic regression
y_pred_test_logreg = logreg.predict(rescaled_X_test)

print("UPSAMPLED Accuracy of logistic regression classifier (Test) : ",accuracy_score(y_test, y_pred_test_logreg))

UPSAMPLED Accuracy of logistic regression classifier (Test) :  0.6121856866537717


In [95]:
# Print the confusion matrix of the logreg model
print(confusion_matrix(y_test, y_pred_test_logreg))

[[316 201]
 [200 317]]
