# Predictions

This notebook contains...

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, roc_auc_score

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier

# from helper_functions import 

### Loading the Data

In [2]:
# Load data
airlines = pd.read_csv("airlines.csv")
airports = pd.read_csv("airports.csv")
flights = pd.read_csv("filtered_flights.csv")

### ...

In [3]:
# We split the prediction problem into two different ones, namely a regression problem
# and a classification problem. The regression problem looks at how much a given flight
# will be delayed, and the classification problem looks at whether the flight will be
# delayed or not (with a 15 minute threshold).

In [4]:
# Cancelled flights cannot be included in the regression because the target variable
# is undefined, but they still contain valuable information and will instead be handled in
# a separate cancellation classification model.

# Therefore, the regression analysis looks at the delay given that the plane has taken off.

In [5]:
# We only keep the variables that are known before the plane takes off, i.e. everything
# that is planned (for instance, it would not make sense to use the actual time of takeoff
# to predict whether the flight is delayed, as we want to simulate predicting before the
# plane takes off). Therefore, we get rid of the following columns.
remove_cols = ["DEPARTURE_TIME", "DEPARTURE_DELAY", "TAXI_OUT", "WHEELS_OFF", "ELAPSED_TIME",
               "AIR_TIME", "WHEELS_ON", "TAXI_IN", "ARRIVAL_TIME", "AIR_SYSTEM_DELAY", 
               "SECURITY_DELAY", "AIRLINE_DELAY", "LATE_AIRCRAFT_DELAY", "WEATHER_DELAY"]
# We also get rid of FLIGHT_NUMBER and TAIL_NUMBER as these have too many unique values
# to encode them meaningfully.
remove_cols.extend(["FLIGHT_NUMBER", "TAIL_NUMBER"])

# Dropping the columns
flights = flights.drop(columns=remove_cols)

Feature engineering

In [6]:
# Convert SCHEDULED_DEPARTURE and SCHEDULED_ARRIVAL to hour of day
flights["SCHEDULED_DEPARTURE"] = (np.floor(flights["SCHEDULED_DEPARTURE"] / 100)).astype(int)
flights["SCHEDULED_ARRIVAL"] = (np.floor(flights["SCHEDULED_ARRIVAL"] / 100)).astype(int)

# # Convert FLIGHT_NUMBER to a categorical variable
# flights["FLIGHT_NUMBER"] = flights["FLIGHT_NUMBER"].astype(str)

In [7]:
# We predict arrival delay in minutes
target_reg = "ARRIVAL_DELAY"

# Defining "on time" threshold as 15 minutes
on_time_threshold = 15

# Creating a new variable saying whether a flight was delayed (nan becomes 0)
flights["arr_on_time"] = (flights["ARRIVAL_DELAY"] <= on_time_threshold).astype(int)

# We predict whether the flight was delayed
target_cla = "arr_on_time"

In [8]:
# Train/test split
X_full = flights.drop(columns=[target_reg, target_cla])
y_reg = flights[target_reg]
y_cla = flights[target_cla]

X_train, X_test, y_reg_train, y_reg_test, y_cla_train, y_cla_test = train_test_split(
    X_full, y_reg, y_cla, test_size=0.3, random_state=42)

In [9]:
# Preprocessing
categorical_variables = ["AIRLINE", "ORIGIN_AIRPORT", "DESTINATION_AIRPORT"]
numeric_variables = [col for col in X_train.columns if col not in categorical_variables]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_variables),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_variables)
    ]
)

### Models

In [10]:
# Masks for the data used for regression models
mask_reg_train = (y_reg_train.notna())
mask_reg_test = (y_reg_test.notna())

In [16]:
# Models
reg_models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=1, max_depth=10, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=1, learning_rate=0.05, random_state=42)
}

cla_models = {
    # "Logistic Regression": LogisticRegression(max_iter=1000, penalty=None), # For some reason this does not converge
    "Random Forest": RandomForestClassifier(n_estimators=1, max_depth=10, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=1, learning_rate=0.05, random_state=42)
}

In [12]:
# Train and evaluate
results_reg = {}

for name, model in tqdm(reg_models.items(), desc="Training models"):
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    
    pipeline.fit(X_train[mask_reg_train], y_reg_train[mask_reg_train])
    y_pred = pipeline.predict(X_test[mask_reg_test])
    rmse = np.sqrt(mean_squared_error(y_reg_test[mask_reg_test], y_pred))
    results_reg[name] = rmse

Training models: 100%|██████████| 3/3 [00:15<00:00,  5.23s/it]


In [17]:
# Train and evaluate
results_cla = {}

for name, model in tqdm(cla_models.items(), desc="Training models"):
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    
    pipeline.fit(X_train, y_cla_train)
    y_pred = pipeline.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_cla_test, y_pred)
    results_cla[name] = roc_auc

Training models: 100%|██████████| 2/2 [00:06<00:00,  3.22s/it]


In [18]:
results_reg

{'Linear Regression': np.float64(36.65143366698413),
 'Random Forest': np.float64(36.75753559663987),
 'Gradient Boosting': np.float64(37.08785044543497)}

In [19]:
results_cla

{'Random Forest': 0.6150765134682357, 'Gradient Boosting': 0.621444877847806}