# Scope of this workbook

Here, we want to actually model the whole thing for different datasets. 

In [139]:
# Imports
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

import datetime

from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


#Settings
pd.set_option("display.max_rows", 70)
pd.set_option("display.max_columns", 101)

# full_df_unedited

In [140]:
full_df_unedited = pd.read_pickle('../data/processed/full_df_unedited.pkl')

In [141]:
full_df_unedited.head()

Unnamed: 0,dep_ap_sched,arr_ap_sched,dep_sched_date,dep_sched_time,arr_sched_date,arr_sched_time,m_offblockdt,m_onblockdt,ac_registration_x,dep_delay,Ac Type Code,trans_time,sched_trans_time,Crew Group,TLC_trans,crew_type_change,Sched Groundtime,Act Groundtime,mingt,arr_leg_outbound,catering_duration,pax_boarding_duration,block_time,flt_event_number
4,East Melissaberg,East Carmen,2019-06-01,2019-06-01 02:30:00,2019-06-01,2019-06-01 04:15:00,2019-06-01 03:02:00,2019-06-01 04:43:00,ECLGNX,32.0,DH4,0,0,Start,"['Renee Fisher_nan_nan_nan_ca', 'Rebecca Castr...",[],35.0,21.0,35.0,Keithberg,26.0,26.0,101.0,1
7,East Latashaview,East Carmen,2019-06-01,2019-06-01 03:15:00,2019-06-01,2019-06-01 07:05:00,2019-06-01 03:30:00,2019-06-01 07:07:00,ECLBIX,15.0,320,0,0,Start,"['Nicholas Evans_nan_nan_nan_ca', 'Jessica Her...",[],45.0,61.0,40.0,Juliemouth,27.0,15.0,217.0,1
9,New Jessica,East Carmen,2019-06-01,2019-06-01 03:25:00,2019-06-01,2019-06-01 06:45:00,2019-06-01 03:50:00,2019-06-01 07:01:00,ECLBAX,25.0,320,0,0,Start,"['Sean Weeks_nan_nan_nan_ca', 'Tony Lloyd_nan_...",[],95.0,94.0,45.0,South Nathaniel,27.0,25.0,191.0,1
13,East Allisontown,East Carmen,2019-06-01,2019-06-01 04:00:00,2019-06-01,2019-06-01 04:35:00,2019-06-01 04:04:00,2019-06-01 04:36:00,ECLWFX,4.0,E95,0,0,Start,"['Frederick Ramirez_nan_nan_nan_cp', 'Ariel Wi...",[],55.0,120.0,40.0,Yoderburgh,18.0,11.0,32.0,1
15,Port Courtneytown,East Carmen,2019-06-01,2019-06-01 04:00:00,2019-06-01,2019-06-01 04:35:00,2019-06-01 04:14:00,2019-06-01 04:57:00,ECLGBX,14.0,DH4,0,0,Start,"['Heather Ryan_nan_nan_nan_ca', 'Jeff Hays_nan...",[],35.0,23.0,30.0,West Ana,15.0,12.0,43.0,1


In [142]:
# For the moment, let's in this step drop all variables we will not use. ToDo: Check which step is the best step to do that
columns_to_drop = [
    'TLC_trans',
    'catering_duration',
    'dep_sched_date', # the date itself has no real information. ToDo: Maybe we should extract the day of week here
    'dep_sched_time', # the time has a value but is dropped for the moment due to formatting. 
    'arr_sched_date',
    'arr_sched_time',
    'm_offblockdt',
    'm_onblockdt'
]

full_df_unedited = full_df_unedited.drop(columns_to_drop, axis = 1)

full_df_unedited = full_df_unedited[full_df_unedited['Act Groundtime']<180]

In [143]:
# Creating dummy variables for all categorical variables

# Note: Onehotencoder is the better solution, however for simplicity let's use pandas for the moment
#full_df_unedited_encoded = OneHotEncoder().fit_transform(full_df_unedited)
#full_df_unedited

# Get object columns
full_df_unedited_objectcolumns = full_df_unedited.select_dtypes(include = 'object')
varlist = full_df_unedited_objectcolumns.columns.values.tolist()

#get dummies
full_df_unedited_encoded = pd.get_dummies(full_df_unedited, columns = varlist, drop_first = True)

In [144]:
full_df_unedited_encoded.shape

(3837, 304)

In [145]:
# We split the forecast in two different forecasts - one for ground time and one for block time

X_train_block, X_test_block, y_train_block, y_test_block = train_test_split(full_df_unedited_encoded.drop(['block_time'], axis = 1), full_df_unedited_encoded['block_time'], test_size=0.33, random_state=42)
X_train_ground, X_test_ground, y_train_ground, y_test_ground = train_test_split(full_df_unedited_encoded.drop(['Act Groundtime'], axis = 1), full_df_unedited_encoded['Act Groundtime'], test_size=0.33, random_state=42)

In [146]:
# Gradientboosting as base classifier

# Blocktime
base_model_block = GradientBoostingRegressor()

base_model_block.fit(X_train_block, y_train_block)
base_model_block.predict(X_test_block)


# Groundtime

base_model_ground = GradientBoostingRegressor()

base_model_ground.fit(X_train_ground, y_train_ground)
base_model_ground.predict(X_test_ground)


array([  14.98141688,   90.31005136,   61.83242638, ...,   62.03673736,
       -208.98147572,   78.65529136])

In [147]:
# Blocktime
r2_block = base_model_block.score(X_test_block, y_test_block)
rmse_block = np.sqrt(mean_squared_error(y_test_block, base_model_block.predict(X_test_block)))
print("The r^2 for Block time is " + str(round(r2_block,4)))
print("The RMSE for Block time is " + str(round(rmse_block,2)) + " minutes.")
print("\n")

# Groundtime
r2_ground = base_model_ground.score(X_test_ground, y_test_ground)
rmse_ground = np.sqrt(mean_squared_error(y_test_ground, base_model_ground.predict(X_test_ground)))
print("The r^2 for Ground time is " + str(round(r2_ground,4)))
print("The RMSE for Ground time is " + str(round(rmse_ground,2)) + " minutes.")

The r^2 for Block time is 0.7946
The RMSE for Block time is 18.24 minutes.


The r^2 for Ground time is 0.8261
The RMSE for Ground time is 23.73 minutes.


# full_df_unedited with delay prediction

In [148]:
full_df = pd.read_pickle('../data/processed/full_df_unedited_wdelay.pkl')
full_df.head()

Unnamed: 0,dep_ap_sched,arr_ap_sched,dep_sched_date,dep_sched_time,arr_sched_date,arr_sched_time,m_offblockdt,m_onblockdt,ac_registration_x,dep_delay,Ac Type Code,trans_time,sched_trans_time,Crew Group,TLC_trans,crew_type_change,Sched Groundtime,Act Groundtime,mingt,arr_leg_outbound,catering_duration,pax_boarding_duration,block_time,flt_event_number,block_delay,ground_delay
4,East Melissaberg,East Carmen,2019-06-01,2019-06-01 02:30:00,2019-06-01,2019-06-01 04:15:00,2019-06-01 03:02:00,2019-06-01 04:43:00,ECLGNX,32.0,DH4,0,0,Start,"['Renee Fisher_nan_nan_nan_ca', 'Rebecca Castr...",[],35.0,21.0,35.0,Keithberg,26.0,26.0,101.0,1,-4.0,32.0
7,East Latashaview,East Carmen,2019-06-01,2019-06-01 03:15:00,2019-06-01,2019-06-01 07:05:00,2019-06-01 03:30:00,2019-06-01 07:07:00,ECLBIX,15.0,320,0,0,Start,"['Nicholas Evans_nan_nan_nan_ca', 'Jessica Her...",[],45.0,61.0,40.0,Juliemouth,27.0,15.0,217.0,1,-13.0,15.0
9,New Jessica,East Carmen,2019-06-01,2019-06-01 03:25:00,2019-06-01,2019-06-01 06:45:00,2019-06-01 03:50:00,2019-06-01 07:01:00,ECLBAX,25.0,320,0,0,Start,"['Sean Weeks_nan_nan_nan_ca', 'Tony Lloyd_nan_...",[],95.0,94.0,45.0,South Nathaniel,27.0,25.0,191.0,1,-9.0,25.0
13,East Allisontown,East Carmen,2019-06-01,2019-06-01 04:00:00,2019-06-01,2019-06-01 04:35:00,2019-06-01 04:04:00,2019-06-01 04:36:00,ECLWFX,4.0,E95,0,0,Start,"['Frederick Ramirez_nan_nan_nan_cp', 'Ariel Wi...",[],55.0,120.0,40.0,Yoderburgh,18.0,11.0,32.0,1,-3.0,4.0
15,Port Courtneytown,East Carmen,2019-06-01,2019-06-01 04:00:00,2019-06-01,2019-06-01 04:35:00,2019-06-01 04:14:00,2019-06-01 04:57:00,ECLGBX,14.0,DH4,0,0,Start,"['Heather Ryan_nan_nan_nan_ca', 'Jeff Hays_nan...",[],35.0,23.0,30.0,West Ana,15.0,12.0,43.0,1,8.0,14.0


We repeat the basic experiment, but this time we do delay prediction instead of ground/block time prediction

In [149]:
# For the moment, let's in this step drop all variables we will not use. ToDo: Check which step is the best step to do that
columns_to_drop_delaypred = [
    'TLC_trans',
    'catering_duration',
    'dep_sched_date', # the date itself has no real information. ToDo: Maybe we should extract the day of week here
    'dep_sched_time', # the time has a value but is dropped for the moment due to formatting. 
    'arr_sched_date',
    'arr_sched_time',
    'm_offblockdt',
    'm_onblockdt',
    # to ensure proper comparison in between ground/block time and ground/block delay prediction, we drop the targets of ground/block time prediction. 
    'Act Groundtime',
    #'block_time'
]

full_df = full_df[full_df['Act Groundtime']<180]


full_df = full_df.drop(columns_to_drop_delaypred, axis = 1)

In [150]:
# Creating dummy variables for all categorical variables
# Note: Onehotencoder is the better solution, however for simplicity let's use pandas for the moment

# Get object columns
full_df_objectcolumns = full_df.select_dtypes(include = 'object')
varlist = full_df_objectcolumns.columns.values.tolist()

# get dummies
full_df_encoded = pd.get_dummies(full_df, columns = varlist, drop_first = True)

In [151]:
# We split the forecast in two different forecasts - one for ground delay and one for block delay

X_train_blockdelay, X_test_blockdelay, y_train_blockdelay, y_test_blockdelay = train_test_split(
    full_df_encoded.drop(['block_delay'], axis = 1), full_df_encoded['block_delay'], test_size=0.33, random_state=42)
    
X_train_grounddelay, X_test_grounddelay, y_train_grounddelay, y_test_grounddelay = train_test_split(
    full_df_encoded.drop(['ground_delay'], axis = 1), full_df_encoded['ground_delay'], test_size=0.33, random_state=42)


In [152]:
# Gradientboosting as base classifier

# Blocktime
base_model_blockdelay = GradientBoostingRegressor()

base_model_blockdelay.fit(X_train_blockdelay, y_train_blockdelay)
base_model_blockdelay.predict(X_test_blockdelay)


# Groundtime

base_model_grounddelay = GradientBoostingRegressor()

base_model_grounddelay.fit(X_train_grounddelay, y_train_grounddelay)
base_model_grounddelay.predict(X_test_grounddelay)


array([1.15816732e+02, 1.44612806e-02, 6.59940396e+01, ...,
       3.39997679e+01, 1.99985403e+01, 1.09975256e+01])

In [153]:
# Blocktime
r2_blockdelay = base_model_blockdelay.score(X_test_blockdelay, y_test_blockdelay)
rmse_blockdelay = np.sqrt(mean_squared_error(y_test_blockdelay, base_model_blockdelay.predict(X_test_blockdelay)))
print("The r^2 for Block delay is " + str(round(r2_blockdelay,4)))
print("The RMSE for Block delay is " + str(round(rmse_blockdelay,2)) + " minutes.")
print("\n")

# Groundtime
r2_grounddelay = base_model_grounddelay.score(X_test_grounddelay, y_test_grounddelay)
rmse_grounddelay = np.sqrt(mean_squared_error(y_test_grounddelay, base_model_grounddelay.predict(X_test_grounddelay)))
print("The r^2 for Ground delay is " + str(round(r2_grounddelay,4)))
print("The RMSE for Ground delay is " + str(round(rmse_grounddelay,2)) + " minutes.")

The r^2 for Block delay is 0.337
The RMSE for Block delay is 5.52 minutes.


The r^2 for Ground delay is 0.9344
The RMSE for Ground delay is 7.78 minutes.


# Processed Dataset

In [154]:
full_df = pd.read_pickle('../data/finalized/full_df.pkl')

#columns_to_drop_full_df = [
#    #'TLC_trans',
#    'catering_duration',
#    'dep_sched_date', # the date itself has no real information. ToDo: Maybe we should extract the day of week here
#    'dep_sched_time', # the time has a value but is dropped for the moment due to formatting. 
#    'arr_sched_date',
#    'arr_sched_time',
#    'm_offblockdt',
#    'm_onblockdt',
#    # to ensure proper comparison in between ground/block time and ground/block delay prediction, we drop the targets of ground/block time prediction. 
#    'Act Groundtime',
#    #'block_time'
#]

#full_df = full_df.drop(columns_to_drop_full_df, axis = 1)
#full_df = full_df.select_dtypes(exclude=['datetime64'])

full_df = full_df.dropna(how = 'any')
full_df

Unnamed: 0,dep_ap_sched,arr_ap_sched,ac_registration_x,Ac Type Code,trans_time,sched_trans_time,Sched Groundtime,Act Groundtime,mingt,dep_leg_inbound,arr_leg_outbound,sched_turnaround,catering_duration,block_time,flt_event_number,block_delay,ground_delay,rows_to_drop_grounddelay
0,New Jessica,East Carmen,ECLBAX,320,0,0,95.0,94.0,45.0,New Jessica,South Nathaniel,95.0,27.0,191.0,1,-9.0,25.0,0
2,South Nathaniel,East Carmen,ECLBAX,320,68,75,80.0,86.0,45.0,South Nathaniel,Joneshaven,80.0,25.0,131.0,3,-14.0,51.0,0
6,Joneshaven,East Carmen,ECLBAX,320,39,45,50.0,65.0,45.0,Joneshaven,Marioborough,50.0,20.0,78.0,2,-7.0,15.0,0
8,Marioborough,East Carmen,ECLBAX,320,53,45,45.0,46.0,45.0,Marioborough,Desireeton,45.0,17.0,73.0,4,-7.0,57.0,0
12,Lake Lawrencechester,East Carmen,ECLBAX,320,54,70,95.0,122.0,45.0,Lake Lawrencechester,Desireeton,95.0,20.0,81.0,2,-4.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12162,Port Amberfort,East Carmen,ECLXEX,320,46,45,55.0,91.0,40.0,Port Amberfort,Susanmouth,55.0,18.0,119.0,2,-16.0,4.0,0
12164,Susanmouth,East Carmen,ECLXEX,320,31,45,50.0,93.0,40.0,Susanmouth,Susanmouth,50.0,19.0,83.0,4,-12.0,3.0,0
12168,Davidtown,East Carmen,ECLXEX,320,43,50,50.0,70.0,40.0,Davidtown,Susanmouth,50.0,25.0,104.0,2,-11.0,13.0,0
12174,West Danielport,East Carmen,ECLXEX,320,53,45,45.0,61.0,40.0,West Danielport,South Cory,45.0,18.0,82.0,2,-3.0,3.0,0


In [155]:
# Creating dummy variables for all categorical variables
# Note: Onehotencoder is the better solution, however for simplicity let's use pandas for the moment

# Get object columns
full_df_objectcolumns = full_df.select_dtypes(include = 'object')
varlist = full_df_objectcolumns.columns.values.tolist()

# get dummies
full_df_encoded = pd.get_dummies(full_df, columns = varlist, drop_first = True)

In [156]:
# We split the forecast in two different forecasts - one for ground delay and one for block delay

X_train_blockdelay, X_test_blockdelay, y_train_blockdelay, y_test_blockdelay = train_test_split(
    full_df_encoded.drop(['block_delay'], axis = 1), full_df_encoded['block_delay'], test_size=0.33, random_state=42)

# Filtering out rows which are skewing ground delay prediction
full_df_encoded_grounddelay = full_df_encoded[full_df_encoded['rows_to_drop_grounddelay']<1]

X_train_grounddelay, X_test_grounddelay, y_train_grounddelay, y_test_grounddelay = train_test_split(
    full_df_encoded_grounddelay.drop(['ground_delay'], axis = 1), full_df_encoded_grounddelay['ground_delay'], test_size=0.33, random_state=42)


In [157]:
# Gradientboosting as base classifier

# Blocktime
base_model_blockdelay = GradientBoostingRegressor()

base_model_blockdelay.fit(X_train_blockdelay, y_train_blockdelay)
base_model_blockdelay.predict(X_test_blockdelay)


# Groundtime

base_model_grounddelay = GradientBoostingRegressor()

base_model_grounddelay.fit(X_train_grounddelay, y_train_grounddelay)
base_model_grounddelay.predict(X_test_grounddelay)


array([12.1289374 , 40.48937364,  9.6042367 , ...,  4.33293307,
        8.50829022, 17.45423131])

In [158]:
# Blocktime
r2_blockdelay = base_model_blockdelay.score(X_test_blockdelay, y_test_blockdelay)
rmse_blockdelay = np.sqrt(mean_squared_error(y_test_blockdelay, base_model_blockdelay.predict(X_test_blockdelay)))
print("The r^2 for Block delay is " + str(round(r2_blockdelay,4)))
print("The RMSE for Block delay is " + str(round(rmse_blockdelay,2)) + " minutes.")
print("\n")

# Groundtime
r2_grounddelay = base_model_grounddelay.score(X_test_grounddelay, y_test_grounddelay)
rmse_grounddelay = np.sqrt(mean_squared_error(y_test_grounddelay, base_model_grounddelay.predict(X_test_grounddelay)))
print("The r^2 for Ground delay is " + str(round(r2_grounddelay,4)))
print("The RMSE for Ground delay is " + str(round(rmse_grounddelay,2)) + " minutes.")

The r^2 for Block delay is 0.45
The RMSE for Block delay is 5.43 minutes.


The r^2 for Ground delay is 0.6771
The RMSE for Ground delay is 13.98 minutes.
