# Scope of this workbook

Here, we want to actually model the whole thing for different datasets. 

In [30]:
# Imports
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

import datetime

from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


#Settings
pd.set_option("display.max_rows", 70)
pd.set_option("display.max_columns", 101)

# full_df_unedited

In [22]:
full_df_unedited = pd.read_pickle('../data/processed/full_df_unedited.pkl')

In [23]:
full_df_unedited.dtypes

fn_number                        object
dep_ap_sched                     object
arr_ap_sched                     object
dep_sched_date           datetime64[ns]
dep_sched_time           datetime64[ns]
arr_sched_date           datetime64[ns]
arr_sched_time           datetime64[ns]
m_offblockdt             datetime64[ns]
m_onblockdt              datetime64[ns]
ac_registration_x                object
dep_delay                       float64
Ac Type Code                     object
trans_time                        int64
sched_trans_time                  int64
Crew Group                       object
TLC_trans                        object
crew_type_change                 object
Sched Groundtime                float64
Act Groundtime                  float64
block_time                      float64
leg                              object
route                            object
mingt                           float64
catering_duration               float64
pax_boarding_duration           float64


In [24]:
full_df_unedited.head()

Unnamed: 0,fn_number,dep_ap_sched,arr_ap_sched,dep_sched_date,dep_sched_time,arr_sched_date,arr_sched_time,m_offblockdt,m_onblockdt,ac_registration_x,dep_delay,Ac Type Code,trans_time,sched_trans_time,Crew Group,TLC_trans,crew_type_change,Sched Groundtime,Act Groundtime,block_time,leg,route,mingt,catering_duration,pax_boarding_duration
4,EC3114,East Melissaberg,East Carmen,2019-06-01,2022-04-30 02:30:00,2019-06-01,2022-04-30 04:15:00,2019-06-01 03:02:00,2019-06-01 04:43:00,ECLGNX,32.0,DH4,0,0,Start,"['Renee Fisher_nan_nan_nan_ca', 'Rebecca Castr...",[],35.0,21.0,101.0,East Melissaberg-East Carmen,East Carmen-East Melissaberg,35.0,26.0,26.0
7,EC3316,East Latashaview,East Carmen,2019-06-01,2022-04-30 03:15:00,2019-06-01,2022-04-30 07:05:00,2019-06-01 03:30:00,2019-06-01 07:07:00,ECLBIX,15.0,320,0,0,Start,"['Nicholas Evans_nan_nan_nan_ca', 'Jessica Her...",[],45.0,61.0,217.0,East Latashaview-East Carmen,East Carmen-East Latashaview,40.0,27.0,15.0
9,EC3292,New Jessica,East Carmen,2019-06-01,2022-04-30 03:25:00,2019-06-01,2022-04-30 06:45:00,2019-06-01 03:50:00,2019-06-01 07:01:00,ECLBAX,25.0,320,0,0,Start,"['Sean Weeks_nan_nan_nan_ca', 'Tony Lloyd_nan_...",[],95.0,94.0,191.0,New Jessica-East Carmen,East Carmen-New Jessica,45.0,27.0,25.0
13,EC3420,East Allisontown,East Carmen,2019-06-01,2022-04-30 04:00:00,2019-06-01,2022-04-30 04:35:00,2019-06-01 04:04:00,2019-06-01 04:36:00,ECLWFX,4.0,E95,0,0,Start,"['Frederick Ramirez_nan_nan_nan_cp', 'Ariel Wi...",[],55.0,120.0,32.0,East Allisontown-East Carmen,East Allisontown-East Carmen,40.0,18.0,11.0
15,EC3400,Port Courtneytown,East Carmen,2019-06-01,2022-04-30 04:00:00,2019-06-01,2022-04-30 04:35:00,2019-06-01 04:14:00,2019-06-01 04:57:00,ECLGBX,14.0,DH4,0,0,Start,"['Heather Ryan_nan_nan_nan_ca', 'Jeff Hays_nan...",[],35.0,23.0,43.0,Port Courtneytown-East Carmen,East Carmen-Port Courtneytown,30.0,15.0,12.0


In [25]:
# For the moment, let's in this step drop all variables we will not use. ToDo: Check which step is the best step to do that
columns_to_drop = [
    'fn_number',
    'TLC_trans',
    'catering_duration',
    'dep_sched_date', # the date itself has no real information. ToDo: Maybe we should extract the day of week here
    'dep_sched_time', # the time has a value but is dropped for the moment due to formatting. 
    'arr_sched_date',
    'arr_sched_time',
    'm_offblockdt',
    'm_onblockdt'
]

full_df_unedited = full_df_unedited.drop(columns_to_drop, axis = 1)

In [26]:
# Creating dummy variables for all categorical variables

# Note: Onehotencoder is the better solution, however for simplicity let's use pandas for the moment
#full_df_unedited_encoded = OneHotEncoder().fit_transform(full_df_unedited)
#full_df_unedited

# Get object columns
full_df_unedited_objectcolumns = full_df_unedited.select_dtypes(include = 'object')
varlist = full_df_unedited_objectcolumns.columns.values.tolist()

#get dummies
full_df_unedited_encoded = pd.get_dummies(full_df_unedited, columns = varlist, drop_first = True)

In [27]:
full_df_unedited_encoded.shape

(3867, 407)

In [28]:
# We split the forecast in two different forecasts - one for ground time and one for block time

X_train_block, X_test_block, y_train_block, y_test_block = train_test_split(full_df_unedited_encoded.drop(['block_time'], axis = 1), full_df_unedited_encoded['block_time'], test_size=0.33, random_state=42)
X_train_ground, X_test_ground, y_train_ground, y_test_ground = train_test_split(full_df_unedited_encoded.drop(['Act Groundtime'], axis = 1), full_df_unedited_encoded['Act Groundtime'], test_size=0.33, random_state=42)

In [37]:
# Gradientboosting as base classifier

# Blocktime
base_model_block = GradientBoostingRegressor()

base_model_block.fit(X_train_block, y_train_block)
base_model_block.predict(X_test_block)


# Groundtime

base_model_ground = GradientBoostingRegressor()

base_model_ground.fit(X_train_ground, y_train_ground)
base_model_ground.predict(X_test_ground)

base_model_ground.score(X_test_ground, y_test_ground)

0.8270232545043312

In [40]:
# Blocktime
r2_block = base_model_block.score(X_test_block, y_test_block)
rmse_block = np.sqrt(mean_squared_error(y_test_block, base_model_block.predict(X_test_block)))
print("The r^2 for Block time is " + str(round(r2_block,4)))
print("The RMSE for Block time is " + str(round(rmse_block,2)) + " minutes.")
print("\n")

# Groundtime
r2_ground = base_model_ground.score(X_test_ground, y_test_ground)
rmse_ground = np.sqrt(mean_squared_error(y_test_ground, base_model_ground.predict(X_test_ground)))
print("The r^2 for Ground time is " + str(round(r2_ground,4)))
print("The RMSE for Ground time is " + str(round(rmse_ground,2)) + " minutes.")

The r^2 for Block time is 0.779
The RMSE for Block time is 18.79 minutes.


The r^2 for Ground time is 0.827
The RMSE for Ground time is 23.86 minutes.
