# Scope of this workbook

Here, we want to actually model the whole thing for different datasets. 

In [43]:
# Imports
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

import datetime

from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


#Settings
pd.set_option("display.max_rows", 70)
pd.set_option("display.max_columns", 101)

# full_df_unedited

full_df_unedited = pd.read_pickle('../data/processed/full_df_unedited.pkl')

full_df_unedited.head()

# For the moment, let's in this step drop all variables we will not use. ToDo: Check which step is the best step to do that
columns_to_drop = [
    'TLC_trans',
    'catering_duration',
    'dep_sched_date', # the date itself has no real information. ToDo: Maybe we should extract the day of week here
    'dep_sched_time', # the time has a value but is dropped for the moment due to formatting. 
    'arr_sched_date',
    'arr_sched_time',
    'm_offblockdt',
    'm_onblockdt'
]

full_df_unedited = full_df_unedited.drop(columns_to_drop, axis = 1)

full_df_unedited = full_df_unedited[full_df_unedited['Act Groundtime']<180]

# Creating dummy variables for all categorical variables

# Note: Onehotencoder is the better solution, however for simplicity let's use pandas for the moment
#full_df_unedited = OneHotEncoder().fit_transform(full_df_unedited)
#full_df_unedited

# Get object columns
full_df_unedited_objectcolumns = full_df_unedited.select_dtypes(include = 'object')
varlist = full_df_unedited_objectcolumns.columns.values.tolist()

#get dummies
full_df_unedited_encoded = pd.get_dummies(full_df_unedited, columns = varlist, drop_first = True)

full_df_unedited_encoded.shape

# We split the forecast in two different forecasts - one for ground time and one for block time

X_train_block, X_test_block, y_train_block, y_test_block = train_test_split(full_df_unedited_encoded.drop(['actual_block_time'], axis = 1), full_df_unedited_encoded['actual_block_time'], test_size=0.33, random_state=42)
X_train_ground, X_test_ground, y_train_ground, y_test_ground = train_test_split(full_df_unedited_encoded.drop(['Act Groundtime'], axis = 1), full_df_unedited_encoded['Act Groundtime'], test_size=0.33, random_state=42)

# Gradientboosting as base classifier

# Blocktime
base_model_block = GradientBoostingRegressor()

base_model_block.fit(X_train_block, y_train_block)
base_model_block.predict(X_test_block)


# Groundtime

base_model_ground = GradientBoostingRegressor()

base_model_ground.fit(X_train_ground, y_train_ground)
base_model_ground.predict(X_test_ground)


# Blocktime
r2_block = base_model_block.score(X_test_block, y_test_block)
rmse_block = np.sqrt(mean_squared_error(y_test_block, base_model_block.predict(X_test_block)))
print("The r^2 for Block time is " + str(round(r2_block,4)))
print("The RMSE for Block time is " + str(round(rmse_block,2)) + " minutes.")
print("\n")

# Groundtime
r2_ground = base_model_ground.score(X_test_ground, y_test_ground)
rmse_ground = np.sqrt(mean_squared_error(y_test_ground, base_model_ground.predict(X_test_ground)))
print("The r^2 for Ground time is " + str(round(r2_ground,4)))
print("The RMSE for Ground time is " + str(round(rmse_ground,2)) + " minutes.")

# full_df_unedited with delay prediction

In [44]:
full_df_wdelay = pd.read_pickle('../data/processed/full_df_unedited_wdelay.pkl')
full_df_wdelay.head()

Unnamed: 0,fn_number_x,dep_ap_sched,arr_ap_sched,dep_sched_date,dep_sched_time,arr_sched_date,arr_sched_time,m_offblockdt,m_onblockdt,ac_registration_x,dep_delay,Ac Type Code,trans_time,sched_trans_time,Crew Group,TLC_trans,crew_type_change,Sched Groundtime,Act Groundtime,mingt,arr_leg_outbound,catering_duration,pax_boarding_duration,actual_block_time,flt_event_number,block_delay,ground_delay
0,EC3292,New Jessica,East Carmen,2019-06-01,2019-06-01 03:25:00,2019-06-01,2019-06-01 06:45:00,2019-06-01 03:50:00,2019-06-01 07:01:00,ECLBAX,25.0,320,0,0,Start,"['Sean Weeks_nan_nan_nan_ca', 'Tony Lloyd_nan_...",[],95.0,94.0,45.0,South Nathaniel,27.0,25.0,191.0,1,-9.0,25.0
6,EC3008,Joneshaven,East Carmen,2019-06-02,2019-06-02 08:55:00,2019-06-02,2019-06-02 10:20:00,2019-06-02 09:10:00,2019-06-02 10:28:00,ECLBAX,15.0,320,39,45,A,"['Kyle Melendez_66_39_45_cp', 'Marissa Edwards...",[],50.0,65.0,45.0,Marioborough,20.0,16.0,78.0,2,-7.0,15.0
12,EC2578,Lake Lawrencechester,East Carmen,2019-06-03,2019-06-03 07:50:00,2019-06-03,2019-06-03 09:15:00,2019-06-03 07:50:00,2019-06-03 09:11:00,ECLBAX,0.0,320,54,70,A,"['Mrs. Cheryl Williams_60_54_70_ca', 'Brian Wh...",[],95.0,122.0,45.0,Desireeton,20.0,26.0,81.0,2,-4.0,0.0
16,EC3316,East Latashaview,East Carmen,2019-06-04,2019-06-04 03:15:00,2019-06-04,2019-06-04 07:05:00,2019-06-04 03:29:00,2019-06-04 07:10:00,ECLBAX,14.0,320,0,0,Start,"['Richard Anderson_nan_nan_nan_ca', 'Vicki Gri...",[],80.0,89.0,45.0,East Latashaview,21.0,27.0,221.0,1,-9.0,14.0
20,EC2868,Kennethfort,East Carmen,2019-06-05,2019-06-05 08:00:00,2019-06-05,2019-06-05 10:00:00,2019-06-05 08:15:00,2019-06-05 10:25:00,ECLBAX,15.0,320,60,45,A,"['Christina Miller_41_60_45_ca', 'Jeffrey Vazq...",[],50.0,49.0,45.0,Desireeton,28.0,7.0,130.0,2,10.0,15.0


We repeat the basic experiment, but this time we do delay prediction instead of ground/block time prediction

In [45]:
# For the moment, let's in this step drop all variables we will not use. ToDo: Check which step is the best step to do that
columns_to_drop_delaypred = [
    'TLC_trans',
    'catering_duration',
    'dep_sched_date', # the date itself has no real information. ToDo: Maybe we should extract the day of week here
    'dep_sched_time', # the time has a value but is dropped for the moment due to formatting. 
    'arr_sched_date',
    'arr_sched_time',
    'm_offblockdt',
    'm_onblockdt',
    # to ensure proper comparison in between ground/block time and ground/block delay prediction, we drop the targets of ground/block time prediction. 
    'Act Groundtime',
    #'block_time'
]

full_df_wdelay = full_df_wdelay[full_df_wdelay['Act Groundtime']<180]


full_df_wdelay = full_df_wdelay.drop(columns_to_drop_delaypred, axis = 1)

In [46]:
# Creating dummy variables for all categorical variables
# Note: Onehotencoder is the better solution, however for simplicity let's use pandas for the moment

# Get object columns
full_df_wdelay_objectcolumns = full_df_wdelay.select_dtypes(include = 'object')
varlist = full_df_wdelay_objectcolumns.columns.values.tolist()

# get dummies
full_df_wdelay_encoded = pd.get_dummies(full_df_wdelay, columns = varlist, drop_first = True)

In [47]:
# We split the forecast in two different forecasts - one for ground delay and one for block delay

X_train_blockdelay, X_test_blockdelay, y_train_blockdelay, y_test_blockdelay = train_test_split(
    full_df_wdelay_encoded.drop(['block_delay'], axis = 1), full_df_wdelay_encoded['block_delay'], test_size=0.33, random_state=42)
    
X_train_grounddelay, X_test_grounddelay, y_train_grounddelay, y_test_grounddelay = train_test_split(
    full_df_wdelay_encoded.drop(['ground_delay'], axis = 1), full_df_wdelay_encoded['ground_delay'], test_size=0.33, random_state=42)


In [48]:
# Gradientboosting as base classifier

# Blocktime
base_model_blockdelay = GradientBoostingRegressor()

base_model_blockdelay.fit(X_train_blockdelay, y_train_blockdelay)
base_model_blockdelay.predict(X_test_blockdelay)


# Groundtime

base_model_grounddelay = GradientBoostingRegressor()

base_model_grounddelay.fit(X_train_grounddelay, y_train_grounddelay)
base_model_grounddelay.predict(X_test_grounddelay)


array([1.67405860e-02, 1.90020403e+01, 1.67405860e-02, ...,
       8.55846934e+01, 4.00269356e+00, 3.78949876e+01])

In [49]:
# Blocktime
r2_blockdelay = base_model_blockdelay.score(X_test_blockdelay, y_test_blockdelay)
rmse_blockdelay = np.sqrt(mean_squared_error(y_test_blockdelay, base_model_blockdelay.predict(X_test_blockdelay)))
print("The r^2 for Block delay is " + str(round(r2_blockdelay,4)))
print("The RMSE for Block delay is " + str(round(rmse_blockdelay,2)) + " minutes.")
print("\n")

# Groundtime
r2_grounddelay = base_model_grounddelay.score(X_test_grounddelay, y_test_grounddelay)
rmse_grounddelay = np.sqrt(mean_squared_error(y_test_grounddelay, base_model_grounddelay.predict(X_test_grounddelay)))
print("The r^2 for Ground delay is " + str(round(r2_grounddelay,4)))
print("The RMSE for Ground delay is " + str(round(rmse_grounddelay,2)) + " minutes.")

The r^2 for Block delay is 0.3686
The RMSE for Block delay is 5.63 minutes.


The r^2 for Ground delay is 0.9982
The RMSE for Ground delay is 1.19 minutes.


# Processed Dataset

In [50]:
full_df = pd.read_pickle('../data/finalized/full_df.pkl')




#columns_to_drop_full_df = [
#    #'TLC_trans',
#    'catering_duration',
#    'dep_sched_date', # the date itself has no real information. ToDo: Maybe we should extract the day of week here
#    'dep_sched_time', # the time has a value but is dropped for the moment due to formatting. 
#    'arr_sched_date',
#    'arr_sched_time',
#    'm_offblockdt',
#    'm_onblockdt',
#    # to ensure proper comparison in between ground/block time and ground/block delay prediction, we drop the targets of ground/block time prediction. 
#    'Act Groundtime',
#    #'block_time'
#]

#full_df = full_df.drop(columns_to_drop_full_df, axis = 1)
#full_df = full_df.select_dtypes(exclude=['datetime64'])


#*******************************temp************************************
#test ob es was bringt, de facto keine ground columns mehr zu haben
full_df = full_df.drop(['catering_duration','arr_leg_outbound','mingt','sched_turnaround'], axis = 1) # maybe improve and impute sched_turnaround

#drop trans_time and block time as well
full_df = full_df.drop(['trans_time', 'actual_block_time', 'fn_number_x'], axis = 1)
#***********************************************************************

full_df['Route'] = full_df['dep_ap_sched'] + "_" + full_df['arr_ap_sched']
full_df = full_df.drop(['dep_ap_sched','arr_ap_sched'], axis = 1)

print(full_df.shape)

full_df = full_df.dropna(how = 'any')
'*************************'
#full_df = full_df[full_df['Sched Groundtime']<270]


full_df.shape
#full_df[full_df['leg_no'] == 271997824]

(11628, 16)


(11469, 16)

In [51]:
#*********************************************************Temp************************************************************
#das hier am besten zu 4

full_df = full_df.drop(['flt_event_number','sched_trans_time'], axis = 1)


# Outlier Deletion nach Hinnerk
# Function to identify outliers for all continous variables
def find_outliers(df, col):
    q25 = df[col].quantile(0.25)
    q75 = df[col].quantile(0.75)
    iqr = q75 - q25
    ll = q25 - 1.5 * iqr
    ul = q75 + 1.5 * iqr

    ls = df.index[(df[col] < ll) | (df[col] > ul)]

    return ls

outlier_idx = []
#full_df_columns = full_df[['sched_trans_time', 'Sched Groundtime', 'Act Groundtime', 'block_delay', 'ground_delay']].columns
full_df_columns = full_df[['Sched Groundtime', 'Act Groundtime', 'block_delay', 'ground_delay']].columns
for col in full_df_columns:
    if full_df[col].dtype in ['int64', 'float64']:
        outlier_idx.extend(find_outliers(full_df, col))

full_df = full_df.drop(outlier_idx)

print(full_df.shape)
full_df=full_df[full_df['Sched Groundtime']>0]
print(full_df.shape)
full_df=full_df[full_df['Act Groundtime']>0]
print(full_df.shape)

full_df.shape
full_df.to_pickle('full_df_comparison.pkl')



#columns=['day_of_week','hour_of_day_dep', 'hour_of_day_arr']
#for i in columns:
#    full_df[i] = full_df[i].apply(str)

full_df.dtypes

full_df.drop(['Sched Groundtime', 'Act Groundtime', 'Crewchange', 'routing'], axis = 1, inplace = True)


(8374, 14)
(8363, 14)
(7914, 14)


In [52]:
# Temp, file from Hinnerk*******************************************************************************************+
#full_df = pd.read_pickle('../data/finalized/final.pkl')

#full_df['flt_ac_type'].astype('object')
#full_df = full_df['hour_of_day_dep'].astype('float')

## Train/Test Split encoded

In [53]:
# Creating dummy variables for all categorical variables
# Note: Onehotencoder is the better solution, however for simplicity let's use pandas for the moment

# Get object columns
full_df_objectcolumns = full_df.select_dtypes(include = ['object', 'category'])
varlist = full_df_objectcolumns.columns.values.tolist()

# get dummies
full_df_encoded = pd.get_dummies(full_df, columns = varlist, drop_first = True)

#filter = ['ac']
#print(str([i for i in full_df_encoded.columns if 'Route' not in i ])+"\n")

full_df_encoded.shape

(7914, 316)

In [54]:
# We split the forecast in two different forecasts - one for ground delay and one for block delay

# dropping the ground delay filter row
full_df_encoded_blockdelay = full_df_encoded.drop(['rows_to_drop_grounddelay'], axis = 1)
X_train_blockdelay_encoded, X_test_blockdelay_encoded, y_train_blockdelay_encoded, y_test_blockdelay_encoded = train_test_split(
    full_df_encoded_blockdelay.drop(['block_delay'], axis = 1), full_df_encoded_blockdelay['block_delay'], test_size=0.33, random_state=42)

# Filtering out rows which are skewing ground delay prediction
full_df_encoded_grounddelay = full_df_encoded[full_df_encoded['rows_to_drop_grounddelay'] == 0]

X_train_grounddelay_encoded, X_test_grounddelay_encoded, y_train_grounddelay_encoded, y_test_grounddelay_encoded = train_test_split(
    full_df_encoded_grounddelay.drop(['ground_delay'], axis = 1), full_df_encoded_grounddelay['ground_delay'], test_size=0.33, random_state=42)

'''#Hinnerk****************************************************************************************************

# dropping the ground delay filter row
full_df_encoded_blockdelay = full_df_encoded#.drop(['rows_to_drop_grounddelay'], axis = 1)
X_train_blockdelay_encoded, X_test_blockdelay_encoded, y_train_blockdelay_encoded, y_test_blockdelay_encoded = train_test_split(
    full_df_encoded_blockdelay.drop(['block_delay'], axis = 1), full_df_encoded_blockdelay['block_delay'], test_size=0.33, random_state=42)

# Filtering out rows which are skewing ground delay prediction
full_df_encoded_grounddelay = full_df_encoded#[full_df_encoded['rows_to_drop_grounddelay']<1]

X_train_grounddelay_encoded, X_test_grounddelay_encoded, y_train_grounddelay_encoded, y_test_grounddelay_encoded = train_test_split(
    full_df_encoded_grounddelay.drop(['ground_delay'], axis = 1), full_df_encoded_grounddelay['ground_delay'], test_size=0.33, random_state=42)
'''

"#Hinnerk****************************************************************************************************\n\n# dropping the ground delay filter row\nfull_df_encoded_blockdelay = full_df_encoded#.drop(['rows_to_drop_grounddelay'], axis = 1)\nX_train_blockdelay_encoded, X_test_blockdelay_encoded, y_train_blockdelay_encoded, y_test_blockdelay_encoded = train_test_split(\n    full_df_encoded_blockdelay.drop(['block_delay'], axis = 1), full_df_encoded_blockdelay['block_delay'], test_size=0.33, random_state=42)\n\n# Filtering out rows which are skewing ground delay prediction\nfull_df_encoded_grounddelay = full_df_encoded#[full_df_encoded['rows_to_drop_grounddelay']<1]\n\nX_train_grounddelay_encoded, X_test_grounddelay_encoded, y_train_grounddelay_encoded, y_test_grounddelay_encoded = train_test_split(\n    full_df_encoded_grounddelay.drop(['ground_delay'], axis = 1), full_df_encoded_grounddelay['ground_delay'], test_size=0.33, random_state=42)\n"

## Linear Regression


In [55]:
# Blocktime
lr_block = LinearRegression()

lr_block.fit(X_train_blockdelay_encoded, y_train_blockdelay_encoded)
lr_block.predict(X_test_blockdelay_encoded)

# Groundtime
lr_ground = LinearRegression()

lr_ground.fit(X_train_grounddelay_encoded, y_train_grounddelay_encoded)
lr_ground.predict(X_test_grounddelay_encoded)

array([41.18899582, 12.17864948, 27.78310294, ..., 31.51163435,
       -5.51072097,  7.63249541])

In [56]:
# Blocktime
r2_blockdelay = lr_block.score(X_test_blockdelay_encoded, y_test_blockdelay_encoded)
rmse_blockdelay = np.sqrt(mean_squared_error(y_test_blockdelay_encoded, lr_block.predict(X_test_blockdelay_encoded)))
print("The r^2 for Block delay is " + str(round(r2_blockdelay,4)))
print("The RMSE for Block delay is " + str(round(rmse_blockdelay,2)) + " minutes.")
print("\n")

# Groundtime
r2_grounddelay = lr_ground.score(X_test_grounddelay_encoded, y_test_grounddelay_encoded)
rmse_grounddelay = np.sqrt(mean_squared_error(y_test_grounddelay_encoded, lr_ground.predict(X_test_grounddelay_encoded)))
print("The r^2 for Ground delay is " + str(round(r2_grounddelay,4)))
print("The RMSE for Ground delay is " + str(round(rmse_grounddelay,2)) + " minutes.")

The r^2 for Block delay is -3153678589.7033
The RMSE for Block delay is 1059188.18 minutes.


The r^2 for Ground delay is -86510273.2216
The RMSE for Ground delay is 164240.1 minutes.


## Setup of Modelling

In [57]:
# Imports 
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


In [58]:
%%time
# Initialize estimators

#reg1 = LinearRegression()
reg2 = Ridge()
#reg3 = Lasso()
reg4 = GradientBoostingRegressor()
#reg5 = SVR()

# Initialize hyperparameters for each dictionary
#param1 = {}

param2 = {}
param2['regressor__alpha'] = [0,1,2,3]#[x for x in np.linspace(0,2,num = 4)]
param2['regressor'] = [reg2]

param4 = {}
param4['regressor__n_estimators'] = [150]# [10,50,100,150,250,300] #[10,50,100,150,250]#[10,50,100,150,200,250]
param4['regressor__max_depth'] = [11]#[8,10,12,15,18,20]#,18]#,22] #[2,3,6,8,10,12,18] #[2,3,4,6,8,10,12,15,18]
param4['regressor__max_leaf_nodes'] = [30,40]#[6,8,12,15,19,25]#,18] #[6,8,12,None] #[6,8,12,15,19,25,None]
param4['regressor'] = [reg4]


# Create Pipeline
pipeline = Pipeline([('regressor', reg2)])
params = [param2, param4]

# Train grid search model
gs_block = GridSearchCV(pipeline, params, cv=5, n_jobs=-1, scoring='neg_root_mean_squared_error', verbose=1).fit(X_train_blockdelay_encoded,y_train_blockdelay_encoded)
gs_ground = GridSearchCV(pipeline, params, cv=5, n_jobs=-1, scoring='neg_root_mean_squared_error', verbose=1).fit(X_train_grounddelay_encoded,y_train_grounddelay_encoded)
#gs_block = GridSearchCV(pipeline, params, cv=5, n_jobs=-1, verbose=1).fit(X_train_blockdelay_encoded,y_train_blockdelay_encoded)
#gs_ground = GridSearchCV(pipeline, params, cv=5, n_jobs=-1, verbose=1).fit(X_train_grounddelay_encoded,y_train_grounddelay_encoded)
print(gs_block.best_estimator_, gs_block.best_params_)
print(gs_ground.best_estimator_, gs_ground.best_params_)

#Evaluation
# Blocktime
r2_blockdelay = gs_block.best_estimator_.score(X_test_blockdelay_encoded, y_test_blockdelay_encoded)
rmse_blockdelay = np.sqrt(mean_squared_error(y_test_blockdelay_encoded, gs_block.best_estimator_.predict(X_test_blockdelay_encoded)))
print("The r^2 for Block delay is " + str(round(r2_blockdelay,4)))
print("The RMSE for Block delay is " + str(round(rmse_blockdelay,2)) + " minutes.")
print("\n")

# Groundtime
r2_grounddelay = gs_ground.best_estimator_.score(X_test_grounddelay_encoded, y_test_grounddelay_encoded)
rmse_grounddelay = np.sqrt(mean_squared_error(y_test_grounddelay_encoded, gs_ground.best_estimator_.predict(X_test_grounddelay_encoded)))
print("The r^2 for Ground delay is " + str(round(r2_grounddelay,4)))
print("The RMSE for Ground delay is " + str(round(rmse_grounddelay,2)) + " minutes.")



Fitting 5 folds for each of 6 candidates, totalling 30 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Pipeline(steps=[('regressor', Ridge(alpha=2))]) {'regressor': Ridge(alpha=2), 'regressor__alpha': 2}
Pipeline(steps=[('regressor',
                 GradientBoostingRegressor(max_depth=11, max_leaf_nodes=30,
                                           n_estimators=150))]) {'regressor': GradientBoostingRegressor(max_depth=11, max_leaf_nodes=30, n_estimators=150), 'regressor__max_depth': 11, 'regressor__max_leaf_nodes': 30, 'regressor__n_estimators': 150}
The r^2 for Block delay is 0.8843
The RMSE for Block delay is 6.42 minutes.


The r^2 for Ground delay is 0.9171
The RMSE for Ground delay is 5.08 minutes.
CPU times: total: 14.7 s
Wall time: 1min 47s


In [174]:
# filtering out all columns does not help apparently. 

In [186]:
from sklearn.metrics import r2_score, mean_squared_error
# Create dataframe to save model evaluation parameters
eval = pd.DataFrame(columns= ['Group', 'Model', 'Parameters', 'R^2 test', 'RMSE test', 'R^2 train', 'RMSE train'])

# Create ridge regression models with best alpha values
rid_block = Ridge(alpha=2, fit_intercept=True)
rid_ground = Ridge(alpha=2, fit_intercept=True)

# Fit models to training data
rid_block.fit(X_train_blockdelay_encoded, y_train_blockdelay_encoded)
rid_ground.fit(X_train_grounddelay_encoded, y_train_grounddelay_encoded)

# Predict values for train and test data
rid_pred_block_train = rid_block.predict(X_train_blockdelay_encoded)
rid_pred_ground_train = rid_ground.predict(X_train_grounddelay_encoded)

rid_pred_block_test = rid_block.predict(X_test_blockdelay_encoded)
rid_pred_ground_test = rid_ground.predict(X_test_grounddelay_encoded)

# Save r^2 and RMSE for both models in dataframe for later comparison
eval = eval.append({
    'Group': 'Block',
    'Model': 'Ridge',
    'R^2 test': r2_score(y_test_blockdelay_encoded, rid_pred_block_test),
    'RMSE test': mean_squared_error(y_test_blockdelay_encoded, rid_pred_block_test, squared=False),
    'R^2 train': r2_score(y_train_blockdelay_encoded, rid_pred_block_train),
    'RMSE train': mean_squared_error(y_train_blockdelay_encoded, rid_pred_block_train, squared=False)
    }, ignore_index=True)
 
eval = eval.append({
    'Group': 'Ground',
    'Model': 'Ridge',
    'R^2 test': r2_score(y_test_grounddelay_encoded, rid_pred_ground_test),
    'RMSE test': mean_squared_error(y_test_grounddelay_encoded, rid_pred_ground_test, squared=False),
    'R^2 train': r2_score(y_train_grounddelay_encoded, rid_pred_ground_train),
    'RMSE train': mean_squared_error(y_train_grounddelay_encoded, rid_pred_ground_train, squared=False),
    }, ignore_index=True)

eval.round(decimals=3)

  eval = eval.append({
  eval = eval.append({


Unnamed: 0,Group,Model,Parameters,R^2 test,RMSE test,R^2 train,RMSE train
0,Block,Ridge,,0.299786,5.728762,0.390707,5.363538
1,Ground,Ridge,,0.615664,10.99461,0.651971,10.435815


## Gradient Boosted Tree

In [367]:
# Gradientboosting as base classifier

# Blocktime
base_model_blockdelay = GradientBoostingRegressor()

base_model_blockdelay.fit(X_train_blockdelay_encoded, y_train_blockdelay_encoded)
base_model_blockdelay.predict(X_test_blockdelay_encoded)


# Groundtime

base_model_grounddelay = GradientBoostingRegressor()

base_model_grounddelay.fit(X_train_grounddelay_encoded, y_train_grounddelay_encoded)
base_model_grounddelay.predict(X_test_grounddelay_encoded)


array([34.98205655, 25.11056469, 29.39842072, ..., 21.16724337,
       39.10357401, 25.3647793 ])

In [None]:
# Blocktime
r2_blockdelay = base_model_blockdelay.score(X_test_blockdelay_encoded, y_test_blockdelay_encoded)
rmse_blockdelay = np.sqrt(mean_squared_error(y_test_blockdelay_encoded, base_model_blockdelay.predict(X_test_blockdelay_encoded)))
print("The r^2 for Block delay is " + str(round(r2_blockdelay,4)))
print("The RMSE for Block delay is " + str(round(rmse_blockdelay,2)) + " minutes.")
print("\n")

# Groundtime
r2_grounddelay = base_model_grounddelay.score(X_test_grounddelay_encoded, y_test_grounddelay_encoded)
rmse_grounddelay = np.sqrt(mean_squared_error(y_test_grounddelay_encoded, base_model_grounddelay.predict(X_test_grounddelay_encoded)))
print("The r^2 for Ground delay is " + str(round(r2_grounddelay,4)))
print("The RMSE for Ground delay is " + str(round(rmse_grounddelay,2)) + " minutes.")

The r^2 for Block delay is 0.4503
The RMSE for Block delay is 5.43 minutes.


The r^2 for Ground delay is 0.6768
The RMSE for Ground delay is 13.98 minutes.
