In [8]:
import h2o
from h2o.automl import H2OAutoML

import pandas as pd
import numpy as np

import matplotlib as plt
%matplotlib inline

#Import the Estimators
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators import H2OXGBoostEstimator

#Import h2o grid search 
import h2o.grid 
from h2o.grid.grid_search import H2OGridSearch

import glob

In [9]:
h2o.init(max_mem_size = "40g", nthreads = 11)

Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,16 hours 43 mins
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.4
H2O_cluster_version_age:,3 months and 14 days
H2O_cluster_name:,H2O_from_python_mark_keph72
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,37.32 Gb
H2O_cluster_total_cores:,128
H2O_cluster_allowed_cores:,11


In [10]:
data = h2o.import_file("data/STAGE_3_TRAIN.csv")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [11]:
x = data.columns
y = "tow"
ignore = ["flight_id", "dataset"] 
## add adep and ades?
x = list(set(x) - set(ignore))
x.remove(y)
print(x)

data["aircraft_type"] = data["aircraft_type"].asfactor()

['stage_two_500', 'alt_per_s', 'stage_one', 'stage_two_700', 'stage_two_400', 'time_to_cruise', 'stage_two_1000', 'percent_error', 'aircraft_type', 'stage_two_300', 'stage_two_200', 'stage_two_800', 'stage_two_900', 'stage_two_100', 'stage_two_600', 'first_cruise_alt']


In [12]:
test, train = data.split_frame(ratios = [0.05], seed = 26)
print(len(train))

350583


In [14]:
aml = H2OAutoML(max_models=50, seed=75, verbosity="info", stopping_metric="RMSE")
aml.train(x=x, y=y, training_frame=train)

AutoML progress: |
18:57:00.337: Project: AutoML_4_20241024_185700
18:57:00.338: 5-fold cross-validation will be used.
18:57:00.338: Setting stopping tolerance adaptively based on the training frame: 0.0016889024802299066
18:57:00.338: Build control seed: 75
18:57:00.339: training frame: Frame key: AutoML_4_20241024_185700_training_py_8_sid_bf4d    cols: 19    rows: 350583  chunks: 512    size: 39171491  checksum: 626891403074473263
18:57:00.339: validation frame: NULL
18:57:00.339: leaderboard frame: NULL
18:57:00.339: blending frame: NULL
18:57:00.340: response column: tow
18:57:00.340: fold column: null
18:57:00.340: weights column: null
18:57:00.341: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (7g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), grid_1 (4g, 60w), lr_annealing (7g, 10w)]}, {

key,value
Stacking strategy,cross_validation
Number of base models (used / total),27/50
# GBM base models (used / total),11/15
# XGBoost base models (used / total),13/18
# DRF base models (used / total),2/2
# DeepLearning base models (used / total),1/14
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,1258412.1,2247.3005,1260697.8,1257412.0,1259567.1,1254993.6,1259390.4
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mae,1238.4512,3.3380616,1238.242,1233.3314,1240.2178,1242.3358,1238.129
mean_residual_deviance,3644617.8,37412.32,3628619.0,3608847.0,3639754.8,3707833.5,3638034.8
mse,3644617.8,37412.32,3628619.0,3608847.0,3639754.8,3707833.5,3638034.8
null_deviance,198833500000000.0,1460740200000.0,198948490000000.0,200418910000000.0,199595870000000.0,196507940000000.0,198696300000000.0
r2,0.9987146,2.03e-05,0.9987185,0.9987377,0.99872,0.9986818,0.998715
residual_deviance,255544480000.0,2186724900.0,254950400000.0,252976560000.0,255459840000.0,259025550000.0,255309990000.0
rmse,1909.0682,9.774491,1904.8934,1899.6965,1907.8141,1925.5736,1907.3633
rmsle,0.0228743,4.75e-05,0.0228575,0.0229489,0.0228182,0.0228693,0.0228776


In [15]:
leader_model = aml.leader
print(round(leader_model.rmse(), 0))
#print(round(leader_model.rmse(valid=True), 0))
print(round(leader_model.model_performance(test).rmse(), 0))

1431.0
2440.0


In [16]:
# 1531.0
# 2398.0

In [22]:
## Save ensemble
from datetime import datetime
now = datetime.now()
h2o.save_model(model=leader_model, path="data/models/stage_three_"+now.strftime("%m_%d_%y_%H_%M")+"_added", force=True)


'/home/mark/prc_challenge/data/models/stage_three_10_27_24_17_53_added/StackedEnsemble_AllModels_1_AutoML_4_20241024_185700'

## RUN STAGE THREE SUBMISSION

In [18]:
data = h2o.import_file("data/STAGE_3_TEST.csv")
data

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


flight_id,tow,dataset,first_cruise_alt,time_to_cruise,alt_per_s,stage_one,stage_two_100,stage_two_200,stage_two_300,stage_two_400,stage_two_500,stage_two_600,stage_two_700,stage_two_800,stage_two_900,stage_two_1000,aircraft_type,percent_error
248754000.0,,submission,36000.0,1350.0,26.7,69933.1,66052.3,65215.9,65550.8,66918.0,67455.6,67808.1,68133.6,67882.2,68597.9,69366.9,B738,4.97785
248754000.0,,submission,36000.0,2070.0,17.4,214816.0,210956.0,204897.0,214594.0,226919.0,223442.0,223662.0,227598.0,222790.0,225984.0,224391.0,A333,5.94711
248754000.0,,submission,,,,222549.0,255834.0,272244.0,238921.0,256087.0,252095.0,263190.0,257855.0,257676.0,270981.0,274112.0,B77W,11.4072
248764000.0,,submission,35000.0,1260.0,27.8,64158.6,,,,,,,,,,,B738,4.97785
248764000.0,,submission,31000.0,1530.0,20.3,50340.6,,,,,,,,,,,BCS3,3.9136
248758000.0,,submission,39000.0,1470.0,26.5,65195.3,62720.2,65163.7,64612.2,66064.8,65112.5,67573.4,64976.5,65861.4,65940.0,64604.6,B38M,4.35642
248764000.0,,submission,37000.0,1530.0,24.2,63873.0,59431.6,58813.4,59139.5,60372.6,60774.5,60452.0,61539.4,62588.2,61772.7,60045.3,A320,6.07218
248755000.0,,submission,31000.0,930.0,33.3,58110.7,64634.7,64159.6,64267.8,59497.0,58922.1,58212.6,57819.5,58202.6,58121.8,,B738,4.97785
248754000.0,,submission,32000.0,1050.0,30.5,56218.5,57613.1,58116.2,57592.0,60816.7,61158.3,60158.4,59105.6,58081.1,58329.0,57861.5,A320,6.07218
248754000.0,,submission,36000.0,1470.0,24.5,45610.8,,,,,,,,,,,E190,3.11874


In [19]:
data["aircraft_type"] = data["aircraft_type"].asfactor()

In [20]:
submission_set = pd.read_csv("/mnt/SMB_share/mark/flight_competition/final_submission_set.csv")
submission_set

Unnamed: 0,flight_id,date,callsign,adep,name_adep,country_code_adep,ades,name_ades,country_code_ades,actual_offblock_time,arrival_time,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow
0,248753821,2022-01-01,3b3de0f3ad0ee192513995c02f7bf7cf,LTFJ,Istanbul Sabiha Gokcen,TR,LFLL,Lyon,FR,2022-01-01T09:44:00Z,2022-01-01T12:48:33Z,B738,M,6351ec1b849adacc0cbb3b1313d8d39b,170,15,1122,
1,248753822,2022-01-01,e06dd03d4a879ca37d9e18c1bd7cad16,EBBR,Brussels,BE,KJFK,New York JFK,US,2022-01-01T09:45:00Z,2022-01-01T17:49:51Z,A333,H,bdeeef3a675587d530de70a25d7118d2,470,15,3205,
2,248754498,2022-01-01,2d3b1c962c78c4ebeef11bcd51b9e94c,KMIA,Miami,US,EGLL,London Heathrow,GB,2022-01-01T01:52:00Z,2022-01-01T09:55:16Z,B77W,H,5543e4dc327359ffaf5b9c0e6faaf0e1,473,10,3965,
3,248763650,2022-01-01,35f7721f68bf85128195547ae38b0f04,EBBR,Brussels,BE,LEAL,Alicante,ES,2022-01-01T12:02:00Z,2022-01-01T14:13:56Z,B738,M,f53c55b5cf0cbb3be755bf50df6fa52d,123,9,802,
4,248763651,2022-01-01,eb56918bee9bc5204624186b9bcc4391,LSZH,Zurich,CH,LFPG,Paris Charles de Gaulle,FR,2022-01-01T12:03:00Z,2022-01-01T13:09:44Z,BCS3,M,2d5def0a5a844b343ba1b7cc9cb28fa9,56,11,292,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158144,258068876,2022-12-31,c9fca302ca2e28acab0eb0bb1b46f11b,LTFM,iGA Istanbul,TR,LSZH,Zurich,CH,2022-12-31T09:25:00Z,2022-12-31T12:24:24Z,A321,M,6351ec1b849adacc0cbb3b1313d8d39b,154,25,988,
158145,258064675,2022-12-31,00f96ad0e382476649574ba044c764fc,EHAM,Amsterdam,NL,EDDF,Frankfurt,DE,2022-12-31T10:04:21Z,2022-12-31T10:55:35Z,A320,M,f502877cab405652cf0dd70c2213e730,42,9,240,
158146,258065436,2022-12-31,87c552b7f6d9bbd16a66e95df761c7f2,LEBL,Barcelona,ES,KJFK,New York JFK,US,2022-12-31T09:34:00Z,2022-12-31T17:51:22Z,B772,H,5543e4dc327359ffaf5b9c0e6faaf0e1,483,14,3426,
158147,258058138,2022-12-31,2cd57e434494606c965bac87c024bda2,LIPE,Bologna,IT,LOWW,Vienna,AT,2022-12-31T09:37:00Z,2022-12-31T10:47:00Z,E195,M,5d407cb11cc29578cc3e292e743f5393,55,15,335,


In [23]:
this_model = glob.glob("/home/mark/prc_challenge/data/models/stage_three_10_27_24_17_53_added/*")[0]
stage_three_model = h2o.load_model(this_model)
now = datetime.now()

predictions = stage_three_model.predict(data)
print("data loaded")
data = data[["flight_id"]]
data["tow"] = predictions
data.as_data_frame().to_csv("data/STAGE_3_SUBMISSION_PREDICTIONS.csv", index=False)
data.as_data_frame()[["flight_id", "tow"]].to_csv("data/submission_predictions_"+now.strftime("%m_%d_%y_%H_%M")+".csv", index=False)
data

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
data loaded






flight_id,tow
248754000.0,69946.1
248754000.0,217670.0
248754000.0,225983.0
248764000.0,64393.3
248764000.0,50188.8
248758000.0,65032.8
248764000.0,63407.8
248755000.0,57563.9
248754000.0,53713.4
248754000.0,45520.8


In [None]:
## Add next closest airport?