In [5]:
import pandas as pd
import h2o
from h2o.automl import H2OAutoML

from scipy import stats

In [9]:
df = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/pollution.csv')

df.head()

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
0,1,2010,1,1,0,,-21,-11.0,1021.0,NW,1.79,0,0
1,2,2010,1,1,1,,-21,-12.0,1020.0,NW,4.92,0,0
2,3,2010,1,1,2,,-21,-11.0,1019.0,NW,6.71,0,0
3,4,2010,1,1,3,,-21,-14.0,1019.0,NW,9.84,0,0
4,5,2010,1,1,4,,-20,-12.0,1018.0,NW,12.97,0,0


In [14]:
from datetime import datetime
# load data
def parse(x):
	return datetime.strptime(x, '%Y %m %d %H')
df = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/pollution.csv',  parse_dates = [['year', 'month', 'day', 'hour']], index_col=0, date_parser=parse)
df.drop('No', axis=1, inplace=True)
# manually specify column names
df.columns = ['pollution', 'dew', 'temp', 'press', 'wnd_dir', 'wnd_spd', 'snow', 'rain']
df.index.name = 'date'
# mark all NA values with 0
df['pollution'].fillna(0, inplace=True)
# drop the first 24 hours
df = dataset[24:]
# summarize first 5 rows
print(df.head(5))

                     pollution  dew  temp   press wnd_dir  wnd_spd  snow  rain
date                                                                          
2010-01-03 00:00:00       90.0   -7  -6.0  1027.0      SE    58.56     4     0
2010-01-03 01:00:00       63.0   -8  -6.0  1026.0      SE    61.69     5     0
2010-01-03 02:00:00       65.0   -8  -7.0  1026.0      SE    65.71     6     0
2010-01-03 03:00:00       55.0   -8  -7.0  1025.0      SE    68.84     7     0
2010-01-03 04:00:00       65.0   -8  -7.0  1024.0      SE    72.86     8     0


In [15]:
df2 = df.copy()
num_lags = 3 # number of lags and window lenghts for mean aggregation
delay = 1 # predict target one step ahead
for column in df2:
    for lag in range(1,num_lags+1):
        df2[column + '_lag' + str(lag)] = df2[column].shift(lag*-1-(delay-1))
        if column != 'wnd_dir':
            df2[column + '_avg_window_length' + str(lag+1)] = df2[column].shift(-1-(delay-1)).rolling(window=lag+1,center=False).mean().shift(1-(lag+1))

df2.dropna(inplace=True) 

mask = (df2.columns.str.contains('pollution') | df2.columns.str.contains('lag') | df2.columns.str.contains('window'))
df_processed = df2[df2.columns[mask]]

# the columns in the processed dataframe
df_processed.columns

Index(['pollution', 'pollution_lag1', 'pollution_avg_window_length2',
       'pollution_lag2', 'pollution_avg_window_length3', 'pollution_lag3',
       'pollution_avg_window_length4', 'dew_lag1', 'dew_avg_window_length2',
       'dew_lag2', 'dew_avg_window_length3', 'dew_lag3',
       'dew_avg_window_length4', 'temp_lag1', 'temp_avg_window_length2',
       'temp_lag2', 'temp_avg_window_length3', 'temp_lag3',
       'temp_avg_window_length4', 'press_lag1', 'press_avg_window_length2',
       'press_lag2', 'press_avg_window_length3', 'press_lag3',
       'press_avg_window_length4', 'wnd_dir_lag1', 'wnd_dir_lag2',
       'wnd_dir_lag3', 'wnd_spd_lag1', 'wnd_spd_avg_window_length2',
       'wnd_spd_lag2', 'wnd_spd_avg_window_length3', 'wnd_spd_lag3',
       'wnd_spd_avg_window_length4', 'snow_lag1', 'snow_avg_window_length2',
       'snow_lag2', 'snow_avg_window_length3', 'snow_lag3',
       'snow_avg_window_length4', 'rain_lag1', 'rain_avg_window_length2',
       'rain_lag2', 'rain_avg_win

In [16]:
mask = df_processed.columns.str.contains('pollution')
df_processed[df_processed.columns[mask]].head(10)

Unnamed: 0_level_0,pollution,pollution_lag1,pollution_avg_window_length2,pollution_lag2,pollution_avg_window_length3,pollution_lag3,pollution_avg_window_length4
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-03 00:00:00,90.0,63.0,64.0,65.0,61.0,55.0,62.0
2010-01-03 01:00:00,63.0,65.0,60.0,55.0,61.666667,65.0,67.0
2010-01-03 02:00:00,65.0,55.0,60.0,65.0,67.666667,83.0,73.5
2010-01-03 03:00:00,55.0,65.0,74.0,83.0,79.666667,91.0,81.25
2010-01-03 04:00:00,65.0,83.0,87.0,91.0,86.666667,86.0,85.5
2010-01-03 05:00:00,83.0,91.0,88.5,86.0,86.333333,82.0,86.25
2010-01-03 06:00:00,91.0,86.0,84.0,82.0,84.666667,86.0,83.0
2010-01-03 07:00:00,86.0,82.0,84.0,86.0,82.0,78.0,86.0
2010-01-03 08:00:00,82.0,86.0,82.0,78.0,87.333333,98.0,92.25
2010-01-03 09:00:00,86.0,78.0,88.0,98.0,94.333333,107.0,93.25


In [17]:
df_processed.reset_index(drop=True,inplace=True)
df_train = df_processed.loc[:int(df_processed.shape[0]*0.8),:]
df_test = df_processed.loc[int(df_processed.shape[0]*0.8):,:]

In [18]:
h2o.init(nthreads=-1)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.311-b11, mixed mode)
  Starting server from c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\HP\AppData\Local\Temp\tmp4pnfjcck
  JVM stdout: C:\Users\HP\AppData\Local\Temp\tmp4pnfjcck\h2o_HP_started_from_python.out
  JVM stderr: C:\Users\HP\AppData\Local\Temp\tmp4pnfjcck\h2o_HP_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,05 secs
H2O_cluster_timezone:,Europe/Paris
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.34.0.3
H2O_cluster_version_age:,2 months and 8 days
H2O_cluster_name:,H2O_from_python_HP_e40ez8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.481 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [19]:
hf_train = h2o.H2OFrame(df_train)
hf_test = h2o.H2OFrame(df_test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [20]:
y = 'pollution'
X = hf_train.columns
X.remove(y)

In [21]:
aml = H2OAutoML(max_runtime_secs = 600,
                seed = 42)
aml.train(x = X, 
          y = y,
          training_frame = hf_train,
          leaderboard_frame = hf_test)

AutoML progress: |
12:21:17.930: AutoML: XGBoost is not available; skipping it.
12:21:17.961: Step 'best_of_family_xgboost' not defined in provider 'StackedEnsemble': skipping it.
12:21:17.961: Step 'all_xgboost' not defined in provider 'StackedEnsemble': skipping it.

███████████████████████████████████████████████████████████████| (done) 100%
Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_BestOfFamily_1_AutoML_1_20211215_122117

No model summary for this model

ModelMetricsRegressionGLM: stackedensemble
** Reported on train data. **

MSE: 511.13053399652415
RMSE: 22.608196168569577
MAE: 12.220461595950464
RMSLE: NaN
R^2: 0.9372172678244101
Mean Residual Deviance: 511.13053399652415
Null degrees of freedom: 10058
Residual degrees of freedom: 10056
Null deviance: 81895219.0243069
Residual deviance: 5141462.041471036
AIC: 91288.4163435137

ModelMetricsRegressionGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 721.731163628250



In [22]:
lb = aml.leaderboard

lb

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_BestOfFamily_1_AutoML_1_20211215_122117,543.663,23.3166,543.663,12.3287,0.436116
StackedEnsemble_BestOfFamily_2_AutoML_1_20211215_122117,545.586,23.3578,545.586,12.3971,0.439299
StackedEnsemble_BestOfFamily_3_AutoML_1_20211215_122117,545.904,23.3646,545.904,12.3943,
GBM_1_AutoML_1_20211215_122117,547.139,23.391,547.139,12.5195,
StackedEnsemble_AllModels_2_AutoML_1_20211215_122117,547.185,23.392,547.185,12.3188,0.438279
StackedEnsemble_AllModels_3_AutoML_1_20211215_122117,547.661,23.4022,547.661,12.2907,0.437811
StackedEnsemble_AllModels_4_AutoML_1_20211215_122117,547.688,23.4027,547.688,12.2909,0.437829
StackedEnsemble_AllModels_1_AutoML_1_20211215_122117,551.176,23.4771,551.176,12.3664,0.438569
GBM_grid_1_AutoML_1_20211215_122117_model_6,562.328,23.7135,562.328,12.8119,0.450721
GBM_3_AutoML_1_20211215_122117,574.144,23.9613,574.144,12.6829,0.442784




In [23]:
leader_model = aml.leader

hf_test_predict = leader_model.predict(hf_test)

hf_test_predict.head(5)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict
61.8643
63.727
74.4689
91.8868
115.659




In [24]:
df_results = pd.DataFrame()
df_results['ground_truth'] = df_test['pollution'].reset_index(drop=True)
df_results['predictions'] = h2o.as_list(hf_test_predict,use_pandas=True)
df_results.head()

Unnamed: 0,ground_truth,predictions
0,53.0,61.86427
1,65.0,63.726969
2,70.0,74.468893
3,79.0,91.886763
4,92.0,115.659135
