### Import libraries

In [None]:
import pandas as pd
import h2o
from h2o.automl import H2OAutoML
from sklearn.preprocessing import StandardScaler
import joblib  # For saving the model
import matplotlib.pyplot as plt

# Start H2O Cluster
h2o.connect(ip="10.1.234.150", port=54321)
h2o.init()

### Load dataset

In [5]:
file_path = "./data/1991-2005/monthly/solar_dataset.csv"
data = pd.read_csv(file_path)

### Data Preprocessing

In [6]:
# # Convert Year and Month into a single feature 'YearMonth'
# data['YearMonth'] = data['Year'].astype(str) + data['Month'].astype(str).str.zfill(2)
# data['YearMonth'] = pd.to_datetime(data['YearMonth'], format='%Y%m')
# 
# # Drop original Year and Month columns
# data = data.drop(['Year', 'Month'], axis=1)

# Feature Scaling for coordinate and solar irradiation inputs
features = ['Azimuth (deg)', 'Longitude', 'Elevation', 'Latitude', 'Year', 'Month']
target = 'Merged Glo (Wh/m^2)'

# Scaling using StandardScaler for the coordinate features
scaler = StandardScaler()
data[['Azimuth (deg)', 'Longitude', 'Elevation', 'Latitude']] = scaler.fit_transform(
    data[['Azimuth (deg)', 'Longitude', 'Elevation', 'Latitude']])

# Convert pandas DataFrame to H2OFrame
h2o_data = h2o.H2OFrame(data)

# Specify input features and target for H2O
X = features
y = target


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


### Train Test Split

In [7]:
# Train-Test Split (80%-20%)
train, test = h2o_data.split_frame(ratios=[0.8], seed=42)

### AutoML with H2O

In [8]:
# H2O AutoML - Set max runtime in seconds or specify max_models
aml = H2OAutoML(max_runtime_secs=600, seed=42, verbosity="info", nfolds=5)
aml.train(x=X, y=y, training_frame=train)

AutoML progress: |Failed polling AutoML progress log: Server error java.lang.IllegalArgumentException:
  Error: No enum constant water.logging.LoggingLevel.info
  Request: GET /99/AutoML/AutoML_1_20241007_170129@@Merged_Glo__Wh_m_2_
    params: {'verbosity': 'info'}
Failed polling AutoML progress log: Server error java.lang.IllegalArgumentException:
  Error: No enum constant water.logging.LoggingLevel.info
  Request: GET /99/AutoML/AutoML_1_20241007_170129@@Merged_Glo__Wh_m_2_
    params: {'verbosity': 'info'}
Failed polling AutoML progress log: Server error java.lang.IllegalArgumentException:
  Error: No enum constant water.logging.LoggingLevel.info
  Request: GET /99/AutoML/AutoML_1_20241007_170129@@Merged_Glo__Wh_m_2_
    params: {'verbosity': 'info'}
Failed polling AutoML progress log: Server error java.lang.IllegalArgumentException:
  Error: No enum constant water.logging.LoggingLevel.info
  Request: GET /99/AutoML/AutoML_1_20241007_170129@@Merged_Glo__Wh_m_2_
    params: {'verbos

key,value
Stacking strategy,cross_validation
Number of base models (used / total),5/5
# GBM base models (used / total),1/1
# DRF base models (used / total),2/2
# DeepLearning base models (used / total),1/1
# GLM base models (used / total),1/1
Metalearner algorithm,GBM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5
Metalearner fold_column,

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,,0.0,,,,,
loglikelihood,,0.0,,,,,
mae,6896.6055,82.87906,6903.5522,6917.7124,6967.2324,6939.7983,6754.732
mean_residual_deviance,133008312.0,3209001.5,130941864.0,132002848.0,134608384.0,137760832.0,129727616.0
mse,133008312.0,3209001.5,130941864.0,132002848.0,134608384.0,137760832.0,129727616.0
r2,0.9626176,0.0007573,0.963238,0.9631031,0.9625849,0.9613406,0.9628215
residual_deviance,133008312.0,3209001.5,130941864.0,132002848.0,134608384.0,137760832.0,129727616.0
rmse,11532.255,138.71924,11442.983,11489.249,11602.085,11737.156,11389.804
rmsle,,0.0,,,,,


### Prediction and Evaluation

In [9]:
# Leaderboard of top models
lb = aml.leaderboard
print(lb.head())

# Predict on test set
preds = aml.leader.predict(test)

# Evaluate performance
perf = aml.leader.model_performance(test)
print(perf)

model_id                                                    rmse          mse      mae    rmsle    mean_residual_deviance
StackedEnsemble_BestOfFamily_5_AutoML_1_20241007_170129  11532.7  1.33004e+08  6896.7       nan               1.33004e+08
StackedEnsemble_AllModels_3_AutoML_1_20241007_170129     12744.9  1.62433e+08  7418.34      nan               1.62433e+08
StackedEnsemble_AllModels_4_AutoML_1_20241007_170129     12745.4  1.62446e+08  7418.17      nan               1.62446e+08
StackedEnsemble_BestOfFamily_4_AutoML_1_20241007_170129  12800.4  1.63851e+08  7333.92      nan               1.63851e+08
GBM_grid_1_AutoML_1_20241007_170129_model_11             12848.3  1.65078e+08  7327.33      nan               1.65078e+08
GBM_grid_1_AutoML_1_20241007_170129_model_9              14487.9  2.099e+08    8211.98      nan               2.099e+08
GBM_grid_1_AutoML_1_20241007_170129_model_5              15016.7  2.25501e+08  9013.27      nan               2.25501e+08
StackedEnsemble_BestOfFami

### Retrain Best Model on the Entire Dataset 

In [10]:
# Combine train and test data
full_data = train.rbind(test)

# Retrain the best model (leader) using the entire dataset
aml.leader.train(x=X, y=y, training_frame=full_data)

H2OResponseError: ModelBuilderErrorV3  (water.exceptions.H2OModelBuilderIllegalArgumentException):
    timestamp = 1728310292420
    error_url = '/99/ModelBuilders/stackedensemble'
    msg = 'Illegal argument(s) for StackedEnsemble model: StackedEnsemble_model_python_1728309504984_24.  Details: ERRR on field: _train: Training data must have at least 2 features (incl. response).'
    dev_msg = 'Illegal argument(s) for StackedEnsemble model: StackedEnsemble_model_python_1728309504984_24.  Details: ERRR on field: _train: Training data must have at least 2 features (incl. response).'
    http_status = 412
    values = {'messages': [{'_log_level': 5, '_field_name': '_keep_cross_validation_models', '_message': 'Only for cross-validation.'}, {'_log_level': 5, '_field_name': '_keep_cross_validation_predictions', '_message': 'Only for cross-validation.'}, {'_log_level': 5, '_field_name': '_keep_cross_validation_fold_assignment', '_message': 'Only for cross-validation.'}, {'_log_level': 5, '_field_name': '_fold_assignment', '_message': 'Only for cross-validation.'}, {'_log_level': 5, '_field_name': '_tweedie_power', '_message': 'Only for Tweedie Distribution.'}, {'_log_level': 2, '_field_name': '_train', '_message': 'Dropping unused columns: [Month, Elevation, Latitude, Azimuth (deg), Year, Longitude]'}, {'_log_level': 5, '_field_name': '_tweedie_power', '_message': 'Tweedie power is only used for Tweedie distribution.'}, {'_log_level': 5, '_field_name': '_quantile_alpha', '_message': 'Quantile (alpha) is only used for Quantile regression.'}, {'_log_level': 5, '_field_name': '_max_after_balance_size', '_message': 'Balance classes is false, hide max_after_balance_size'}, {'_log_level': 1, '_field_name': '_train', '_message': 'Training data must have at least 2 features (incl. response).'}, {'_log_level': 5, '_field_name': '_balance_classes', '_message': 'Balance classes is only applicable to classification problems.'}, {'_log_level': 5, '_field_name': '_class_sampling_factors', '_message': 'Class sampling factors is only applicable to classification problems.'}, {'_log_level': 5, '_field_name': '_max_after_balance_size', '_message': 'Max after balance size is only applicable to classification problems.'}, {'_log_level': 5, '_field_name': '_max_confusion_matrix_size', '_message': 'Max confusion matrix size is only applicable to classification problems.'}, {'_log_level': 5, '_field_name': '_max_confusion_matrix_size', '_message': 'Only for multi-class classification problems.'}, {'_log_level': 5, '_field_name': '_max_after_balance_size', '_message': 'Only used with balanced classes'}, {'_log_level': 5, '_field_name': '_class_sampling_factors', '_message': 'Class sampling factors is only applicable if balancing classes.'}], 'algo': 'StackedEnsemble', 'parameters': {'_train': {'name': 'py_8_sid_9546', 'type': 'Key'}, '_valid': None, '_nfolds': 0, '_keep_cross_validation_models': True, '_keep_cross_validation_predictions': False, '_keep_cross_validation_predictions_precision': -1, '_keep_cross_validation_fold_assignment': False, '_parallelize_cross_validation': True, '_auto_rebalance': True, '_preprocessors': None, '_seed': -1, '_fold_assignment': 'AUTO', '_categorical_encoding': 'AUTO', '_max_categorical_levels': 10, '_distribution': 'AUTO', '_tweedie_power': 1.5, '_quantile_alpha': 0.5, '_huber_alpha': 0.9, '_ignored_columns': ['Merged Dir (Wh/m^2)', 'Albedo (unitless)', 'Ozone Flg', 'Zenith (deg)', 'Precip Wat Flg', 'AOD Flg', 'Albedo Flg', 'Ozone (cm)', 'Merged Dif (Wh/m^2)', 'Precip Wat (cm)', 'AOD (unitless)'], '_ignore_const_cols': True, '_weights_column': None, '_offset_column': None, '_fold_column': None, '_treatment_column': None, '_check_constant_response': True, '_is_cv_model': False, '_cv_fold': -1, '_score_each_iteration': False, '_max_runtime_secs': 0.0, '_main_model_time_budget_factor': 0.0, '_stopping_rounds': 0, '_stopping_metric': 'AUTO', '_stopping_tolerance': 0.001, '_response_column': 'Merged Glo (Wh/m^2)', '_balance_classes': False, '_max_after_balance_size': 5.0, '_class_sampling_factors': None, '_max_confusion_matrix_size': 20, '_checkpoint': None, '_pretrained_autoencoder': None, '_custom_metric_func': None, '_custom_distribution_func': None, '_export_checkpoints_dir': None, '_gainslift_bins': -1, '_auc_type': 'AUTO', '_auuc_type': 'AUTO', '_auuc_nbins': -1, '_base_models': [], '_keep_levelone_frame': False, '_keep_base_model_predictions': False, '_metalearner_nfolds': 0, '_metalearner_fold_assignment': None, '_metalearner_fold_column': None, '_blending': None, '_metalearner_transform': 'NONE', '_metalearner_algorithm': 'AUTO', '_metalearner_params': '', '_metalearner_parameters': None, '_score_training_samples': 10000}, 'error_count': 2}
    exception_msg = 'Illegal argument(s) for StackedEnsemble model: StackedEnsemble_model_python_1728309504984_24.  Details: ERRR on field: _train: Training data must have at least 2 features (incl. response).'
    stacktrace = ['water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for StackedEnsemble model: StackedEnsemble_model_python_1728309504984_24.  Details: ERRR on field: _train: Training data must have at least 2 features (incl. response).\n', '    water.exceptions.H2OModelBuilderIllegalArgumentException.makeFromBuilder(H2OModelBuilderIllegalArgumentException.java:19)', '    hex.ModelBuilder.trainModelOnH2ONode(ModelBuilder.java:346)', '    water.api.ModelBuilderHandler.handle(ModelBuilderHandler.java:51)', '    water.api.ModelBuilderHandler.handle(ModelBuilderHandler.java:16)', '    water.api.RequestServer.serve(RequestServer.java:472)', '    water.api.RequestServer.doGeneric(RequestServer.java:303)', '    water.api.RequestServer.doPost(RequestServer.java:227)', '    javax.servlet.http.HttpServlet.service(HttpServlet.java:707)', '    javax.servlet.http.HttpServlet.service(HttpServlet.java:790)', '    org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:799)', '    org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:554)', '    org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:233)', '    org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1440)', '    org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:188)', '    org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:505)', '    org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:186)', '    org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1355)', '    org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)', '    org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:146)', '    org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)', '    water.webserver.jetty9.Jetty9ServerAdapter$LoginHandler.handle(Jetty9ServerAdapter.java:130)', '    org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:146)', '    org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)', '    org.eclipse.jetty.server.Server.handle(Server.java:516)', '    org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:487)', '    org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:732)', '    org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:479)', '    org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:277)', '    org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)', '    org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)', '    org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104)', '    org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:338)', '    org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:315)', '    org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173)', '    org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:131)', '    org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:409)', '    org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:883)', '    org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1034)', '    java.base/java.lang.Thread.run(Thread.java:834)']
    parameters = {'__meta': {'schema_version': 99, 'schema_name': 'StackedEnsembleParametersV99', 'schema_type': 'StackedEnsembleParameters'}, 'model_id': None, 'training_frame': {'__meta': {'schema_version': 3, 'schema_name': 'FrameKeyV3', 'schema_type': 'Key<Frame>'}, 'name': 'py_8_sid_9546', 'type': 'Key<Frame>', 'URL': '/3/Frames/py_8_sid_9546'}, 'validation_frame': None, 'nfolds': 0, 'keep_cross_validation_models': True, 'keep_cross_validation_predictions': False, 'keep_cross_validation_fold_assignment': False, 'parallelize_cross_validation': True, 'distribution': 'AUTO', 'tweedie_power': 1.5, 'quantile_alpha': 0.5, 'huber_alpha': 0.9, 'response_column': {'__meta': {'schema_version': 3, 'schema_name': 'ColSpecifierV3', 'schema_type': 'VecSpecifier'}, 'column_name': 'Merged Glo (Wh/m^2)', 'is_member_of_frames': None}, 'weights_column': None, 'offset_column': None, 'fold_column': None, 'fold_assignment': 'AUTO', 'categorical_encoding': 'AUTO', 'max_categorical_levels': 10, 'ignored_columns': ['Merged Dir (Wh/m^2)', 'Albedo (unitless)', 'Ozone Flg', 'Zenith (deg)', 'Precip Wat Flg', 'AOD Flg', 'Albedo Flg', 'Ozone (cm)', 'Merged Dif (Wh/m^2)', 'Precip Wat (cm)', 'AOD (unitless)'], 'ignore_const_cols': True, 'score_each_iteration': False, 'checkpoint': None, 'stopping_rounds': 0, 'max_runtime_secs': 0.0, 'stopping_metric': 'AUTO', 'stopping_tolerance': 0.001, 'gainslift_bins': -1, 'custom_metric_func': None, 'custom_distribution_func': None, 'export_checkpoints_dir': None, 'auc_type': 'AUTO', 'base_models': [], 'metalearner_algorithm': 'AUTO', 'metalearner_nfolds': 0, 'metalearner_fold_assignment': None, 'metalearner_fold_column': None, 'metalearner_transform': 'NONE', 'keep_levelone_frame': False, 'metalearner_params': '', 'blending_frame': None, 'seed': -1, 'score_training_samples': 10000}
    messages = [{'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'keep_cross_validation_models', 'message': 'Only for cross-validation.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'keep_cross_validation_predictions', 'message': 'Only for cross-validation.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'keep_cross_validation_fold_assignment', 'message': 'Only for cross-validation.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'fold_assignment', 'message': 'Only for cross-validation.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'tweedie_power', 'message': 'Only for Tweedie Distribution.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'WARN', 'field_name': 'train', 'message': 'Dropping unused columns: [Month, Elevation, Latitude, Azimuth (deg), Year, Longitude]'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'tweedie_power', 'message': 'Tweedie power is only used for Tweedie distribution.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'quantile_alpha', 'message': 'Quantile (alpha) is only used for Quantile regression.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'max_after_balance_size', 'message': 'Balance classes is false, hide max_after_balance_size'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'ERRR', 'field_name': 'train', 'message': 'Training data must have at least 2 features (incl. response).'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'balance_classes', 'message': 'Balance classes is only applicable to classification problems.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'class_sampling_factors', 'message': 'Class sampling factors is only applicable to classification problems.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'max_after_balance_size', 'message': 'Max after balance size is only applicable to classification problems.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'max_confusion_matrix_size', 'message': 'Max confusion matrix size is only applicable to classification problems.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'max_confusion_matrix_size', 'message': 'Only for multi-class classification problems.'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'max_after_balance_size', 'message': 'Only used with balanced classes'}, {'__meta': {'schema_version': 3, 'schema_name': 'ValidationMessageV3', 'schema_type': 'ValidationMessage'}, 'message_type': 'TRACE', 'field_name': 'class_sampling_factors', 'message': 'Class sampling factors is only applicable if balancing classes.'}]
    error_count = 2


### Retrain Best Model on the Entire Dataset 

In [None]:
model_filename = 'best_solar_model.pkl'
joblib.dump(tpot.fitted_pipeline_, model_filename)
print(f"Best model saved to {model_filename}")

### Plotting Actual vs Predicted

In [None]:
# Convert H2O predictions to pandas for visualization
test_df = test.as_data_frame()
preds_df = preds.as_data_frame()

# Plot Actual vs Predicted Solar Irradiation
plt.figure(figsize=(10, 6))
plt.plot(test_df[target].values, label='Actual')
plt.plot(preds_df['predict'].values, label='Predicted', linestyle='--')
plt.legend()
plt.xlabel('Samples')
plt.ylabel('Merged Glo (Wh/m^2)')
plt.title('Actual vs Predicted Solar Irradiation (Test Data)')
plt.show()


In [None]:
# plot the predicted values 
plt.figure(figsize=(10, 6))
plt.plot(preds[:50], label='Predicted')
plt.legend()
plt.xlabel('Samples')
plt.ylabel('Merged Glo (Wh/m^2)')
plt.title('Predicted Solar Irradiation')
plt.show()

In [None]:
#print maximum and minimum predicted values
print("Maximum predicted value:", max(preds))
print("Minimum predicted value:", min(preds))