In [1]:
import pyspark
import h2o
import pysparkling

print("Spark:", pyspark.__version__)
print("PySparkling:", pysparkling.__version__)
print("H2O:", h2o.__version__)

Spark: 3.5.1
PySparkling: 3.46.0.6-1-3.5
H2O: 3.46.0.6


In [1]:
from pyspark.sql import SparkSession
from ai.h2o.sparkling import H2OContext, H2OConf

import os, sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

spark = (
    SparkSession.builder
    .appName("H2O_Spark_35")
    .config("spark.driver.memory", "8g")
    .config("spark.executor.memory", "4g")
    .config("spark.ext.h2o.client.language", "python")
    # Jeśli pojawi się ClassNotFoundException, odkomentuj:
    # .config("spark.jars.packages", "ai.h2o:sparkling-water-package_2.12:3.46.0.6-1-3.5")
    .getOrCreate()
)

# <<< KLUCZ >>>: tworzysz H2OConf (bez argumentów!) i przekazujesz go do getOrCreate
conf = H2OConf().set("spark.ext.h2o.client.language", "python")

hc = H2OContext.getOrCreate(conf)   # <-- NIE przekazujesz tu `spark`

import h2o
print("H2O:", h2o.cluster().version)


Connecting to H2O server at http://host.docker.internal:54323 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,11 secs
H2O_cluster_timezone:,Europe/Belgrade
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,1 year and 4 days
H2O_cluster_name:,sparkling-water-Paweł_local-1762457130651
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,8 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8



Sparkling Water Context:
 * Sparkling Water Version: 3.46.0.6-1-3.5
 * H2O name: sparkling-water-Paweł_local-1762457130651
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (0,192.168.50.103,54321)
  ------------------------

  Open H2O Flow in browser: http://host.docker.internal:54323 (CMD + click in Mac OSX)

    
H2O: 3.46.0.6


In [3]:
# === Ładowanie danych ze scikit-learn ===
import pandas as pd
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
pdf = pd.DataFrame(data.data, columns=[c.replace(" ", "_") for c in data.feature_names])

# target: 0=malignant, 1=benign -> nazwijmy klasy jawnie jako stringi
pdf["target"] = pd.Series(data.target).map({0: "malignant", 1: "benign"})

# === pandas -> Spark ===
sdf = spark.createDataFrame(pdf)

print("Rekordy:", sdf.count())


Rekordy: 569


In [4]:
sdf.printSchema()

root
 |-- mean_radius: double (nullable = true)
 |-- mean_texture: double (nullable = true)
 |-- mean_perimeter: double (nullable = true)
 |-- mean_area: double (nullable = true)
 |-- mean_smoothness: double (nullable = true)
 |-- mean_compactness: double (nullable = true)
 |-- mean_concavity: double (nullable = true)
 |-- mean_concave_points: double (nullable = true)
 |-- mean_symmetry: double (nullable = true)
 |-- mean_fractal_dimension: double (nullable = true)
 |-- radius_error: double (nullable = true)
 |-- texture_error: double (nullable = true)
 |-- perimeter_error: double (nullable = true)
 |-- area_error: double (nullable = true)
 |-- smoothness_error: double (nullable = true)
 |-- compactness_error: double (nullable = true)
 |-- concavity_error: double (nullable = true)
 |-- concave_points_error: double (nullable = true)
 |-- symmetry_error: double (nullable = true)
 |-- fractal_dimension_error: double (nullable = true)
 |-- worst_radius: double (nullable = true)
 |-- worst_

In [5]:
sdf.groupBy("target").count().show()

+---------+-----+
|   target|count|
+---------+-----+
|malignant|  212|
|   benign|  357|
+---------+-----+



In [6]:
# === Spark -> H2O, split, AutoML ===
hf = hc.asH2OFrame(sdf, "breast_cancer_hf")
response = "target"
hf[response] = hf[response].asfactor()

In [7]:
# train/valid/test: 70/15/15
train, valid, test = hf.split_frame(ratios=[0.70, 0.15], seed=42)

In [8]:
from h2o.automl import H2OAutoML
x = [c for c in hf.columns if c != response]

In [9]:
aml = H2OAutoML(
    max_runtime_secs=600,        # podbij jeśli chcesz więcej
    seed=42,
    sort_metric="AUC",
    nfolds=5,
    balance_classes=True,
    keep_cross_validation_predictions=True,
    keep_cross_validation_models=False
)
aml.train(x=x, y=response, training_frame=train, leaderboard_frame=valid)

AutoML progress: |
20:27:44.903: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,layer,units,type,dropout,l1,l2,mean_rate,rate_rms,momentum,mean_weight,weight_rms,mean_bias,bias_rms
,1,30,Input,5.0,,,,,,,,,
,2,100,RectifierDropout,20.0,0.0,0.0,0.0028005,0.0021818,0.0,0.0041935,0.1224917,0.4969464,0.0181305
,3,100,RectifierDropout,20.0,0.0,0.0,0.0029125,0.0016495,0.0,-0.0025061,0.1013309,0.9880808,0.0217466
,4,100,RectifierDropout,20.0,0.0,0.0,0.0081655,0.0407208,0.0,-0.0018229,0.1016104,0.9952035,0.0071641
,5,2,Softmax,,0.0,0.0,0.001413,0.0016394,0.0,-0.0216159,0.5435266,-3e-07,0.00374

Unnamed: 0,benign,malignant,Error,Rate
benign,237.0,3.0,0.0125,(3.0/240.0)
malignant,3.0,240.0,0.0123,(3.0/243.0)
Total,240.0,243.0,0.0124,(6.0/483.0)

metric,threshold,value,idx
max f1,0.8571585,0.9876543,37.0
max f2,0.8571585,0.9876543,37.0
max f0point5,0.9948627,0.9914894,30.0
max accuracy,0.8571585,0.9875776,37.0
max precision,1.0,1.0,0.0
max recall,0.0241461,1.0,56.0
max specificity,1.0,1.0,0.0
max absolute_mcc,0.8571585,0.9751543,37.0
max min_per_class_accuracy,0.8571585,0.9875,37.0
max mean_per_class_accuracy,0.8571585,0.9875772,37.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0517598,1.0,1.9876543,1.9876543,1.0,1.0,1.0,1.0,0.1028807,0.1028807,98.7654321,98.7654321,0.1028807
2,0.1035197,1.0,1.9876543,1.9876543,1.0,1.0,1.0,1.0,0.1028807,0.2057613,98.7654321,98.7654321,0.2057613
3,0.1511387,1.0,1.9876543,1.9876543,1.0,1.0,1.0,1.0,0.0946502,0.3004115,98.7654321,98.7654321,0.3004115
4,0.2028986,1.0,1.9876543,1.9876543,1.0,1.0,1.0,1.0,0.1028807,0.4032922,98.7654321,98.7654321,0.4032922
5,0.300207,1.0,1.9876543,1.9876543,1.0,1.0,1.0,1.0,0.1934156,0.5967078,98.7654321,98.7654321,0.5967078
6,0.3995859,0.9999994,1.9876543,1.9876543,1.0,0.9999999,1.0,1.0,0.1975309,0.7942387,98.7654321,98.7654321,0.7942387
7,0.5031056,0.8571585,1.8683951,1.9631154,0.94,0.9887249,0.9876543,0.99768,0.1934156,0.9876543,86.8395062,96.3115379,0.9751543
8,0.6004141,0.0001458,0.1268716,1.6655172,0.0638298,0.1037171,0.837931,0.8527964,0.0123457,1.0,-87.3128448,66.5517241,0.8041667
9,0.699793,1.8e-06,0.0,1.4289941,0.0,3.11e-05,0.7189349,0.7316936,0.0,1.0,-100.0,42.8994083,0.6041667
10,0.7991718,1e-07,0.0,1.2512953,0.0,5e-07,0.6295337,0.6407059,0.0,1.0,-100.0,25.1295337,0.4041667

Unnamed: 0,benign,malignant,Error,Rate
benign,232.0,8.0,0.0333,(8.0/240.0)
malignant,2.0,149.0,0.0132,(2.0/151.0)
Total,234.0,157.0,0.0256,(10.0/391.0)

metric,threshold,value,idx
max f1,0.1004335,0.9675325,50.0
max f2,0.1004335,0.978975,50.0
max f0point5,0.9890549,0.9774965,33.0
max accuracy,0.5979155,0.9744246,46.0
max precision,1.0,1.0,0.0
max recall,7.42e-05,1.0,78.0
max specificity,1.0,1.0,0.0
max absolute_mcc,0.1004335,0.9469376,50.0
max min_per_class_accuracy,0.5979155,0.9735099,46.0
max mean_per_class_accuracy,0.1004335,0.9767108,50.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.2352941,1.0,2.589404,2.589404,1.0,1.0,1.0,1.0,0.6092715,0.6092715,158.9403974,158.9403974,0.6092715
2,0.3017903,0.999992,2.589404,2.589404,1.0,0.999999,1.0,0.9999998,0.1721854,0.781457,158.9403974,158.9403974,0.781457
3,0.4015345,0.1004335,2.0582442,2.4574598,0.7948718,0.8593341,0.9490446,0.9650573,0.205298,0.986755,105.8244184,145.7459822,0.9534216
4,0.5012788,8.71e-06,0.1327899,1.994898,0.0512821,0.0091952,0.7704082,0.7748603,0.013245,1.0,-86.7210053,99.4897959,0.8125
5,0.6061381,6e-08,0.0,1.649789,0.0,1.4e-06,0.6371308,0.640813,0.0,1.0,-100.0,64.978903,0.6416667
6,1.0,0.0,0.0,1.0,0.0,0.0,0.3861893,0.3884212,0.0,1.0,-100.0,0.0,0.0

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.9846803,0.0106495,0.9746835,0.9871795,1.0,0.974359,0.9871795
aic,,0.0,,,,,
auc,0.997423,0.0021340,0.9946808,0.9979410,1.0,0.995882,0.9986111
err,0.0153197,0.0106495,0.0253165,0.0128205,0.0,0.0256410,0.0128205
err_count,1.2,0.83666,2.0,1.0,0.0,2.0,1.0
f0point5,0.9806905,0.0128158,0.9868421,0.9748428,1.0,0.9677419,0.9740260
f1,0.9806435,0.0134953,0.9677419,0.984127,1.0,0.9677419,0.9836066
f2,0.9808152,0.0214997,0.9493671,0.9935898,1.0,0.9677419,0.9933775
lift_top_group,2.5979793,0.1693603,2.46875,2.516129,2.8888888,2.516129,2.6
loglikelihood,,0.0,,,,,

Unnamed: 0,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_logloss,training_r2,training_auc,training_pr_auc,training_lift,training_classification_error
,2025-11-06 20:37:07,0.000 sec,,0.0,0,0.0,,,,,,,
,2025-11-06 20:37:07,1 min 15.383 sec,19000 obs/sec,0.7867495,1,380.0,0.1898743,0.1154022,0.8557854,0.9954561,0.9955318,1.9876543,0.0351967
,2025-11-06 20:37:08,1 min 15.595 sec,17747 obs/sec,8.010352,10,3869.0,0.1264348,0.054789,0.9360545,0.9989883,0.9990419,1.9876543,0.0124224

variable,relative_importance,scaled_importance,percentage
radius_error,1.0,1.0,0.0371642
concave_points_error,0.9977105,0.9977105,0.0370791
fractal_dimension_error,0.9792872,0.9792872,0.0363945
worst_smoothness,0.9775466,0.9775466,0.0363298
mean_perimeter,0.9761767,0.9761767,0.0362789
mean_area,0.9684341,0.9684341,0.0359911
mean_compactness,0.9608803,0.9608803,0.0357104
symmetry_error,0.9606385,0.9606385,0.0357014
mean_concavity,0.9311875,0.9311875,0.0346069
worst_concave_points,0.9264062,0.9264062,0.0344292


In [18]:
lb = aml.leaderboard
lb.head()

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
DeepLearning_grid_3_AutoML_1_20251106_202744_model_7,0.98539,0.284253,0.984492,0.030303,0.213946,0.0457731
DeepLearning_grid_1_AutoML_1_20251106_202744_model_4,0.983225,0.65517,0.980432,0.0481602,0.218918,0.0479253
DeepLearning_grid_2_AutoML_1_20251106_202744_model_4,0.982684,0.487211,0.978978,0.0606061,0.234661,0.0550659
DeepLearning_grid_2_AutoML_1_20251106_202744_model_19,0.982143,0.427724,0.978516,0.0606061,0.233003,0.0542902
DeepLearning_grid_1_AutoML_1_20251106_202744_model_46,0.982143,0.390101,0.978978,0.0543831,0.229549,0.0526926
DeepLearning_grid_1_AutoML_1_20251106_202744_model_3,0.981602,0.518414,0.980142,0.0392316,0.207537,0.0430715
DeepLearning_grid_2_AutoML_1_20251106_202744_model_3,0.981602,0.387062,0.980696,0.0454545,0.223225,0.0498296
DeepLearning_grid_3_AutoML_1_20251106_202744_model_3,0.981061,0.335981,0.980361,0.0454545,0.228856,0.0523752
DeepLearning_grid_3_AutoML_1_20251106_202744_model_4,0.980519,0.437478,0.975459,0.0570887,0.229369,0.0526102
DeepLearning_grid_1_AutoML_1_20251106_202744_model_9,0.980519,0.457216,0.977168,0.0419372,0.231944,0.053798


In [19]:
lb_pd = lb.as_data_frame()
lb_pd = lb_pd[~lb_pd['model_id'].str.contains("Deep")]
lb_pd




Unnamed: 0,model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
10,StackedEnsemble_AllModels_3_AutoML_1_20251106_...,0.980519,0.173599,0.976511,0.063312,0.214761,0.046122
13,StackedEnsemble_AllModels_4_AutoML_1_20251106_...,0.979978,0.169734,0.977459,0.060606,0.209964,0.044085
14,StackedEnsemble_BestOfFamily_4_AutoML_1_202511...,0.979978,0.298617,0.977233,0.04816,0.211552,0.044754
15,StackedEnsemble_BestOfFamily_2_AutoML_1_202511...,0.979978,0.203917,0.975631,0.045455,0.204905,0.041986
25,GLM_1_AutoML_1_20251106_202744,0.978355,0.17724,0.9781,0.045455,0.206122,0.042486
29,StackedEnsemble_AllModels_1_AutoML_1_20251106_...,0.977814,0.204458,0.974168,0.045455,0.203135,0.041264
30,StackedEnsemble_BestOfFamily_5_AutoML_1_202511...,0.977814,0.205809,0.975258,0.045455,0.202698,0.041087
33,StackedEnsemble_BestOfFamily_7_AutoML_1_202511...,0.977273,0.224716,0.974965,0.045455,0.205964,0.042421
38,StackedEnsemble_Best1000_1_AutoML_1_20251106_2...,0.976732,0.181055,0.975149,0.057089,0.214667,0.046082
41,StackedEnsemble_BestOfFamily_6_AutoML_1_202511...,0.976461,0.194215,0.977271,0.045455,0.221033,0.048855


In [13]:
leader = aml.leader
print("Leader:", leader.algo, leader.model_id)

Leader: deeplearning DeepLearning_grid_3_AutoML_1_20251106_202744_model_7


In [20]:
# === Ocena na teście + próg pod F1 ===
perf = leader.model_performance(test_data=test)
print("AUC(test)   :", perf.auc())
print("AUCPR(test) :", perf.aucpr())
print("LogLoss(test):", perf.logloss())

AUC(test)   : 1.0
AUCPR(test) : 1.0
LogLoss(test): 0.05159368900882523


In [21]:
best_thr_f1 = perf.find_threshold_by_max_metric("f1")
print("Best threshold (F1):", best_thr_f1)
print(perf.confusion_matrix(metrics="f1"))

Best threshold (F1): 0.9999276132390292
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.9999276132390292
           benign    malignant    Error    Rate
---------  --------  -----------  -------  ----------
benign     61        0            0        (0.0/61.0)
malignant  0         28           0        (0.0/28.0)
Total      61        28           0        (0.0/89.0)


In [22]:
# === Predykcje + powrót do Spark ===
preds_h2o = leader.predict(test)         # kolumny: p0/p1/predict (dla binarnej)
preds_spark = hc.asSparkFrame(preds_h2o)
preds_spark.show(5)

deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%
+---------+--------------------+------------------+
|  predict|              benign|         malignant|
+---------+--------------------+------------------+
|malignant|7.238676097078231E-5|0.9999276132390292|
|malignant|9.981415322312201E-8|0.9999999001858467|
|malignant|1.574546262827487...|0.9999998425453737|
|malignant|4.211755889030553...|0.9999999999999958|
|malignant|2.828283736897204E-7|0.9999997171716263|
+---------+--------------------+------------------+
only showing top 5 rows



In [23]:
# === Zapis modelu (H2O bin + MOJO) ===
import os, h2o
outdir = r"C:\REPO\studia\PRZETWARZAZBIOROW20252026\P2 Pysparkling\models\h2o_breast_cancer"
os.makedirs(outdir, exist_ok=True)

bin_path  = h2o.save_model(model=leader, path=outdir, force=True)
mojo_path = leader.download_mojo(path=outdir, get_genmodel_jar=True)
print("Saved BIN :", bin_path)
print("Saved MOJO:", mojo_path)

Saved BIN : C:\REPO\studia\PRZETWARZAZBIOROW20252026\P2 Pysparkling\models\h2o_breast_cancer\DeepLearning_grid_3_AutoML_1_20251106_202744_model_7
Saved MOJO: C:\REPO\studia\PRZETWARZAZBIOROW20252026\P2 Pysparkling\models\h2o_breast_cancer\DeepLearning_grid_3_AutoML_1_20251106_202744_model_7.zip


In [24]:
aml = H2OAutoML(
    max_runtime_secs=600,        
    seed=42,
    sort_metric="AUC",
    nfolds=5,
    balance_classes=True,
    keep_cross_validation_predictions=True,
    keep_cross_validation_models=False,
    exclude_algos=["DeepLearning", 'XGBoost']
)
aml.train(x=x, y=response, training_frame=train, leaderboard_frame=valid)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),3/3
# GBM base models (used / total),1/1
# GLM base models (used / total),1/1
# DRF base models (used / total),1/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5
Metalearner fold_column,
Custom metalearner hyperparameters,

Unnamed: 0,benign,malignant,Error,Rate
benign,240.0,0.0,0.0,(0.0/240.0)
malignant,1.0,150.0,0.0066,(1.0/151.0)
Total,241.0,150.0,0.0026,(1.0/391.0)

metric,threshold,value,idx
max f1,0.692686,0.9966777,110.0
max f2,0.3704245,0.997358,113.0
max f0point5,0.692686,0.9986684,110.0
max accuracy,0.692686,0.9974425,110.0
max precision,1.0,1.0,0.0
max recall,0.3704245,1.0,113.0
max specificity,1.0,1.0,0.0
max absolute_mcc,0.692686,0.9946133,110.0
max min_per_class_accuracy,0.692686,0.9933775,110.0
max mean_per_class_accuracy,0.692686,0.9966887,110.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0102302,1.0,2.589404,2.589404,1.0,1.0,1.0,1.0,0.0264901,0.0264901,158.9403974,158.9403974,0.0264901
2,0.0204604,1.0,2.589404,2.589404,1.0,1.0,1.0,1.0,0.0264901,0.0529801,158.9403974,158.9403974,0.0529801
3,0.0306905,1.0,2.589404,2.589404,1.0,1.0,1.0,1.0,0.0264901,0.0794702,158.9403974,158.9403974,0.0794702
4,0.0409207,1.0,2.589404,2.589404,1.0,1.0,1.0,1.0,0.0264901,0.1059603,158.9403974,158.9403974,0.1059603
5,0.0511509,1.0,2.589404,2.589404,1.0,1.0,1.0,1.0,0.0264901,0.1324503,158.9403974,158.9403974,0.1324503
6,0.1023018,0.9999997,2.589404,2.589404,1.0,0.9999999,1.0,0.9999999,0.1324503,0.2649007,158.9403974,158.9403974,0.2649007
7,0.1508951,0.9999964,2.589404,2.589404,1.0,0.9999984,1.0,0.9999995,0.1258278,0.3907285,158.9403974,158.9403974,0.3907285
8,0.202046,0.999936,2.589404,2.589404,1.0,0.9999812,1.0,0.9999948,0.1324503,0.5231788,158.9403974,158.9403974,0.5231788
9,0.3017903,0.9983077,2.589404,2.589404,1.0,0.9995506,1.0,0.999848,0.2582781,0.781457,158.9403974,158.9403974,0.781457
10,0.4015345,0.2052434,2.1910341,2.4904459,0.8461538,0.8206821,0.9617834,0.9553418,0.218543,1.0,119.1034131,149.044586,0.975

Unnamed: 0,benign,malignant,Error,Rate
benign,237.0,3.0,0.0125,(3.0/240.0)
malignant,5.0,146.0,0.0331,(5.0/151.0)
Total,242.0,149.0,0.0205,(8.0/391.0)

metric,threshold,value,idx
max f1,0.5949943,0.9733333,119.0
max f2,0.4534478,0.969657,124.0
max f0point5,0.6431413,0.9781122,115.0
max accuracy,0.5949943,0.9795396,119.0
max precision,1.0,1.0,0.0
max recall,0.0326891,1.0,156.0
max specificity,1.0,1.0,0.0
max absolute_mcc,0.5949943,0.956793,119.0
max min_per_class_accuracy,0.4534478,0.9708333,124.0
max mean_per_class_accuracy,0.5949943,0.9771937,119.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0255754,1.0,2.589404,2.589404,1.0,1.0,1.0,1.0,0.0662252,0.0662252,158.9403974,158.9403974,0.0662252
2,0.0358056,1.0,2.589404,2.589404,1.0,1.0,1.0,1.0,0.0264901,0.0927152,158.9403974,158.9403974,0.0927152
3,0.0409207,1.0,2.589404,2.589404,1.0,1.0,1.0,1.0,0.013245,0.1059603,158.9403974,158.9403974,0.1059603
4,0.0511509,0.9999999,2.589404,2.589404,1.0,0.9999999,1.0,1.0,0.0264901,0.1324503,158.9403974,158.9403974,0.1324503
5,0.1023018,0.999999,2.589404,2.589404,1.0,0.9999995,1.0,0.9999998,0.1324503,0.2649007,158.9403974,158.9403974,0.2649007
6,0.1508951,0.9999854,2.589404,2.589404,1.0,0.9999937,1.0,0.9999978,0.1258278,0.3907285,158.9403974,158.9403974,0.3907285
7,0.202046,0.9998457,2.589404,2.589404,1.0,0.9999308,1.0,0.9999808,0.1324503,0.5231788,158.9403974,158.9403974,0.5231788
8,0.3017903,0.9853038,2.589404,2.589404,1.0,0.9969558,1.0,0.998981,0.2582781,0.781457,158.9403974,158.9403974,0.781457
9,0.4015345,0.3574306,1.9254542,2.4244738,0.7435897,0.7492536,0.9363057,0.9369468,0.192053,0.9735099,92.5454237,142.4473784,0.9318433
10,0.5012788,0.0121699,0.2655799,1.994898,0.1025641,0.1000211,0.7704082,0.7704157,0.0264901,1.0,-73.4420105,99.4897959,0.8125

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.9876179,0.0171810,0.972973,1.0,1.0,1.0,0.9651163
aic,19.588245,6.1683445,24.799656,13.474743,19.840965,13.302881,26.522978
auc,0.9972004,0.0038462,0.9925595,1.0,1.0,1.0,0.9934426
err,0.0123821,0.0171810,0.0270270,0.0,0.0,0.0,0.0348837
err_count,1.0,1.4142135,2.0,0.0,0.0,0.0,3.0
f0point5,0.9920587,0.0118600,0.9868421,1.0,1.0,1.0,0.9734513
f1,0.9807824,0.0285843,0.9677419,1.0,1.0,1.0,0.9361702
f2,0.9702013,0.0441551,0.9493671,1.0,1.0,1.0,0.9016393
lift_top_group,2.6287038,0.4838150,2.3125,2.5294118,2.642857,2.21875,3.44
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
lb = aml.leaderboard
lb.head()

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
StackedEnsemble_BestOfFamily_2_AutoML_2_20251106_204540,0.979978,0.203917,0.975631,0.0454545,0.204905,0.0419862
GBM_grid_1_AutoML_2_20251106_204540_model_228,0.979437,0.219746,0.974835,0.0454545,0.198551,0.0394227
GBM_grid_1_AutoML_2_20251106_204540_model_120,0.978896,0.282075,0.97252,0.0543831,0.216169,0.0467292
GBM_grid_1_AutoML_2_20251106_204540_model_161,0.978355,0.291956,0.972674,0.0543831,0.204215,0.0417038
GLM_1_AutoML_2_20251106_204540,0.978355,0.17724,0.9781,0.0454545,0.206122,0.0424861
GBM_grid_1_AutoML_2_20251106_204540_model_68,0.978355,0.239388,0.972756,0.0543831,0.213768,0.0456969
StackedEnsemble_AllModels_1_AutoML_2_20251106_204540,0.977814,0.204458,0.974168,0.0454545,0.203135,0.0412638
GBM_grid_1_AutoML_2_20251106_204540_model_237,0.977273,0.232757,0.971302,0.0543831,0.220613,0.0486701
StackedEnsemble_AllModels_3_AutoML_2_20251106_204540,0.976732,0.224431,0.973168,0.0454545,0.250459,0.0627298
GBM_grid_1_AutoML_2_20251106_204540_model_210,0.976732,0.24415,0.970276,0.0606061,0.214569,0.0460398


In [26]:
leader = aml.leader
print("Leader:", leader.algo, leader.model_id)

Leader: stackedensemble StackedEnsemble_BestOfFamily_2_AutoML_2_20251106_204540


In [27]:
# === Ocena na teście + próg pod F1 ===
perf = leader.model_performance(test_data=test)
print("AUC(test)   :", perf.auc())
print("AUCPR(test) :", perf.aucpr())
print("LogLoss(test):", perf.logloss())

AUC(test)   : 1.0
AUCPR(test) : 1.0
LogLoss(test): 0.030195947664521938


In [28]:
best_thr_f1 = perf.find_threshold_by_max_metric("f1")
print("Best threshold (F1):", best_thr_f1)
print(perf.confusion_matrix(metrics="f1"))

Best threshold (F1): 0.9072210702480294
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.9072210702480294
           benign    malignant    Error    Rate
---------  --------  -----------  -------  ----------
benign     61        0            0        (0.0/61.0)
malignant  0         28           0        (0.0/28.0)
Total      61        28           0        (0.0/89.0)


In [29]:
# === Predykcje + powrót do Spark ===
preds_h2o = leader.predict(test)         # kolumny: p0/p1/predict (dla binarnej)
preds_spark = hc.asSparkFrame(preds_h2o)
preds_spark.show(5)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
+---------+--------------------+------------------+
|  predict|              benign|         malignant|
+---------+--------------------+------------------+
|malignant| 0.09277892975197055|0.9072210702480294|
|malignant|0.006204363178459...|0.9937956368215404|
|malignant|1.466680307093959...|0.9998533319692906|
|malignant| 3.12627168597146E-9|0.9999999968737283|
|malignant|0.005996768160698607|0.9940032318393014|
+---------+--------------------+------------------+
only showing top 5 rows



In [31]:
# === Zapis modelu (H2O bin + MOJO) ===
import os, h2o
outdir = r"C:\REPO\studia\PRZETWARZAZBIOROW20252026\P2 Pysparkling\models\h2o_breast_cancer\nodeeplarning_xgboost"
os.makedirs(outdir, exist_ok=True)

bin_path  = h2o.save_model(model=leader, path=outdir, force=True)
mojo_path = leader.download_mojo(path=outdir, get_genmodel_jar=True)
print("Saved BIN :", bin_path)
print("Saved MOJO:", mojo_path)


Saved BIN : C:\REPO\studia\PRZETWARZAZBIOROW20252026\P2 Pysparkling\models\h2o_breast_cancer\nodeeplarning_xgboost\StackedEnsemble_BestOfFamily_2_AutoML_2_20251106_204540
Saved MOJO: C:\REPO\studia\PRZETWARZAZBIOROW20252026\P2 Pysparkling\models\h2o_breast_cancer\nodeeplarning_xgboost\StackedEnsemble_BestOfFamily_2_AutoML_2_20251106_204540.zip
