## Penguins - Classification Data

In [None]:
pip install h2o

In [9]:
import h2o
from h2o.automl import H2OAutoML
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns

# Initialize H2O
h2o.init()

data = sns.load_dataset("penguins")

X = data.drop('sex', axis=1)
y = data['sex']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=197)
train_data = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
test_data = h2o.H2OFrame(pd.concat([X_test, y_test], axis=1))

# Define the target and features
target = "sex"  # This should match the column name of your target in the DataFrame
features = [col for col in train_data.columns if col != target]

# Run AutoML
aml = H2OAutoML(max_runtime_secs=600)  # 10 minutes
aml.train(x=features, y=target, training_frame = train_data) # Weights can be specified for each row and added as a column and added in parameter- weights_column.

# View leaderboard
print(aml.leaderboard)

m = aml.leader
print(m)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,47 mins 45 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,2 months and 21 days
H2O_cluster_name:,H2O_from_python_unknownUser_8bu0qd
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.151 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
model_id                                                   mean_per_class_error    logloss      rmse        mse
GBM_3_AutoML_2_20250123_123249                                         0.326041   0.349147  0.314877  0.0991477
XGBoost_grid_1_AutoML_2_20250123_123249_model_7                        0.331375   0.364206  0.320149  0.102496
XGBoost_grid_1_AutoML_2_20250123_123249_model_6                        0.331375   0.365592  0.319667  0.102187
XGBoost_grid_1_AutoML_2_20250123_123249_model_16                       0.331912   0.382829  0.324202  0.105107
GBM_grid_1_AutoML_2_20250123_123249_model_12                           0.333504   0.364645  0.325899  0.10621
XGBoost_grid_1_AutoML_2_20250123_123249_model_1 

In [None]:
pip install autogluon

In [3]:
from autogluon.tabular import TabularPredictor
import seaborn as sns
import pandas as pd

data = sns.load_dataset("penguins")

data.dropna(inplace=True)

# Train AutoML
predictor = TabularPredictor(label='sex')
predictor.fit(data)

# Make predictions
predictions = predictor.predict(data)
print(predictions)

No path specified. Models will be saved in: "AutogluonModels/ag-20250123_130135"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       11.22 GB / 12.67 GB (88.5%)
Disk Space Avail:   74.80 GB / 107.72 GB (69.4%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong acc

0        Male
1      Female
2      Female
4      Female
5        Male
        ...  
338    Female
340    Female
341      Male
342    Female
343      Male
Name: sex, Length: 333, dtype: object


In [None]:
pip install tpot

In [7]:
import pandas as pd
from tpot import TPOTClassifier
import seaborn as sns
from sklearn.model_selection import train_test_split

tips = sns.load_dataset("penguins")
data = tips.dropna()

data = pd.get_dummies(data, columns= ['species', 'island'], drop_first = True)

X = data.drop('sex', axis=1)
y = data['sex']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=197)

# Run TPOT
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)

# Export best pipeline
tpot.export('best_pipeline.py')

Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9059399021663171

Generation 2 - Current best internal CV score: 0.9059399021663171

Generation 3 - Current best internal CV score: 0.917120894479385

Generation 4 - Current best internal CV score: 0.9171907756813417

Generation 5 - Current best internal CV score: 0.9171907756813417

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.1, min_samples_leaf=1, min_samples_split=8, n_estimators=100)


## Regression Data - Medical Insurance

In [8]:
import h2o
from h2o.automl import H2OAutoML
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns

# Initialize H2O
h2o.init()

data = pd.read_csv("/content/insurance.csv")

X = data.drop('charges', axis=1)
y = data['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=197)
train_data = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
test_data = h2o.H2OFrame(pd.concat([X_test, y_test], axis=1))

# Define the target and features
target = "charges"  # This should match the column name of your target in the DataFrame
features = [col for col in train_data.columns if col != target]

# Run AutoML
aml = H2OAutoML(max_runtime_secs=600)  # 10 minutes
aml.train(x=features, y=target, training_frame = train_data) # Weights can be specified for each row and added as a column and added in parameter- weights_column.

# View leaderboard
print(aml.leaderboard)

m = aml.leader
print(m)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.25" 2024-10-15; OpenJDK Runtime Environment (build 11.0.25+9-post-Ubuntu-1ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.25+9-post-Ubuntu-1ubuntu122.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.11/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp54m_ys2r
  JVM stdout: /tmp/tmp54m_ys2r/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp54m_ys2r/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,05 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,2 months and 21 days
H2O_cluster_name:,H2O_from_python_unknownUser_343ij2
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.170 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
model_id                                                      auc    logloss     aucpr    mean_per_class_error      rmse       mse
StackedEnsemble_AllModels_5_AutoML_1_20250123_135207     0.695724   0.632735  0.725576                0.452902  0.471203  0.222033
StackedEnsemble_BestOfFamily_6_AutoML_1_20250123_135207  0.647425   0.661896  0.638636                0.478427  0.484201  0.234451
StackedEnsemble_AllModels_3_AutoML_1_20250123_135207     0.645009   0.66164   0.644194                0.482325  0.484182  0.234432
XGBoost_grid_1_AutoML_1_20250123_135207_model_12         0.643776   0.700939  0.639647                0.47458   0.49667   0.246681
StackedEnsemble_BestOfFamily_4_AutoML_1_20250123_135207  0.6

In [9]:
from autogluon.tabular import TabularPredictor
import seaborn as sns
import pandas as pd

data = pd.read_csv("/content/insurance.csv")

data.dropna(inplace=True)

# Train AutoML
predictor = TabularPredictor(label='charges')
predictor.fit(data)

# Make predictions
predictions = predictor.predict(data)
print(predictions)

No path specified. Models will be saved in: "AutogluonModels/ag-20250123_140212"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       10.44 GB / 12.67 GB (82.4%)
Disk Space Avail:   74.57 GB / 107.72 GB (69.2%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong acc

0       19334.916016
1        3858.539795
2        5589.521484
3        5547.783691
4        4844.771973
            ...     
1333    12254.125977
1334     3529.936035
1335     2676.797363
1336     2927.105957
1337    30839.687500
Name: charges, Length: 1338, dtype: float32
