In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from tpot import TPOTClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import sklearn.metrics
import os

# Any results you write to the current directory are saved as output.
import timeit 

pd.options.display.max_columns = 500
pd.options.display.width = 500

In [3]:
digits = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,
                                                    train_size=0.75, test_size=0.25, random_state=50)

In [3]:
tpot = TPOTClassifier(verbosity=3, 
                      scoring="accuracy", 
                      random_state=50,  
                      n_jobs=-1, 
                      generations=20, 
                      periodic_checkpoint_folder="intermediate_algos",
                      population_size=60,
                      early_stop=10)
times = []
scores = []
winning_pipes = []

for x in range(1):
    start_time = timeit.default_timer()
    tpot.fit(X_train, y_train)
    elapsed = timeit.default_timer() - start_time
    times.append(elapsed)
    winning_pipes.append(tpot.fitted_pipeline_)
    scores.append(tpot.score(X_test, y_test))
    tpot.export('tpot_mnist_pipeline1.py')
times = [time/60 for time in times]

print('Times:', times)
print('Scores:', scores)   
print('Winning pipelines:', winning_pipes)

32 operators have been imported by TPOT.


Optimization Progress:   0%|          | 0/1260 [00:00<?, ?pipeline/s]

_pre_test decorator: _random_mutation_operator: num_test=0 Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty..

Generation 1 - Current Pareto front scores:

-1	0.9671135430916552	ExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=False, ExtraTreesClassifier__criterion=entropy, ExtraTreesClassifier__max_features=0.8500000000000001, ExtraTreesClassifier__min_samples_leaf=1, ExtraTreesClassifier__min_samples_split=4, ExtraTreesClassifier__n_estimators=100)
Saving periodic pipeline from pareto front to intermediate_algos\pipeline_gen_1_idx_0_2021.12.15_09-55-46.py
_pre_test decorator: _random_mutation_operator: num_test=0 Unsupported set of arguments: The combination of penalty='l1' and loss='hinge' is not supported, Parameters: penalty='l1', loss='hinge', dual=False.

Generation 2 - Current Pareto front scores:

-1	0.9671135430916552	ExtraTreesClassifier(input_matrix, ExtraTreesClassifier__bootstrap=False, ExtraTreesClassifier__criterion=entropy, ExtraTre

_pre_test decorator: _random_mutation_operator: num_test=0 Expected n_neighbors <= n_samples,  but n_samples = 50, n_neighbors = 55.
_pre_test decorator: _random_mutation_operator: num_test=1 Expected n_neighbors <= n_samples,  but n_samples = 50, n_neighbors = 75.
_pre_test decorator: _random_mutation_operator: num_test=0 Expected n_neighbors <= n_samples,  but n_samples = 50, n_neighbors = 51.
_pre_test decorator: _random_mutation_operator: num_test=1 Unsupported set of arguments: The combination of penalty='l2' and loss='hinge' are not supported when dual=False, Parameters: penalty='l2', loss='hinge', dual=False.

Generation 6 - Current Pareto front scores:

-1	0.969466484268126	RandomForestClassifier(input_matrix, RandomForestClassifier__bootstrap=False, RandomForestClassifier__criterion=entropy, RandomForestClassifier__max_features=0.15000000000000002, RandomForestClassifier__min_samples_leaf=2, RandomForestClassifier__min_samples_split=6, RandomForestClassifier__n_estimators=100)

_pre_test decorator: _random_mutation_operator: num_test=0 Solver lbfgs supports only dual=False, got dual=True.
_pre_test decorator: _random_mutation_operator: num_test=1 Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty..
_pre_test decorator: _random_mutation_operator: num_test=0 Unsupported set of arguments: The combination of penalty='l1' and loss='hinge' is not supported, Parameters: penalty='l1', loss='hinge', dual=True.
_pre_test decorator: _random_mutation_operator: num_test=0 Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty..
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required..

Generation 10 - Current Pareto front scores:

-1	0.9765253077975377	GradientBoostingClassifier(input_matrix, GradientBoostingClassifier__learning_rate=0.5, GradientBoostingClassifier__max_depth=8, GradientBoostingClassifier__max_features=0.7500000000000001, GradientBoostingClassif

_pre_test decorator: _random_mutation_operator: num_test=0 Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty..
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required..

Generation 16 - Current Pareto front scores:

-1	0.9765253077975377	GradientBoostingClassifier(input_matrix, GradientBoostingClassifier__learning_rate=0.5, GradientBoostingClassifier__max_depth=8, GradientBoostingClassifier__max_features=0.7500000000000001, GradientBoostingClassifier__min_samples_leaf=8, GradientBoostingClassifier__min_samples_split=18, GradientBoostingClassifier__n_estimators=100, GradientBoostingClassifier__subsample=0.7000000000000001)

-3	0.9788508891928865	ExtraTreesClassifier(GradientBoostingClassifier(MultinomialNB(input_matrix, MultinomialNB__alpha=0.1, MultinomialNB__fit_prior=False), GradientBoostingClassifier__learning_rate=1.0, GradientBoostingClassifier__max_depth=7, GradientBoostingClassifi

_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required by StandardScaler..
_pre_test decorator: _random_mutation_operator: num_test=0 Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty..

Generation 20 - Current Pareto front scores:

-1	0.9765253077975377	GradientBoostingClassifier(input_matrix, GradientBoostingClassifier__learning_rate=0.5, GradientBoostingClassifier__max_depth=8, GradientBoostingClassifier__max_features=0.7500000000000001, GradientBoostingClassifier__min_samples_leaf=8, GradientBoostingClassifier__min_samples_split=18, GradientBoostingClassifier__n_estimators=100, GradientBoostingClassifier__subsample=0.7000000000000001)

-2	0.9788782489740082	GradientBoostingClassifier(SGDClassifier(input_matrix, SGDClassifier__alpha=0.0, SGDClassifier__eta0=0.01, SGDClassifier__fit_intercept=False, SGDClassifier__l1_ratio=0.75, SGDClassifier__learning_rate=constant, SGDClassifier__lo

In [4]:
print('Times:', times)
print('Scores:', scores)   
print('Winning pipelines:', winning_pipes)

Times: [16.918544558333334]
Scores: [0.9790209790209791]
Winning pipelines: [Pipeline(steps=[('stackingestimator',
                 StackingEstimator(estimator=SGDClassifier(alpha=0.0, eta0=0.01,
                                                           fit_intercept=False,
                                                           l1_ratio=0.75,
                                                           learning_rate='constant',
                                                           loss='log',
                                                           penalty='elasticnet',
                                                           power_t=0.0,
                                                           random_state=50))),
                ('gradientboostingclassifier',
                 GradientBoostingClassifier(learning_rate=0.5, max_depth=8,
                                            max_features=0.7500000000000001,
                                            min_samples_leaf=3

In [5]:
import h2o
print(h2o.__version__)
from h2o.automl import H2OAutoML

h2o.init(max_mem_size='2G')

3.34.0.3
Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.311-b11, mixed mode)
  Starting server from c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\HP\AppData\Local\Temp\tmp4rry8t0y
  JVM stdout: C:\Users\HP\AppData\Local\Temp\tmp4rry8t0y\h2o_HP_started_from_python.out
  JVM stderr: C:\Users\HP\AppData\Local\Temp\tmp4rry8t0y\h2o_HP_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,05 secs
H2O_cluster_timezone:,Europe/Paris
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.34.0.3
H2O_cluster_version_age:,2 months and 7 days
H2O_cluster_name:,H2O_from_python_HP_0mf834
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.778 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [6]:
%%time
train = h2o.import_file(r"C:\Users\HP\Desktop\Oreilly\data.csv")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Wall time: 716 ms


In [7]:
train.head()

id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,C33
842302.0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
842517.0,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
84300900.0,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
84348300.0,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
84358400.0,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,
843786.0,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,0.3345,0.8902,2.217,27.19,0.00751,0.03345,0.03672,0.01137,0.02165,0.005082,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244,
844359.0,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742,0.4467,0.7732,3.18,53.91,0.004314,0.01382,0.02254,0.01039,0.01369,0.002179,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368,
84458200.0,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,0.5835,1.377,3.856,50.96,0.008805,0.03029,0.02488,0.01448,0.01486,0.005412,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151,
844981.0,M,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,0.3063,1.002,2.406,24.32,0.005731,0.03502,0.03553,0.01226,0.02143,0.003749,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072,
84501000.0,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,0.2976,1.599,2.039,23.94,0.007149,0.07217,0.07743,0.01432,0.01789,0.01008,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075,




In [9]:
x = train.columns[2:]
y = 'diagnosis'
# For binary classification, response should be a factor
train[y] = train[y].asfactor()

In [10]:
aml = H2OAutoML(max_models=30, seed=45, max_runtime_secs=28800)
aml.train(x=x, y=y, training_frame=train)

AutoML progress: |
10:13:16.618: AutoML: XGBoost is not available; skipping it.
10:13:16.649: Step 'best_of_family_xgboost' not defined in provider 'StackedEnsemble': skipping it.
10:13:16.649: Step 'all_xgboost' not defined in provider 'StackedEnsemble': skipping it.

███████████████████████████████████████████████████████████████| (done) 100%
Model Details
H2ODeepLearningEstimator :  Deep Learning
Model Key:  DeepLearning_grid_2_AutoML_1_20211215_101316_model_3


Status of Neuron Layers: predicting diagnosis, 2-class classification, bernoulli distribution, CrossEntropy loss, 13.402 weights/biases, 168,3 KB, 1.615.960 training samples, mini-batch size 1


Unnamed: 0,Unnamed: 1,layer,units,type,dropout,l1,l2,mean_rate,rate_rms,momentum,mean_weight,weight_rms,mean_bias,bias_rms
0,,1,30,Input,20.0,,,,,,,,,
1,,2,100,RectifierDropout,40.0,0.0,0.0,0.025184,0.11386,0.0,-0.001596,0.211028,0.188828,0.2187
2,,3,100,RectifierDropout,40.0,0.0,0.0,0.061669,0.195709,0.0,-0.02675,0.143085,0.734084,0.314972
3,,4,2,Softmax,,0.0,0.0,0.022006,0.111479,0.0,0.003793,0.545358,0.195055,0.386103




ModelMetricsBinomial: deeplearning
** Reported on train data. **

MSE: 0.00494764712098518
RMSE: 0.07033951322681427
LogLoss: 0.015590989014707317
Mean Per-Class Error: 0.0014005602240896309
AUC: 0.9999603615030919
AUCPR: 0.9999330913443776
Gini: 0.9999207230061837

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.02180629401409174: 


Unnamed: 0,Unnamed: 1,B,M,Error,Rate
0,B,356.0,1.0,0.0028,(1.0/357.0)
1,M,0.0,212.0,0.0,(0.0/212.0)
2,Total,356.0,213.0,0.0018,(1.0/569.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.02180629,0.997647,21.0
1,max f2,0.02180629,0.999057,21.0
2,max f0point5,0.9419725,0.997137,17.0
3,max accuracy,0.02180629,0.998243,21.0
4,max precision,1.0,1.0,0.0
5,max recall,0.02180629,1.0,21.0
6,max specificity,1.0,1.0,0.0
7,max absolute_mcc,0.02180629,0.996252,21.0
8,max min_per_class_accuracy,0.02180629,0.997199,21.0
9,max mean_per_class_accuracy,0.02180629,0.998599,21.0



Gains/Lift Table: Avg response rate: 37,26 %, avg score: 36,87 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.288225,1.0,2.683962,2.683962,1.0,1.0,1.0,1.0,0.773585,0.773585,168.396226,168.396226,0.773585
1,2,0.300527,1.0,2.683962,2.683962,1.0,1.0,1.0,1.0,0.033019,0.806604,168.396226,168.396226,0.806604
2,3,0.400703,0.0001072414,1.930569,2.495614,0.719298,0.6809459,0.929825,0.920236,0.193396,1.0,93.056935,149.561404,0.955182
3,4,0.500879,4.77752e-08,0.0,1.996491,0.0,1.200455e-05,0.74386,0.736192,0.0,1.0,-100.0,99.649123,0.795518
4,5,0.599297,3.143182e-11,0.0,1.668622,0.0,5.966873e-09,0.621701,0.615292,0.0,1.0,-100.0,66.86217,0.638655
5,6,0.699473,6.524262e-14,0.0,1.429648,0.0,5.452312e-12,0.532663,0.527172,0.0,1.0,-100.0,42.964824,0.478992
6,7,0.799649,1.72698e-16,0.0,1.250549,0.0,1.565417e-14,0.465934,0.461131,0.0,1.0,-100.0,25.054945,0.319328
7,8,0.899824,5.2413439999999996e-20,0.0,1.111328,0.0,2.041642e-17,0.414062,0.409794,0.0,1.0,-100.0,11.132812,0.159664
8,9,1.0,2.622848e-40,0.0,1.0,0.0,5.736249e-21,0.372583,0.368743,0.0,1.0,-100.0,0.0,0.0




ModelMetricsBinomial: deeplearning
** Reported on cross-validation data. **

MSE: 0.01682091078871903
RMSE: 0.12969545400174606
LogLoss: 0.09260203554029675
Mean Per-Class Error: 0.01879525395063686
AUC: 0.9962607684583268
AUCPR: 0.9951964370103076
Gini: 0.9925215369166536

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.6303770894367192: 


Unnamed: 0,Unnamed: 1,B,M,Error,Rate
0,B,352.0,5.0,0.014,(5.0/357.0)
1,M,5.0,207.0,0.0236,(5.0/212.0)
2,Total,357.0,212.0,0.0176,(10.0/569.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.6303771,0.976415,41.0
1,max f2,0.6303771,0.976415,41.0
2,max f0point5,0.979202,0.986056,27.0
3,max accuracy,0.6303771,0.982425,41.0
4,max precision,1.0,1.0,0.0
5,max recall,6.829745e-07,1.0,175.0
6,max specificity,1.0,1.0,0.0
7,max absolute_mcc,0.6303771,0.962409,41.0
8,max min_per_class_accuracy,0.6303771,0.976415,41.0
9,max mean_per_class_accuracy,0.6303771,0.981205,41.0



Gains/Lift Table: Avg response rate: 37,26 %, avg score: 37,29 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.265378,1.0,2.683962,2.683962,1.0,1.0,1.0,1.0,0.712264,0.712264,168.396226,168.396226,0.712264
1,2,0.300527,0.9999998,2.683962,2.683962,1.0,0.9999999,1.0,1.0,0.09434,0.806604,168.396226,168.396226,0.806604
2,3,0.400703,0.009552546,1.742221,2.448527,0.649123,0.7215549,0.912281,0.930389,0.174528,0.981132,74.222112,144.852698,0.92511
3,4,0.500879,2.116e-05,0.141261,1.987074,0.052632,0.001345271,0.740351,0.74458,0.014151,0.995283,-85.873883,98.707382,0.788
4,5,0.599297,7.86e-07,0.0,1.660751,0.0,5.486786e-06,0.618768,0.622304,0.0,0.995283,-100.0,66.075084,0.631137
5,6,0.704745,7e-08,0.044733,1.418953,0.016667,2.773333e-07,0.528678,0.529191,0.004717,1.0,-95.52673,41.895262,0.470588
6,7,0.813708,1e-08,0.0,1.228942,0.0,2.483871e-08,0.457883,0.458327,0.0,1.0,-100.0,22.894168,0.296919
7,8,1.0,0.0,0.0,1.0,0.0,0.0,0.372583,0.372945,0.0,1.0,-100.0,0.0,0.0




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.985965,0.013303,0.982456,0.991228,0.991228,0.964912,1.0
1,auc,0.995648,0.003912,0.994257,0.998961,0.990313,0.994709,1.0
2,err,0.014035,0.013303,0.017544,0.008772,0.008772,0.035088,0.0
3,err_count,1.6,1.516575,2.0,1.0,1.0,4.0,0.0
4,f0point5,0.988856,0.009412,0.989583,0.979381,0.995935,0.979381,1.0
5,f1,0.980254,0.019227,0.974359,0.987013,0.989899,0.95,1.0
6,f2,0.972125,0.031881,0.959596,0.994764,0.983936,0.92233,1.0
7,lift_top_group,2.706952,0.268772,2.85,3.0,2.28,2.714286,2.690476
8,logloss,0.09244,0.066132,0.141865,0.043986,0.144036,0.131958,0.000356
9,max_per_class_error,0.035679,0.038008,0.05,0.013158,0.02,0.095238,0.0



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_logloss,training_r2,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2021-12-15 10:21:16,0.000 sec,,0.0,0,0.0,,,,,,,
1,,2021-12-15 10:21:16,3 min 33.026 sec,24212 obs/sec,10.0,1,5690.0,0.169593,0.122778,0.876963,0.993724,0.992375,2.683962,0.026362
2,,2021-12-15 10:21:21,3 min 38.063 sec,29229 obs/sec,270.0,27,153630.0,0.09656,0.057054,0.960114,0.998322,0.997555,2.683962,0.008787
3,,2021-12-15 10:21:26,3 min 43.110 sec,29883 obs/sec,540.0,54,307260.0,0.087472,0.044674,0.967269,0.999141,0.998667,2.683962,0.00703
4,,2021-12-15 10:21:31,3 min 48.110 sec,30942 obs/sec,830.0,83,472270.0,0.081681,0.03154,0.97146,0.999551,0.999275,2.683962,0.00703
5,,2021-12-15 10:21:36,3 min 53.247 sec,31550 obs/sec,1130.0,113,642970.0,0.078874,0.028774,0.973387,0.999736,0.999569,2.683962,0.005272
6,,2021-12-15 10:21:42,3 min 58.347 sec,32178 obs/sec,1440.0,144,819360.0,0.076191,0.025621,0.975167,0.999802,0.999675,2.683962,0.005272
7,,2021-12-15 10:21:47,4 min 3.399 sec,33019 obs/sec,1770.0,177,1007130.0,0.074273,0.026357,0.976402,0.999921,0.999867,2.683962,0.005272
8,,2021-12-15 10:21:52,4 min 8.541 sec,33541 obs/sec,2100.0,210,1194900.0,0.072076,0.019868,0.977777,0.999947,0.999911,2.683962,0.003515
9,,2021-12-15 10:21:57,4 min 13.638 sec,33831 obs/sec,2420.0,242,1376980.0,0.072156,0.025632,0.977728,0.99996,0.999933,2.683962,0.001757



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,radius_worst,1.0,1.0,0.059246
1,concave points_mean,0.949279,0.949279,0.056241
2,area_worst,0.909152,0.909152,0.053864
3,perimeter_worst,0.862774,0.862774,0.051116
4,concavity_worst,0.721778,0.721778,0.042763
5,texture_worst,0.707516,0.707516,0.041918
6,area_se,0.690148,0.690148,0.040889
7,texture_mean,0.620443,0.620443,0.036759
8,concavity_mean,0.60468,0.60468,0.035825
9,radius_se,0.570768,0.570768,0.033816



See the whole table with table.as_data_frame()




In [11]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
DeepLearning_grid_2_AutoML_1_20211215_101316_model_3,0.996261,0.092602,0.995196,0.0187953,0.129695,0.0168209
DeepLearning_grid_1_AutoML_1_20211215_101316_model_4,0.996155,0.0851442,0.99498,0.0201958,0.132138,0.0174606
DeepLearning_grid_3_AutoML_1_20211215_101316_model_4,0.996115,0.0810954,0.995421,0.0164368,0.117412,0.0137855
StackedEnsemble_BestOfFamily_4_AutoML_1_20211215_101316,0.996109,0.0621652,0.995178,0.0150362,0.123571,0.0152697
DeepLearning_grid_3_AutoML_1_20211215_101316_model_3,0.996063,0.101468,0.994982,0.0206384,0.149978,0.0224933
StackedEnsemble_BestOfFamily_7_AutoML_1_20211215_101316,0.995917,0.0642502,0.994904,0.0187953,0.130045,0.0169117
DeepLearning_grid_2_AutoML_1_20211215_101316_model_2,0.995864,0.0849824,0.994328,0.0291871,0.153942,0.023698
DeepLearning_grid_1_AutoML_1_20211215_101316_model_3,0.995759,0.0829518,0.994638,0.0178373,0.129486,0.0167667
StackedEnsemble_AllModels_6_AutoML_1_20211215_101316,0.995699,0.0640589,0.994968,0.0164368,0.124858,0.0155894
GBM_grid_1_AutoML_1_20211215_101316_model_1,0.995494,0.0779294,0.99363,0.0229969,0.145313,0.0211158




In [12]:
aml.leader

Model Details
H2ODeepLearningEstimator :  Deep Learning
Model Key:  DeepLearning_grid_2_AutoML_1_20211215_101316_model_3


Status of Neuron Layers: predicting diagnosis, 2-class classification, bernoulli distribution, CrossEntropy loss, 13.402 weights/biases, 168,3 KB, 1.615.960 training samples, mini-batch size 1


Unnamed: 0,Unnamed: 1,layer,units,type,dropout,l1,l2,mean_rate,rate_rms,momentum,mean_weight,weight_rms,mean_bias,bias_rms
0,,1,30,Input,20.0,,,,,,,,,
1,,2,100,RectifierDropout,40.0,0.0,0.0,0.025184,0.11386,0.0,-0.001596,0.211028,0.188828,0.2187
2,,3,100,RectifierDropout,40.0,0.0,0.0,0.061669,0.195709,0.0,-0.02675,0.143085,0.734084,0.314972
3,,4,2,Softmax,,0.0,0.0,0.022006,0.111479,0.0,0.003793,0.545358,0.195055,0.386103




ModelMetricsBinomial: deeplearning
** Reported on train data. **

MSE: 0.00494764712098518
RMSE: 0.07033951322681427
LogLoss: 0.015590989014707317
Mean Per-Class Error: 0.0014005602240896309
AUC: 0.9999603615030919
AUCPR: 0.9999330913443776
Gini: 0.9999207230061837

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.02180629401409174: 


Unnamed: 0,Unnamed: 1,B,M,Error,Rate
0,B,356.0,1.0,0.0028,(1.0/357.0)
1,M,0.0,212.0,0.0,(0.0/212.0)
2,Total,356.0,213.0,0.0018,(1.0/569.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.02180629,0.997647,21.0
1,max f2,0.02180629,0.999057,21.0
2,max f0point5,0.9419725,0.997137,17.0
3,max accuracy,0.02180629,0.998243,21.0
4,max precision,1.0,1.0,0.0
5,max recall,0.02180629,1.0,21.0
6,max specificity,1.0,1.0,0.0
7,max absolute_mcc,0.02180629,0.996252,21.0
8,max min_per_class_accuracy,0.02180629,0.997199,21.0
9,max mean_per_class_accuracy,0.02180629,0.998599,21.0



Gains/Lift Table: Avg response rate: 37,26 %, avg score: 36,87 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.288225,1.0,2.683962,2.683962,1.0,1.0,1.0,1.0,0.773585,0.773585,168.396226,168.396226,0.773585
1,2,0.300527,1.0,2.683962,2.683962,1.0,1.0,1.0,1.0,0.033019,0.806604,168.396226,168.396226,0.806604
2,3,0.400703,0.0001072414,1.930569,2.495614,0.719298,0.6809459,0.929825,0.920236,0.193396,1.0,93.056935,149.561404,0.955182
3,4,0.500879,4.77752e-08,0.0,1.996491,0.0,1.200455e-05,0.74386,0.736192,0.0,1.0,-100.0,99.649123,0.795518
4,5,0.599297,3.143182e-11,0.0,1.668622,0.0,5.966873e-09,0.621701,0.615292,0.0,1.0,-100.0,66.86217,0.638655
5,6,0.699473,6.524262e-14,0.0,1.429648,0.0,5.452312e-12,0.532663,0.527172,0.0,1.0,-100.0,42.964824,0.478992
6,7,0.799649,1.72698e-16,0.0,1.250549,0.0,1.565417e-14,0.465934,0.461131,0.0,1.0,-100.0,25.054945,0.319328
7,8,0.899824,5.2413439999999996e-20,0.0,1.111328,0.0,2.041642e-17,0.414062,0.409794,0.0,1.0,-100.0,11.132812,0.159664
8,9,1.0,2.622848e-40,0.0,1.0,0.0,5.736249e-21,0.372583,0.368743,0.0,1.0,-100.0,0.0,0.0




ModelMetricsBinomial: deeplearning
** Reported on cross-validation data. **

MSE: 0.01682091078871903
RMSE: 0.12969545400174606
LogLoss: 0.09260203554029675
Mean Per-Class Error: 0.01879525395063686
AUC: 0.9962607684583268
AUCPR: 0.9951964370103076
Gini: 0.9925215369166536

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.6303770894367192: 


Unnamed: 0,Unnamed: 1,B,M,Error,Rate
0,B,352.0,5.0,0.014,(5.0/357.0)
1,M,5.0,207.0,0.0236,(5.0/212.0)
2,Total,357.0,212.0,0.0176,(10.0/569.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.6303771,0.976415,41.0
1,max f2,0.6303771,0.976415,41.0
2,max f0point5,0.979202,0.986056,27.0
3,max accuracy,0.6303771,0.982425,41.0
4,max precision,1.0,1.0,0.0
5,max recall,6.829745e-07,1.0,175.0
6,max specificity,1.0,1.0,0.0
7,max absolute_mcc,0.6303771,0.962409,41.0
8,max min_per_class_accuracy,0.6303771,0.976415,41.0
9,max mean_per_class_accuracy,0.6303771,0.981205,41.0



Gains/Lift Table: Avg response rate: 37,26 %, avg score: 37,29 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.265378,1.0,2.683962,2.683962,1.0,1.0,1.0,1.0,0.712264,0.712264,168.396226,168.396226,0.712264
1,2,0.300527,0.9999998,2.683962,2.683962,1.0,0.9999999,1.0,1.0,0.09434,0.806604,168.396226,168.396226,0.806604
2,3,0.400703,0.009552546,1.742221,2.448527,0.649123,0.7215549,0.912281,0.930389,0.174528,0.981132,74.222112,144.852698,0.92511
3,4,0.500879,2.116e-05,0.141261,1.987074,0.052632,0.001345271,0.740351,0.74458,0.014151,0.995283,-85.873883,98.707382,0.788
4,5,0.599297,7.86e-07,0.0,1.660751,0.0,5.486786e-06,0.618768,0.622304,0.0,0.995283,-100.0,66.075084,0.631137
5,6,0.704745,7e-08,0.044733,1.418953,0.016667,2.773333e-07,0.528678,0.529191,0.004717,1.0,-95.52673,41.895262,0.470588
6,7,0.813708,1e-08,0.0,1.228942,0.0,2.483871e-08,0.457883,0.458327,0.0,1.0,-100.0,22.894168,0.296919
7,8,1.0,0.0,0.0,1.0,0.0,0.0,0.372583,0.372945,0.0,1.0,-100.0,0.0,0.0




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.985965,0.013303,0.982456,0.991228,0.991228,0.964912,1.0
1,auc,0.995648,0.003912,0.994257,0.998961,0.990313,0.994709,1.0
2,err,0.014035,0.013303,0.017544,0.008772,0.008772,0.035088,0.0
3,err_count,1.6,1.516575,2.0,1.0,1.0,4.0,0.0
4,f0point5,0.988856,0.009412,0.989583,0.979381,0.995935,0.979381,1.0
5,f1,0.980254,0.019227,0.974359,0.987013,0.989899,0.95,1.0
6,f2,0.972125,0.031881,0.959596,0.994764,0.983936,0.92233,1.0
7,lift_top_group,2.706952,0.268772,2.85,3.0,2.28,2.714286,2.690476
8,logloss,0.09244,0.066132,0.141865,0.043986,0.144036,0.131958,0.000356
9,max_per_class_error,0.035679,0.038008,0.05,0.013158,0.02,0.095238,0.0



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_logloss,training_r2,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2021-12-15 10:21:16,0.000 sec,,0.0,0,0.0,,,,,,,
1,,2021-12-15 10:21:16,3 min 33.026 sec,24212 obs/sec,10.0,1,5690.0,0.169593,0.122778,0.876963,0.993724,0.992375,2.683962,0.026362
2,,2021-12-15 10:21:21,3 min 38.063 sec,29229 obs/sec,270.0,27,153630.0,0.09656,0.057054,0.960114,0.998322,0.997555,2.683962,0.008787
3,,2021-12-15 10:21:26,3 min 43.110 sec,29883 obs/sec,540.0,54,307260.0,0.087472,0.044674,0.967269,0.999141,0.998667,2.683962,0.00703
4,,2021-12-15 10:21:31,3 min 48.110 sec,30942 obs/sec,830.0,83,472270.0,0.081681,0.03154,0.97146,0.999551,0.999275,2.683962,0.00703
5,,2021-12-15 10:21:36,3 min 53.247 sec,31550 obs/sec,1130.0,113,642970.0,0.078874,0.028774,0.973387,0.999736,0.999569,2.683962,0.005272
6,,2021-12-15 10:21:42,3 min 58.347 sec,32178 obs/sec,1440.0,144,819360.0,0.076191,0.025621,0.975167,0.999802,0.999675,2.683962,0.005272
7,,2021-12-15 10:21:47,4 min 3.399 sec,33019 obs/sec,1770.0,177,1007130.0,0.074273,0.026357,0.976402,0.999921,0.999867,2.683962,0.005272
8,,2021-12-15 10:21:52,4 min 8.541 sec,33541 obs/sec,2100.0,210,1194900.0,0.072076,0.019868,0.977777,0.999947,0.999911,2.683962,0.003515
9,,2021-12-15 10:21:57,4 min 13.638 sec,33831 obs/sec,2420.0,242,1376980.0,0.072156,0.025632,0.977728,0.99996,0.999933,2.683962,0.001757



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,radius_worst,1.0,1.0,0.059246
1,concave points_mean,0.949279,0.949279,0.056241
2,area_worst,0.909152,0.909152,0.053864
3,perimeter_worst,0.862774,0.862774,0.051116
4,concavity_worst,0.721778,0.721778,0.042763
5,texture_worst,0.707516,0.707516,0.041918
6,area_se,0.690148,0.690148,0.040889
7,texture_mean,0.620443,0.620443,0.036759
8,concavity_mean,0.60468,0.60468,0.035825
9,radius_se,0.570768,0.570768,0.033816



See the whole table with table.as_data_frame()




In [15]:
test = h2o.import_file(r"C:\Users\HP\Desktop\Oreilly\data.csv")
preds = aml.predict(test)
print(preds)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%


predict,B,M
M,5.12593e-83,1
M,1.53778e-33,1
M,3.15456e-56,1
M,2.03713e-24,1
M,5.68461e-33,1
M,1.88194e-12,1
M,1.95587e-36,1
M,4.04017e-14,1
M,7.120459999999999e-27,1
M,8.71689e-18,1





In [23]:
import autokeras as ak

# Importing data
df = pd.read_csv(r'C:\Users\HP\Desktop\Oreilly/data.csv')

#Define the dependent variable that needs to be predicted (labels)
y = df["diagnosis"].values

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
Y = labelencoder.fit_transform(y) # M=1 and B=0
#################################################################
#Define x and normalize values

#Define the independent variables. Let's also drop Gender, so we can normalize other data
X = df.drop(labels = ["diagnosis", "id"], axis=1) 


from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

#Split data into train and test to verify accuracy after fitting the model. 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


In [24]:
# Initialize the ImageClassifier.
clf = ak.StructuredDataClassifier(max_trials=1)

INFO:tensorflow:Reloading Oracle from existing project .\structured_data_classifier\oracle.json


In [25]:
clf.fit(X_train, y_train, verbose=1, epochs=1)

Trial 1 Complete [00h 00m 01s]
val_accuracy: 0.8809523582458496

Best val_accuracy So Far: 0.8809523582458496
Total elapsed time: 00h 00m 01s
INFO:tensorflow:Oracle triggered exit
INFO:tensorflow:Assets written to: .\structured_data_classifier\best_model\assets


<tensorflow.python.keras.callbacks.History at 0x178ce1e9188>

In [27]:
# Evaluate on the testing data.
print("Accuracy: {accuracy}".format(accuracy=clf.evaluate(X_test, y_test)))

Accuracy: [0.5418186187744141, 0.7762237787246704]
