In [1]:
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection

# From OpenML: https://www.openml.org/search?type=data&status=active&id=43672
dataset_name = "Heart-Disease-Dataset-(Comprehensive)"


def get_data_and_scoring_function(dataset_name):
    X, y = sklearn.datasets.fetch_openml(
        dataset_name, as_frame=True, return_X_y=True)
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        X,
        y,
        random_state=0,
        stratify=y,
    )

    def scoring_function(estimator):
        predictions = estimator.predict_proba(X_test)[:, 1]
        return sklearn.metrics.roc_auc_score(y_test, predictions)

    def train_scoring_function(estimator):
        predictions = estimator.predict_proba(X_train)[:, 1]
        return sklearn.metrics.roc_auc_score(y_train, predictions)

    def get_test_data():
        return X_test, y_test

    return (
        X,
        y,
        X_train,
        y_train,
        get_test_data,
        scoring_function,
        train_scoring_function,
    )


X, y, X_train, y_train, get_test_data, scoring_function, train_scoring_function = (
    get_data_and_scoring_function(dataset_name)
)

X_test, y_test = get_test_data()

print(f"Done Processing and downloading {dataset_name}")

Done Processing and downloading Heart-Disease-Dataset-(Comprehensive)


  warn("Multiple active versions of the dataset matching the name"


In [2]:
# bug from this https://stackoverflow.com/questions/76404811/attributeerror-dataframe-object-has-no-attribute-iteritems
import pandas as pd
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.experimental.askl2 import AutoSklearn2Classifier
from autosklearn.metrics import log_loss

# Edit the settings to try in both AutoSklearn1 and AutoSklearn2
# Possibilities https://automl.github.io/auto-sklearn/master/api.html

settings = {
    "time_left_for_this_task": 5 * 60,
    "seed": 42,
    "metric": log_loss,
    "n_jobs": 16,
}

pd.DataFrame.iteritems = pd.DataFrame.items

# Create and train AutoSklearn2.0
askl2 = AutoSklearn2Classifier(
    **settings,
    delete_tmp_folder_after_terminate=False,
    tmp_folder="models/autosklearn"
)
askl2.fit(X_train, y_train, dataset_name="heart_disease_comprehensive")



AutoSklearn2Classifier(delete_tmp_folder_after_terminate=False, metric=log_loss,
                       n_jobs=16, per_run_time_limit=960, seed=42,
                       time_left_for_this_task=600,
                       tmp_folder='/home/skhani/Documents/Recent/PV/AutoML/exam/Heart_Disease/models/autosklearn')

In [3]:
from sklearn.metrics import classification_report

y_pred = askl2.predict(X_test)

print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

         0.0     0.9197    0.9000    0.9097       140
         1.0     0.9130    0.9304    0.9216       158

    accuracy                         0.9161       298
   macro avg     0.9164    0.9152    0.9157       298
weighted avg     0.9162    0.9161    0.9160       298



In [5]:
print(
    f"Auto-sklearn 2.0 | train = {train_scoring_function(askl2)} | test = {scoring_function(askl2)}"
)
print(f"Selected `resampling-strategy` = {askl2.resampling_strategy}")
print(
    f"Selected `resampling-strategy-arguments` = {askl2.resampling_strategy_arguments}"
)

print(askl2.sprint_statistics())

askl2.leaderboard(sort_by="rank", ensemble_only=True)

Auto-sklearn 2.0 | train = 1.0 | test = 0.9705696202531645
Selected `resampling-strategy` = cv-iterative-fit
Selected `resampling-strategy-arguments` = {'folds': 10}
auto-sklearn results:
  Dataset name: heart_disease_comprehensive
  Metric: log_loss
  Best validation score: 0.245661
  Number of target algorithm runs: 527
  Number of successful target algorithm runs: 405
  Number of crashed target algorithm runs: 120
  Number of target algorithms that exceeded the time limit: 2
  Number of target algorithms that exceeded the memory limit: 0



Unnamed: 0_level_0,rank,ensemble_weight,type,cost,duration
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
128,4,0.04,gradient_boosting,0.311609,10.231822
333,1,0.24,extra_trees,0.245661,15.202969
439,3,0.32,extra_trees,0.251838,15.641046
461,2,0.4,extra_trees,0.251779,13.076456
