In [1]:
## mount drive for access to the files
from google.colab import drive

drive.mount("/content/drive")

## all the drive the files are present in "/content/drive/My Drive"
!ls "/content/drive/My Drive/Beuth Uni/Master Thesis"

import sys
sys.path.append('/content/drive/My Drive/Beuth Uni/Master Thesis/jenga')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Data
'Data Quality in ML Production Systems.pdf'
'Datawig: Missing Value Imputation for Tables.pdf'
 Declaration
 Images
 jenga
 jenga.pdf
 MICE_Multivariate_Imputation_by_Chained_Equations_.pdf


In [2]:
!pip install openml
!pip install pyod

!pip install mxnet autogluon
!pip install mxnet-mkl --pre --upgrade

Requirement already up-to-date: mxnet-mkl in /usr/local/lib/python3.6/dist-packages (1.6.0)


In [3]:
import random
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from jenga.basis import Dataset
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise
from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNNOutlierDetection, PyODIsolationForestOutlierDetection, AutoGluonOutlierDetection
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, AutoGluonImputation
from jenga.cleaning.clean import Clean

In [4]:
seed = 10

In [5]:
def run_experiment(dataset_name, learner, param_grid, corruptions, fraction, cleaners, num_repetitions, categorical_precision_threshold=0.7, numerical_std_error_threshold=2.0):
    
    ## dataset
    dataset = Dataset(seed, dataset_name)
    
    all_data = dataset.all_data
    attribute_names = dataset.attribute_names
    attribute_types = dataset.attribute_types
    
    ## categorical and numerical features
    categorical_columns = dataset.categorical_columns
    numerical_columns = dataset.numerical_columns
    print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")
    
    ## train and test data
    df_train, lab_train, df_test, lab_test = dataset.get_train_test_data()
    
    
    ## pipeline performance prediction (ppp)
    ppp = PipelinePerformancePrediction(seed, df_train, lab_train, df_test, lab_test, categorical_columns, numerical_columns, learner, param_grid)
    ppp_model = ppp.fit_ppp(df_train)
    
    ## generate corrpted data
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(df_test, corruptions, fraction, num_repetitions)
    
    ## cleaning
    clean = Clean(df_train, df_corrupted, categorical_columns, numerical_columns, categorical_precision_threshold, numerical_std_error_threshold, ppp, ppp_model, cleaners)
    df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(df_train, df_corrupted)
    
    ## results
    result = {
        'ppp_score_model': ppp.predict_score_ppp(ppp_model, df_test),
        'ppp_score_corrupted': corrupted_score_ppp,
        'ppp_score_cleaned': best_cleaning_score,
        'ppp_scores_cleaners': cleaner_scores_ppp
    }
#     print('\n'.join([f'{key}:{val}' for key, val in result.items()]))
    
    ## summary
    summary = {
        'dataset': dataset_name,
        'model': learner,
        'corruptions': summary_col_corrupt,
        'cleaners': summary_cleaners,
        'result': result
    }
#     print('\n\n\n\n'.join([f'{key}:{val}' for key, val in summary.items()]))
    
    return summary #summary_col_corrupt, result

In [6]:
datasets = [
    'parkinsons',
    'heart-statlog',
    'credit-g'
]

In [7]:
## model parameters
## models is a dict where key = leaner & value = param_grid
models = {SGDClassifier(loss='log'): {'learner__max_iter': [500, 1000, 5000], 
                                         'learner__penalty': ['l2', 'l1', 'elasticnet'], 
                                         'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
                                        }#, 
          # RandomForestClassifier():{'learner__n_estimators': [100, 200, 500], 
          #                           'learner__max_depth': [5, 10, 15]
          #                          }
         }

## make dict of multiple leraners and corresponding param_grids

In [8]:
corruptions = [[MissingValues]] #, [Scaling], [GaussianNoise]]

In [9]:
fractions = [0.15]#, 0.25, 0.5, 0.75, 0.9]

In [10]:
cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, MeanModeImputation),
    # (PyODKNNOutlierDetection, AutoGluonImputation),
    (PyODIsolationForestOutlierDetection, MeanModeImputation),
    # (PyODIsolationForestOutlierDetection, AutoGluonImputation),
    (AutoGluonOutlierDetection, AutoGluonImputation)
]

In [11]:
%%time
for _ in range(1):
  print("\n\n..................................ITERATION..................................\n")
  ind_results = []
  
  for dataset in datasets:
    for learner, param_grid in models.items():
      for corruption in corruptions:
        for fraction in fractions:
          ind_results.append(run_experiment(dataset, learner, param_grid, corruption, fraction, cleaners, 100))



..................................ITERATION..................................



Data pickle file already exists and is up to date.


Dataset: parkinsons
Found 0 categorical and 22 numeric features 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 174 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    3.8s finished



Generating corrupted training data on 39 rows... 

	perturbation: MissingValues: {'column': 'V2', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'V16', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'V1', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'V15', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'V9', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'V6', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'V17', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'V11', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'V8', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	p

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_142858/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_142858/
AutoGluon Version:  0.0.12
Train Data Rows:    124
Train Data Columns: 22
Preprocessing data ...
NumExpr defaulting to 2 threads.
Feature Generator processed 124 data points with 21 features
Original Features (raw dtypes):
	float64 features: 21
Original Features (inferred dtypes):
	float features: 21
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 21
Final Features:
	float features: 21
	Data preprocessing and feature engineering runtime = 0.1s ...
AutoGluon will gauge predictive performance using evaluation metric: root_mean_squared_error
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: root_mean_squared_error
Fitting model: RandomForestRegressorMSE ...


	perturbation: MissingValues: {'column': 'V4', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'V14', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'V10', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}


	-14.0151	 = Validation root_mean_squared_error score
	0.64s	 = Training runtime
	0.11s	 = Validation runtime
Fitting model: ExtraTreesRegressorMSE ...
	-10.3166	 = Validation root_mean_squared_error score
	0.42s	 = Training runtime
	0.11s	 = Validation runtime
Fitting model: KNeighborsRegressorUnif ...
	-16.4813	 = Validation root_mean_squared_error score
	0.02s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: KNeighborsRegressorDist ...
	-12.9161	 = Validation root_mean_squared_error score
	0.01s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: LightGBMRegressor ...
	-14.9676	 = Validation root_mean_squared_error score
	0.25s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: CatboostRegressor ...
	-11.6816	 = Validation root_mean_squared_error score
	0.69s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: NeuralNetRegressor ...
	-14.8948	 = Validation root_mean_squared_error score
	1.77s	 = Training runtime
	0.02s	 = Validatio

[1000]	train_set's rmse: 3.69631e-07	valid_set's rmse: 5.66817e-06


	-0.0	 = Validation root_mean_squared_error score
	0.73s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-0.0	 = Validation root_mean_squared_error score
	0.55s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 8.72s ...
Evaluation: root_mean_squared_error on test data: 8.215519244107794e-05
Evaluations on test data:
{
    "root_mean_squared_error": 8.215519244107794e-05,
    "mean_absolute_error": 5.290717706159108e-06,
    "explained_variance_score": 0.9462871304479227,
    "r2_score": 0.9420358282123962,
    "pearson_correlation": 0.9771238944448154,
    "mean_squared_error": 9.592204364609817e-11,
    "median_absolute_error": 3.7589890570965282e-06
}
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_142931/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_142931/
AutoGluon Version:  0.0.12
Train Data Rows:    124
Train D

[1000]	train_set's rmse: 0.000150375	valid_set's rmse: 0.00186043
[2000]	train_set's rmse: 3.0847e-06	valid_set's rmse: 0.00182536
[3000]	train_set's rmse: 7.2029e-08	valid_set's rmse: 0.00182506
[4000]	train_set's rmse: 1.59206e-09	valid_set's rmse: 0.00182505
[5000]	train_set's rmse: 3.11113e-11	valid_set's rmse: 0.00182505
[6000]	train_set's rmse: 6.72496e-13	valid_set's rmse: 0.00182505
[7000]	train_set's rmse: 1.47231e-14	valid_set's rmse: 0.00182505
[8000]	train_set's rmse: 3.40554e-16	valid_set's rmse: 0.00182505


	-0.0018	 = Validation root_mean_squared_error score
	5.05s	 = Training runtime
	0.03s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-0.0015	 = Validation root_mean_squared_error score
	0.52s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 12.81s ...
Evaluation: root_mean_squared_error on test data: 0.00784456007582137
Evaluations on test data:
{
    "root_mean_squared_error": 0.00784456007582137,
    "mean_absolute_error": 0.0013168298330135466,
    "explained_variance_score": 0.9592890994366936,
    "r2_score": 0.9592850791267176,
    "pearson_correlation": 0.9841263855993286,
    "mean_squared_error": 4.936854840467849e-06,
    "median_absolute_error": 0.0009691834800157634
}
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_143026/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_143026/
AutoGluon Version:  0.0.12
Train Data Rows:    124
Train

[1000]	train_set's rmse: 0.0038014	valid_set's rmse: 1.60855
[2000]	train_set's rmse: 0.000115494	valid_set's rmse: 1.60798
[3000]	train_set's rmse: 3.44221e-06	valid_set's rmse: 1.60795
[4000]	train_set's rmse: 1.0543e-07	valid_set's rmse: 1.60795
[5000]	train_set's rmse: 3.12303e-09	valid_set's rmse: 1.60795
[6000]	train_set's rmse: 5.83722e-11	valid_set's rmse: 1.60795
[7000]	train_set's rmse: 1.09177e-12	valid_set's rmse: 1.60795


	-1.6079	 = Validation root_mean_squared_error score
	4.45s	 = Training runtime
	0.03s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-1.1216	 = Validation root_mean_squared_error score
	0.54s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 12.24s ...
Evaluation: root_mean_squared_error on test data: 7.5914236749196125
Evaluations on test data:
{
    "root_mean_squared_error": 7.5914236749196125,
    "mean_absolute_error": 0.8901491779181221,
    "explained_variance_score": 0.9264673392587038,
    "r2_score": 0.9261071104230534,
    "pearson_correlation": 0.9646903930648902,
    "mean_squared_error": 1.40236265292343,
    "median_absolute_error": 0.6344048129504127
}
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_143059/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_143059/
AutoGluon Version:  0.0.12
Train Data Rows:    124
Train Data Columns

[1000]	train_set's rmse: 8.41138e-05	valid_set's rmse: 0.0357267
[2000]	train_set's rmse: 2.09409e-06	valid_set's rmse: 0.035714
[3000]	train_set's rmse: 4.38001e-08	valid_set's rmse: 0.0357138
[4000]	train_set's rmse: 7.39231e-10	valid_set's rmse: 0.0357138
[5000]	train_set's rmse: 1.19256e-11	valid_set's rmse: 0.0357138
[6000]	train_set's rmse: 1.95393e-13	valid_set's rmse: 0.0357138
[7000]	train_set's rmse: 3.44937e-15	valid_set's rmse: 0.0357138


	-0.0357	 = Validation root_mean_squared_error score
	4.13s	 = Training runtime
	0.03s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-0.0269	 = Validation root_mean_squared_error score
	0.63s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 14.13s ...
Evaluation: root_mean_squared_error on test data: 0.11176054577359662
Evaluations on test data:
{
    "root_mean_squared_error": 0.11176054577359662,
    "mean_absolute_error": 0.019821204777027278,
    "explained_variance_score": 0.7904081648672915,
    "r2_score": 0.7801324888419047,
    "pearson_correlation": 0.8902457598194501,
    "mean_squared_error": 0.0008373023911838104,
    "median_absolute_error": 0.009893295177909156
}
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_143121/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_143121/
AutoGluon Version:  0.0.12
Train Data Rows:    124
Train D

Categorical precision threshold: 0.7
Numerical Std Error threshold: 2.0
Predictors: {'V1': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e2eb6fac8>, 'V2': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e27fcda90>, 'V3': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e2efc9198>, 'V4': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e2bb27160>, 'V5': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e2eb6f940>, 'V6': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e1e393588>, 'V7': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e2fbfa390>, 'V8': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e2804ec88>, 'V9': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e2803d7f0>, 'V10': <autogluon.task.tabular_prediction.predictor.TabularP

Evaluation: root_mean_squared_error on test data: 32.55445390777406


Column V22: Num NaNs: Before: 0, Now: 0


Evaluation: root_mean_squared_error on test data: 44.18315405652611
Evaluation: root_mean_squared_error on test data: 2.320513825741169
Evaluation: root_mean_squared_error on test data: 0.0008557906633898262
Evaluation: root_mean_squared_error on test data: 6.9377028623696326e-06
Evaluation: root_mean_squared_error on test data: 0.0015768325242974941
Evaluation: root_mean_squared_error on test data: 0.0004847344107617968
Evaluation: root_mean_squared_error on test data: 0.0011834034023344346
Evaluation: root_mean_squared_error on test data: 0.0021378747377851443
Evaluation: root_mean_squared_error on test data: 0.03137631609124936
Evaluation: root_mean_squared_error on test data: 0.0006800093086876745
Evaluation: root_mean_squared_error on test data: 0.0033649955764906113
Evaluation: root_mean_squared_error on test data: 0.0032364916124538153
Evaluation: root_mean_squared_error on test data: 0.0019056747107709265
Evaluation: root_mean_squared_error on test data: 0.012469434657610046
Ev

Cleaner: {'outlier_detection': AutoGluonOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.9805194805194805, 'classification_report': {'1': {'precision': 0.7142857142857143, 'recall': 0.9090909090909091, 'f1-score': 0.8, 'support': 11}, '2': {'precision': 0.96, 'recall': 0.8571428571428571, 'f1-score': 0.9056603773584904, 'support': 28}, 'accuracy': 0.8717948717948718, 'macro avg': {'precision': 0.8371428571428572, 'recall': 0.8831168831168831, 'f1-score': 0.8528301886792452, 'support': 39}, 'weighted avg': {'precision': 0.8906959706959707, 'recall': 0.8717948717948718, 'f1-score': 0.8758587324625058, 'support': 39}}}

Best cleaning method:
Cleaning score: Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.9837662337662337, 'classification_report': {'1': {'precision': 0.7692307692307693, 'recall': 0.9090909090909091, 'f1-score': 0.8333333333333333, 'support': 11}, '2': {'precision': 0.9615384615384616, 'recal

Saved dataset 53: heart-statlog to file /root/.openml/cache/org/openml/www/datasets/53/dataset.pkl.py3
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Dataset: heart-statlog
Found 0 categorical and 13 numeric features 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.9s finished



Generating corrupted training data on 54 rows... 

	perturbation: MissingValues: {'column': 'age', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'age', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'serum_cholestoral', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'thal', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'age', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: 

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_143208/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_143208/
AutoGluon Version:  0.0.12
Train Data Rows:    172
Train Data Columns: 13
Preprocessing data ...
Feature Generator processed 172 data points with 12 features
Original Features (raw dtypes):
	float64 features: 12
Original Features (inferred dtypes):
	float features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 12
Final Features:
	float features: 12
	Data preprocessing and feature engineering runtime = 0.07s ...
AutoGluon will gauge predictive performance using evaluation metric: root_mean_squared_error
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: root_mean_squared_error
Fitting model: RandomForestRegressorMSE ...
	-8.7894	 = Validation root_mean_squared_error score
	0.63s	 = Training runti

Categorical precision threshold: 0.7
Numerical Std Error threshold: 2.0
Predictors: {'age': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e2a802390>, 'sex': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e2804e358>, 'chest': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e27db0ac8>, 'resting_blood_pressure': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e1e3af940>, 'serum_cholestoral': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e9275c9b0>, 'fasting_blood_sugar': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e2803d748>, 'resting_electrocardiographic_results': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6ea11b0c88>, 'maximum_heart_rate_achieved': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e280382e8>, 'exercise_induced_angina': <autogluon.t

Evaluation: root_mean_squared_error on test data: 7.030646161416363


Column thal: Num NaNs: Before: 0, Now: 0


Evaluation: root_mean_squared_error on test data: 0.3909377167893328
Evaluation: root_mean_squared_error on test data: 0.8978297064831683
Evaluation: root_mean_squared_error on test data: 14.066473581081684
Evaluation: root_mean_squared_error on test data: 42.17541950917734
Evaluation: root_mean_squared_error on test data: 0.15145197547026173
Evaluation: root_mean_squared_error on test data: 0.9849172201835534
Evaluation: root_mean_squared_error on test data: 17.213761562191237
Evaluation: root_mean_squared_error on test data: 0.43610610493280294
Evaluation: root_mean_squared_error on test data: 1.0002617066315307
Evaluation: root_mean_squared_error on test data: 0.53322660638329
Evaluation: root_mean_squared_error on test data: 0.8881863780209592
Evaluation: root_mean_squared_error on test data: 1.840421623558836


Cleaner: {'outlier_detection': AutoGluonOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.9326923076923077, 'classification_report': {'absent': {'precision': 0.8620689655172413, 'recall': 0.9615384615384616, 'f1-score': 0.9090909090909091, 'support': 26}, 'present': {'precision': 0.96, 'recall': 0.8571428571428571, 'f1-score': 0.9056603773584904, 'support': 28}, 'accuracy': 0.9074074074074074, 'macro avg': {'precision': 0.9110344827586206, 'recall': 0.9093406593406593, 'f1-score': 0.9073756432246998, 'support': 54}, 'weighted avg': {'precision': 0.9128480204342273, 'recall': 0.9074074074074074, 'f1-score': 0.9073121148592846, 'support': 54}}}

Best cleaning method:
Cleaning score: Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.9326923076923077, 'classification_report': {'absent': {'precision': 0.8620689655172413, 'recall': 0.9615384615384616, 'f1-score': 0.9090909090909091, 'support': 26}, 'present': {'p

Saved dataset 31: credit-g to file /root/.openml/cache/org/openml/www/datasets/31/dataset.pkl.py3
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Dataset: credit-g
Found 13 categorical and 7 numeric features 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    7.5s finished



Generating corrupted training data on 200 rows... 

	perturbation: MissingValues: {'column': 'credit_amount', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'other_payment_plans', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'duration', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'credit_history', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'existing_credits', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'housing', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'savings_status', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: Mi

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_143330/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_143330/
AutoGluon Version:  0.0.12
Train Data Rows:    640
Train Data Columns: 20
Preprocessing data ...
Train Data Class Count: 4


	perturbation: MissingValues: {'column': 'job', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'credit_amount', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'installment_commitment', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'residence_since', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'age', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'other_parties', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'credit_history', 'fraction': 0.15

Feature Generator processed 640 data points with 19 features
Original Features (raw dtypes):
	float64 features: 7
	object features: 12
Original Features (inferred dtypes):
	float features: 7
	object features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 7
	category features: 12
Final Features:
	float features: 7
	category features: 12
	Data preprocessing and feature engineering runtime = 0.15s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini ...
	0.4219	 = Validation accuracy score
	0.95s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: RandomForestClassifierEntr ...
	0.4375	 = Validation accuracy score
	0.94s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: ExtraTreesClassifierGini ...
	0.3672	 = Validation accuracy sc

Categorical precision threshold: 0.7
Numerical Std Error threshold: 2.0
Predictors: {'checking_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e1e3f6e80>, 'credit_history': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e27d1c240>, 'purpose': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e27aed400>, 'savings_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e292f8390>, 'employment': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e28030588>, 'personal_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e27db1860>, 'other_parties': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e29f84940>, 'property_magnitude': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e2bb095f8>, 'other_payment_plans': <autogluon.task.tabular_prediction.predict

Evaluation: accuracy on test data: 0.445
Evaluation: root_mean_squared_error on test data: 7.699521681198704
Evaluation: accuracy on test data: 0.69
Evaluation: accuracy on test data: 0.215
Evaluation: root_mean_squared_error on test data: 1947.809160752353
Evaluation: accuracy on test data: 0.625
Evaluation: accuracy on test data: 0.505
Evaluation: root_mean_squared_error on test data: 0.9479871991045707
Evaluation: accuracy on test data: 0.495
Evaluation: accuracy on test data: 0.925
Evaluation: root_mean_squared_error on test data: 0.9328901324953283
Evaluation: accuracy on test data: 0.46
Evaluation: root_mean_squared_error on test data: 7.848479289418133
Evaluation: accuracy on test data: 0.81
Evaluation: accuracy on test data: 0.8
Evaluation: root_mean_squared_error on test data: 0.3278367802613017
Evaluation: accuracy on test data: 0.56
Evaluation: root_mean_squared_error on test data: 0.288412531613649
Evaluation: accuracy on test data: 0.645
Evaluation: accuracy on test data: 

Cleaner: {'outlier_detection': AutoGluonOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.809724170172978, 'classification_report': {'bad': {'precision': 0.6666666666666666, 'recall': 0.5161290322580645, 'f1-score': 0.5818181818181819, 'support': 62}, 'good': {'precision': 0.8026315789473685, 'recall': 0.8840579710144928, 'f1-score': 0.8413793103448277, 'support': 138}, 'accuracy': 0.77, 'macro avg': {'precision': 0.7346491228070176, 'recall': 0.7000935016362786, 'f1-score': 0.7115987460815048, 'support': 200}, 'weighted avg': {'precision': 0.7604824561403509, 'recall': 0.77, 'f1-score': 0.7609153605015675, 'support': 200}}}
Column credit_history: Num NaNs: Before: 0, Now: 31
Column purpose: Num NaNs: Before: 0, Now: 18
Column personal_status: Num NaNs: Before: 0, Now: 2
Column other_parties: Num NaNs: Before: 0, Now: 15
Column property_magnitude: Num NaNs: Before: 0, Now: 1
Column other_payment_plans: Num NaNs: Before: 0, Now: 38
Column housing: Num NaNs: Befor

Evaluation: accuracy on test data: 0.445
Evaluation: root_mean_squared_error on test data: 7.699521681198704
Evaluation: accuracy on test data: 0.69
Evaluation: accuracy on test data: 0.215
Evaluation: root_mean_squared_error on test data: 1947.809160752353
Evaluation: accuracy on test data: 0.625
Evaluation: accuracy on test data: 0.505
Evaluation: root_mean_squared_error on test data: 0.9479871991045707
Evaluation: accuracy on test data: 0.495
Evaluation: accuracy on test data: 0.925
Evaluation: root_mean_squared_error on test data: 0.9328901324953283
Evaluation: accuracy on test data: 0.46
Evaluation: root_mean_squared_error on test data: 7.848479289418133
Evaluation: accuracy on test data: 0.81
Evaluation: accuracy on test data: 0.8
Evaluation: root_mean_squared_error on test data: 0.3278367802613017
Evaluation: accuracy on test data: 0.56
Evaluation: root_mean_squared_error on test data: 0.288412531613649
Evaluation: accuracy on test data: 0.645
Evaluation: accuracy on test data: 


Best cleaning method:
Cleaning score: Cleaner: {'outlier_detection': AutoGluonOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.809724170172978, 'classification_report': {'bad': {'precision': 0.6666666666666666, 'recall': 0.5161290322580645, 'f1-score': 0.5818181818181819, 'support': 62}, 'good': {'precision': 0.8026315789473685, 'recall': 0.8840579710144928, 'f1-score': 0.8413793103448277, 'support': 138}, 'accuracy': 0.77, 'macro avg': {'precision': 0.7346491228070176, 'recall': 0.7000935016362786, 'f1-score': 0.7115987460815048, 'support': 200}, 'weighted avg': {'precision': 0.7604824561403509, 'recall': 0.77, 'f1-score': 0.7609153605015675, 'support': 200}}} 

Cleaning improved the overall score 



CPU times: user 14min 11s, sys: 49.1 s, total: 15min
Wall time: 12min 6s


In [12]:
ind_results

[{'cleaners': [{'Imputation method': MeanModeImputation,
    'Outlier detection method': NoOutlierDetection,
    'PPP score with cleaning': {'classification_report': {'1': {'f1-score': 0.8333333333333333,
       'precision': 0.7692307692307693,
       'recall': 0.9090909090909091,
       'support': 11},
      '2': {'f1-score': 0.9259259259259259,
       'precision': 0.9615384615384616,
       'recall': 0.8928571428571429,
       'support': 28},
      'accuracy': 0.8974358974358975,
      'macro avg': {'f1-score': 0.8796296296296295,
       'precision': 0.8653846153846154,
       'recall': 0.900974025974026,
       'support': 39},
      'weighted avg': {'f1-score': 0.8998100664767332,
       'precision': 0.9072978303747535,
       'recall': 0.8974358974358975,
       'support': 39}},
     'roc_auc_acore': 0.9837662337662337}},
   {'Imputation method': MeanModeImputation,
    'Outlier detection method': PyODKNNOutlierDetection,
    'PPP score with cleaning': {'classification_report': {'1

In [15]:
cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, AutoGluonImputation),
    (PyODIsolationForestOutlierDetection, MeanModeImputation),
    (PyODIsolationForestOutlierDetection, AutoGluonImputation),
    (AutoGluonOutlierDetection, AutoGluonImputation)
]

In [16]:
%%time
for _ in range(1):
  print("\n\n..................................ITERATION..................................\n")
  ind_results = []
  
  for dataset in datasets:
    for learner, param_grid in models.items():
      for corruption in corruptions:
        for fraction in fractions:
          ind_results.append(run_experiment(dataset, learner, param_grid, corruption, fraction, cleaners, 100))



..................................ITERATION..................................



Data pickle file already exists and is up to date.


Dataset: parkinsons
Found 0 categorical and 22 numeric features 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 174 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    3.8s finished



Generating corrupted training data on 39 rows... 

	perturbation: MissingValues: {'column': 'V2', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'V16', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'V1', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'V15', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'V9', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'V6', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'V17', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'V11', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'V8', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	p

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_153236/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_153236/
AutoGluon Version:  0.0.12
Train Data Rows:    124
Train Data Columns: 22
Preprocessing data ...
Feature Generator processed 124 data points with 21 features
Original Features (raw dtypes):
	float64 features: 21
Original Features (inferred dtypes):
	float features: 21
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 21
Final Features:
	float features: 21
	Data preprocessing and feature engineering runtime = 0.08s ...
AutoGluon will gauge predictive performance using evaluation metric: root_mean_squared_error
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: root_mean_squared_error
Fitting model: RandomForestRegressorMSE ...


	perturbation: MissingValues: {'column': 'V11', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'V17', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'V5', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'V8', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'V20', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'V22', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'V21', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'V19', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'V16', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'V4', 'fr

	-14.0151	 = Validation root_mean_squared_error score
	0.64s	 = Training runtime
	0.11s	 = Validation runtime
Fitting model: ExtraTreesRegressorMSE ...
	-10.3166	 = Validation root_mean_squared_error score
	0.52s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: KNeighborsRegressorUnif ...
	-16.4813	 = Validation root_mean_squared_error score
	0.02s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: KNeighborsRegressorDist ...
	-12.9161	 = Validation root_mean_squared_error score
	0.02s	 = Training runtime
	0.11s	 = Validation runtime
Fitting model: LightGBMRegressor ...
	-14.9676	 = Validation root_mean_squared_error score
	0.37s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: CatboostRegressor ...
	-11.6816	 = Validation root_mean_squared_error score
	0.72s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: NeuralNetRegressor ...
	-15.1367	 = Validation root_mean_squared_error score
	2.03s	 = Training runtime
	0.02s	 = Validatio

[1000]	train_set's rmse: 3.69631e-07	valid_set's rmse: 5.66817e-06


	-0.0	 = Validation root_mean_squared_error score
	0.76s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-0.0	 = Validation root_mean_squared_error score
	0.51s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 7.64s ...
Evaluation: root_mean_squared_error on test data: 8.81141354172665e-05
Evaluations on test data:
{
    "root_mean_squared_error": 8.81141354172665e-05,
    "mean_absolute_error": 5.163190318270694e-06,
    "explained_variance_score": 0.9506247110639006,
    "r2_score": 0.9478451317179262,
    "pearson_correlation": 0.9778761102609147,
    "mean_squared_error": 8.630851433608311e-11,
    "median_absolute_error": 2.6727533731970218e-06
}
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_153309/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_153309/
AutoGluon Version:  0.0.12
Train Data Rows:    124
Train Dat

[1000]	train_set's rmse: 0.000150375	valid_set's rmse: 0.00186043
[2000]	train_set's rmse: 3.0847e-06	valid_set's rmse: 0.00182536
[3000]	train_set's rmse: 7.2029e-08	valid_set's rmse: 0.00182506
[4000]	train_set's rmse: 1.59206e-09	valid_set's rmse: 0.00182505
[5000]	train_set's rmse: 3.11113e-11	valid_set's rmse: 0.00182505
[6000]	train_set's rmse: 6.72496e-13	valid_set's rmse: 0.00182505
[7000]	train_set's rmse: 1.47231e-14	valid_set's rmse: 0.00182505
[8000]	train_set's rmse: 3.40554e-16	valid_set's rmse: 0.00182505


	-0.0018	 = Validation root_mean_squared_error score
	4.99s	 = Training runtime
	0.03s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-0.0015	 = Validation root_mean_squared_error score
	0.59s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 12.44s ...
Evaluation: root_mean_squared_error on test data: 0.00784456007582137
Evaluations on test data:
{
    "root_mean_squared_error": 0.00784456007582137,
    "mean_absolute_error": 0.0013168298330135466,
    "explained_variance_score": 0.9592890994366936,
    "r2_score": 0.9592850791267176,
    "pearson_correlation": 0.9841263855993286,
    "mean_squared_error": 4.936854840467849e-06,
    "median_absolute_error": 0.0009691834800157634
}
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_153405/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_153405/
AutoGluon Version:  0.0.12
Train Data Rows:    124
Train

[1000]	train_set's rmse: 0.0038014	valid_set's rmse: 1.60855
[2000]	train_set's rmse: 0.000115494	valid_set's rmse: 1.60798
[3000]	train_set's rmse: 3.44221e-06	valid_set's rmse: 1.60795
[4000]	train_set's rmse: 1.0543e-07	valid_set's rmse: 1.60795
[5000]	train_set's rmse: 3.12303e-09	valid_set's rmse: 1.60795
[6000]	train_set's rmse: 5.83722e-11	valid_set's rmse: 1.60795
[7000]	train_set's rmse: 1.09177e-12	valid_set's rmse: 1.60795


	-1.6079	 = Validation root_mean_squared_error score
	4.44s	 = Training runtime
	0.04s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-1.2747	 = Validation root_mean_squared_error score
	0.51s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 11.49s ...
Evaluation: root_mean_squared_error on test data: 8.796061551840996
Evaluations on test data:
{
    "root_mean_squared_error": 8.796061551840996,
    "mean_absolute_error": 0.8937658317682541,
    "explained_variance_score": 0.9226979792228029,
    "r2_score": 0.9222720244354334,
    "pearson_correlation": 0.9677367119133484,
    "mean_squared_error": 1.475146128986951,
    "median_absolute_error": 0.6357931583598919
}
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_153437/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_153437/
AutoGluon Version:  0.0.12
Train Data Rows:    124
Train Data Columns:

[1000]	train_set's rmse: 8.41138e-05	valid_set's rmse: 0.0357267
[2000]	train_set's rmse: 2.09409e-06	valid_set's rmse: 0.035714
[3000]	train_set's rmse: 4.38001e-08	valid_set's rmse: 0.0357138
[4000]	train_set's rmse: 7.39231e-10	valid_set's rmse: 0.0357138
[5000]	train_set's rmse: 1.19256e-11	valid_set's rmse: 0.0357138
[6000]	train_set's rmse: 1.95393e-13	valid_set's rmse: 0.0357138
[7000]	train_set's rmse: 3.44937e-15	valid_set's rmse: 0.0357138


	-0.0357	 = Validation root_mean_squared_error score
	4.16s	 = Training runtime
	0.03s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-0.0265	 = Validation root_mean_squared_error score
	0.52s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 13.57s ...
Evaluation: root_mean_squared_error on test data: 0.10302361448190134
Evaluations on test data:
{
    "root_mean_squared_error": 0.10302361448190134,
    "mean_absolute_error": 0.020537386756723904,
    "explained_variance_score": 0.777003121440553,
    "r2_score": 0.7577186399283027,
    "pearson_correlation": 0.884463144400313,
    "mean_squared_error": 0.000922659109837423,
    "median_absolute_error": 0.014393217910503464
}
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_153459/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_153459/
AutoGluon Version:  0.0.12
Train Data Rows:    124
Train Data

Categorical precision threshold: 0.7
Numerical Std Error threshold: 2.0
Predictors: {'V1': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e2f00d860>, 'V2': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e29a25f98>, 'V3': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e292ec080>, 'V4': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e1e3ec470>, 'V5': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e29f845c0>, 'V6': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e1e3ece80>, 'V7': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e28030c18>, 'V8': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e29a259e8>, 'V9': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e29a27828>, 'V10': <autogluon.task.tabular_prediction.predictor.TabularP

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_153525/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_153525/
AutoGluon Version:  0.0.12
Train Data Rows:    156
Train Data Columns: 22
Preprocessing data ...
Feature Generator processed 156 data points with 21 features
Original Features (raw dtypes):
	float64 features: 21
Original Features (inferred dtypes):
	float features: 21
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 21
Final Features:
	float features: 21
	Data preprocessing and feature engineering runtime = 0.08s ...
AutoGluon will gauge predictive performance using evaluation metric: root_mean_squared_error
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: root_mean_squared_error
Fitting model: RandomForestRegressorMSE ...
	-21.4135	 = Validation root_mean_squared_error score
	0.73s	 = Training runt

[1000]	train_set's rmse: 1.20743e-05	valid_set's rmse: 0.0014217


	-0.0014	 = Validation root_mean_squared_error score
	1.39s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-0.0009	 = Validation root_mean_squared_error score
	0.53s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 8.37s ...
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_153555/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_153555/
AutoGluon Version:  0.0.12
Train Data Rows:    156
Train Data Columns: 22
Preprocessing data ...
Feature Generator processed 156 data points with 21 features
Original Features (raw dtypes):
	float64 features: 21
Original Features (inferred dtypes):
	float features: 21
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 21
Final Features:
	float features: 21
	Data preprocessing and feature engineering runtime = 0.09s ...
AutoGluon will gauge predictive perform

[1000]	train_set's rmse: 0.000129447	valid_set's rmse: 0.0030499


	-0.003	 = Validation root_mean_squared_error score
	1.19s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-0.0018	 = Validation root_mean_squared_error score
	0.54s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 10.74s ...
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_153639/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_153639/
AutoGluon Version:  0.0.12
Train Data Rows:    156
Train Data Columns: 22
Preprocessing data ...
Feature Generator processed 156 data points with 21 features
Original Features (raw dtypes):
	float64 features: 21
Original Features (inferred dtypes):
	float features: 21
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 21
Final Features:
	float features: 21
	Data preprocessing and feature engineering runtime = 0.09s ...
AutoGluon will gauge predictive perform

[1000]	train_set's rmse: 0.00139469	valid_set's rmse: 0.0302898


	-0.0301	 = Validation root_mean_squared_error score
	0.52s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: CatboostRegressor ...
	-0.0214	 = Validation root_mean_squared_error score
	4.13s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: NeuralNetRegressor ...
	-0.012	 = Validation root_mean_squared_error score
	4.34s	 = Training runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMRegressorCustom ...
	-0.028	 = Validation root_mean_squared_error score
	0.49s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-0.012	 = Validation root_mean_squared_error score
	0.56s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 12.71s ...
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_153739/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_153739/
AutoGluon Version:  0.0.12
Train Data Rows:    156


[1000]	train_set's rmse: 0.00455953	valid_set's rmse: 0.0585091
[2000]	train_set's rmse: 0.00165458	valid_set's rmse: 0.0582908


	-0.0583	 = Validation root_mean_squared_error score
	0.72s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: CatboostRegressor ...
	-0.0522	 = Validation root_mean_squared_error score
	1.31s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: NeuralNetRegressor ...
	-0.0472	 = Validation root_mean_squared_error score
	1.89s	 = Training runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMRegressorCustom ...


[1000]	train_set's rmse: 0.00027099	valid_set's rmse: 0.0578018


	-0.0578	 = Validation root_mean_squared_error score
	1.06s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-0.0461	 = Validation root_mean_squared_error score
	0.51s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 8.46s ...
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_153756/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_153756/
AutoGluon Version:  0.0.12
Train Data Rows:    156
Train Data Columns: 22
Preprocessing data ...
Feature Generator processed 156 data points with 21 features
Original Features (raw dtypes):
	float64 features: 21
Original Features (inferred dtypes):
	float features: 21
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 21
Final Features:
	float features: 21
	Data preprocessing and feature engineering runtime = 0.09s ...
AutoGluon will gauge predictive perform

[1000]	train_set's rmse: 4.10915e-05	valid_set's rmse: 0.0639365


	-0.0639	 = Validation root_mean_squared_error score
	1.12s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-0.0594	 = Validation root_mean_squared_error score
	0.52s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 7.93s ...
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_153821/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_153821/
AutoGluon Version:  0.0.12
Train Data Rows:    156
Train Data Columns: 22
Preprocessing data ...
Feature Generator processed 156 data points with 21 features
Original Features (raw dtypes):
	float64 features: 21
Original Features (inferred dtypes):
	float features: 21
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 21
Final Features:
	float features: 21
	Data preprocessing and feature engineering runtime = 0.09s ...
AutoGluon will gauge predictive perform

Cleaner: {'outlier_detection': PyODKNNOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.9837662337662337, 'classification_report': {'1': {'precision': 0.8461538461538461, 'recall': 1.0, 'f1-score': 0.9166666666666666, 'support': 11}, '2': {'precision': 1.0, 'recall': 0.9285714285714286, 'f1-score': 0.962962962962963, 'support': 28}, 'accuracy': 0.9487179487179487, 'macro avg': {'precision': 0.9230769230769231, 'recall': 0.9642857142857143, 'f1-score': 0.9398148148148149, 'support': 39}, 'weighted avg': {'precision': 0.9566074950690335, 'recall': 0.9487179487179487, 'f1-score': 0.9499050332383666, 'support': 39}}}
Cleaner: {'outlier_detection': PyODIsolationForestOutlierDetection, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.6168831168831168, 'classification_report': {'1': {'precision': 0.5, 'recall': 0.18181818181818182, 'f1-score': 0.26666666666666666, 'support': 11}, '2': {'precision': 0.7428571428571429, 'recall': 0.9285714285714286, 'f1-score': 0.8

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_153855/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_153855/
AutoGluon Version:  0.0.12
Train Data Rows:    156
Train Data Columns: 22
Preprocessing data ...
Feature Generator processed 156 data points with 21 features
Original Features (raw dtypes):
	float64 features: 21
Original Features (inferred dtypes):
	float features: 21
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 21
Final Features:
	float features: 21
	Data preprocessing and feature engineering runtime = 0.08s ...
AutoGluon will gauge predictive performance using evaluation metric: root_mean_squared_error
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: root_mean_squared_error
Fitting model: RandomForestRegressorMSE ...
	-21.4645	 = Validation root_mean_squared_error score
	0.73s	 = Training runt

[1000]	train_set's rmse: 1.20743e-05	valid_set's rmse: 0.0014217


	-0.0014	 = Validation root_mean_squared_error score
	1.42s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-0.0009	 = Validation root_mean_squared_error score
	0.56s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 7.34s ...
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_153923/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_153923/
AutoGluon Version:  0.0.12
Train Data Rows:    156
Train Data Columns: 22
Preprocessing data ...
Feature Generator processed 156 data points with 21 features
Original Features (raw dtypes):
	float64 features: 21
Original Features (inferred dtypes):
	float features: 21
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 21
Final Features:
	float features: 21
	Data preprocessing and feature engineering runtime = 0.09s ...
AutoGluon will gauge predictive perform

[1000]	train_set's rmse: 0.000129447	valid_set's rmse: 0.0030499


	-0.003	 = Validation root_mean_squared_error score
	1.16s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-0.0018	 = Validation root_mean_squared_error score
	0.54s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 10.53s ...
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_154009/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_154009/
AutoGluon Version:  0.0.12
Train Data Rows:    156
Train Data Columns: 22
Preprocessing data ...
Feature Generator processed 156 data points with 21 features
Original Features (raw dtypes):
	float64 features: 21
Original Features (inferred dtypes):
	float features: 21
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 21
Final Features:
	float features: 21
	Data preprocessing and feature engineering runtime = 0.09s ...
AutoGluon will gauge predictive perform

[1000]	train_set's rmse: 0.00139469	valid_set's rmse: 0.0302898


	-0.0301	 = Validation root_mean_squared_error score
	0.53s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: CatboostRegressor ...
	-0.0214	 = Validation root_mean_squared_error score
	4.21s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: NeuralNetRegressor ...
	-0.0267	 = Validation root_mean_squared_error score
	2.63s	 = Training runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMRegressorCustom ...
	-0.028	 = Validation root_mean_squared_error score
	0.47s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-0.0167	 = Validation root_mean_squared_error score
	0.56s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 11.09s ...
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_154100/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_154100/
AutoGluon Version:  0.0.12
Train Data Rows:    15

[1000]	train_set's rmse: 0.00455953	valid_set's rmse: 0.0585091
[2000]	train_set's rmse: 0.00165458	valid_set's rmse: 0.0582908


	-0.0583	 = Validation root_mean_squared_error score
	0.71s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: CatboostRegressor ...
	-0.0522	 = Validation root_mean_squared_error score
	1.24s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: NeuralNetRegressor ...
	-0.0505	 = Validation root_mean_squared_error score
	2.2s	 = Training runtime
	0.02s	 = Validation runtime
Fitting model: LightGBMRegressorCustom ...


[1000]	train_set's rmse: 0.00027099	valid_set's rmse: 0.0578018


	-0.0578	 = Validation root_mean_squared_error score
	1.02s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-0.0477	 = Validation root_mean_squared_error score
	0.63s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 8.73s ...
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_154117/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_154117/
AutoGluon Version:  0.0.12
Train Data Rows:    156
Train Data Columns: 22
Preprocessing data ...
Feature Generator processed 156 data points with 21 features
Original Features (raw dtypes):
	float64 features: 21
Original Features (inferred dtypes):
	float features: 21
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 21
Final Features:
	float features: 21
	Data preprocessing and feature engineering runtime = 0.11s ...
AutoGluon will gauge predictive perform

[1000]	train_set's rmse: 4.10915e-05	valid_set's rmse: 0.0639365


	-0.0639	 = Validation root_mean_squared_error score
	1.13s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: weighted_ensemble_k0_l1 ...
	-0.0609	 = Validation root_mean_squared_error score
	0.63s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 7.89s ...
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_154141/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_154141/
AutoGluon Version:  0.0.12
Train Data Rows:    156
Train Data Columns: 22
Preprocessing data ...
Feature Generator processed 156 data points with 21 features
Original Features (raw dtypes):
	float64 features: 21
Original Features (inferred dtypes):
	float features: 21
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 21
Final Features:
	float features: 21
	Data preprocessing and feature engineering runtime = 0.08s ...
AutoGluon will gauge predictive perform

Cleaner: {'outlier_detection': PyODIsolationForestOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.8603896103896104, 'classification_report': {'1': {'precision': 0.5454545454545454, 'recall': 0.5454545454545454, 'f1-score': 0.5454545454545454, 'support': 11}, '2': {'precision': 0.8214285714285714, 'recall': 0.8214285714285714, 'f1-score': 0.8214285714285714, 'support': 28}, 'accuracy': 0.7435897435897436, 'macro avg': {'precision': 0.6834415584415584, 'recall': 0.6834415584415584, 'f1-score': 0.6834415584415584, 'support': 39}, 'weighted avg': {'precision': 0.7435897435897436, 'recall': 0.7435897435897436, 'f1-score': 0.7435897435897436, 'support': 39}}}
Column V1: Num NaNs: Before: 0, Now: 0
Column V2: Num NaNs: Before: 0, Now: 5
Column V3: Num NaNs: Before: 0, Now: 31
Column V7: Num NaNs: Before: 0, Now: 0
Column V9: Num NaNs: Before: 0, Now: 0
Column V10: Num NaNs: Before: 7, Now: 7
Column V12: Num NaNs: Before: 0, Now: 0
Column V16: Num NaNs: Before: 0, Now

Evaluation: root_mean_squared_error on test data: 32.582979721559234


Column V22: Num NaNs: Before: 0, Now: 0


Evaluation: root_mean_squared_error on test data: 42.920917535635965
Evaluation: root_mean_squared_error on test data: 3.8343712883644647
Evaluation: root_mean_squared_error on test data: 0.0009743500170331338
Evaluation: root_mean_squared_error on test data: 1.0583684554313212e-05
Evaluation: root_mean_squared_error on test data: 0.0006626751210266909
Evaluation: root_mean_squared_error on test data: 0.0004127786324120747
Evaluation: root_mean_squared_error on test data: 0.002870797056880129
Evaluation: root_mean_squared_error on test data: 0.0024840956553001774
Evaluation: root_mean_squared_error on test data: 0.03287678014066008
Evaluation: root_mean_squared_error on test data: 0.0006894614313835895
Evaluation: root_mean_squared_error on test data: 0.003540398800804717
Evaluation: root_mean_squared_error on test data: 0.003770270255570455
Evaluation: root_mean_squared_error on test data: 0.0019172218210108537
Evaluation: root_mean_squared_error on test data: 0.029112753851502523
Eva

Cleaner: {'outlier_detection': AutoGluonOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.9935064935064934, 'classification_report': {'1': {'precision': 0.9090909090909091, 'recall': 0.9090909090909091, 'f1-score': 0.9090909090909091, 'support': 11}, '2': {'precision': 0.9642857142857143, 'recall': 0.9642857142857143, 'f1-score': 0.9642857142857143, 'support': 28}, 'accuracy': 0.9487179487179487, 'macro avg': {'precision': 0.9366883116883117, 'recall': 0.9366883116883117, 'f1-score': 0.9366883116883117, 'support': 39}, 'weighted avg': {'precision': 0.9487179487179487, 'recall': 0.9487179487179487, 'f1-score': 0.9487179487179487, 'support': 39}}}

Best cleaning method:
Cleaning score: Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.9967532467532467, 'classification_report': {'1': {'precision': 0.7333333333333333, 'recall': 1.0, 'f1-score': 0.846153846153846, 'support': 11}, '2': {'precision': 1.0, 'recall'

Data pickle file already exists and is up to date.


Dataset: heart-statlog
Found 0 categorical and 13 numeric features 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    3.5s finished



Generating corrupted training data on 54 rows... 

	perturbation: MissingValues: {'column': 'age', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'age', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'serum_cholestoral', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'thal', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'age', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: 

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_154213/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_154213/
AutoGluon Version:  0.0.12
Train Data Rows:    172
Train Data Columns: 13
Preprocessing data ...
Feature Generator processed 172 data points with 12 features
Original Features (raw dtypes):
	float64 features: 12
Original Features (inferred dtypes):
	float features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 12
Final Features:
	float features: 12
	Data preprocessing and feature engineering runtime = 0.07s ...
AutoGluon will gauge predictive performance using evaluation metric: root_mean_squared_error
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: root_mean_squared_error
Fitting model: RandomForestRegressorMSE ...


	perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'number_of_major_vessels', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'thal', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'fasting_blood_sugar', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'exercise_induced_angina', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'chest', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'resting_blood_pressure', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues:

	-8.7894	 = Validation root_mean_squared_error score
	0.63s	 = Training runtime
	0.11s	 = Validation runtime
Fitting model: ExtraTreesRegressorMSE ...
	-8.768	 = Validation root_mean_squared_error score
	0.43s	 = Training runtime
	0.11s	 = Validation runtime
Fitting model: KNeighborsRegressorUnif ...
	-8.582	 = Validation root_mean_squared_error score
	0.02s	 = Training runtime
	0.11s	 = Validation runtime
Fitting model: KNeighborsRegressorDist ...
	-9.017	 = Validation root_mean_squared_error score
	0.01s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: LightGBMRegressor ...
	-7.8839	 = Validation root_mean_squared_error score
	0.25s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: CatboostRegressor ...
	-8.4829	 = Validation root_mean_squared_error score
	0.12s	 = Training runtime
	0.01s	 = Validation runtime
Fitting model: NeuralNetRegressor ...
	-8.2195	 = Validation root_mean_squared_error score
	1.42s	 = Training runtime
	0.01s	 = Validation runtime


Categorical precision threshold: 0.7
Numerical Std Error threshold: 2.0
Predictors: {'age': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e1dc110f0>, 'sex': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e27ac8fd0>, 'chest': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e1c2895c0>, 'resting_blood_pressure': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e1c24a0b8>, 'serum_cholestoral': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e1c24a898>, 'fasting_blood_sugar': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e299d7c88>, 'resting_electrocardiographic_results': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e1c24c470>, 'maximum_heart_rate_achieved': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e1c289e80>, 'exercise_induced_angina': <autogluon.t

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_154318/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_154318/
AutoGluon Version:  0.0.12
Train Data Rows:    216
Train Data Columns: 13
Preprocessing data ...
Feature Generator processed 216 data points with 12 features
Original Features (raw dtypes):
	float64 features: 12
Original Features (inferred dtypes):
	float features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 12
Final Features:
	float features: 12
	Data preprocessing and feature engineering runtime = 0.07s ...
AutoGluon will gauge predictive performance using evaluation metric: root_mean_squared_error
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: root_mean_squared_error
Fitting model: RandomForestRegressorMSE ...
	-8.9114	 = Validation root_mean_squared_error score
	0.63s	 = Training runti

Cleaner: {'outlier_detection': PyODKNNOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.918956043956044, 'classification_report': {'absent': {'precision': 0.8620689655172413, 'recall': 0.9615384615384616, 'f1-score': 0.9090909090909091, 'support': 26}, 'present': {'precision': 0.96, 'recall': 0.8571428571428571, 'f1-score': 0.9056603773584904, 'support': 28}, 'accuracy': 0.9074074074074074, 'macro avg': {'precision': 0.9110344827586206, 'recall': 0.9093406593406593, 'f1-score': 0.9073756432246998, 'support': 54}, 'weighted avg': {'precision': 0.9128480204342273, 'recall': 0.9074074074074074, 'f1-score': 0.9073121148592846, 'support': 54}}}
Cleaner: {'outlier_detection': PyODIsolationForestOutlierDetection, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.9162087912087913, 'classification_report': {'absent': {'precision': 0.8275862068965517, 'recall': 0.9230769230769231, 'f1-score': 0.8727272727272727, 'support': 26}, 'present': {'precision': 0.92, 'recall'

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_154432/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_154432/
AutoGluon Version:  0.0.12
Train Data Rows:    216
Train Data Columns: 13
Preprocessing data ...
Feature Generator processed 216 data points with 12 features
Original Features (raw dtypes):
	float64 features: 12
Original Features (inferred dtypes):
	float features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 12
Final Features:
	float features: 12
	Data preprocessing and feature engineering runtime = 0.08s ...
AutoGluon will gauge predictive performance using evaluation metric: root_mean_squared_error
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: root_mean_squared_error
Fitting model: RandomForestRegressorMSE ...
	-8.8129	 = Validation root_mean_squared_error score
	0.63s	 = Training runti

Cleaner: {'outlier_detection': PyODIsolationForestOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.9189560439560439, 'classification_report': {'absent': {'precision': 0.8214285714285714, 'recall': 0.8846153846153846, 'f1-score': 0.8518518518518519, 'support': 26}, 'present': {'precision': 0.8846153846153846, 'recall': 0.8214285714285714, 'f1-score': 0.8518518518518519, 'support': 28}, 'accuracy': 0.8518518518518519, 'macro avg': {'precision': 0.853021978021978, 'recall': 0.853021978021978, 'f1-score': 0.8518518518518519, 'support': 54}, 'weighted avg': {'precision': 0.8541921041921041, 'recall': 0.8518518518518519, 'f1-score': 0.8518518518518519, 'support': 54}}}
Column age: Num NaNs: Before: 0, Now: 0
Column sex: Num NaNs: Before: 0, Now: 0
Column chest: Num NaNs: Before: 0, Now: 0
Column resting_blood_pressure: Num NaNs: Before: 0, Now: 2
Column serum_cholestoral: Num NaNs: Before: 0, Now: 0
Column fasting_blood_sugar: Num NaNs: Before: 0, Now: 11
Column rest

Evaluation: root_mean_squared_error on test data: 6.875529714326623
Evaluation: root_mean_squared_error on test data: 0.3909944070766402
Evaluation: root_mean_squared_error on test data: 0.9307673580074393
Evaluation: root_mean_squared_error on test data: 12.91146577768603
Evaluation: root_mean_squared_error on test data: 41.60527344839843
Evaluation: root_mean_squared_error on test data: 0.15146036676448116
Evaluation: root_mean_squared_error on test data: 0.9855364943949211
Evaluation: root_mean_squared_error on test data: 17.117386760602276
Evaluation: root_mean_squared_error on test data: 0.4377148874886628
Evaluation: root_mean_squared_error on test data: 0.999597006737082
Evaluation: root_mean_squared_error on test data: 0.5340294819052861
Evaluation: root_mean_squared_error on test data: 0.8336935661257499
Evaluation: root_mean_squared_error on test data: 1.8446964875399603


Cleaner: {'outlier_detection': AutoGluonOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.9340659340659341, 'classification_report': {'absent': {'precision': 0.8620689655172413, 'recall': 0.9615384615384616, 'f1-score': 0.9090909090909091, 'support': 26}, 'present': {'precision': 0.96, 'recall': 0.8571428571428571, 'f1-score': 0.9056603773584904, 'support': 28}, 'accuracy': 0.9074074074074074, 'macro avg': {'precision': 0.9110344827586206, 'recall': 0.9093406593406593, 'f1-score': 0.9073756432246998, 'support': 54}, 'weighted avg': {'precision': 0.9128480204342273, 'recall': 0.9074074074074074, 'f1-score': 0.9073121148592846, 'support': 54}}}
Column age: Num NaNs: Before: 0, Now: 0
Column sex: Num NaNs: Before: 0, Now: 0
Column chest: Num NaNs: Before: 0, Now: 0
Column resting_blood_pressure: Num NaNs: Before: 0, Now: 2
Column serum_cholestoral: Num NaNs: Before: 0, Now: 0
Column fasting_blood_sugar: Num NaNs: Before: 0, Now: 11
Column resting_electrocardiograph

Evaluation: root_mean_squared_error on test data: 6.875529714326623
Evaluation: root_mean_squared_error on test data: 0.3909944070766402
Evaluation: root_mean_squared_error on test data: 0.9307673580074393
Evaluation: root_mean_squared_error on test data: 12.91146577768603
Evaluation: root_mean_squared_error on test data: 41.60527344839843
Evaluation: root_mean_squared_error on test data: 0.15146036676448116
Evaluation: root_mean_squared_error on test data: 0.9855364943949211
Evaluation: root_mean_squared_error on test data: 17.117386760602276
Evaluation: root_mean_squared_error on test data: 0.4377148874886628
Evaluation: root_mean_squared_error on test data: 0.999597006737082
Evaluation: root_mean_squared_error on test data: 0.5340294819052861
Evaluation: root_mean_squared_error on test data: 0.8336935661257499
Evaluation: root_mean_squared_error on test data: 1.8446964875399603



Best cleaning method:
Cleaning score: Cleaner: {'outlier_detection': AutoGluonOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.9340659340659341, 'classification_report': {'absent': {'precision': 0.8620689655172413, 'recall': 0.9615384615384616, 'f1-score': 0.9090909090909091, 'support': 26}, 'present': {'precision': 0.96, 'recall': 0.8571428571428571, 'f1-score': 0.9056603773584904, 'support': 28}, 'accuracy': 0.9074074074074074, 'macro avg': {'precision': 0.9110344827586206, 'recall': 0.9093406593406593, 'f1-score': 0.9073756432246998, 'support': 54}, 'weighted avg': {'precision': 0.9128480204342273, 'recall': 0.9074074074074074, 'f1-score': 0.9073121148592846, 'support': 54}}} 

Cleaning didnt't improve the overall score 





Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Dataset: credit-g
Found 13 categorical and 7 numeric features 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    7.3s finished



Generating corrupted training data on 200 rows... 

	perturbation: MissingValues: {'column': 'credit_amount', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'other_payment_plans', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'duration', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'credit_history', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'existing_credits', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'housing', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'savings_status', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: Mi

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_154552/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_154552/
AutoGluon Version:  0.0.12
Train Data Rows:    640
Train Data Columns: 20
Preprocessing data ...
Train Data Class Count: 4


	perturbation: MissingValues: {'column': 'other_parties', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'credit_history', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'savings_status', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'housing', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'age', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'checking_status', 'fraction': 0.15, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'foreign_worker', 'fraction': 0.15, 

Feature Generator processed 640 data points with 19 features
Original Features (raw dtypes):
	float64 features: 7
	object features: 12
Original Features (inferred dtypes):
	float features: 7
	object features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 7
	category features: 12
Final Features:
	float features: 7
	category features: 12
	Data preprocessing and feature engineering runtime = 0.12s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini ...
	0.4219	 = Validation accuracy score
	0.85s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: RandomForestClassifierEntr ...
	0.4375	 = Validation accuracy score
	1.05s	 = Training runtime
	0.22s	 = Validation runtime
Fitting model: ExtraTreesClassifierGini ...
	0.3672	 = Validation accuracy sc

Categorical precision threshold: 0.7
Numerical Std Error threshold: 2.0
Predictors: {'checking_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e278db240>, 'credit_history': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e1c73d400>, 'purpose': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e1cc41470>, 'savings_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e1c257898>, 'employment': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e278b89e8>, 'personal_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e1b1aa128>, 'other_parties': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e1c269198>, 'property_magnitude': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6e1c747ef0>, 'other_payment_plans': <autogluon.task.tabular_prediction.predict

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_155252/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_155252/
AutoGluon Version:  0.0.12
Train Data Rows:    800
Train Data Columns: 20
Preprocessing data ...
Train Data Class Count: 4
Feature Generator processed 800 data points with 19 features
Original Features (raw dtypes):
	float64 features: 7
	object features: 12
Original Features (inferred dtypes):
	float features: 7
	object features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 7
	category features: 12
Final Features:
	float features: 7
	category features: 12
	Data preprocessing and feature engineering runtime = 0.13s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini

Cleaner: {'outlier_detection': PyODKNNOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.802127162225339, 'classification_report': {'bad': {'precision': 0.6538461538461539, 'recall': 0.5483870967741935, 'f1-score': 0.5964912280701755, 'support': 62}, 'good': {'precision': 0.8108108108108109, 'recall': 0.8695652173913043, 'f1-score': 0.8391608391608392, 'support': 138}, 'accuracy': 0.77, 'macro avg': {'precision': 0.7323284823284824, 'recall': 0.7089761570827489, 'f1-score': 0.7178260336155073, 'support': 200}, 'weighted avg': {'precision': 0.7621517671517672, 'recall': 0.77, 'f1-score': 0.7639332597227334, 'support': 200}}}
Cleaner: {'outlier_detection': PyODIsolationForestOutlierDetection, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.7640252454417954, 'classification_report': {'bad': {'precision': 0.5714285714285714, 'recall': 0.45161290322580644, 'f1-score': 0.5045045045045045, 'support': 62}, 'good': {'precision': 0.7748344370860927, 'recall': 0.8478

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200820_160055/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200820_160055/
AutoGluon Version:  0.0.12
Train Data Rows:    800
Train Data Columns: 20
Preprocessing data ...
Train Data Class Count: 4
Feature Generator processed 800 data points with 19 features
Original Features (raw dtypes):
	float64 features: 7
	object features: 12
Original Features (inferred dtypes):
	float features: 7
	object features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 7
	category features: 12
Final Features:
	float features: 7
	category features: 12
	Data preprocessing and feature engineering runtime = 0.13s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini

Cleaner: {'outlier_detection': PyODIsolationForestOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.7616877045348293, 'classification_report': {'bad': {'precision': 0.5652173913043478, 'recall': 0.41935483870967744, 'f1-score': 0.4814814814814815, 'support': 62}, 'good': {'precision': 0.7662337662337663, 'recall': 0.855072463768116, 'f1-score': 0.8082191780821918, 'support': 138}, 'accuracy': 0.72, 'macro avg': {'precision': 0.665725578769057, 'recall': 0.6372136512388967, 'f1-score': 0.6448503297818367, 'support': 200}, 'weighted avg': {'precision': 0.7039186900056466, 'recall': 0.72, 'f1-score': 0.7069304921359716, 'support': 200}}}
Column credit_history: Num NaNs: Before: 0, Now: 31
Column personal_status: Num NaNs: Before: 0, Now: 2
Column other_parties: Num NaNs: Before: 0, Now: 15
Column property_magnitude: Num NaNs: Before: 0, Now: 1
Column other_payment_plans: Num NaNs: Before: 0, Now: 38
Column housing: Num NaNs: Before: 0, Now: 39
Column own_telephone:

Evaluation: accuracy on test data: 0.455
Evaluation: root_mean_squared_error on test data: 8.26470822869023
Evaluation: accuracy on test data: 0.69
Evaluation: accuracy on test data: 0.24
Evaluation: root_mean_squared_error on test data: 2374.5349678574485
Evaluation: accuracy on test data: 0.625
Evaluation: accuracy on test data: 0.505
Evaluation: root_mean_squared_error on test data: 0.9565251865323527
Evaluation: accuracy on test data: 0.5
Evaluation: accuracy on test data: 0.925
Evaluation: root_mean_squared_error on test data: 0.9163862628788637
Evaluation: accuracy on test data: 0.465
Evaluation: root_mean_squared_error on test data: 7.7945020257945385
Evaluation: accuracy on test data: 0.81
Evaluation: accuracy on test data: 0.8
Evaluation: root_mean_squared_error on test data: 0.329055102478832
Evaluation: accuracy on test data: 0.56
Evaluation: root_mean_squared_error on test data: 0.2836734110988546
Evaluation: accuracy on test data: 0.645
Evaluation: accuracy on test data: 0

Cleaner: {'outlier_detection': AutoGluonOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.8142823749415615, 'classification_report': {'bad': {'precision': 0.673469387755102, 'recall': 0.532258064516129, 'f1-score': 0.5945945945945945, 'support': 62}, 'good': {'precision': 0.8079470198675497, 'recall': 0.8840579710144928, 'f1-score': 0.8442906574394464, 'support': 138}, 'accuracy': 0.775, 'macro avg': {'precision': 0.7407082038113258, 'recall': 0.7081580177653108, 'f1-score': 0.7194426260170205, 'support': 200}, 'weighted avg': {'precision': 0.7662589539126909, 'recall': 0.775, 'f1-score': 0.7668848779575425, 'support': 200}}}
Column credit_history: Num NaNs: Before: 0, Now: 31
Column personal_status: Num NaNs: Before: 0, Now: 2
Column other_parties: Num NaNs: Before: 0, Now: 15
Column property_magnitude: Num NaNs: Before: 0, Now: 1
Column other_payment_plans: Num NaNs: Before: 0, Now: 38
Column housing: Num NaNs: Before: 0, Now: 39
Column own_telephone: Num NaNs

Evaluation: accuracy on test data: 0.455
Evaluation: root_mean_squared_error on test data: 8.26470822869023
Evaluation: accuracy on test data: 0.69
Evaluation: accuracy on test data: 0.24
Evaluation: root_mean_squared_error on test data: 2374.5349678574485
Evaluation: accuracy on test data: 0.625
Evaluation: accuracy on test data: 0.505
Evaluation: root_mean_squared_error on test data: 0.9565251865323527
Evaluation: accuracy on test data: 0.5
Evaluation: accuracy on test data: 0.925
Evaluation: root_mean_squared_error on test data: 0.9163862628788637
Evaluation: accuracy on test data: 0.465
Evaluation: root_mean_squared_error on test data: 7.7945020257945385
Evaluation: accuracy on test data: 0.81
Evaluation: accuracy on test data: 0.8
Evaluation: root_mean_squared_error on test data: 0.329055102478832
Evaluation: accuracy on test data: 0.56
Evaluation: root_mean_squared_error on test data: 0.2836734110988546
Evaluation: accuracy on test data: 0.645
Evaluation: accuracy on test data: 0


Best cleaning method:
Cleaning score: Cleaner: {'outlier_detection': AutoGluonOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.8142823749415615, 'classification_report': {'bad': {'precision': 0.673469387755102, 'recall': 0.532258064516129, 'f1-score': 0.5945945945945945, 'support': 62}, 'good': {'precision': 0.8079470198675497, 'recall': 0.8840579710144928, 'f1-score': 0.8442906574394464, 'support': 138}, 'accuracy': 0.775, 'macro avg': {'precision': 0.7407082038113258, 'recall': 0.7081580177653108, 'f1-score': 0.7194426260170205, 'support': 200}, 'weighted avg': {'precision': 0.7662589539126909, 'recall': 0.775, 'f1-score': 0.7668848779575425, 'support': 200}}} 

Cleaning improved the overall score 



CPU times: user 46min 8s, sys: 2min 26s, total: 48min 35s
Wall time: 37min 4s


In [17]:
ind_results

[{'cleaners': [{'Imputation method': MeanModeImputation,
    'Outlier detection method': NoOutlierDetection,
    'PPP score with cleaning': {'classification_report': {'1': {'f1-score': 0.846153846153846,
       'precision': 0.7333333333333333,
       'recall': 1.0,
       'support': 11},
      '2': {'f1-score': 0.923076923076923,
       'precision': 1.0,
       'recall': 0.8571428571428571,
       'support': 28},
      'accuracy': 0.8974358974358975,
      'macro avg': {'f1-score': 0.8846153846153846,
       'precision': 0.8666666666666667,
       'recall': 0.9285714285714286,
       'support': 39},
      'weighted avg': {'f1-score': 0.9013806706114397,
       'precision': 0.9247863247863247,
       'recall': 0.8974358974358975,
       'support': 39}},
     'roc_auc_acore': 0.9967532467532467}},
   {'Imputation method': MeanModeImputation,
    'Outlier detection method': PyODKNNOutlierDetection,
    'PPP score with cleaning': {'classification_report': {'1': {'f1-score': 0.7272727272727

In [21]:
len(ind_results)

3