In [1]:
# mount drive for access to the files
from google.colab import drive

drive.mount("/content/drive")

# all the drive the files are present in "/content/drive/My Drive"
!ls "/content/drive/My Drive/Beuth Uni/Master Thesis"

import sys
sys.path.append('/content/drive/My Drive/Beuth Uni/Master Thesis/jenga')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Data
'Data Quality in ML Production Systems.pdf'
'Datawig: Missing Value Imputation for Tables.pdf'
 Declaration
 Images
 jenga
 jenga.pdf
 MICE_Multivariate_Imputation_by_Chained_Equations_.pdf


In [2]:
!pip install openml
!pip install pyod

!pip install mxnet autogluon
!pip install mxnet-mkl --pre --upgrade

Requirement already up-to-date: mxnet-mkl in /usr/local/lib/python3.6/dist-packages (1.6.0)


In [2]:
import random
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from jenga.basis import Dataset
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise
from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNN, PyODIsolationForest
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, AutoGluonImputation
from jenga.cleaning.clean import Clean

In [3]:
seed = 10

In [11]:
def run_experiment(dataset_name, learner, param_grid, corruptions, fraction, cleaners, num_repetitions):
    
    ## dataset
    dataset = Dataset(seed, dataset_name)
    
    all_data = dataset.all_data
    attribute_names = dataset.attribute_names
    attribute_types = dataset.attribute_types
    
    ## categorical and numerical features
    categorical_columns = dataset.categorical_columns
    numerical_columns = dataset.numerical_columns
    print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")
    
    ## train and test data
    df_train, lab_train, df_test, lab_test = dataset.get_train_test_data()
    
    
    ## pipeline performance prediction (ppp)
    ppp = PipelinePerformancePrediction(seed, df_train, lab_train, df_test, lab_test, categorical_columns, numerical_columns, learner, param_grid)
    ppp_model = ppp.fit_ppp(df_train)
    
    ## generate corrpted data
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(df_test, corruptions, fraction, num_repetitions)
    
    ## cleaning
    clean = Clean(df_train, df_corrupted, categorical_columns, numerical_columns, ppp, ppp_model, cleaners)
    df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(df_train, df_corrupted)
    
    ## results
    result = {
        'ppp_score_model': ppp.predict_score_ppp(ppp_model, df_test),
        'ppp_score_corrupted': corrupted_score_ppp,
        'ppp_score_cleaned': best_cleaning_score,
        'ppp_scores_cleaners': cleaner_scores_ppp
    }
#     print('\n'.join([f'{key}:{val}' for key, val in result.items()]))
    
    ## summary
    summary = {
        'dataset': dataset_name,
        'model': learner,
        'corruptions': summary_col_corrupt,
        'cleaners': summary_cleaners,
        'result': result
    }
#     print('\n\n\n\n'.join([f'{key}:{val}' for key, val in summary.items()]))
    
    return summary #summary_col_corrupt, result

In [12]:
datasets = [
    # 'parkinsons',
    'heart-statlog',
    'credit-g'
]

In [13]:
## model parameters
## models is a dict where key = leaner & value = param_grid
models = {SGDClassifier(loss='log'): {'learner__max_iter': [500, 1000, 5000], 
                                         'learner__penalty': ['l2', 'l1', 'elasticnet'], 
                                         'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
                                        }, 
          RandomForestClassifier():{'learner__n_estimators': [100, 200, 500], 
                                    'learner__max_depth': [5, 10, 15]
                                   }
         }

## make dict of multiple leraners and corresponding param_grids

In [14]:
corruptions = [MissingValues, SwappedValues, Scaling, GaussianNoise]

In [15]:
fractions = np.random.uniform(0, 1, 1) # np.random.uniform(0, 1, 3)

In [16]:
cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (NoOutlierDetection, AutoGluonImputation),
    (PyODKNN, NoImputation),
    (PyODKNN, MeanModeImputation),
    (PyODKNN, AutoGluonImputation),
    (PyODIsolationForest, NoImputation),
    (PyODIsolationForest, MeanModeImputation),
    (PyODIsolationForest, AutoGluonImputation)
]

In [17]:
%%time
for _ in range(2):
  print("\n\n..................................ITERATION..................................\n")
  ind_results = []

  for dataset in datasets:
      for learner, param_grid in models.items():
          for fraction in fractions:
              ind_results.append(run_experiment(dataset, learner, param_grid, [MissingValues], fraction, [(PyODKNN, AutoGluonImputation)], 5))



..................................ITERATION..................................



Data pickle file already exists and is up to date.


Dataset: heart-statlog
Found 0 categorical and 13 numeric features 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 174 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    3.8s finished



Generating corrupted training data on 54 rows... 

	perturbation: MissingValues: {'column': 'age', 'fraction': 0.3532754431740348, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.3532754431740348, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'age', 'fraction': 0.3532754431740348, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.3532754431740348, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'serum_cholestoral', 'fraction': 0.3532754431740348, 'na_value': nan, 'missingness': 'MAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9368131868131868, 'classification_report': {'absent': {'precision': 0.78125, 'recall': 0.9615384615384616, 'f1-score': 0.8620689655172413, 'support': 26}, 'present': {'precision': 0.9545454545454546, 'recall': 0.75, 'f1-s

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200724_114423/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200724_114423/
AutoGluon Version:  0.0.12
Train Data Rows:    216
Train Data Columns: 13
Preprocessing data ...
NumExpr defaulting to 2 threads.
Feature Generator processed 216 data points with 12 features
Original Features (raw dtypes):
	float64 features: 12
Original Features (inferred dtypes):
	float features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 12
Final Features:
	float features: 12
	Data preprocessing and feature engineering runtime = 0.08s ...
AutoGluon will gauge predictive performance using evaluation metric: root_mean_squared_error
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: root_mean_squared_error
Fitting model: RandomForestRegressorMSE ...
	-9.0528	 = Validation root_mean_squared_err

Cleaner: {'outlier_detection': PyODKNN, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.9326923076923077, 'classification_report': {'absent': {'precision': 0.8620689655172413, 'recall': 0.9615384615384616, 'f1-score': 0.9090909090909091, 'support': 26}, 'present': {'precision': 0.96, 'recall': 0.8571428571428571, 'f1-score': 0.9056603773584904, 'support': 28}, 'accuracy': 0.9074074074074074, 'macro avg': {'precision': 0.9110344827586206, 'recall': 0.9093406593406593, 'f1-score': 0.9073756432246998, 'support': 54}, 'weighted avg': {'precision': 0.9128480204342273, 'recall': 0.9074074074074074, 'f1-score': 0.9073121148592846, 'support': 54}}}


No output_directory specified. Models will be saved in: AutogluonModels/ag-20200724_114530/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200724_114530/
AutoGluon Version:  0.0.12
Train Data Rows:    216
Train Data Columns: 13
Preprocessing data ...
Feature Generator processed 216 data points with 12 features
Original Features (raw dtypes):
	float64 features: 12
Original Features (inferred dtypes):
	float features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 12
Final Features:
	float features: 12
	Data preprocessing and feature engineering runtime = 0.07s ...
AutoGluon will gauge predictive performance using evaluation metric: root_mean_squared_error
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: root_mean_squared_error
Fitting model: RandomForestRegressorMSE ...
	-8.9114	 = Validation root_mean_squared_error score
	0.76s	 = Training runti


Best cleaning method:
Cleaning score: Cleaner: {'outlier_detection': PyODKNN, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.9326923076923077, 'classification_report': {'absent': {'precision': 0.8620689655172413, 'recall': 0.9615384615384616, 'f1-score': 0.9090909090909091, 'support': 26}, 'present': {'precision': 0.96, 'recall': 0.8571428571428571, 'f1-score': 0.9056603773584904, 'support': 28}, 'accuracy': 0.9074074074074074, 'macro avg': {'precision': 0.9110344827586206, 'recall': 0.9093406593406593, 'f1-score': 0.9073756432246998, 'support': 54}, 'weighted avg': {'precision': 0.9128480204342273, 'recall': 0.9074074074074074, 'f1-score': 0.9073121148592846, 'support': 54}}} 

Cleaning didnt't improve the overall score 





Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Dataset: heart-statlog
Found 0 categorical and 13 numeric features 

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   17.8s finished



Generating corrupted training data on 54 rows... 

	perturbation: MissingValues: {'column': 'age', 'fraction': 0.3532754431740348, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.3532754431740348, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'age', 'fraction': 0.3532754431740348, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.3532754431740348, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'serum_cholestoral', 'fraction': 0.3532754431740348, 'na_value': nan, 'missingness': 'MAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9313186813186812, 'classification_report': {'absent': {'precision': 0.8333333333333334, 'recall': 0.9615384615384616, 'f1-score': 0.8928571428571429, 'support': 26}, 'present': {'precision': 0.9583333333333334, 'recall': 

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200724_114656/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200724_114656/
AutoGluon Version:  0.0.12
Train Data Rows:    216
Train Data Columns: 13
Preprocessing data ...
Feature Generator processed 216 data points with 12 features
Original Features (raw dtypes):
	float64 features: 12
Original Features (inferred dtypes):
	float features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 12
Final Features:
	float features: 12
	Data preprocessing and feature engineering runtime = 0.08s ...
AutoGluon will gauge predictive performance using evaluation metric: root_mean_squared_error
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: root_mean_squared_error
Fitting model: RandomForestRegressorMSE ...
	-8.9885	 = Validation root_mean_squared_error score
	0.76s	 = Training runti

Cleaner: {'outlier_detection': PyODKNN, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.9381868131868132, 'classification_report': {'absent': {'precision': 0.8275862068965517, 'recall': 0.9230769230769231, 'f1-score': 0.8727272727272727, 'support': 26}, 'present': {'precision': 0.92, 'recall': 0.8214285714285714, 'f1-score': 0.8679245283018867, 'support': 28}, 'accuracy': 0.8703703703703703, 'macro avg': {'precision': 0.8737931034482759, 'recall': 0.8722527472527473, 'f1-score': 0.8703259005145797, 'support': 54}, 'weighted avg': {'precision': 0.8755044699872286, 'recall': 0.8703703703703703, 'f1-score': 0.8702369608029985, 'support': 54}}}


No output_directory specified. Models will be saved in: AutogluonModels/ag-20200724_114804/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200724_114804/
AutoGluon Version:  0.0.12
Train Data Rows:    216
Train Data Columns: 13
Preprocessing data ...
Feature Generator processed 216 data points with 12 features
Original Features (raw dtypes):
	float64 features: 12
Original Features (inferred dtypes):
	float features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 12
Final Features:
	float features: 12
	Data preprocessing and feature engineering runtime = 0.07s ...
AutoGluon will gauge predictive performance using evaluation metric: root_mean_squared_error
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: root_mean_squared_error
Fitting model: RandomForestRegressorMSE ...
	-8.9114	 = Validation root_mean_squared_error score
	0.76s	 = Training runti


Best cleaning method:
Cleaning score: Cleaner: {'outlier_detection': PyODKNN, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.9381868131868132, 'classification_report': {'absent': {'precision': 0.8275862068965517, 'recall': 0.9230769230769231, 'f1-score': 0.8727272727272727, 'support': 26}, 'present': {'precision': 0.92, 'recall': 0.8214285714285714, 'f1-score': 0.8679245283018867, 'support': 28}, 'accuracy': 0.8703703703703703, 'macro avg': {'precision': 0.8737931034482759, 'recall': 0.8722527472527473, 'f1-score': 0.8703259005145797, 'support': 54}, 'weighted avg': {'precision': 0.8755044699872286, 'recall': 0.8703703703703703, 'f1-score': 0.8702369608029985, 'support': 54}}} 

Cleaning improved the overall score 





Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Dataset: credit-g
Found 13 categorical and 7 numeric features 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    8.0s finished



Generating corrupted training data on 200 rows... 

	perturbation: MissingValues: {'column': 'credit_amount', 'fraction': 0.3532754431740348, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'other_payment_plans', 'fraction': 0.3532754431740348, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: MissingValues: {'column': 'duration', 'fraction': 0.3532754431740348, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.3532754431740348, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: MissingValues: {'column': 'credit_history', 'fraction': 0.3532754431740348, 'na_value': nan, 'missingness': 'MAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.8063347358578775, 'classification_report': {'bad': {'precision': 0.6415094339622641, 'recall': 0.5483870967741935, 'f1-score': 0.591304347826087, 'support': 62}, 'good': {'precision': 0.8095238095238095, 'recall': 0.862318840

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200724_114921/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200724_114921/
AutoGluon Version:  0.0.12
Train Data Rows:    800
Train Data Columns: 20
Preprocessing data ...
Train Data Class Count: 4
Feature Generator processed 800 data points with 19 features
Original Features (raw dtypes):
	float64 features: 7
	object features: 12
Original Features (inferred dtypes):
	float features: 7
	object features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 7
	category features: 12
Final Features:
	float features: 7
	category features: 12
	Data preprocessing and feature engineering runtime = 0.15s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini

KeyboardInterrupt: ignored

In [18]:
ind_results

[{'cleaners': [{'Imputation method': AutoGluonImputation,
    'Outlier detection method': PyODKNN,
    'PPP score with cleaning': {'classification_report': {'absent': {'f1-score': 0.9090909090909091,
       'precision': 0.8620689655172413,
       'recall': 0.9615384615384616,
       'support': 26},
      'accuracy': 0.9074074074074074,
      'macro avg': {'f1-score': 0.9073756432246998,
       'precision': 0.9110344827586206,
       'recall': 0.9093406593406593,
       'support': 54},
      'present': {'f1-score': 0.9056603773584904,
       'precision': 0.96,
       'recall': 0.8571428571428571,
       'support': 28},
      'weighted avg': {'f1-score': 0.9073121148592846,
       'precision': 0.9128480204342273,
       'recall': 0.9074074074074074,
       'support': 54}},
     'roc_auc_acore': 0.9326923076923077}}],
  'corruptions': defaultdict(list,
              {('serum_cholestoral',): [MissingValues: {'column': 'serum_cholestoral', 'fraction': 0.3532754431740348, 'na_value': nan, 'm

In [None]:
%%time
for _ in range(1):
  print("\n\n..................................ITERATION..................................\n")
  results = []

  for dataset in datasets:
      for learner, param_grid in models.items():
          for fraction in fractions:
              results.append(run_experiment(dataset, learner, param_grid, corruptions, fraction, cleaners, 5))



..................................ITERATION..................................

Dataset: heart-statlog
Found 0 categorical and 13 numeric features 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    2.2s finished



Generating corrupted training data on 54 rows... 

Can't apply the SwappedValues corruption because there are no categorical columns. 


	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'resting_electrocardiographic_results', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'serum_cholestoral', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'exercis

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   15.4s finished



Generating corrupted training data on 54 rows... 

	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'resting_electrocardiographic_results', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'serum_cholestoral', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'exercise_induced_angina', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    7.4s finished



Generating corrupted training data on 200 rows... 

	... perturbation: MissingValues: {'column': 'credit_amount', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'num_dependents', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'existing_credits', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'duration', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'age', 'fraction': 0.4148793458140141}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   19.9s finished



Generating corrupted training data on 200 rows... 

	... perturbation: MissingValues: {'column': 'credit_amount', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'num_dependents', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'existing_credits', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'duration', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'age', 'fraction': 0.4148793458140141}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.9s finished



Generating corrupted training data on 54 rows... 

	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'resting_electrocardiographic_results', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'serum_cholestoral', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'exercise_induced_angina', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   15.4s finished



Generating corrupted training data on 54 rows... 

	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'resting_electrocardiographic_results', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'serum_cholestoral', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'exercise_induced_angina', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    7.5s finished



Generating corrupted training data on 200 rows... 

	... perturbation: MissingValues: {'column': 'credit_amount', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'num_dependents', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'existing_credits', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'duration', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'age', 'fraction': 0.4148793458140141}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   20.5s finished



Generating corrupted training data on 200 rows... 

	... perturbation: MissingValues: {'column': 'credit_amount', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'num_dependents', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'existing_credits', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'duration', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'age', 'fraction': 0.4148793458140141}

In [None]:
results

[{'cleaners': [{'Imputation method': MeanModeImputation,
    'Outlier detection method': NoOutlierDetection,
    'PPP score with cleaning': {'classification_report': {'absent': {'f1-score': 0.8620689655172413,
       'precision': 0.78125,
       'recall': 0.9615384615384616,
       'support': 26},
      'accuracy': 0.8518518518518519,
      'macro avg': {'f1-score': 0.8510344827586207,
       'precision': 0.8678977272727273,
       'recall': 0.8557692307692308,
       'support': 54},
      'present': {'f1-score': 0.84,
       'precision': 0.9545454545454546,
       'recall': 0.75,
       'support': 28},
      'weighted avg': {'f1-score': 0.850625798212005,
       'precision': 0.8711069023569024,
       'recall': 0.8518518518518519,
       'support': 54}},
     'roc_auc_acore': 0.8846153846153846}},
   {'Imputation method': NoImputation,
    'Outlier detection method': PyODKNN,
    'PPP score with cleaning': {'classification_report': {'absent': {'f1-score': 0.8571428571428571,
       'p