In [1]:
# mount drive for access to the files
from google.colab import drive

drive.mount("/content/drive")

# all the drive the files are present in "/content/drive/My Drive"
!ls "/content/drive/My Drive/Beuth Uni/Master Thesis"

import sys
sys.path.append('/content/drive/My Drive/Beuth Uni/Master Thesis/jenga')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Data
'Data Quality in ML Production Systems.pdf'
'Datawig: Missing Value Imputation for Tables.pdf'
 Declaration
 Images
 jenga
 jenga.pdf
 MICE_Multivariate_Imputation_by_Chained_Equations_.pdf


In [2]:
!pip install openml
!pip install pyod



In [3]:
import random
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from jenga.basis import Dataset
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise
from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNN, PyODIsolationForest
from jenga.cleaning.imputation import NoImputation, MeanModeImputation
from jenga.cleaning.clean import Clean

In [4]:
seed = 10

In [5]:
def run_experiment(dataset_name, learner, param_grid, corruptions, fraction, cleaners, num_repetitions):
    
    ## dataset
    dataset = Dataset(seed, dataset_name)
    
    all_data = dataset.all_data
    attribute_names = dataset.attribute_names
    attribute_types = dataset.attribute_types
    
    ## categorical and numerical features
    categorical_columns = dataset.categorical_columns
    numerical_columns = dataset.numerical_columns
    print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")
    
    ## train and test data
    df_train, lab_train, df_test, lab_test = dataset.get_train_test_data()
    
    
    ## pipeline performance prediction (ppp)
    ppp = PipelinePerformancePrediction(seed, df_train, lab_train, df_test, lab_test, categorical_columns, numerical_columns, learner, param_grid)
    ppp_model = ppp.fit_ppp(df_train)
    
    ## generate corrpted data
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(df_test, corruptions, fraction, num_repetitions)
    
    ## cleaning
    clean = Clean(df_train, df_corrupted, categorical_columns, numerical_columns, ppp, ppp_model, cleaners)
    df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(df_train, df_corrupted)
    
    ## results
    result = {
        'ppp_score_model': ppp.predict_score_ppp(ppp_model, df_test),
        'ppp_score_corrupted': corrupted_score_ppp,
        'ppp_score_cleaned': best_cleaning_score,
        'ppp_scores_cleaners': cleaner_scores_ppp
    }
#     print('\n'.join([f'{key}:{val}' for key, val in result.items()]))
    
    ## summary
    summary = {
        'dataset': dataset_name,
        'model': learner,
        'corruptions': summary_col_corrupt,
        'cleaners': summary_cleaners,
        'result': result
    }
#     print('\n\n\n\n'.join([f'{key}:{val}' for key, val in summary.items()]))
    
    return summary #summary_col_corrupt, result

In [6]:
datasets = [
    # 'parkinsons',
    'heart-statlog',
    'credit-g'
]

In [7]:
## model parameters
## models is a dict where key = leaner & value = param_grid
models = {SGDClassifier(loss='log'): {'learner__max_iter': [500, 1000, 5000], 
                                         'learner__penalty': ['l2', 'l1', 'elasticnet'], 
                                         'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
                                        }, 
          RandomForestClassifier():{'learner__n_estimators': [100, 200, 500], 
                                    'learner__max_depth': [5, 10, 15]
                                   }
         }

## make dict of multiple leraners and corresponding param_grids

In [8]:
corruptions = [MissingValues, SwappedValues, Scaling, GaussianNoise]

In [9]:
fractions = np.random.uniform(0, 1, 1) # np.random.uniform(0, 1, 3)

In [10]:
cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNN, NoImputation),
    (PyODKNN, MeanModeImputation),
    (PyODIsolationForest, NoImputation),
    (PyODIsolationForest, MeanModeImputation)
]

In [11]:
%%time
for _ in range(2):
  print("\n\n..................................ITERATION..................................\n")
  ind_results = []

  for dataset in datasets:
      for learner, param_grid in models.items():
          for fraction in fractions:
              ind_results.append(run_experiment(dataset, learner, param_grid, [MissingValues], fraction, [(PyODKNN, MeanModeImputation)], 5))



..................................ITERATION..................................

Dataset: heart-statlog
Found 0 categorical and 13 numeric features 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 174 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    3.5s finished



Generating corrupted training data on 54 rows... 

	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MAR'}
	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: MissingValues: {'column': 'serum_cholestoral', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9024725274725275, 'classification_report': {'absent': {'precision': 0.7878787878787878, 'recall': 1.0, 'f1-score': 0.8813559322033898, 'support': 26}, 'present': {'precision': 1.0, 'recall': 0.75, 'f1-

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   15.3s finished



Generating corrupted training data on 54 rows... 

	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MAR'}
	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: MissingValues: {'column': 'serum_cholestoral', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9381868131868132, 'classification_report': {'absent': {'precision': 0.8125, 'recall': 1.0, 'f1-score': 0.896551724137931, 'support': 26}, 'present': {'precision': 1.0, 'recall': 0.7857142857142857, 'f1

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    7.5s finished



Generating corrupted training data on 200 rows... 

	... perturbation: MissingValues: {'column': 'credit_amount', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: MissingValues: {'column': 'other_payment_plans', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MAR'}
	... perturbation: MissingValues: {'column': 'duration', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: MissingValues: {'column': 'credit_history', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.808555399719495, 'classification_report': {'bad': {'precision': 0.6470588235294118, 'recall': 0.532258064516129, 'f1-score': 0.5840707964601771, 'support': 62}, 'good': {'precision': 0.8053691275167785, 'r

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   20.2s finished



Generating corrupted training data on 200 rows... 

	... perturbation: MissingValues: {'column': 'credit_amount', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: MissingValues: {'column': 'other_payment_plans', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MAR'}
	... perturbation: MissingValues: {'column': 'duration', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: MissingValues: {'column': 'credit_history', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.7847124824684432, 'classification_report': {'bad': {'precision': 0.7333333333333333, 'recall': 0.3548387096774194, 'f1-score': 0.47826086956521735, 'support': 62}, 'good': {'precision': 0.7647058823529411,

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    2.1s finished



Generating corrupted training data on 54 rows... 

	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MAR'}
	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: MissingValues: {'column': 'serum_cholestoral', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9024725274725275, 'classification_report': {'absent': {'precision': 0.7878787878787878, 'recall': 1.0, 'f1-score': 0.8813559322033898, 'support': 26}, 'present': {'precision': 1.0, 'recall': 0.75, 'f1-

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   15.6s finished



Generating corrupted training data on 54 rows... 

	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MAR'}
	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: MissingValues: {'column': 'serum_cholestoral', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9381868131868132, 'classification_report': {'absent': {'precision': 0.8125, 'recall': 1.0, 'f1-score': 0.896551724137931, 'support': 26}, 'present': {'precision': 1.0, 'recall': 0.7857142857142857, 'f1

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    8.3s finished



Generating corrupted training data on 200 rows... 

	... perturbation: MissingValues: {'column': 'credit_amount', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: MissingValues: {'column': 'other_payment_plans', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MAR'}
	... perturbation: MissingValues: {'column': 'duration', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: MissingValues: {'column': 'credit_history', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.808555399719495, 'classification_report': {'bad': {'precision': 0.6470588235294118, 'recall': 0.532258064516129, 'f1-score': 0.5840707964601771, 'support': 62}, 'good': {'precision': 0.8053691275167785, 'r

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   19.9s finished



Generating corrupted training data on 200 rows... 

	... perturbation: MissingValues: {'column': 'credit_amount', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: MissingValues: {'column': 'other_payment_plans', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MAR'}
	... perturbation: MissingValues: {'column': 'duration', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: MissingValues: {'column': 'credit_history', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.7847124824684432, 'classification_report': {'bad': {'precision': 0.7333333333333333, 'recall': 0.3548387096774194, 'f1-score': 0.47826086956521735, 'support': 62}, 'good': {'precision': 0.7647058823529411,

In [12]:
ind_results

[{'cleaners': [{'Imputation method': MeanModeImputation,
    'Outlier detection method': PyODKNN,
    'PPP score with cleaning': {'classification_report': {'absent': {'f1-score': 0.9090909090909091,
       'precision': 0.8620689655172413,
       'recall': 0.9615384615384616,
       'support': 26},
      'accuracy': 0.9074074074074074,
      'macro avg': {'f1-score': 0.9073756432246998,
       'precision': 0.9110344827586206,
       'recall': 0.9093406593406593,
       'support': 54},
      'present': {'f1-score': 0.9056603773584904,
       'precision': 0.96,
       'recall': 0.8571428571428571,
       'support': 28},
      'weighted avg': {'f1-score': 0.9073121148592846,
       'precision': 0.9128480204342273,
       'recall': 0.9074074074074074,
       'support': 54}},
     'roc_auc_acore': 0.9217032967032968}}],
  'corruptions': defaultdict(list,
              {('serum_cholestoral',): [MissingValues: {'column': 'serum_cholestoral', 'fraction': 0.4148793458140141, 'na_value': nan, 'mi

In [13]:
%%time
for _ in range(2):
  print("\n\n..................................ITERATION..................................\n")
  results = []

  for dataset in datasets:
      for learner, param_grid in models.items():
          for fraction in fractions:
              results.append(run_experiment(dataset, learner, param_grid, corruptions, fraction, cleaners, 5))



..................................ITERATION..................................

Dataset: heart-statlog
Found 0 categorical and 13 numeric features 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    2.2s finished



Generating corrupted training data on 54 rows... 

Can't apply the SwappedValues corruption because there are no categorical columns. 


	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'resting_electrocardiographic_results', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'serum_cholestoral', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'exercis

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   15.4s finished



Generating corrupted training data on 54 rows... 

	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'resting_electrocardiographic_results', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'serum_cholestoral', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'exercise_induced_angina', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    7.4s finished



Generating corrupted training data on 200 rows... 

	... perturbation: MissingValues: {'column': 'credit_amount', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'num_dependents', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'existing_credits', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'duration', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'age', 'fraction': 0.4148793458140141}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   19.9s finished



Generating corrupted training data on 200 rows... 

	... perturbation: MissingValues: {'column': 'credit_amount', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'num_dependents', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'existing_credits', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'duration', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'age', 'fraction': 0.4148793458140141}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.9s finished



Generating corrupted training data on 54 rows... 

	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'resting_electrocardiographic_results', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'serum_cholestoral', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'exercise_induced_angina', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   15.4s finished



Generating corrupted training data on 54 rows... 

	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'resting_electrocardiographic_results', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'serum_cholestoral', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'age', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'exercise_induced_angina', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    7.5s finished



Generating corrupted training data on 200 rows... 

	... perturbation: MissingValues: {'column': 'credit_amount', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'num_dependents', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'existing_credits', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'duration', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'age', 'fraction': 0.4148793458140141}

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   20.5s finished



Generating corrupted training data on 200 rows... 

	... perturbation: MissingValues: {'column': 'credit_amount', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MCAR'}
	... perturbation: Scaling: {'column': 'num_dependents', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'residence_since', 'fraction': 0.4148793458140141}
	... perturbation: MissingValues: {'column': 'existing_credits', 'fraction': 0.4148793458140141, 'na_value': nan, 'missingness': 'MNAR'}
	... perturbation: Scaling: {'column': 'duration', 'fraction': 0.4148793458140141}
	... perturbation: GaussianNoise: {'column': 'age', 'fraction': 0.4148793458140141}

In [14]:
results

[{'cleaners': [{'Imputation method': MeanModeImputation,
    'Outlier detection method': NoOutlierDetection,
    'PPP score with cleaning': {'classification_report': {'absent': {'f1-score': 0.8620689655172413,
       'precision': 0.78125,
       'recall': 0.9615384615384616,
       'support': 26},
      'accuracy': 0.8518518518518519,
      'macro avg': {'f1-score': 0.8510344827586207,
       'precision': 0.8678977272727273,
       'recall': 0.8557692307692308,
       'support': 54},
      'present': {'f1-score': 0.84,
       'precision': 0.9545454545454546,
       'recall': 0.75,
       'support': 28},
      'weighted avg': {'f1-score': 0.850625798212005,
       'precision': 0.8711069023569024,
       'recall': 0.8518518518518519,
       'support': 54}},
     'roc_auc_acore': 0.8846153846153846}},
   {'Imputation method': NoImputation,
    'Outlier detection method': PyODKNN,
    'PPP score with cleaning': {'classification_report': {'absent': {'f1-score': 0.8571428571428571,
       'p