In [1]:
# mount drive for access to the files
from google.colab import drive

drive.mount("/content/drive")

# all the drive the files are present in "/content/drive/My Drive"
!ls "/content/drive/My Drive/Beuth Uni/Master Thesis"

import sys
sys.path.append('/content/drive/My Drive/Beuth Uni/Master Thesis/jenga')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Data
'Data Quality in ML Production Systems.pdf'
'Datawig: Missing Value Imputation for Tables.pdf'
 Declaration
 Images
 jenga
 jenga.pdf
 MICE_Multivariate_Imputation_by_Chained_Equations_.pdf


In [2]:
!pip install openml
!pip install pyod
!pip install datawig ##



In [2]:
import random
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from jenga.basis import Dataset
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise
from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNN, PyODIsolationForest
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, DatawigImputation
from jenga.cleaning.clean import Clean

In [3]:
seed = 10

In [4]:
def run_experiment(dataset_name, learner, param_grid, corruptions, fraction, cleaners):
    
    ## dataset
    dataset = Dataset(seed, dataset_name)
    
    all_data = dataset.all_data
    attribute_names = dataset.attribute_names
    attribute_types = dataset.attribute_types
    
    ## categorical and numerical features
    categorical_columns = dataset.categorical_columns
    numerical_columns = dataset.numerical_columns
    print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")
    
    ## train and test data
    df_train, lab_train, df_test, lab_test = dataset.get_train_test_data()
    
    
    ## pipeline performance prediction (ppp)
    ppp = PipelinePerformancePrediction(seed, df_train, lab_train, df_test, lab_test, categorical_columns, numerical_columns, learner, param_grid)
    ppp_model = ppp.fit_ppp(df_train)
    
    ## generate corrpted data
#     for _ in range(100):
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(df_test, corruptions, fraction)
    
    ## cleaning
    clean = Clean(df_train, df_corrupted, categorical_columns, numerical_columns, ppp, ppp_model, cleaners)
    df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(df_train, df_corrupted)
    
    ## results
    result = {
        'ppp_score_model': ppp.predict_score_ppp(ppp_model, df_test),
        'ppp_score_corrupted': corrupted_score_ppp,
        'ppp_score_cleaned': best_cleaning_score,
        'ppp_scores_cleaners': cleaner_scores_ppp
    }
#     print('\n'.join([f'{key}:{val}' for key, val in result.items()]))
    
    ## summary
    summary = {
        'dataset': dataset_name,
        'model': learner,
        'corruptions': summary_col_corrupt,
        'cleaners': summary_cleaners,
        'result': result
    }
#     print('\n\n\n\n'.join([f'{key}:{val}' for key, val in summary.items()]))
    
    return summary #summary_col_corrupt, result

In [5]:
datasets = [
    'parkinsons',
    'heart-statlog',
    'credit-g'
]

In [6]:
## model parameters
## models is a dict where key = leaner & value = param_grid
models = {SGDClassifier(loss='log'): {'learner__max_iter': [500, 1000, 5000], 
                                         'learner__penalty': ['l2', 'l1', 'elasticnet'], 
                                         'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
                                        }, 
          RandomForestClassifier():{'learner__n_estimators': [100, 200, 500], 
                                    'learner__max_depth': [5, 10, 15]
                                   }
         }

## make dict of multiple leraners and corresponding param_grids

In [7]:
corruptions = [MissingValues, SwappedValues, Scaling, GaussianNoise]

In [8]:
fractions = [0.25, 0.5, 0.75]

In [9]:
cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNN, NoImputation),
    (PyODKNN, MeanModeImputation),
    (PyODIsolationForest, NoImputation),
    (PyODIsolationForest, MeanModeImputation)
]

In [10]:
ind_results = []

# for _ in range(100):
for dataset in datasets:
    for learner, param_grid in models.items():
        for fraction in fractions:
            ind_results.append(run_experiment(dataset, learner, param_grid, [MissingValues], fraction, [(PyODKNN, MeanModeImputation)]))

Dataset: parkinsons
Found 0 categorical and 22 numeric features 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    3.3s finished


Generating corrupted training data on 39 rows... 

Applying perturbations... 

MissingValues: {'column': 'V2', 'fraction': 0.25, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.8181818181818181, 'classification_report': {'1': {'precision': 0.6666666666666666, 'recall': 0.5454545454545454, 'f1-score': 0.6, 'support': 11}, '2': {'precision': 0.8333333333333334, 'recall': 0.8928571428571429, 'f1-score': 0.8620689655172413, 'support': 28}, 'accuracy': 0.7948717948717948, 'macro avg': {'precision': 0.75, 'recall': 0.7191558441558441, 'f1-score': 0.7310344827586206, 'support': 39}, 'weighted avg': {'precision': 0.7863247863247863, 'recall': 0.7948717948717948, 'f1-score': 0.7881520778072502, 'support': 39}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.724025974025974, 'classification_report': {'1': {'precision': 0.5, 'recall': 0.6363636363636364, 'f1-score': 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.8s finished


Generating corrupted training data on 39 rows... 

Applying perturbations... 

MissingValues: {'column': 'V2', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9902597402597402, 'classification_report': {'1': {'precision': 0.8333333333333334, 'recall': 0.9090909090909091, 'f1-score': 0.8695652173913043, 'support': 11}, '2': {'precision': 0.9629629629629629, 'recall': 0.9285714285714286, 'f1-score': 0.9454545454545454, 'support': 28}, 'accuracy': 0.9230769230769231, 'macro avg': {'precision': 0.8981481481481481, 'recall': 0.9188311688311688, 'f1-score': 0.9075098814229249, 'support': 39}, 'weighted avg': {'precision': 0.9264007597340932, 'recall': 0.9230769230769231, 'f1-score': 0.9240498631802979, 'support': 39}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.9123376623376623, 'classification_report': {'1': {'precision': 0.7272727272727273

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.8s finished


Generating corrupted training data on 39 rows... 

Applying perturbations... 

MissingValues: {'column': 'V2', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9285714285714285, 'classification_report': {'1': {'precision': 0.8571428571428571, 'recall': 0.5454545454545454, 'f1-score': 0.6666666666666665, 'support': 11}, '2': {'precision': 0.84375, 'recall': 0.9642857142857143, 'f1-score': 0.8999999999999999, 'support': 28}, 'accuracy': 0.8461538461538461, 'macro avg': {'precision': 0.8504464285714286, 'recall': 0.7548701298701299, 'f1-score': 0.7833333333333332, 'support': 39}, 'weighted avg': {'precision': 0.8475274725274726, 'recall': 0.8461538461538461, 'f1-score': 0.8341880341880341, 'support': 39}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.8896103896103896, 'classification_report': {'1': {'precision': 0.7, 'recall': 0.63636363636

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    9.7s finished


Generating corrupted training data on 39 rows... 

Applying perturbations... 

MissingValues: {'column': 'V2', 'fraction': 0.25, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9935064935064934, 'classification_report': {'1': {'precision': 1.0, 'recall': 0.8181818181818182, 'f1-score': 0.9, 'support': 11}, '2': {'precision': 0.9333333333333333, 'recall': 1.0, 'f1-score': 0.9655172413793104, 'support': 28}, 'accuracy': 0.9487179487179487, 'macro avg': {'precision': 0.9666666666666667, 'recall': 0.9090909090909092, 'f1-score': 0.9327586206896552, 'support': 39}, 'weighted avg': {'precision': 0.9521367521367521, 'recall': 0.9487179487179487, 'f1-score': 0.9470380194518125, 'support': 39}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.9967532467532467, 'classification_report': {'1': {'precision': 1.0, 'recall': 0.8181818181818182, 'f1-score': 0.9, 'support':

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 out of  45 | elapsed:    9.1s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    9.7s finished


Generating corrupted training data on 39 rows... 

Applying perturbations... 

MissingValues: {'column': 'V2', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9967532467532467, 'classification_report': {'1': {'precision': 0.9090909090909091, 'recall': 0.9090909090909091, 'f1-score': 0.9090909090909091, 'support': 11}, '2': {'precision': 0.9642857142857143, 'recall': 0.9642857142857143, 'f1-score': 0.9642857142857143, 'support': 28}, 'accuracy': 0.9487179487179487, 'macro avg': {'precision': 0.9366883116883117, 'recall': 0.9366883116883117, 'f1-score': 0.9366883116883117, 'support': 39}, 'weighted avg': {'precision': 0.9487179487179487, 'recall': 0.9487179487179487, 'f1-score': 0.9487179487179487, 'support': 39}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.9967532467532467, 'classification_report': {'1': {'precision': 1.0, 'recall': 0.7

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 out of  45 | elapsed:    9.2s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    9.8s finished


Generating corrupted training data on 39 rows... 

Applying perturbations... 

MissingValues: {'column': 'V2', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.987012987012987, 'classification_report': {'1': {'precision': 0.9090909090909091, 'recall': 0.9090909090909091, 'f1-score': 0.9090909090909091, 'support': 11}, '2': {'precision': 0.9642857142857143, 'recall': 0.9642857142857143, 'f1-score': 0.9642857142857143, 'support': 28}, 'accuracy': 0.9487179487179487, 'macro avg': {'precision': 0.9366883116883117, 'recall': 0.9366883116883117, 'f1-score': 0.9366883116883117, 'support': 39}, 'weighted avg': {'precision': 0.9487179487179487, 'recall': 0.9487179487179487, 'f1-score': 0.9487179487179487, 'support': 39}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 1.0, 'classification_report': {'1': {'precision': 1.0, 'recall': 0.7272727272727273

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.8s finished


Generating corrupted training data on 54 rows... 

Applying perturbations... 

MissingValues: {'column': 'age', 'fraction': 0.25, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9326923076923077, 'classification_report': {'absent': {'precision': 0.8620689655172413, 'recall': 0.9615384615384616, 'f1-score': 0.9090909090909091, 'support': 26}, 'present': {'precision': 0.96, 'recall': 0.8571428571428571, 'f1-score': 0.9056603773584904, 'support': 28}, 'accuracy': 0.9074074074074074, 'macro avg': {'precision': 0.9110344827586206, 'recall': 0.9093406593406593, 'f1-score': 0.9073756432246998, 'support': 54}, 'weighted avg': {'precision': 0.9128480204342273, 'recall': 0.9074074074074074, 'f1-score': 0.9073121148592846, 'support': 54}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.9162087912087913, 'classification_report': {'absent': {'precision': 0.862068965517

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.8s finished


Generating corrupted training data on 54 rows... 

Applying perturbations... 

MissingValues: {'column': 'age', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9326923076923077, 'classification_report': {'absent': {'precision': 0.8620689655172413, 'recall': 0.9615384615384616, 'f1-score': 0.9090909090909091, 'support': 26}, 'present': {'precision': 0.96, 'recall': 0.8571428571428571, 'f1-score': 0.9056603773584904, 'support': 28}, 'accuracy': 0.9074074074074074, 'macro avg': {'precision': 0.9110344827586206, 'recall': 0.9093406593406593, 'f1-score': 0.9073756432246998, 'support': 54}, 'weighted avg': {'precision': 0.9128480204342273, 'recall': 0.9074074074074074, 'f1-score': 0.9073121148592846, 'support': 54}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.9162087912087913, 'classification_report': {'absent': {'precision': 0.8620689655172

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.8s finished


Generating corrupted training data on 54 rows... 

Applying perturbations... 

MissingValues: {'column': 'age', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9326923076923077, 'classification_report': {'absent': {'precision': 0.8620689655172413, 'recall': 0.9615384615384616, 'f1-score': 0.9090909090909091, 'support': 26}, 'present': {'precision': 0.96, 'recall': 0.8571428571428571, 'f1-score': 0.9056603773584904, 'support': 28}, 'accuracy': 0.9074074074074074, 'macro avg': {'precision': 0.9110344827586206, 'recall': 0.9093406593406593, 'f1-score': 0.9073756432246998, 'support': 54}, 'weighted avg': {'precision': 0.9128480204342273, 'recall': 0.9074074074074074, 'f1-score': 0.9073121148592846, 'support': 54}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.9162087912087913, 'classification_report': {'absent': {'precision': 0.862068965517

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 out of  45 | elapsed:    9.1s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    9.7s finished


Generating corrupted training data on 54 rows... 

Applying perturbations... 

MissingValues: {'column': 'age', 'fraction': 0.25, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9340659340659341, 'classification_report': {'absent': {'precision': 0.8214285714285714, 'recall': 0.8846153846153846, 'f1-score': 0.8518518518518519, 'support': 26}, 'present': {'precision': 0.8846153846153846, 'recall': 0.8214285714285714, 'f1-score': 0.8518518518518519, 'support': 28}, 'accuracy': 0.8518518518518519, 'macro avg': {'precision': 0.853021978021978, 'recall': 0.853021978021978, 'f1-score': 0.8518518518518519, 'support': 54}, 'weighted avg': {'precision': 0.8541921041921041, 'recall': 0.8518518518518519, 'f1-score': 0.8518518518518519, 'support': 54}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.907967032967033, 'classification_report': {'absent': {'precision': 0.8

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 out of  45 | elapsed:    9.1s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    9.6s finished


Generating corrupted training data on 54 rows... 

Applying perturbations... 

MissingValues: {'column': 'age', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9203296703296704, 'classification_report': {'absent': {'precision': 0.8275862068965517, 'recall': 0.9230769230769231, 'f1-score': 0.8727272727272727, 'support': 26}, 'present': {'precision': 0.92, 'recall': 0.8214285714285714, 'f1-score': 0.8679245283018867, 'support': 28}, 'accuracy': 0.8703703703703703, 'macro avg': {'precision': 0.8737931034482759, 'recall': 0.8722527472527473, 'f1-score': 0.8703259005145797, 'support': 54}, 'weighted avg': {'precision': 0.8755044699872286, 'recall': 0.8703703703703703, 'f1-score': 0.8702369608029985, 'support': 54}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.8914835164835165, 'classification_report': {'absent': {'precision': 0.7741935483870

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 out of  45 | elapsed:    9.1s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    9.7s finished


Generating corrupted training data on 54 rows... 

Applying perturbations... 

MissingValues: {'column': 'age', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9271978021978021, 'classification_report': {'absent': {'precision': 0.8214285714285714, 'recall': 0.8846153846153846, 'f1-score': 0.8518518518518519, 'support': 26}, 'present': {'precision': 0.8846153846153846, 'recall': 0.8214285714285714, 'f1-score': 0.8518518518518519, 'support': 28}, 'accuracy': 0.8518518518518519, 'macro avg': {'precision': 0.853021978021978, 'recall': 0.853021978021978, 'f1-score': 0.8518518518518519, 'support': 54}, 'weighted avg': {'precision': 0.8541921041921041, 'recall': 0.8518518518518519, 'f1-score': 0.8518518518518519, 'support': 54}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.9148351648351648, 'classification_report': {'absent': {'precision': 0.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    6.9s finished


Generating corrupted training data on 200 rows... 

Applying perturbations... 

MissingValues: {'column': 'credit_amount', 'fraction': 0.25, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.80890603085554, 'classification_report': {'bad': {'precision': 0.6415094339622641, 'recall': 0.5483870967741935, 'f1-score': 0.591304347826087, 'support': 62}, 'good': {'precision': 0.8095238095238095, 'recall': 0.8623188405797102, 'f1-score': 0.8350877192982457, 'support': 138}, 'accuracy': 0.765, 'macro avg': {'precision': 0.7255166217430369, 'recall': 0.7053529686769519, 'f1-score': 0.7131960335621663, 'support': 200}, 'weighted avg': {'precision': 0.7574393530997305, 'recall': 0.765, 'f1-score': 0.7595148741418766, 'support': 200}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.8011921458625526, 'classification_report': {'bad': {'precision': 0.660377358490566, 'reca

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    7.0s finished


Generating corrupted training data on 200 rows... 

Applying perturbations... 

MissingValues: {'column': 'credit_amount', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.8103085553997196, 'classification_report': {'bad': {'precision': 0.6470588235294118, 'recall': 0.532258064516129, 'f1-score': 0.5840707964601771, 'support': 62}, 'good': {'precision': 0.8053691275167785, 'recall': 0.8695652173913043, 'f1-score': 0.8362369337979093, 'support': 138}, 'accuracy': 0.765, 'macro avg': {'precision': 0.7262139755230952, 'recall': 0.7009116409537166, 'f1-score': 0.7101538651290432, 'support': 200}, 'weighted avg': {'precision': 0.7562929332806948, 'recall': 0.765, 'f1-score': 0.7580654312232123, 'support': 200}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.7999064983637215, 'classification_report': {'bad': {'precision': 0.6481481481481481, 're

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    6.9s finished


Generating corrupted training data on 200 rows... 

Applying perturbations... 

MissingValues: {'column': 'credit_amount', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.8143992519869098, 'classification_report': {'bad': {'precision': 0.6511627906976745, 'recall': 0.45161290322580644, 'f1-score': 0.5333333333333333, 'support': 62}, 'good': {'precision': 0.7834394904458599, 'recall': 0.8913043478260869, 'f1-score': 0.8338983050847457, 'support': 138}, 'accuracy': 0.755, 'macro avg': {'precision': 0.7173011405717672, 'recall': 0.6714586255259467, 'f1-score': 0.6836158192090396, 'support': 200}, 'weighted avg': {'precision': 0.7424337135239224, 'recall': 0.755, 'f1-score': 0.7407231638418079, 'support': 200}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.7962833099579243, 'classification_report': {'bad': {'precision': 0.6274509803921569, 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   14.1s finished


Generating corrupted training data on 200 rows... 

Applying perturbations... 

MissingValues: {'column': 'credit_amount', 'fraction': 0.25, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.7732585320243104, 'classification_report': {'bad': {'precision': 0.6842105263157895, 'recall': 0.41935483870967744, 'f1-score': 0.5200000000000001, 'support': 62}, 'good': {'precision': 0.7777777777777778, 'recall': 0.9130434782608695, 'f1-score': 0.84, 'support': 138}, 'accuracy': 0.76, 'macro avg': {'precision': 0.7309941520467836, 'recall': 0.6661991584852734, 'f1-score': 0.68, 'support': 200}, 'weighted avg': {'precision': 0.7487719298245614, 'recall': 0.76, 'f1-score': 0.7408000000000001, 'support': 200}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.7765310892940627, 'classification_report': {'bad': {'precision': 0.696969696969697, 'recall': 0.3709677419354839, '

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   14.0s finished


Generating corrupted training data on 200 rows... 

Applying perturbations... 

MissingValues: {'column': 'credit_amount', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.793419822346891, 'classification_report': {'bad': {'precision': 0.6944444444444444, 'recall': 0.4032258064516129, 'f1-score': 0.5102040816326531, 'support': 62}, 'good': {'precision': 0.774390243902439, 'recall': 0.9202898550724637, 'f1-score': 0.8410596026490066, 'support': 138}, 'accuracy': 0.76, 'macro avg': {'precision': 0.7344173441734417, 'recall': 0.6617578307620383, 'f1-score': 0.6756318421408298, 'support': 200}, 'weighted avg': {'precision': 0.7496070460704607, 'recall': 0.76, 'f1-score': 0.738494391133937, 'support': 200}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.7736091631603553, 'classification_report': {'bad': {'precision': 0.7, 'recall': 0.3387096774

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   14.1s finished


Generating corrupted training data on 200 rows... 

Applying perturbations... 

MissingValues: {'column': 'credit_amount', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MNAR'}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.7826086956521741, 'classification_report': {'bad': {'precision': 0.7, 'recall': 0.45161290322580644, 'f1-score': 0.5490196078431372, 'support': 62}, 'good': {'precision': 0.7875, 'recall': 0.9130434782608695, 'f1-score': 0.8456375838926175, 'support': 138}, 'accuracy': 0.77, 'macro avg': {'precision': 0.7437499999999999, 'recall': 0.682328190743338, 'f1-score': 0.6973285958678773, 'support': 200}, 'weighted avg': {'precision': 0.7603749999999999, 'recall': 0.77, 'f1-score': 0.7536860113172785, 'support': 200}}}
PPP score with cleaning: Cleaner: {'outlier_detection': PyODKNN, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.7731416549789621, 'classification_report': {'bad': {'precision': 0.696969696969697, 'recall': 0.3709677419354839, '

In [11]:
ind_results[1]

{'cleaners': [{'Imputation method': MeanModeImputation,
   'Outlier detection method': PyODKNN,
   'PPP score with cleaning': {'classification_report': {'1': {'f1-score': 0.7272727272727273,
      'precision': 0.7272727272727273,
      'recall': 0.7272727272727273,
      'support': 11},
     '2': {'f1-score': 0.8928571428571429,
      'precision': 0.8928571428571429,
      'recall': 0.8928571428571429,
      'support': 28},
     'accuracy': 0.8461538461538461,
     'macro avg': {'f1-score': 0.8100649350649352,
      'precision': 0.8100649350649352,
      'recall': 0.8100649350649352,
      'support': 39},
     'weighted avg': {'f1-score': 0.8461538461538461,
      'precision': 0.8461538461538461,
      'recall': 0.8461538461538461,
      'support': 39}},
    'roc_auc_acore': 0.9123376623376623}}],
 'corruptions': defaultdict(list,
             {('V2',): [MissingValues: {'column': 'V2', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MNAR'}]}),
 'dataset': 'parkinsons',
 'model': SGDC

In [12]:
results = []

# for _ in range(100):
for dataset in datasets:
    for learner, param_grid in models.items():
        for fraction in fractions:
            results.append(run_experiment(dataset, learner, param_grid, corruptions, fraction, cleaners))

Dataset: parkinsons
Found 0 categorical and 22 numeric features 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.9s finished
2020-07-22 13:43:28,390 [INFO]  NumExpr defaulting to 2 threads.


Generating corrupted training data on 39 rows... 

Can't apply the SwappedValues corruption because there are no categorical columns. 


Applying perturbations... 

MissingValues: {'column': 'V2', 'fraction': 0.25, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'V14', 'fraction': 0.25}
GaussianNoise: {'column': 'V16', 'fraction': 0.25}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.7597402597402597, 'classification_report': {'1': {'precision': 0.4782608695652174, 'recall': 1.0, 'f1-score': 0.6470588235294118, 'support': 11}, '2': {'precision': 1.0, 'recall': 0.5714285714285714, 'f1-score': 0.7272727272727273, 'support': 28}, 'accuracy': 0.6923076923076923, 'macro avg': {'precision': 0.7391304347826086, 'recall': 0.7857142857142857, 'f1-score': 0.6871657754010696, 'support': 39}, 'weighted avg': {'precision': 0.8528428093645485, 'recall': 0.6923076923076923, 'f1-score': 0.704648292883587, 'support': 39}}}
PPP score with cleaning: Cleaner: {'outlier_detec

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.8s finished


Generating corrupted training data on 39 rows... 

Applying perturbations... 

MissingValues: {'column': 'V2', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'V14', 'fraction': 0.5}
GaussianNoise: {'column': 'V16', 'fraction': 0.5}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9805194805194806, 'classification_report': {'1': {'precision': 1.0, 'recall': 0.6363636363636364, 'f1-score': 0.7777777777777778, 'support': 11}, '2': {'precision': 0.875, 'recall': 1.0, 'f1-score': 0.9333333333333333, 'support': 28}, 'accuracy': 0.8974358974358975, 'macro avg': {'precision': 0.9375, 'recall': 0.8181818181818181, 'f1-score': 0.8555555555555556, 'support': 39}, 'weighted avg': {'precision': 0.9102564102564102, 'recall': 0.8974358974358975, 'f1-score': 0.8894586894586894, 'support': 39}}}
PPP score with cleaning: Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.9805194805194806, 'classificati

  _warn_prf(average, modifier, msg_start, len(result))


PPP score with cleaning: Cleaner: {'outlier_detection': PyODIsolationForest, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.8798701298701298, 'classification_report': {'1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 11}, '2': {'precision': 0.717948717948718, 'recall': 1.0, 'f1-score': 0.835820895522388, 'support': 28}, 'accuracy': 0.717948717948718, 'macro avg': {'precision': 0.358974358974359, 'recall': 0.5, 'f1-score': 0.417910447761194, 'support': 39}, 'weighted avg': {'precision': 0.5154503616042078, 'recall': 0.717948717948718, 'f1-score': 0.6000765403750479, 'support': 39}}} 

Best cleaning method:
Cleaning score: Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.9805194805194806, 'classification_report': {'1': {'precision': 1.0, 'recall': 0.6363636363636364, 'f1-score': 0.7777777777777778, 'support': 11}, '2': {'precision': 0.875, 'recall': 1.0, 'f1-score': 0.9333333333333333, 'support': 28}, 'acc

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.8s finished


Generating corrupted training data on 39 rows... 

Applying perturbations... 

MissingValues: {'column': 'V2', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'V14', 'fraction': 0.75}
GaussianNoise: {'column': 'V16', 'fraction': 0.75}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.4837662337662338, 'classification_report': {'1': {'precision': 0.30303030303030304, 'recall': 0.9090909090909091, 'f1-score': 0.45454545454545453, 'support': 11}, '2': {'precision': 0.8333333333333334, 'recall': 0.17857142857142858, 'f1-score': 0.29411764705882354, 'support': 28}, 'accuracy': 0.38461538461538464, 'macro avg': {'precision': 0.5681818181818182, 'recall': 0.5438311688311688, 'f1-score': 0.37433155080213903, 'support': 39}, 'weighted avg': {'precision': 0.6837606837606838, 'recall': 0.38461538461538464, 'f1-score': 0.3393665158371041, 'support': 39}}}
PPP score with cleaning: Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation': MeanMod

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 out of  45 | elapsed:    9.1s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    9.7s finished


Generating corrupted training data on 39 rows... 

Applying perturbations... 

MissingValues: {'column': 'V2', 'fraction': 0.25, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'V14', 'fraction': 0.25}
GaussianNoise: {'column': 'V16', 'fraction': 0.25}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9935064935064934, 'classification_report': {'1': {'precision': 1.0, 'recall': 0.8181818181818182, 'f1-score': 0.9, 'support': 11}, '2': {'precision': 0.9333333333333333, 'recall': 1.0, 'f1-score': 0.9655172413793104, 'support': 28}, 'accuracy': 0.9487179487179487, 'macro avg': {'precision': 0.9666666666666667, 'recall': 0.9090909090909092, 'f1-score': 0.9327586206896552, 'support': 39}, 'weighted avg': {'precision': 0.9521367521367521, 'recall': 0.9487179487179487, 'f1-score': 0.9470380194518125, 'support': 39}}}
PPP score with cleaning: Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.9935064935064934, 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 out of  45 | elapsed:    9.0s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    9.7s finished


Generating corrupted training data on 39 rows... 

Applying perturbations... 

MissingValues: {'column': 'V2', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'V14', 'fraction': 0.5}
GaussianNoise: {'column': 'V16', 'fraction': 0.5}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9935064935064934, 'classification_report': {'1': {'precision': 0.9090909090909091, 'recall': 0.9090909090909091, 'f1-score': 0.9090909090909091, 'support': 11}, '2': {'precision': 0.9642857142857143, 'recall': 0.9642857142857143, 'f1-score': 0.9642857142857143, 'support': 28}, 'accuracy': 0.9487179487179487, 'macro avg': {'precision': 0.9366883116883117, 'recall': 0.9366883116883117, 'f1-score': 0.9366883116883117, 'support': 39}, 'weighted avg': {'precision': 0.9487179487179487, 'recall': 0.9487179487179487, 'f1-score': 0.9487179487179487, 'support': 39}}}
PPP score with cleaning: Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation': MeanModeImputatio

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 out of  45 | elapsed:    9.1s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    9.7s finished


Generating corrupted training data on 39 rows... 

Applying perturbations... 

MissingValues: {'column': 'V2', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'V14', 'fraction': 0.75}
GaussianNoise: {'column': 'V16', 'fraction': 0.75}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9967532467532467, 'classification_report': {'1': {'precision': 1.0, 'recall': 0.9090909090909091, 'f1-score': 0.9523809523809523, 'support': 11}, '2': {'precision': 0.9655172413793104, 'recall': 1.0, 'f1-score': 0.9824561403508771, 'support': 28}, 'accuracy': 0.9743589743589743, 'macro avg': {'precision': 0.9827586206896552, 'recall': 0.9545454545454546, 'f1-score': 0.9674185463659147, 'support': 39}, 'weighted avg': {'precision': 0.9752431476569408, 'recall': 0.9743589743589743, 'f1-score': 0.9739733950260265, 'support': 39}}}
PPP score with cleaning: Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation': MeanModeImputation}: {'roc_auc_acore': 1.0, 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.8s finished


Generating corrupted training data on 54 rows... 

Applying perturbations... 

MissingValues: {'column': 'age', 'fraction': 0.25, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'resting_electrocardiographic_results', 'fraction': 0.25}
GaussianNoise: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.25}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.8846153846153846, 'classification_report': {'absent': {'precision': 0.8333333333333334, 'recall': 0.7692307692307693, 'f1-score': 0.8, 'support': 26}, 'present': {'precision': 0.8, 'recall': 0.8571428571428571, 'f1-score': 0.8275862068965518, 'support': 28}, 'accuracy': 0.8148148148148148, 'macro avg': {'precision': 0.8166666666666667, 'recall': 0.8131868131868132, 'f1-score': 0.8137931034482759, 'support': 54}, 'weighted avg': {'precision': 0.8160493827160494, 'recall': 0.8148148148148148, 'f1-score': 0.8143039591315455, 'support': 54}}}
PPP score with cleaning: Cleaner: {'outlier_detection': NoOutlier

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.8s finished


Generating corrupted training data on 54 rows... 

Applying perturbations... 

MissingValues: {'column': 'age', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'resting_electrocardiographic_results', 'fraction': 0.5}
GaussianNoise: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.5}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.8145604395604397, 'classification_report': {'absent': {'precision': 0.8181818181818182, 'recall': 0.6923076923076923, 'f1-score': 0.7500000000000001, 'support': 26}, 'present': {'precision': 0.75, 'recall': 0.8571428571428571, 'f1-score': 0.7999999999999999, 'support': 28}, 'accuracy': 0.7777777777777778, 'macro avg': {'precision': 0.7840909090909092, 'recall': 0.7747252747252746, 'f1-score': 0.775, 'support': 54}, 'weighted avg': {'precision': 0.7828282828282829, 'recall': 0.7777777777777778, 'f1-score': 0.775925925925926, 'support': 54}}}
PPP score with cleaning: Cleaner: {'outlier_detection': NoOutlierD

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.8s finished


Generating corrupted training data on 54 rows... 

Applying perturbations... 

MissingValues: {'column': 'age', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'resting_electrocardiographic_results', 'fraction': 0.75}
GaussianNoise: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.75}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.8118131868131868, 'classification_report': {'absent': {'precision': 0.8, 'recall': 0.6153846153846154, 'f1-score': 0.6956521739130435, 'support': 26}, 'present': {'precision': 0.7058823529411765, 'recall': 0.8571428571428571, 'f1-score': 0.7741935483870968, 'support': 28}, 'accuracy': 0.7407407407407407, 'macro avg': {'precision': 0.7529411764705882, 'recall': 0.7362637362637363, 'f1-score': 0.7349228611500701, 'support': 54}, 'weighted avg': {'precision': 0.7511982570806099, 'recall': 0.7407407407407407, 'f1-score': 0.7363773310477377, 'support': 54}}}
PPP score with cleaning: Cleaner: {'outlier_detect

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 out of  45 | elapsed:    8.9s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    9.6s finished


Generating corrupted training data on 54 rows... 

Applying perturbations... 

MissingValues: {'column': 'age', 'fraction': 0.25, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'resting_electrocardiographic_results', 'fraction': 0.25}
GaussianNoise: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.25}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9217032967032966, 'classification_report': {'absent': {'precision': 0.8, 'recall': 0.9230769230769231, 'f1-score': 0.8571428571428571, 'support': 26}, 'present': {'precision': 0.9166666666666666, 'recall': 0.7857142857142857, 'f1-score': 0.8461538461538461, 'support': 28}, 'accuracy': 0.8518518518518519, 'macro avg': {'precision': 0.8583333333333334, 'recall': 0.8543956043956045, 'f1-score': 0.8516483516483516, 'support': 54}, 'weighted avg': {'precision': 0.8604938271604938, 'recall': 0.8518518518518519, 'f1-score': 0.8514448514448515, 'support': 54}}}
PPP score with cleaning: Cleaner: {'outlier_detect

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 out of  45 | elapsed:    9.1s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    9.7s finished


Generating corrupted training data on 54 rows... 

Applying perturbations... 

MissingValues: {'column': 'age', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'resting_electrocardiographic_results', 'fraction': 0.5}
GaussianNoise: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.5}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9271978021978021, 'classification_report': {'absent': {'precision': 0.8333333333333334, 'recall': 0.9615384615384616, 'f1-score': 0.8928571428571429, 'support': 26}, 'present': {'precision': 0.9583333333333334, 'recall': 0.8214285714285714, 'f1-score': 0.8846153846153847, 'support': 28}, 'accuracy': 0.8888888888888888, 'macro avg': {'precision': 0.8958333333333334, 'recall': 0.8914835164835164, 'f1-score': 0.8887362637362638, 'support': 54}, 'weighted avg': {'precision': 0.8981481481481481, 'recall': 0.8888888888888888, 'f1-score': 0.8885836385836385, 'support': 54}}}
PPP score with cleaning: Cleaner: {'ou

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 out of  45 | elapsed:    9.0s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    9.7s finished


Generating corrupted training data on 54 rows... 

Applying perturbations... 

MissingValues: {'column': 'age', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'resting_electrocardiographic_results', 'fraction': 0.75}
GaussianNoise: {'column': 'maximum_heart_rate_achieved', 'fraction': 0.75}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9381868131868132, 'classification_report': {'absent': {'precision': 0.8571428571428571, 'recall': 0.9230769230769231, 'f1-score': 0.888888888888889, 'support': 26}, 'present': {'precision': 0.9230769230769231, 'recall': 0.8571428571428571, 'f1-score': 0.888888888888889, 'support': 28}, 'accuracy': 0.8888888888888888, 'macro avg': {'precision': 0.8901098901098901, 'recall': 0.8901098901098901, 'f1-score': 0.888888888888889, 'support': 54}, 'weighted avg': {'precision': 0.8913308913308913, 'recall': 0.8888888888888888, 'f1-score': 0.8888888888888888, 'support': 54}}}
PPP score with cleaning: Cleaner: {'ou

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    7.5s finished


Generating corrupted training data on 200 rows... 

Applying perturbations... 

MissingValues: {'column': 'credit_amount', 'fraction': 0.25, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'residence_since', 'fraction': 0.25}
GaussianNoise: {'column': 'residence_since', 'fraction': 0.25}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.793361383824217, 'classification_report': {'bad': {'precision': 0.6153846153846154, 'recall': 0.3870967741935484, 'f1-score': 0.47524752475247517, 'support': 62}, 'good': {'precision': 0.7639751552795031, 'recall': 0.8913043478260869, 'f1-score': 0.822742474916388, 'support': 138}, 'accuracy': 0.735, 'macro avg': {'precision': 0.6896798853320593, 'recall': 0.6392005610098177, 'f1-score': 0.6489949998344315, 'support': 200}, 'weighted avg': {'precision': 0.7179120879120879, 'recall': 0.735, 'f1-score': 0.715019040365575, 'support': 200}}}
PPP score with cleaning: Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation'

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    7.1s finished


Generating corrupted training data on 200 rows... 

Applying perturbations... 

MissingValues: {'column': 'credit_amount', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'residence_since', 'fraction': 0.5}
GaussianNoise: {'column': 'residence_since', 'fraction': 0.5}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.7755960729312764, 'classification_report': {'bad': {'precision': 0.5384615384615384, 'recall': 0.3387096774193548, 'f1-score': 0.41584158415841577, 'support': 62}, 'good': {'precision': 0.7453416149068323, 'recall': 0.8695652173913043, 'f1-score': 0.8026755852842808, 'support': 138}, 'accuracy': 0.705, 'macro avg': {'precision': 0.6419015766841853, 'recall': 0.6041374474053296, 'f1-score': 0.6092585847213483, 'support': 200}, 'weighted avg': {'precision': 0.6812087912087911, 'recall': 0.705, 'f1-score': 0.6827570449352627, 'support': 200}}}
PPP score with cleaning: Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation'

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    6.9s finished


Generating corrupted training data on 200 rows... 

Applying perturbations... 

MissingValues: {'column': 'credit_amount', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'residence_since', 'fraction': 0.75}
GaussianNoise: {'column': 'residence_since', 'fraction': 0.75}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.7052360916316036, 'classification_report': {'bad': {'precision': 0.4411764705882353, 'recall': 0.24193548387096775, 'f1-score': 0.3125, 'support': 62}, 'good': {'precision': 0.7168674698795181, 'recall': 0.8623188405797102, 'f1-score': 0.7828947368421053, 'support': 138}, 'accuracy': 0.67, 'macro avg': {'precision': 0.5790219702338767, 'recall': 0.552127162225339, 'f1-score': 0.5476973684210527, 'support': 200}, 'weighted avg': {'precision': 0.6314032600992205, 'recall': 0.67, 'f1-score': 0.6370723684210526, 'support': 200}}}
PPP score with cleaning: Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation': MeanModeIm

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   14.5s finished


Generating corrupted training data on 200 rows... 

Applying perturbations... 

MissingValues: {'column': 'credit_amount', 'fraction': 0.25, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'residence_since', 'fraction': 0.25}
GaussianNoise: {'column': 'residence_since', 'fraction': 0.25}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.774310425432445, 'classification_report': {'bad': {'precision': 0.6666666666666666, 'recall': 0.3870967741935484, 'f1-score': 0.4897959183673469, 'support': 62}, 'good': {'precision': 0.7682926829268293, 'recall': 0.9130434782608695, 'f1-score': 0.8344370860927153, 'support': 138}, 'accuracy': 0.75, 'macro avg': {'precision': 0.717479674796748, 'recall': 0.6500701262272089, 'f1-score': 0.662116502230031, 'support': 200}, 'weighted avg': {'precision': 0.7367886178861789, 'recall': 0.75, 'f1-score': 0.7275983240978511, 'support': 200}}}
PPP score with cleaning: Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation': M

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   14.3s finished


Generating corrupted training data on 200 rows... 

Applying perturbations... 

MissingValues: {'column': 'credit_amount', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'residence_since', 'fraction': 0.5}
GaussianNoise: {'column': 'residence_since', 'fraction': 0.5}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.7705703599812997, 'classification_report': {'bad': {'precision': 0.6388888888888888, 'recall': 0.3709677419354839, 'f1-score': 0.4693877551020407, 'support': 62}, 'good': {'precision': 0.7621951219512195, 'recall': 0.9057971014492754, 'f1-score': 0.8278145695364238, 'support': 138}, 'accuracy': 0.74, 'macro avg': {'precision': 0.7005420054200542, 'recall': 0.6383824216923797, 'f1-score': 0.6486011623192323, 'support': 200}, 'weighted avg': {'precision': 0.7239701897018971, 'recall': 0.74, 'f1-score': 0.716702257061765, 'support': 200}}}
PPP score with cleaning: Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation': Me

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   14.0s finished


Generating corrupted training data on 200 rows... 

Applying perturbations... 

MissingValues: {'column': 'credit_amount', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'residence_since', 'fraction': 0.75}
GaussianNoise: {'column': 'residence_since', 'fraction': 0.75}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.7793361383824217, 'classification_report': {'bad': {'precision': 0.6666666666666666, 'recall': 0.3870967741935484, 'f1-score': 0.4897959183673469, 'support': 62}, 'good': {'precision': 0.7682926829268293, 'recall': 0.9130434782608695, 'f1-score': 0.8344370860927153, 'support': 138}, 'accuracy': 0.75, 'macro avg': {'precision': 0.717479674796748, 'recall': 0.6500701262272089, 'f1-score': 0.662116502230031, 'support': 200}, 'weighted avg': {'precision': 0.7367886178861789, 'recall': 0.75, 'f1-score': 0.7275983240978511, 'support': 200}}}
PPP score with cleaning: Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation': 

In [13]:
results

[{'cleaners': [{'Imputation method': MeanModeImputation,
    'Outlier detection method': NoOutlierDetection,
    'PPP score with cleaning': {'classification_report': {'1': {'f1-score': 0.6470588235294118,
       'precision': 0.4782608695652174,
       'recall': 1.0,
       'support': 11},
      '2': {'f1-score': 0.7272727272727273,
       'precision': 1.0,
       'recall': 0.5714285714285714,
       'support': 28},
      'accuracy': 0.6923076923076923,
      'macro avg': {'f1-score': 0.6871657754010696,
       'precision': 0.7391304347826086,
       'recall': 0.7857142857142857,
       'support': 39},
      'weighted avg': {'f1-score': 0.704648292883587,
       'precision': 0.8528428093645485,
       'recall': 0.6923076923076923,
       'support': 39}},
     'roc_auc_acore': 0.7500000000000001}},
   {'Imputation method': NoImputation,
    'Outlier detection method': PyODKNN,
    'PPP score with cleaning': {'classification_report': {'1': {'f1-score': 0.28571428571428564,
       'precisi