In [1]:
import random
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from jenga.basis import Dataset
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise
from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNN, PyODIsolationForest
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, DatawigImputation
from jenga.cleaning.clean import Clean

In [2]:
seed = 10

In [3]:
def run_experiment(dataset_name, learner, param_grid, corruptions, cleaners):
    
    ## dataset
    dataset = Dataset(seed, dataset_name)
    
    all_data = dataset.all_data
    attribute_names = dataset.attribute_names
    attribute_types = dataset.attribute_types
    
    ## categorical and numerical features
    categorical_columns = dataset.categorical_columns
    numerical_columns = dataset.numerical_columns
    print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")
    
    ## train and test data
    df_train, lab_train, df_test, lab_test = dataset.get_train_test_data()
    
    
    ## pipeline performance prediction (ppp)
    ppp = PipelinePerformancePrediction(seed, df_train, lab_train, df_test, lab_test, categorical_columns, numerical_columns, learner, param_grid, corruptions)
    ppp_model = ppp.fit_ppp(df_train)
    
    ## generate corrpted data
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(df_test, corruptions)
    
    ## cleaning
    clean = Clean(df_train, df_corrupted, categorical_columns, numerical_columns, ppp, ppp_model, cleaners)
    df_cleaned, corrupted_score_ppp, cleaner_scores_ppp, summary_cleaners = clean(df_train, df_corrupted)
    
    ## results
    result = {
        'ppp_score_model': ppp.predict_score_ppp(ppp_model, df_test),
        'ppp_score_corrupted': corrupted_score_ppp,
        'ppp_score_cleaned': np.array(cleaner_scores_ppp).max(),
        'ppp_scores_cleaners': cleaner_scores_ppp
    }
#     print('\n'.join([f'{key}:{val}' for key, val in result.items()]))
    
    ## summary
    summary = {
        'dataset': dataset_name,
        'model': learner,
        'corruptions': summary_col_corrupt,
        'cleaners': summary_cleaners,
        'result': result
    }
#     print('\n\n\n\n'.join([f'{key}:{val}' for key, val in summary.items()]))
    
    return summary #summary_col_corrupt, result

In [4]:
datasets = [
    'parkinsons',
    'heart-statlog',
    'credit-g'
]

In [5]:
## model parameters
## models is a dict where key = leaner & value = param_grid
models = {SGDClassifier(loss='log'): {'learner__max_iter': [500, 1000, 5000], 
                                         'learner__penalty': ['l2', 'l1', 'elasticnet'], 
                                         'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
                                        }, 
          RandomForestClassifier():{'learner__n_estimators': [100, 200, 500], 
                                    'learner__max_depth': [5, 10, 15]
                                   }
         }

## make dict of multiple leraners and corresponding param_grids

In [6]:
corruptions = [MissingValues, SwappedValues, Scaling, GaussianNoise]

In [7]:
cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNN, NoImputation),
    (PyODKNN, MeanModeImputation),
    (PyODIsolationForest, NoImputation),
    (PyODIsolationForest, MeanModeImputation)
]

In [8]:
# results = []
# for _ in range(5):
#     results.append(run_experiment(random.choice(datasets), learner, param_grid, corruptions, cleaners))

In [9]:
results = []

for dataset in datasets:
    for learner, param_grid in models.items():
        results.append(run_experiment(dataset, learner, param_grid, corruptions, cleaners))

Dataset 'parkinsons', target: 'Class'
**Author**:   
**Source**: UCI
**Please cite**: 'Exploiting Nonlinear Recurrence and Fractal Scaling Properties for Voice Disorder Detection', Little MA, McSharry PE, Roberts SJ, Costello DAE, Moroz IM. BioMedical Engineering OnLine 2007, 6:23 (26 June 2007) 

* Abstract: 

Oxford Parkinson's Disease Detection Dataset

* Source:

The dataset was created by Max Little of the University of Oxford, in collaboration with the National Centre for Voice and Speech, Denver, Colorado, who recorded the sp

Attribute types: 


Unnamed: 0,attribute_names,categorical_indicator
0,V1,False
1,V2,False
2,V3,False
3,V4,False
4,V5,False
5,V6,False
6,V7,False
7,V8,False
8,V9,False
9,V10,False


Found 0 categorical and 22 numeric features 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  59 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    7.1s finished


Generating corrupted training data on 39 rows... 

Can't apply the SwappedValues corruption because there are no categorical columns. 


Applying perturbations... 

MissingValues: {'column': 'V14', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MCAR'}
Scaling: {'column': 'V19', 'fraction': 0.5}
GaussianNoise: {'column': 'V7', 'fraction': 0.25}

Applying cleaners... 

PPP score no cleaning: 0.7045454545454545
Outlier detection method: <jenga.cleaning.outlier_detection.NoOutlierDetection object at 0x000001200AEB7160>
Imputation method: <jenga.cleaning.imputation.MeanModeImputation object at 0x000001207C2B6AC8>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x000001200AF73588>: 0.6948051948051948 

Outlier detection method: <jenga.cleaning.outlier_detection.PyODKNN object at 0x000001200AFE14E0>
Imputation method: <jenga.cleaning.imputation.NoImputation object at 0x000001200AFE1A90>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x000001200AFE18D

Unnamed: 0,attribute_names,categorical_indicator
0,V1,False
1,V2,False
2,V3,False
3,V4,False
4,V5,False
5,V6,False
6,V7,False
7,V8,False
8,V9,False
9,V10,False


Found 0 categorical and 22 numeric features 

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   12.5s finished


Generating corrupted training data on 39 rows... 

Applying perturbations... 

MissingValues: {'column': 'V14', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MCAR'}
Scaling: {'column': 'V19', 'fraction': 0.5}
GaussianNoise: {'column': 'V7', 'fraction': 0.25}

Applying cleaners... 

PPP score no cleaning: 0.9967532467532467
Outlier detection method: <jenga.cleaning.outlier_detection.NoOutlierDetection object at 0x000001200AFE1F28>
Imputation method: <jenga.cleaning.imputation.MeanModeImputation object at 0x000001200AFE1198>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x000001200AECF048>: 0.9967532467532467 

Outlier detection method: <jenga.cleaning.outlier_detection.PyODKNN object at 0x000001200AECF6D8>
Imputation method: <jenga.cleaning.imputation.NoImputation object at 0x000001200AECFB70>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x000001200AF9DE10>: 0.9967532467532467 

Outlier detection method: <jenga.cleaning.outlier_detection.P

Unnamed: 0,attribute_names,categorical_indicator
0,age,False
1,sex,False
2,chest,False
3,resting_blood_pressure,False
4,serum_cholestoral,False
5,fasting_blood_sugar,False
6,resting_electrocardiographic_results,False
7,maximum_heart_rate_achieved,False
8,exercise_induced_angina,False
9,oldpeak,False


Found 0 categorical and 13 numeric features 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 173 out of 180 | elapsed:    1.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.7s finished


Generating corrupted training data on 54 rows... 

Applying perturbations... 

MissingValues: {'column': 'resting_electrocardiographic_results', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MCAR'}
Scaling: {'column': 'oldpeak', 'fraction': 0.5}
GaussianNoise: {'column': 'resting_blood_pressure', 'fraction': 0.25}

Applying cleaners... 

PPP score no cleaning: 0.9175824175824175
Outlier detection method: <jenga.cleaning.outlier_detection.NoOutlierDetection object at 0x000001200AF91C18>
Imputation method: <jenga.cleaning.imputation.MeanModeImputation object at 0x000001200AF91278>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x000001200AF91E80>: 0.9175824175824175 

Outlier detection method: <jenga.cleaning.outlier_detection.PyODKNN object at 0x000001200AF86D68>
Imputation method: <jenga.cleaning.imputation.NoImputation object at 0x000001200AFE1A58>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x000001200AEB7F98>: 0.853021978021978 

Outli

Unnamed: 0,attribute_names,categorical_indicator
0,age,False
1,sex,False
2,chest,False
3,resting_blood_pressure,False
4,serum_cholestoral,False
5,fasting_blood_sugar,False
6,resting_electrocardiographic_results,False
7,maximum_heart_rate_achieved,False
8,exercise_induced_angina,False
9,oldpeak,False


Found 0 categorical and 13 numeric features 

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    8.9s finished


Generating corrupted training data on 54 rows... 

Applying perturbations... 

MissingValues: {'column': 'resting_electrocardiographic_results', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MCAR'}
Scaling: {'column': 'oldpeak', 'fraction': 0.5}
GaussianNoise: {'column': 'resting_blood_pressure', 'fraction': 0.25}

Applying cleaners... 

PPP score no cleaning: 0.8942307692307693
Outlier detection method: <jenga.cleaning.outlier_detection.NoOutlierDetection object at 0x000001200AF7D6D8>
Imputation method: <jenga.cleaning.imputation.MeanModeImputation object at 0x000001200AF7DBE0>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x000001200B07ACC0>: 0.8969780219780219 

Outlier detection method: <jenga.cleaning.outlier_detection.PyODKNN object at 0x000001200B07AE80>
Imputation method: <jenga.cleaning.imputation.NoImputation object at 0x000001200B07A978>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x000001200B07A9E8>: 0.9024725274725275 

Outl

Unnamed: 0,attribute_names,categorical_indicator
0,checking_status,True
1,duration,False
2,credit_history,True
3,purpose,True
4,credit_amount,False
5,savings_status,True
6,employment,True
7,installment_commitment,False
8,personal_status,True
9,other_parties,True


Found 13 categorical and 7 numeric features 

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    5.2s finished


Generating corrupted training data on 200 rows... 

Applying perturbations... 

MissingValues: {'column': 'other_parties', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MCAR'}
Scaling: {'column': 'age', 'fraction': 0.5}
GaussianNoise: {'column': 'credit_amount', 'fraction': 0.25}

Applying cleaners... 

PPP score no cleaning: 0.7196119682094437
Outlier detection method: <jenga.cleaning.outlier_detection.NoOutlierDetection object at 0x0000012003D91320>
Imputation method: <jenga.cleaning.imputation.MeanModeImputation object at 0x0000012003D91278>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000012003D91400>: 0.7196119682094437 

Outlier detection method: <jenga.cleaning.outlier_detection.PyODKNN object at 0x0000012003D91588>
Imputation method: <jenga.cleaning.imputation.NoImputation object at 0x0000012003D912E8>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000012003D91240>: 0.8000233754090696 

Outlier detection method: <jenga.cleani

Unnamed: 0,attribute_names,categorical_indicator
0,checking_status,True
1,duration,False
2,credit_history,True
3,purpose,True
4,credit_amount,False
5,savings_status,True
6,employment,True
7,installment_commitment,False
8,personal_status,True
9,other_parties,True


Found 13 categorical and 7 numeric features 

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   11.2s finished


Generating corrupted training data on 200 rows... 

Applying perturbations... 

MissingValues: {'column': 'other_parties', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MCAR'}
Scaling: {'column': 'age', 'fraction': 0.5}
GaussianNoise: {'column': 'credit_amount', 'fraction': 0.25}

Applying cleaners... 

PPP score no cleaning: 0.7879850397381954
Outlier detection method: <jenga.cleaning.outlier_detection.NoOutlierDetection object at 0x0000012003DF1358>
Imputation method: <jenga.cleaning.imputation.MeanModeImputation object at 0x000001200C16F9B0>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x000001200B07AA20>: 0.7807386629266012 

Outlier detection method: <jenga.cleaning.outlier_detection.PyODKNN object at 0x000001200B07AC18>
Imputation method: <jenga.cleaning.imputation.NoImputation object at 0x000001200B07A860>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x000001200B07AE48>: 0.7916082281439926 

Outlier detection method: <jenga.cleani

In [10]:
results

[{'dataset': 'parkinsons',
  'model': SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
                l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
                n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
                random_state=None, shuffle=True, tol=0.001,
                validation_fraction=0.1, verbose=0, warm_start=False),
  'corruptions': defaultdict(list,
              {('V14',): [<jenga.corruptions.generic.MissingValues at 0x1200af73630>],
               ('V19',): [<jenga.corruptions.numerical.Scaling at 0x1200aeb7208>],
               ('V7',): [<jenga.corruptions.numerical.GaussianNoise at 0x1200af739b0>]}),
  'cleaners': [{'Outlier detection method': <jenga.cleaning.outlier_detection.NoOutlierDetection at 0x1200aeb7160>,
    'Imputation method': <jenga.cleaning.imputation.MeanModeImputation at 0x1207c2b6ac8>,
    'PPP score with cleaning