In [1]:
import random
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from jenga.basis import Dataset
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise
from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.clean import Clean

In [2]:
seed = 10

In [3]:
## model parameters
learner = SGDClassifier(max_iter=1000)
param_grid = {
    'learner__loss': ['log'],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

In [4]:
corruptions = [MissingValues, SwappedValues, Scaling, GaussianNoise]

In [None]:
dataset = Dataset(seed, 'credit-g')
categorical_columns = dataset.categorical_columns
numerical_columns = dataset.numerical_columns
df_train, lab_train, df_test, lab_test = dataset.get_train_test_data()
ppp = PipelinePerformancePrediction(seed, df_train, lab_train, df_test, lab_test, categorical_columns, numerical_columns, learner, param_grid, corruptions)#, pipeline)
df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(df_test, corruptions)

In [None]:
df_corrupted

In [None]:
perturbations

In [None]:
cols_perturbed

In [None]:
summary_col_corrupt

In [6]:
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNN, PyODIsolationForest
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, DatawigImputation

cleaners = [
    (NoOutlierDetection, NoImputation),
    (NoOutlierDetection, MeanModeImputation),
    (NoOutlierDetection, DatawigImputation),
    (PyODKNN, NoImputation),
    (PyODKNN, MeanModeImputation),
    (PyODKNN, DatawigImputation),
    (PyODIsolationForest, NoImputation),
    (PyODIsolationForest, MeanModeImputation),
    (PyODIsolationForest, DatawigImputation)
]

In [7]:
def run_experiment(dataset_name, learner, param_grid, corruptions, cleaners):
    
    ## dataset
    dataset = Dataset(seed, dataset_name)
    
    all_data = dataset.all_data
    attribute_names = dataset.attribute_names
    attribute_types = dataset.attribute_types
    
    ## categorical and numerical features
    categorical_columns = dataset.categorical_columns
    numerical_columns = dataset.numerical_columns
    print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features")
    
    
    ## preprocessing pipeline
    # preprocessing pipeline for numerical columns
    transformer_numeric = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('standard_scale', StandardScaler())
    ])

    # preprocessing pipeline for categorical columns
    transformer_categorical = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
        ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
    ])

    # preprocessor
    feature_transform = ColumnTransformer(transformers=[
        ('categorical_features', transformer_categorical, categorical_columns),
        ('numerical_features', transformer_numeric, numerical_columns)
    ])
    
    ## prediction pipeline: append classifier (learner) to the preprocessing pipeline
    pipeline = Pipeline([
        ('features', feature_transform),
        ('learner', learner)
    ])
    
    ## train and test data
    df_train, lab_train, df_test, lab_test = dataset.get_train_test_data()
    
    ## model results
    # model 
    pipeline.fit(df_train, lab_train)
    model_score = pipeline.score(df_test, lab_test)
    print(f"Model score: {model_score}")
    
    
    ## pipeline performance prediction (ppp)
    ppp = PipelinePerformancePrediction(seed, df_train, lab_train, df_test, lab_test, categorical_columns, numerical_columns, learner, param_grid, corruptions, pipeline)
    ppp_model = ppp.fit_ppp(df_train)
    
    ## generate corrpted data
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(df_test, corruptions)
    
    ## cleaning
    clean = Clean(df_train, df_corrupted, categorical_columns, numerical_columns, ppp, ppp_model)
    df_cleaned, corrupted_score_ppp, cleaner_scores_ppp = clean(df_train, df_corrupted)
    
    ## results
    result = {
        'test_score_model': model_score, ## original test data
        'test_score_corrupted': pipeline.score(df_corrupted, lab_test),
        'test_score_cleaned': pipeline.score(df_cleaned, lab_test),
        'ppp_score_model': ppp.predict_score_ppp(ppp_model, df_test),
        'ppp_score_corrupted': corrupted_score_ppp,
        'ppp_score_cleaned': np.array(cleaner_scores_ppp).max(),
        'ppp_scores_cleaners': cleaner_scores_ppp
    }
    #print('\n'.join([f'{key}:{val}' for key, val in result.items()]))
    
    ## summary
    summary = {
        'dataset': dataset,
        'model': learner,
        'corruptions': summary_col_corrupt,
        #'cleaners': 
        'result': result
    }
    print('\n\n\n\n'.join([f'{key}:{val}' for key, val in summary.items()]))
    
    return summary #summary_col_corrupt, result

In [8]:
datasets = [
    'credit-g'#,
    #'heart-statlog',
    #'parkinsons'
]

In [10]:
results = []
for _ in range(5):
    results.append(run_experiment(random.choice(datasets), learner, param_grid, corruptions, cleaners))

Dataset 'credit-g', target: 'class'
**Author**: Dr. Hans Hofmann  

**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)) - 1994    

**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)



**German Credit data**  

This dataset classifies people described by a set of attributes as good or bad credit risks.



This dataset comes with a cost matrix: 

``` 

      Good  Bad (predicted)  

Good   0    1   (actual)  

Bad    5    0  

```



It is worse to class a customer as go

Attribute types: 


Unnamed: 0,attribute_names,categorical_indicator
0,checking_status,True
1,duration,False
2,credit_history,True
3,purpose,True
4,credit_amount,False
5,savings_status,True
6,employment,True
7,installment_commitment,False
8,personal_status,True
9,other_parties,True


Found 13 categorical and 7 numeric features
Model score: 0.75
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    6.0s finished


Generating corrupted training data on 200 rows...
Applying perturbations...
MissingValues: {'column': 'other_parties', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MCAR'} on column ['other_parties']
SwappedValues: {'column_a': 'housing', 'column_b': 'checking_status', 'fraction': 0.5} on column ['housing', 'checking_status']
Scaling: {'column': 'installment_commitment', 'fraction': 0.75} on column ['installment_commitment']
GaussianNoise: {'column': 'age', 'fraction': 0.5} on column ['age']
PPP score no cleaning: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D91C7A390>: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D91998048>: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D91C66588>: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D822B1080>: 0.6639784946236559
PPP score with cleaning: <je

Unnamed: 0,attribute_names,categorical_indicator
0,checking_status,True
1,duration,False
2,credit_history,True
3,purpose,True
4,credit_amount,False
5,savings_status,True
6,employment,True
7,installment_commitment,False
8,personal_status,True
9,other_parties,True


Found 13 categorical and 7 numeric features
Model score: 0.75
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.6s finished


Generating corrupted training data on 200 rows...
Applying perturbations...
MissingValues: {'column': 'other_parties', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MCAR'} on column ['other_parties']
SwappedValues: {'column_a': 'housing', 'column_b': 'checking_status', 'fraction': 0.5} on column ['housing', 'checking_status']
Scaling: {'column': 'installment_commitment', 'fraction': 0.75} on column ['installment_commitment']
GaussianNoise: {'column': 'age', 'fraction': 0.5} on column ['age']
PPP score no cleaning: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D91E1DAC8>: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D91E1DA20>: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D91E01470>: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D91E01F60>: 0.6639784946236559
PPP score with cleaning: <je

Unnamed: 0,attribute_names,categorical_indicator
0,checking_status,True
1,duration,False
2,credit_history,True
3,purpose,True
4,credit_amount,False
5,savings_status,True
6,employment,True
7,installment_commitment,False
8,personal_status,True
9,other_parties,True


Found 13 categorical and 7 numeric features
Model score: 0.75
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.3s finished


Generating corrupted training data on 200 rows...
Applying perturbations...
MissingValues: {'column': 'other_parties', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MCAR'} on column ['other_parties']
SwappedValues: {'column_a': 'housing', 'column_b': 'checking_status', 'fraction': 0.5} on column ['housing', 'checking_status']
Scaling: {'column': 'installment_commitment', 'fraction': 0.75} on column ['installment_commitment']
GaussianNoise: {'column': 'age', 'fraction': 0.5} on column ['age']
PPP score no cleaning: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D91E01F28>: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D91E01AC8>: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D91E010F0>: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D91C81390>: 0.6639784946236559
PPP score with cleaning: <je

Unnamed: 0,attribute_names,categorical_indicator
0,checking_status,True
1,duration,False
2,credit_history,True
3,purpose,True
4,credit_amount,False
5,savings_status,True
6,employment,True
7,installment_commitment,False
8,personal_status,True
9,other_parties,True


Found 13 categorical and 7 numeric features
Model score: 0.75
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    1.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.4s finished


Generating corrupted training data on 200 rows...
Applying perturbations...
MissingValues: {'column': 'other_parties', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MCAR'} on column ['other_parties']
SwappedValues: {'column_a': 'housing', 'column_b': 'checking_status', 'fraction': 0.5} on column ['housing', 'checking_status']
Scaling: {'column': 'installment_commitment', 'fraction': 0.75} on column ['installment_commitment']
GaussianNoise: {'column': 'age', 'fraction': 0.5} on column ['age']
PPP score no cleaning: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D91E1DF98>: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D91C73278>: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D91E01898>: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D91E01080>: 0.6639784946236559
PPP score with cleaning: <je

Unnamed: 0,attribute_names,categorical_indicator
0,checking_status,True
1,duration,False
2,credit_history,True
3,purpose,True
4,credit_amount,False
5,savings_status,True
6,employment,True
7,installment_commitment,False
8,personal_status,True
9,other_parties,True


Found 13 categorical and 7 numeric features
Model score: 0.75
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.4s finished


Generating corrupted training data on 200 rows...
Applying perturbations...
MissingValues: {'column': 'other_parties', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MCAR'} on column ['other_parties']
SwappedValues: {'column_a': 'housing', 'column_b': 'checking_status', 'fraction': 0.5} on column ['housing', 'checking_status']
Scaling: {'column': 'installment_commitment', 'fraction': 0.75} on column ['installment_commitment']
GaussianNoise: {'column': 'age', 'fraction': 0.5} on column ['age']
PPP score no cleaning: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D91C73400>: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D91C61400>: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D92DF8EF0>: 0.6639784946236559
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x0000018D92DF8EB8>: 0.6639784946236559
PPP score with cleaning: <je