## Dataset

In [1]:
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('/home/rupali/Documents/Master Thesis/jenga')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from jenga.basis import Dataset

In [33]:
dataset = Dataset("thoracic_surgery") ## hill-valley ## thoracic_surgery ## cleve ## acute-inflammations

all_data = dataset.all_data
attribute_names = dataset.attribute_names
attribute_types = dataset.attribute_types

categorical_columns = dataset.categorical_columns
numerical_columns = dataset.numerical_columns

print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")

  and should_run_async(code)


Dataset: thoracic_surgery
Found 13 categorical and 3 numeric features 



### Get training and test sets

In [34]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data(0.3)

  and should_run_async(code)


In [4]:
########################################

In [35]:
## use categorical columns as strings
def cat_cols_to_str(df):
    for col in df.columns:
        if pd.api.types.is_categorical_dtype(df[col]):
            df[col] = df[col].astype(str)

    return df

In [36]:
### if we don't convert the categorical columns to str, the swapping corruption doesn't let us assign new values to the column: "Cannot setitem on a Categorical with a new category, set the categories first"
train_data = cat_cols_to_str(train_data)
test_data = cat_cols_to_str(test_data)

In [None]:
########################################

## Workaround to get the data when OpenML decided not to work :D

In [2]:
train_dat = pd.read_csv("/home/rupali/Downloads/Hill_Valley_without_noise_Training.data", sep=",")
train_data = train_dat.loc[:, train_dat.columns != 'class']
train_labels = train_dat["class"]

In [3]:
test_dat = pd.read_csv("/home/rupali/Downloads/Hill_Valley_without_noise_Testing.data", sep=",")
test_data = test_dat.loc[:, test_dat.columns != 'class']
test_labels = test_dat["class"]

In [4]:
categorical_columns = []
numerical_columns = []

for col in train_data.columns:
    if pd.api.types.is_categorical_dtype(train_data[col]):
        categorical_columns.append(col)
    elif pd.api.types.is_numeric_dtype(train_data[col]):
        numerical_columns.append(col)

print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")

Found 0 categorical and 100 numeric features 



## Defined Model

In [44]:
from sklearn.linear_model import SGDClassifier

learner = SGDClassifier(loss='log')
param_grid = {
    'learner__max_iter': [500, 1000, 5000],
    'learner__penalty': ['l2', 'l1', 'elasticnet'], 
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

  and should_run_async(code)


## Corruptions using PPP

In [45]:
from jenga.corruptions.generic import MissingValues, SwappedValues, CategoricalShift
from jenga.corruptions.numerical import Scaling, GaussianNoise

corruptions = [MissingValues, Scaling, CategoricalShift, GaussianNoise, SwappedValues]
fraction = 0.5
num_repetitions = 5

In [46]:
from jenga.cleaning.ppp import PipelinePerformancePrediction

ppp = PipelinePerformancePrediction(train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)
ppp_model = ppp.fit_ppp(train_data)

## generate corrpted data
for _ in range(num_repetitions):
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions, fraction, num_repetitions)

Fitting 5 folds for each of 36 candidates, totalling 180 fits

Generating corrupted training data on 141 rows... 

	perturbation: MissingValues: {'column': 'PRE25', 'fraction': 0.5, 'sampling': 'MNAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'PRE5', 'fraction': 0.5, 'sampling': 'MCAR'}
	perturbation: CategoricalShift: {'column': 'PRE17', 'fraction': 0.5, 'sampling': 'MCAR'}
	perturbation: GaussianNoise: {'column': 'PRE4', 'fraction': 0.5, 'sampling': 'MCAR'}
	perturbation: SwappedValues: {'column': 'DGN', 'fraction': 0.5, 'sampling': 'MNAR', 'swap_with': None}

Generating corrupted training data on 141 rows... 

	perturbation: MissingValues: {'column': 'PRE9', 'fraction': 0.5, 'sampling': 'MAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'PRE4', 'fraction': 0.5, 'sampling': 'MAR'}
	perturbation: CategoricalShift: {'column': 'DGN', 'fraction': 0.5, 'sampling': 'MNAR'}
	perturbation: GaussianNoise: {'column': 'PRE5', 'fraction': 0.5, 'sampling': 'MCAR'}
	perturbation

In [47]:
train_data["PRE17"].unique()

  and should_run_async(code)


array(['F', 'T'], dtype=object)

In [48]:
df_corrupted["PRE17"].unique()

array(['F', 'T'], dtype=object)

## Autogluon model and Corruptions

In [4]:
from jenga.corruptions.generic import MissingValues, SwappedValues, CategoricalShift
from jenga.corruptions.numerical import Scaling, GaussianNoise

corruptions = [MissingValues, Scaling, GaussianNoise, CategoricalShift]
fraction = 0.5
num_repetitions = 5

In [5]:
from jenga.cleaning.ppp import PipelinePerformancePrediction

ppp = PipelinePerformancePrediction(train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns)
ppp_model = ppp.fit_ppp(train_data)
train_data = train_data.loc[:, train_data.columns != 'class']

## generate corrpted data
for _ in range(num_repetitions):
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions, fraction, num_repetitions)

Level 25:autogluon.core.utils.utils:No path specified. Models will be saved in: "AutogluonModels/ag-20210329_235658/"
INFO:autogluon.tabular.learner.default_learner:Beginning AutoGluon training ...
INFO:autogluon.tabular.learner.default_learner:AutoGluon will save models to "AutogluonModels/ag-20210329_235658/"
INFO:autogluon.tabular.learner.default_learner:AutoGluon Version:  0.1.0
INFO:autogluon.tabular.learner.default_learner:Train Data Rows:    329
INFO:autogluon.tabular.learner.default_learner:Train Data Columns: 16
INFO:autogluon.tabular.learner.default_learner:Preprocessing data ...
Level 25:autogluon.core.utils.utils:AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
INFO:autogluon.core.utils.utils:	2 unique label values:  ['1', '2']
Level 25:autogluon.core.utils.utils:	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'mu

INFO:autogluon.tabular.trainer.abstract_trainer:Fitting model: NeuralNetFastAI ...


█

INFO:autogluon.tabular.trainer.abstract_trainer:	0.8485	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	6.96s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.12s	 = Validation runtime
INFO:autogluon.tabular.trainer.abstract_trainer:Fitting model: LightGBMLarge ...
INFO:autogluon.tabular.trainer.abstract_trainer:	0.8485	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	25.84s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.04s	 = Validation runtime


█

INFO:autogluon.tabular.trainer.abstract_trainer:Fitting model: WeightedEnsemble_L2 ...
INFO:autogluon.tabular.trainer.abstract_trainer:	0.8636	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	0.52s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.0s	 = Validation runtime
INFO:autogluon.tabular.learner.default_learner:AutoGluon training complete, total runtime = 77.99s ...
INFO:root:TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20210329_235658/")



Generating corrupted training data on 141 rows... 

	perturbation: MissingValues: {'column': 'V13', 'fraction': 0.5, 'sampling': 'MAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'V3', 'fraction': 0.5, 'sampling': 'MCAR'}
	perturbation: GaussianNoise: {'column': 'V3', 'fraction': 0.5, 'sampling': 'MNAR'}
	perturbation: CategoricalShift: {'column': 'V10', 'fraction': 0.5, 'sampling': 'MAR'}

Generating corrupted training data on 141 rows... 

	perturbation: MissingValues: {'column': 'V6', 'fraction': 0.5, 'sampling': 'MAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'V2', 'fraction': 0.5, 'sampling': 'MNAR'}
	perturbation: GaussianNoise: {'column': 'V2', 'fraction': 0.5, 'sampling': 'MAR'}
	perturbation: CategoricalShift: {'column': 'V15', 'fraction': 0.5, 'sampling': 'MNAR'}

Generating corrupted training data on 141 rows... 

	perturbation: MissingValues: {'column': 'V4', 'fraction': 0.5, 'sampling': 'MAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'V2', '

## Cleaning

### PPP Cleaning

In [49]:
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNNOutlierDetection, PyODIsolationForestOutlierDetection, PyODPCAOutlierDetection, PyODCBLOFOutlierDetection, PyODSOSOutlierDetection, SklearnOutlierDetection
from jenga.cleaning.imputation import MeanModeImputation, SklearnImputation

cleaners = []
for od in [NoOutlierDetection, PyODKNNOutlierDetection, PyODIsolationForestOutlierDetection, PyODPCAOutlierDetection, PyODCBLOFOutlierDetection, PyODSOSOutlierDetection, SklearnOutlierDetection]:
    for imp in [MeanModeImputation, SklearnImputation]:
        cleaners.append((od, imp))

In [50]:
from jenga.cleaning.clean import Clean

categorical_precision_threshold=0.85
numerical_std_error_threshold=0.9

clean = Clean(train_data, df_corrupted, categorical_columns, numerical_columns, categorical_precision_threshold, numerical_std_error_threshold, ppp, ppp_model, cleaners)
df_outliers, df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(train_data, test_data, df_corrupted, cols_perturbed)


Applying cleaners... 

PPP score no cleaning: {'roc_auc_score': 0.6031746031746031, 'classification_report': {'F': {'precision': 0.8920863309352518, 'recall': 0.9841269841269841, 'f1-score': 0.9358490566037736, 'support': 126}, 'T': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 15}, 'accuracy': 0.8794326241134752, 'macro avg': {'precision': 0.4460431654676259, 'recall': 0.49206349206349204, 'f1-score': 0.4679245283018868, 'support': 141}, 'weighted avg': {'precision': 0.7971835297719272, 'recall': 0.8794326241134752, 'f1-score': 0.8362906463267764, 'support': 141}}}
PPP scores with cleaning: 
178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    OC12
51     OC11
246    OC12
242    OC14
79     OC12
       ... 
53     OC12
164    OC14
412    OC

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classifier for col: DGN reached 0.7184825352764284
Classifier for col: PRE6 reached 0.8288225769141799
Classifier for col: PRE7 reached 0.912589636826278
Classifier for col: PRE8 reached 0.8631737219523479
Classifier for col: PRE9 reached 0.9201653944020356
Classifier for col: PRE10 reached 0.8744795281054822
Classifier for col: PRE11 reached 0.8441186675919501
Classifier for col: PRE14 reached 0.5018794818413139
Classifier for col: PRE17 reached 0.9239532731899144




Classifier for col: PRE19 reached nan
Classifier for col: PRE25 reached 0.9810027758501041
Classifier for col: PRE30 reached 0.809969928290539




Classifier for col: PRE32 reached nan
Regressor for col: PRE4/lower reached 0.3191963198298544
Regressor for col: PRE4/median reached 0.6919160740239226
Regressor for col: PRE4/upper reached 0.32858594470844493
Regressor for col: PRE5/lower reached -0.04631760104137783
Regressor for col: PRE5/median reached 0.11889565101037064
Regressor for col: PRE5/upper reached -0.5157967308036887
Regressor for col: AGE/lower reached -0.8459488869564165
Regressor for col: AGE/median reached 0.15698995672571048
Regressor for col: AGE/upper reached -0.9508139639911878
Imputed 70 values in column PRE10
178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    T
51     T
246    F
242    F
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    OC12
51     OC11
246    OC12
242    OC14
79     OC12
       ... 
53     OC12
164    OC14
412    OC11
245    OC12
46     OC

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    OC12
51     OC11
246    OC12
242    OC14
79     OC12
       ... 
53     OC12
164    OC14
412    OC11
245    OC12
46     OC12
Name: PRE14, Length: 141, dtype: object
178    OC11
51     OC13
246    OC12
242    OC14
79     OC12
       ... 
53     OC11
164    OC14
412    OC13
245    OC12
46     OC12
Name: PRE14, Length: 141, dtype: object
178    F
51     F
246    F
242    F
79     F
      ..
53     F
164    F
412    F
245    F
46     F
Name: PRE17, Length: 141, dtype: object
178    F
51     F
246    F
242    F
79     F
      ..
53     F
164    F
412    F
245    F
46     F
Name: PRE17, Length: 141, dtype: object

Outlier detection method: PyODKNNOutlierDetection, Outlier Detection Score: {'Precision': 0.7398345



Classifier for col: DGN reached 0.6956974323386538
Classifier for col: PRE6 reached 0.8327261161230627
Classifier for col: PRE7 reached 0.9315579458709229
Classifier for col: PRE8 reached 0.8554533888503354
Classifier for col: PRE9 reached 0.9201364792967845
Classifier for col: PRE10 reached 0.878209576682859
Classifier for col: PRE11 reached 0.8326972010178118
Classifier for col: PRE14 reached 0.5362017117742308
Classifier for col: PRE17 reached 0.8934478371501272
Classifier for col: PRE19 reached 0.9810027758501041
Classifier for col: PRE25 reached 0.9542852185981957
Classifier for col: PRE30 reached 0.8403018736988203
Classifier for col: PRE32 reached 0.9847906546379829
Regressor for col: PRE4/lower reached 0.22996164720922563
Regressor for col: PRE4/median reached 0.7370594091354824
Regressor for col: PRE4/upper reached 0.3850345780240668
Regressor for col: PRE5/lower reached -0.050406930059022304
Regressor for col: PRE5/median reached 0.019286099159723624
Regressor for col: PRE5/u

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    OC12
51     OC11
246    OC12
242    OC14
79     OC12
       ... 
53     OC12
164    OC14
412    OC11
245    OC12
46     OC12
Name: PRE14, Length: 141, dtype: object
178    OC11
51     OC13
246    OC12
242    OC14
79     OC12
       ... 
53     OC11
164    OC14
412    OC13
245    OC12
46     OC12
Name: PRE14, Length: 141, dtype: object
178    F
51     F
246    F
242    F
79     F
      ..
53     F
164    F
412    F
245    F
46     F
Name: PRE17, Length: 141, dtype: object
178    F
51     F
246    F
242    F
79     F
      ..
53     F
164    F
412    F
245    F
46     F
Name: PRE17, Length: 141, dtype: object

Outlier detection method: PyODIsolationForestOutlierDetection, Outlier Detection Score: {'Precision



Classifier for col: DGN reached 0.6920541290770298
Classifier for col: PRE6 reached 0.8554244737450845
Classifier for col: PRE7 reached 0.9163486005089059
Classifier for col: PRE8 reached 0.8289671524404348
Classifier for col: PRE9 reached 0.9087439278278973
Classifier for col: PRE10 reached 0.8744795281054822
Classifier for col: PRE11 reached 0.8516944251677077
Classifier for col: PRE14 reached 0.4753643303261624
Classifier for col: PRE17 reached 0.9012260004626417
Classifier for col: PRE19 reached 0.9847906546379829
Classifier for col: PRE25 reached 0.9733981031690955
Classifier for col: PRE30 reached 0.809883182974786




Classifier for col: PRE32 reached nan
Regressor for col: PRE4/lower reached 0.2683269238123703
Regressor for col: PRE4/median reached 0.6918532085769566
Regressor for col: PRE4/upper reached 0.32699952490432177
Regressor for col: PRE5/lower reached -0.054291562699303575
Regressor for col: PRE5/median reached 0.08886075407350147
Regressor for col: PRE5/upper reached -0.09822752736073076
Regressor for col: AGE/lower reached -0.9242400776621128
Regressor for col: AGE/median reached 0.0851537864896586
Regressor for col: AGE/upper reached -0.8739458257293028
Imputed 70 values in column PRE10
Imputed 50 values in column PRE4
Imputed 117 values in column PRE5
Imputed 35 values in column AGE
178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    T
51     T
246    F
242    F
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    OC12
51     OC11
246  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Outlier detection method: PyODIsolationForestOutlierDetection, Outlier Detection Score: {'Precision': 0.7248772504091654, 'Recall': 0.7333333333333332, 'F1-score': 0.7136248667192064, 'Accuracy': 0.8567375886524822}
Imputation method: SklearnImputation, Imputation Score: {'Precision': 0.8371921779464473, 'Recall': 0.804084661043424, 'F1-score': 0.7589462138057824, 'Accuracy': 0.8037825059101656, 'Mean Squared Error': 109.73080419016327}
Cleaner: (PyODIsolationForestOutlierDetection, SklearnImputation): {'roc_auc_score': 0.4978835978835979, 'classification_report': {'F': {'precision': 0.8928571428571429, 'recall': 0.9920634920634921, 'f1-score': 0.9398496240601505, 'support': 126}, 'T': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 15}, 'accuracy': 0.8865248226950354, 'macro avg': {'precision': 0.44642857142857145, 'recall': 0.49603174603174605, 'f1-score': 0.46992481203007525, 'support': 141}, 'weighted avg': {'precision': 0.7978723404255319, 'recall': 0.88652482269503

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    OC12
51     OC11
246    OC12
242    OC14
79     OC12
       ... 
53     OC12
164    OC14
412    OC11
245    OC12
46     OC12
Name: PRE14, Length: 141, dtype: object
178    OC11
51     OC13
246    OC12
242    OC14
79     OC12
       ... 
53     OC11
164    OC14
412    OC13
245    OC12
46     OC12
Name: PRE14, Length: 141, dtype: object
178    F
51     F
246    F
242    F
79     F
      ..
53     F
164    F
412    F
245    F
46     F
Name: PRE17, Length: 141, dtype: object
178    F
51     F
246    F
242    F
79     F
      ..
53     F
164    F
412    F
245    F
46     F
Name: PRE17, Length: 141, dtype: object

Outlier detection method: PyODPCAOutlierDetection, Outlier Detection Score: {'Precision': 0.7380506



Classifier for col: DGN reached 0.7110513532269258
Classifier for col: PRE6 reached 0.8516365949572057
Classifier for col: PRE7 reached 0.9163486005089059
Classifier for col: PRE8 reached 0.8365718251214435
Classifier for col: PRE9 reached 0.9163775156141568
Classifier for col: PRE10 reached 0.8820552856812398
Classifier for col: PRE11 reached 0.8479643765903309
Classifier for col: PRE14 reached 0.4903712699514226
Classifier for col: PRE17 reached 0.9201364792967845




Classifier for col: PRE19 reached nan
Classifier for col: PRE25 reached 0.9771859819569744
Classifier for col: PRE30 reached 0.8251214434420542
Classifier for col: PRE32 reached 0.9809449456396021
Regressor for col: PRE4/lower reached 0.1269102695601202
Regressor for col: PRE4/median reached 0.6823772698554273
Regressor for col: PRE4/upper reached 0.41863068950706855
Regressor for col: PRE5/lower reached -0.10251113234132803
Regressor for col: PRE5/median reached -0.010037076676702839
Regressor for col: PRE5/upper reached -0.5607995797659778
Regressor for col: AGE/lower reached -0.7822557962154016
Regressor for col: AGE/median reached 0.0870629729776567
Regressor for col: AGE/upper reached -1.0106774983787314
Imputed 70 values in column PRE10
Imputed 28 values in column PRE4
Imputed 109 values in column PRE5
Imputed 37 values in column AGE
178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    T


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    OC12
51     OC11
246    OC12
242    OC14
79     OC12
       ... 
53     OC12
164    OC14
412    OC11
245    OC12
46     OC12
Name: PRE14, Length: 141, dtype: object
178    OC11
51     OC13
246    OC12
242    OC14
79     OC12
       ... 
53     OC11
164    OC14
412    OC13
245    OC12
46     OC12
Name: PRE14, Length: 141, dtype: object
178    F
51     F
246    F
242    F
79     F
      ..
53     F
164    F
412    F
245    F
46     F
Name: PRE17, Length: 141, dtype: object
178    F
51     F
246    F
242    F
79     F
      ..
53     F
164    F
412    F
245    F
46     F
Name: PRE17, Length: 141, dtype: object

Outlier detection method: PyODCBLOFOutlierDetection, Outlier Detection Score: {'Precision': 0.73123



Classifier for col: DGN reached 0.760467268100856
Classifier for col: PRE6 reached 0.8250925283368031
Classifier for col: PRE7 reached 0.8898912792042564
Classifier for col: PRE8 reached 0.8403018736988203
Classifier for col: PRE9 reached 0.9201653944020356
Classifier for col: PRE10 reached 0.8745373583159843
Classifier for col: PRE11 reached 0.8783541522091141
Classifier for col: PRE14 reached 0.4600971547536433
Classifier for col: PRE17 reached 0.9353747397640527




Classifier for col: PRE19 reached nan
Classifier for col: PRE25 reached 0.965706685172334
Classifier for col: PRE30 reached 0.8213335646541753
Classifier for col: PRE32 reached 0.9885785334258617
Regressor for col: PRE4/lower reached 0.25377592197169263
Regressor for col: PRE4/median reached 0.6791924026090712
Regressor for col: PRE4/upper reached 0.3150131374783168
Regressor for col: PRE5/lower reached -0.05829528350884483
Regressor for col: PRE5/median reached 0.05588491436605614
Regressor for col: PRE5/upper reached -0.10305300312380095
Regressor for col: AGE/lower reached -0.7744582247989046
Regressor for col: AGE/median reached 0.0358996983850306
Regressor for col: AGE/upper reached -1.1258164300303741
Imputed 70 values in column PRE10
Imputed 49 values in column PRE4
Imputed 113 values in column PRE5
Imputed 40 values in column AGE
178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    T
51

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    OC12
51     OC11
246    OC12
242    OC14
79     OC12
       ... 
53     OC12
164    OC14
412    OC11
245    OC12
46     OC12
Name: PRE14, Length: 141, dtype: object
178    OC11
51     OC13
246    OC12
242    OC14
79     OC12
       ... 
53     OC11
164    OC14
412    OC13
245    OC12
46     OC12
Name: PRE14, Length: 141, dtype: object
178    F
51     F
246    F
242    F
79     F
      ..
53     F
164    F
412    F
245    F
46     F
Name: PRE17, Length: 141, dtype: object
178    F
51     F
246    F
242    F
79     F
      ..
53     F
164    F
412    F
245    F
46     F
Name: PRE17, Length: 141, dtype: object

Outlier detection method: PyODSOSOutlierDetection, Outlier Detection Score: {'Precision': 0.3964539



Classifier for col: DGN reached 0.7528336803145963
Classifier for col: PRE6 reached 0.8553377284293315
Classifier for col: PRE7 reached 0.912531806615776
Classifier for col: PRE8 reached 0.8631448068470969
Classifier for col: PRE9 reached 0.9163486005089059
Classifier for col: PRE10 reached 0.8934478371501272
Classifier for col: PRE11 reached 0.844176497802452
Classifier for col: PRE14 reached 0.45998149433263935
Classifier for col: PRE17 reached 0.9391626185519315




Classifier for col: PRE19 reached nan
Classifier for col: PRE25 reached 0.9886074485311127
Classifier for col: PRE30 reached 0.8326682859125607
Classifier for col: PRE32 reached 0.9923953273189914
Regressor for col: PRE4/lower reached 0.39267386699481077
Regressor for col: PRE4/median reached 0.6737426810020513
Regressor for col: PRE4/upper reached 0.3750861054465849
Regressor for col: PRE5/lower reached -0.05372395716849565
Regressor for col: PRE5/median reached -0.3451100300291481
Regressor for col: PRE5/upper reached -0.6404296774363347
Regressor for col: AGE/lower reached -0.8633851147777268
Regressor for col: AGE/median reached 0.10036824828212515
Regressor for col: AGE/upper reached -1.028681675332699
Imputed 70 values in column PRE10


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    T
51     T
246    F
242    F
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    OC12
51     OC11
246    OC12
242    OC14
79     OC12
       ... 
53     OC12
164    OC14
412    OC11
245    OC12
46     OC12
Name: PRE14, Length: 141, dtype: object
178    OC11
51     OC13
246    OC12
242    OC14
79     OC12
       ... 
53     OC11
164    OC14
412    OC13
245    OC12
46     OC12
Name: PRE14, Length: 141, dtype: object
178    F
51     F
246    F
242    F
79     F
      ..
53     F
164    F
412    F
245    F
46     F
Name: PRE17, Length: 141, dtype: object
178    F
51     F
246    F
242    F
79     F
      ..
53     F
164    F
412    F
245    F
46     F
Name: PRE17, Length: 141, dtype: object

Outlier detection method: PyODSOSOutlierDetection, Outlier Detection Score: {'Precision': 0.3964539



Classifier for col: DGN reached 0.7261739532731899
Classifier for col: PRE6 reached 0.8669326856349757
Classifier for col: PRE7 reached 0.9163486005089059
Classifier for col: PRE8 reached 0.8288225769141799
Classifier for col: PRE9 reached 0.912589636826278
Classifier for col: PRE10 reached 0.8706916493176036
Classifier for col: PRE11 reached 0.8326972010178118
Classifier for col: PRE14 reached 0.4448588942863752
Classifier for col: PRE17 reached 0.9201653944020356




Classifier for col: PRE19 reached nan
Classifier for col: PRE25 reached 0.9847906546379829
Classifier for col: PRE30 reached 0.8213335646541753




Classifier for col: PRE32 reached nan
Regressor for col: PRE4/lower reached 0.20032068032278655
Regressor for col: PRE4/median reached 0.6949225390705449
Regressor for col: PRE4/upper reached 0.3109703666682844
Regressor for col: PRE5/lower reached -0.048989116960934886
Regressor for col: PRE5/median reached -0.036993050360661583
Regressor for col: PRE5/upper reached -1.5283754224164219
Regressor for col: AGE/lower reached -1.1585820095977852
Regressor for col: AGE/median reached 0.15052872422495278
Regressor for col: AGE/upper reached -0.8578910662638739
Column DGN contained 0 nans before, now 0
Column PRE6 contained 0 nans before, now 0
Column PRE7 contained 0 nans before, now 0
Column PRE8 contained 0 nans before, now 0
Column PRE9 contained 0 nans before, now 0
Column PRE10 contained 70 nans before, now 70
Column PRE11 contained 0 nans before, now 0
Column PRE14 contained 0 nans before, now 0
Column PRE17 contained 0 nans before, now 0
Column PRE19 contained 0 nans before, now 0
Co

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    OC12
51     OC11
246    OC12
242    OC14
79     OC12
       ... 
53     OC12
164    OC14
412    OC11
245    OC12
46     OC12
Name: PRE14, Length: 141, dtype: object
178    OC11
51     OC13
246    OC12
242    OC14
79     OC12
       ... 
53     OC11
164    OC14
412    OC13
245    OC12
46     OC12
Name: PRE14, Length: 141, dtype: object
178    F
51     F
246    F
242    F
79     F
      ..
53     F
164    F
412    F
245    F
46     F
Name: PRE17, Length: 141, dtype: object
178    F
51     F
246    F
242    F
79     F
      ..
53     F
164    F
412    F
245    F
46     F
Name: PRE17, Length: 141, dtype: object

Outlier detection method: SklearnOutlierDetection, Outlier Detection Score: {'Precision': 0.7296005

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    T
51     T
246    F
242    T
79     T
      ..
53     T
164    T
412    F
245    T
46     T
Name: PRE10, Length: 141, dtype: object
178    OC12
51     OC11
246    OC12
242    OC14
79     OC12
       ... 
53     OC12
164    OC14
412    OC11
245    OC12
46     OC12
Name: PRE14, Length: 141, dtype: object
178    OC11
51     OC13
246    OC12
242    OC14
79     OC12
       ... 
53     OC11
164    OC14
412    OC13
245    OC12
46     OC12
Name: PRE14, Length: 141, dtype: object
178    F
51     F
246    F
242    F
79     F
      ..
53     F
164    F
412    F
245    F
46     F
Name: PRE17, Length: 141, dtype: object
178    F
51     F
246    F
242    F
79     F
      ..
53     F
164    F
412    F
245    F
46     F
Name: PRE17, Length: 141, dtype: object

Outlier detection method: SklearnOutlierDetection, Outlier Detection Score: {'Precision': 0.7345792

## Model Evaluation

### With learner and param_grid

In [256]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# preprocessing pipeline for numerical columns
transformer_numeric = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('standard_scale', StandardScaler())
])

# preprocessing pipeline for categorical columns
transformer_categorical = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
    ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
])

# preprocessor
feature_transform = ColumnTransformer(transformers=[
    ('categorical_features', transformer_categorical, categorical_columns),
    ('numerical_features', transformer_numeric, numerical_columns)
])

## prediction pipeline: append classifier (learner) to the preprocessing pipeline
pipeline = Pipeline([
    ('features', feature_transform),
    ('learner', learner)
])

In [257]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipeline, param_grid, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
model = grid_search.fit(train_data, train_labels)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [258]:
y_pred = model.predict(test_data)

  and should_run_async(code)


In [259]:
from sklearn.metrics import roc_auc_score

roc_auc_score(test_labels, np.transpose(model.predict_proba(test_data))[1])

  and should_run_async(code)


0.5968965517241378

In [260]:
model.predict_proba(test_data)

  and should_run_async(code)


array([[2.48923873e-01, 7.51076127e-01],
       [1.19466022e-01, 8.80533978e-01],
       [4.76675761e-01, 5.23324239e-01],
       [7.52237373e-01, 2.47762627e-01],
       [3.09806104e-02, 9.69019390e-01],
       [3.95795689e-01, 6.04204311e-01],
       [6.30358337e-02, 9.36964166e-01],
       [6.53316975e-01, 3.46683025e-01],
       [1.11722907e-01, 8.88277093e-01],
       [2.10795831e-01, 7.89204169e-01],
       [2.76059381e-01, 7.23940619e-01],
       [5.10868216e-02, 9.48913178e-01],
       [1.53031464e-02, 9.84696854e-01],
       [6.78376864e-01, 3.21623136e-01],
       [1.18168338e-01, 8.81831662e-01],
       [7.34872186e-01, 2.65127814e-01],
       [9.17923570e-05, 9.99908208e-01],
       [2.21709604e-01, 7.78290396e-01],
       [2.36037544e-01, 7.63962456e-01],
       [7.19554811e-03, 9.92804452e-01],
       [2.84619412e-01, 7.15380588e-01],
       [5.03055384e-02, 9.49694462e-01],
       [1.32424108e-01, 8.67575892e-01],
       [9.91516577e-03, 9.90084834e-01],
       [1.004364

In [261]:
from sklearn.metrics import classification_report

classification_report(test_labels, y_pred, output_dict=True)

  and should_run_async(code)


{'1': {'precision': 0.35294117647058826,
  'recall': 0.24,
  'f1-score': 0.28571428571428564,
  'support': 25},
 '2': {'precision': 0.8467741935483871,
  'recall': 0.9051724137931034,
  'f1-score': 0.875,
  'support': 116},
 'accuracy': 0.7872340425531915,
 'macro avg': {'precision': 0.5998576850094877,
  'recall': 0.5725862068965517,
  'f1-score': 0.5803571428571428,
  'support': 141},
 'weighted avg': {'precision': 0.7592151479672171,
  'recall': 0.7872340425531915,
  'f1-score': 0.770516717325228,
  'support': 141}}

### With Autogluon

In [262]:
from autogluon.tabular import TabularPredictor

## training
train_data["class"] = train_labels

label = "class"

## folder to save trained models
#save_path = '/home/rupali/Documents/Master Thesis/jenga/autogluon_models/'

predictor = TabularPredictor(label=label).fit(train_data)

  and should_run_async(code)
Level 25:autogluon.core.utils.utils:No path specified. Models will be saved in: "AutogluonModels/ag-20210326_001410/"
INFO:autogluon.tabular.learner.default_learner:Beginning AutoGluon training ...
INFO:autogluon.tabular.learner.default_learner:AutoGluon will save models to "AutogluonModels/ag-20210326_001410/"
INFO:autogluon.tabular.learner.default_learner:AutoGluon Version:  0.1.0
INFO:autogluon.tabular.learner.default_learner:Train Data Rows:    329
INFO:autogluon.tabular.learner.default_learner:Train Data Columns: 16
INFO:autogluon.tabular.learner.default_learner:Preprocessing data ...
Level 25:autogluon.core.utils.utils:AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
INFO:autogluon.core.utils.utils:	2 unique label values:  ['2', '1']
Level 25:autogluon.core.utils.utils:	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_t

INFO:autogluon.tabular.trainer.abstract_trainer:Fitting model: NeuralNetFastAI ...


Epoch 21: early stopping
█

INFO:autogluon.tabular.trainer.abstract_trainer:	0.8636	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	6.1s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.11s	 = Validation runtime
INFO:autogluon.tabular.trainer.abstract_trainer:Fitting model: LightGBMLarge ...
INFO:autogluon.tabular.trainer.abstract_trainer:	0.8636	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	0.7s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.02s	 = Validation runtime


█

INFO:autogluon.tabular.trainer.abstract_trainer:Fitting model: WeightedEnsemble_L2 ...
INFO:autogluon.tabular.trainer.abstract_trainer:	0.8636	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	0.39s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.0s	 = Validation runtime
INFO:autogluon.tabular.learner.default_learner:AutoGluon training complete, total runtime = 13.5s ...
INFO:root:TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20210326_001410/")


In [263]:
## test on original data
y_pred_test = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=pd.Series(test_labels), y_pred=y_pred_test, auxiliary_metrics=True)

  and should_run_async(code)
INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8226950354609929
INFO:autogluon.tabular.learner.abstract_learner:Evaluations on test data:
INFO:autogluon.tabular.learner.abstract_learner:{
    "accuracy": 0.8226950354609929,
    "accuracy_score": 0.8226950354609929,
    "balanced_accuracy_score": 0.5,
    "matthews_corrcoef": 0.0,
    "f1_score": 0.8226950354609928
}
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:autogluon.tabular.learner.abstract_learner:Detailed (per-class) classification report:
INFO:autogluon.tabular.learner.abstract_learner:{
    "1": {
        "precision": 0.0,
        "recall": 0.0,
        "f1-score": 0.0,
        "support": 25
    },
    "2": {
        "precision": 0.8226950354609929,
        "recall": 1.0,
        "f1-score": 0.9027237354085603,
        "support": 116
    }

In [264]:
## test on corrupted data
y_pred_corrupted = predictor.predict(df_corrupted)
perf = predictor.evaluate_predictions(y_true=pd.Series(test_labels), y_pred=y_pred_corrupted, auxiliary_metrics=True)

  and should_run_async(code)
INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8226950354609929
INFO:autogluon.tabular.learner.abstract_learner:Evaluations on test data:
INFO:autogluon.tabular.learner.abstract_learner:{
    "accuracy": 0.8226950354609929,
    "accuracy_score": 0.8226950354609929,
    "balanced_accuracy_score": 0.5,
    "matthews_corrcoef": 0.0,
    "f1_score": 0.8226950354609928
}
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:autogluon.tabular.learner.abstract_learner:Detailed (per-class) classification report:
INFO:autogluon.tabular.learner.abstract_learner:{
    "1": {
        "precision": 0.0,
        "recall": 0.0,
        "f1-score": 0.0,
        "support": 25
    },
    "2": {
        "precision": 0.8226950354609929,
        "recall": 1.0,
        "f1-score": 0.9027237354085603,
        "support": 116
    }

In [265]:
## test on cleaned data
y_pred_cleaned = predictor.predict(df_cleaned)
perf = predictor.evaluate_predictions(y_true=pd.Series(test_labels), y_pred=y_pred_cleaned, auxiliary_metrics=True)

  and should_run_async(code)
INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8226950354609929
INFO:autogluon.tabular.learner.abstract_learner:Evaluations on test data:
INFO:autogluon.tabular.learner.abstract_learner:{
    "accuracy": 0.8226950354609929,
    "accuracy_score": 0.8226950354609929,
    "balanced_accuracy_score": 0.5,
    "matthews_corrcoef": 0.0,
    "f1_score": 0.8226950354609928
}
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:autogluon.tabular.learner.abstract_learner:Detailed (per-class) classification report:
INFO:autogluon.tabular.learner.abstract_learner:{
    "1": {
        "precision": 0.0,
        "recall": 0.0,
        "f1-score": 0.0,
        "support": 25
    },
    "2": {
        "precision": 0.8226950354609929,
        "recall": 1.0,
        "f1-score": 0.9027237354085603,
        "support": 116
    }

In [266]:
## We can evaluate the performance of each individual trained model on our (labeled) test data
test_data["class"] = test_labels
predictor.leaderboard(test_data, silent=True)

  and should_run_async(code)


█

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT,0.822695,0.863636,0.015985,0.019807,0.565477,0.015985,0.019807,0.565477,1,True,8
1,LightGBMLarge,0.822695,0.863636,0.019482,0.017002,0.703127,0.019482,0.017002,0.703127,1,True,12
2,LightGBM,0.822695,0.863636,0.020127,0.017831,0.847258,0.020127,0.017831,0.847258,1,True,7
3,CatBoost,0.822695,0.863636,0.021599,0.019487,0.448039,0.021599,0.019487,0.448039,1,True,9
4,WeightedEnsemble_L2,0.822695,0.863636,0.025242,0.023716,0.837128,0.003644,0.004229,0.389089,2,True,13
5,NeuralNetFastAI,0.822695,0.863636,0.164247,0.105555,6.09991,0.164247,0.105555,6.09991,1,True,11
6,KNeighborsDist,0.815603,0.80303,0.007434,0.009465,0.004376,0.007434,0.009465,0.004376,1,True,6
7,KNeighborsUnif,0.815603,0.863636,0.007927,0.006374,0.005564,0.007927,0.006374,0.005564,1,True,5
8,RandomForestGini,0.808511,0.848485,0.105643,0.096689,0.70665,0.105643,0.096689,0.70665,1,True,1
9,XGBoost,0.794326,0.863636,0.02487,0.009048,0.252273,0.02487,0.009048,0.252273,1,True,10


In [267]:
perf

  and should_run_async(code)


OrderedDict([('accuracy', 0.8226950354609929),
             ('accuracy_score', 0.8226950354609929),
             ('balanced_accuracy_score', 0.5),
             ('matthews_corrcoef', 0.0),
             ('f1_score', 0.8226950354609928),
             ('confusion_matrix',
                 1    2
              1  0   25
              2  0  116),
             ('classification_report',
              {'1': {'precision': 0.0,
                'recall': 0.0,
                'f1-score': 0.0,
                'support': 25},
               '2': {'precision': 0.8226950354609929,
                'recall': 1.0,
                'f1-score': 0.9027237354085603,
                'support': 116},
               'accuracy': 0.8226950354609929,
               'macro avg': {'precision': 0.41134751773049644,
                'recall': 0.5,
                'f1-score': 0.45136186770428016,
                'support': 141},
               'weighted avg': {'precision': 0.6768271213721644,
                'recall': 0.8

In [268]:
list(perf.items())[6][1]

  and should_run_async(code)


{'1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 25},
 '2': {'precision': 0.8226950354609929,
  'recall': 1.0,
  'f1-score': 0.9027237354085603,
  'support': 116},
 'accuracy': 0.8226950354609929,
 'macro avg': {'precision': 0.41134751773049644,
  'recall': 0.5,
  'f1-score': 0.45136186770428016,
  'support': 141},
 'weighted avg': {'precision': 0.6768271213721644,
  'recall': 0.8226950354609929,
  'f1-score': 0.7426663355134255,
  'support': 141}}

In [269]:
from sklearn.metrics import roc_auc_score

roc_auc_score(test_labels, np.transpose(predictor.predict_proba(test_data)).to_numpy()[1])

0.5432758620689655

In [270]:
perf = predictor.evaluate_predictions(y_true=pd.Series(test_labels), y_pred=y_pred_cleaned, auxiliary_metrics=False)
perf

  and should_run_async(code)
INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8226950354609929


0.8226950354609929

## Undefined Model: Default setting: whole process

In [10]:
from jenga.corruptions.generic import MissingValues, SwappedValues, CategoricalShift
from jenga.corruptions.numerical import Scaling, GaussianNoise

corruptions = [MissingValues, Scaling, GaussianNoise, CategoricalShift]
fraction = 0.5
num_repetitions = 5

  and should_run_async(code)


In [11]:
from jenga.cleaning.ppp import PipelinePerformancePrediction

ppp = PipelinePerformancePrediction(train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns)
ppp_model = ppp.fit_ppp(train_data)

## generate corrpted data
for _ in range(num_repetitions):
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions, fraction, num_repetitions)

No path specified. Models will be saved in: "AutogluonModels/ag-20210324_221101/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20210324_221101/"
AutoGluon Version:  0.1.0
Train Data Rows:    606
Train Data Columns: 100
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    3047.73 MB
	Train Data (Original)  Memory Usage: 0.48 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of

█

	0.6148	 = Validation accuracy score
	7.82s	 = Training runtime
	0.2s	 = Validation runtime
Fitting model: LightGBMLarge ...
	0.6393	 = Validation accuracy score
	5.52s	 = Training runtime
	0.01s	 = Validation runtime


█

Fitting model: WeightedEnsemble_L2 ...
	0.6885	 = Validation accuracy score
	0.5s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 30.87s ...
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20210324_221101/")



Generating corrupted training data on 606 rows... 

	perturbation: MissingValues: {'column': 'X98', 'fraction': 0.5, 'sampling': 'MAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'X6', 'fraction': 0.5, 'sampling': 'MAR'}
	perturbation: GaussianNoise: {'column': 'X66', 'fraction': 0.5, 'sampling': 'MAR'}


IndexError: Cannot choose from an empty sequence

In [12]:
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNNOutlierDetection, PyODIsolationForestOutlierDetection, PyODPCAOutlierDetection, PyODCBLOFOutlierDetection, PyODSOSOutlierDetection
from jenga.cleaning.imputation import MeanModeImputation

cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, MeanModeImputation),
    (PyODIsolationForestOutlierDetection, MeanModeImputation),
    (PyODPCAOutlierDetection, MeanModeImputation),
    (PyODCBLOFOutlierDetection, MeanModeImputation),
    (PyODSOSOutlierDetection, MeanModeImputation)
]

  and should_run_async(code)


In [13]:
from jenga.cleaning.clean import Clean

categorical_precision_threshold=0.7
numerical_std_error_threshold=2.0

clean = Clean(train_data, df_corrupted, categorical_columns, numerical_columns, categorical_precision_threshold, numerical_std_error_threshold, ppp, ppp_model, cleaners)
df_outliers, df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(train_data, test_data, df_corrupted, cols_perturbed)

Evaluation: accuracy on test data: 0.5693069306930693
Evaluations on test data:
{
    "accuracy": 0.5693069306930693,
    "accuracy_score": 0.5693069306930693,
    "balanced_accuracy_score": 0.5685268951986484,
    "matthews_corrcoef": 0.13727616802960593,
    "f1_score": 0.5693069306930693
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5598591549295775,
        "recall": 0.5389830508474577,
        "f1-score": 0.5492227979274611,
        "support": 295
    },
    "1": {
        "precision": 0.577639751552795,
        "recall": 0.5980707395498392,
        "f1-score": 0.5876777251184835,
        "support": 311
    },
    "accuracy": 0.5693069306930693,
    "macro avg": {
        "precision": 0.5687494532411863,
        "recall": 0.5685268951986484,
        "f1-score": 0.5684502615229723,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5689841805893475,
        "recall": 0.5693069306930693,
        "f1-score": 0.56895791732747


Applying cleaners... 



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Evaluation: accuracy on test data: 0.5693069306930693
Evaluations on test data:
{
    "accuracy": 0.5693069306930693,
    "accuracy_score": 0.5693069306930693,
    "balanced_accuracy_score": 0.5685268951986484,
    "matthews_corrcoef": 0.13727616802960593,
    "f1_score": 0.5693069306930693
}


PPP score no cleaning: {'roc_auc_score': 0.5804458008610823, 'classification_report': {'0': {'precision': 0.5598591549295775, 'recall': 0.5389830508474577, 'f1-score': 0.5492227979274611, 'support': 295}, '1': {'precision': 0.577639751552795, 'recall': 0.5980707395498392, 'f1-score': 0.5876777251184835, 'support': 311}, 'accuracy': 0.5693069306930693, 'macro avg': {'precision': 0.5687494532411863, 'recall': 0.5685268951986484, 'f1-score': 0.5684502615229723, 'support': 606}, 'weighted avg': {'precision': 0.5689841805893475, 'recall': 0.5693069306930693, 'f1-score': 0.5689579173274742, 'support': 606}}}
PPP scores with cleaning: 

Outlier detection method: NoOutlierDetection, Outlier Detection Score: {'Precision': 0.25, 'Recall': 0.5, 'F1-score': 0.3333333333333333, 'Accuracy': 0.5}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 82632108746417.0}


Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5598591549295775,
        "recall": 0.5389830508474577,
        "f1-score": 0.5492227979274611,
        "support": 295
    },
    "1": {
        "precision": 0.577639751552795,
        "recall": 0.5980707395498392,
        "f1-score": 0.5876777251184835,
        "support": 311
    },
    "accuracy": 0.5693069306930693,
    "macro avg": {
        "precision": 0.5687494532411863,
        "recall": 0.5685268951986484,
        "f1-score": 0.5684502615229723,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5689841805893475,
        "recall": 0.5693069306930693,
        "f1-score": 0.5689579173274742,
        "support": 606
    }
}


Cleaner: (NoOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5804458008610823, 'classification_report': {'0': {'precision': 0.5598591549295775, 'recall': 0.5389830508474577, 'f1-score': 0.5492227979274611, 'support': 295}, '1': {'precision': 0.577639751552795, 'recall': 0.5980707395498392, 'f1-score': 0.5876777251184835, 'support': 311}, 'accuracy': 0.5693069306930693, 'macro avg': {'precision': 0.5687494532411863, 'recall': 0.5685268951986484, 'f1-score': 0.5684502615229723, 'support': 606}, 'weighted avg': {'precision': 0.5689841805893475, 'recall': 0.5693069306930693, 'f1-score': 0.5689579173274742, 'support': 606}}}


Evaluation: accuracy on test data: 0.5610561056105611
Evaluations on test data:
{
    "accuracy": 0.5610561056105611,
    "accuracy_score": 0.5610561056105611,
    "balanced_accuracy_score": 0.5613602921140116,
    "matthews_corrcoef": 0.12270186173615062,
    "f1_score": 0.5610561056105611
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5469255663430421,
        "recall": 0.5728813559322034,
        "f1-score": 0.5596026490066225,
        "support": 295
    },
    "1": {
        "precision": 0.5757575757575758,
        "recall": 0.5498392282958199,
        "f1-score": 0.5625,
        "support": 311
    },
    "accuracy": 0.5610561056105611,
    "macro avg": {
        "precision": 0.5613415710503089,
        "recall": 0.5613602921140116,
        "f1-score": 0.5610513245033113,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5617221916366394,
        "recall": 0.5610561056105611,
        "f1-score": 0.5610895733613096,
       


Outlier detection method: PyODKNNOutlierDetection, Outlier Detection Score: {'Precision': 0.8408569300146812, 'Recall': 0.8278327832783279, 'F1-score': 0.8262407435124338, 'Accuracy': 0.8278327832783279}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 384872283.31639314}
Cleaner: (PyODKNNOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.573137500681236, 'classification_report': {'0': {'precision': 0.5469255663430421, 'recall': 0.5728813559322034, 'f1-score': 0.5596026490066225, 'support': 295}, '1': {'precision': 0.5757575757575758, 'recall': 0.5498392282958199, 'f1-score': 0.5625, 'support': 311}, 'accuracy': 0.5610561056105611, 'macro avg': {'precision': 0.5613415710503089, 'recall': 0.5613602921140116, 'f1-score': 0.5610513245033113, 'support': 606}, 'weighted avg': {'precision': 0.5617221916366394, 'recall': 0.5610561056105611, 'f1-score': 0.5610895733613096, 'support': 606}

Evaluation: accuracy on test data: 0.5511551155115512
Evaluations on test data:
{
    "accuracy": 0.5511551155115512,
    "accuracy_score": 0.5511551155115512,
    "balanced_accuracy_score": 0.5538939451741239,
    "matthews_corrcoef": 0.11007944236596345,
    "f1_score": 0.5511551155115512
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5315068493150685,
        "recall": 0.6576271186440678,
        "f1-score": 0.5878787878787878,
        "support": 295
    },
    "1": {
        "precision": 0.5809128630705395,
        "recall": 0.45016077170418006,
        "f1-score": 0.5072463768115942,
        "support": 311
    },
    "accuracy": 0.5511551155115512,
    "macro avg": {
        "precision": 0.5562098561928039,
        "recall": 0.5538939451741239,
        "f1-score": 0.5475625823451911,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5568620807968366,
        "recall": 0.5511551155115512,
        "f1-score": 0.546498128073


Outlier detection method: PyODIsolationForestOutlierDetection, Outlier Detection Score: {'Precision': 0.7322447494913131, 'Recall': 0.7084708470847084, 'F1-score': 0.7039280901824226, 'Accuracy': 0.7084708470847084}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 656300362.0999852}
Cleaner: (PyODIsolationForestOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5698893672679711, 'classification_report': {'0': {'precision': 0.5315068493150685, 'recall': 0.6576271186440678, 'f1-score': 0.5878787878787878, 'support': 295}, '1': {'precision': 0.5809128630705395, 'recall': 0.45016077170418006, 'f1-score': 0.5072463768115942, 'support': 311}, 'accuracy': 0.5511551155115512, 'macro avg': {'precision': 0.5562098561928039, 'recall': 0.5538939451741239, 'f1-score': 0.5475625823451911, 'support': 606}, 'weighted avg': {'precision': 0.5568620807968366, 'recall': 0.5511551155115512, 'f1-score'

Evaluation: accuracy on test data: 0.5412541254125413
Evaluations on test data:
{
    "accuracy": 0.5412541254125413,
    "accuracy_score": 0.5412541254125413,
    "balanced_accuracy_score": 0.5438116518611369,
    "matthews_corrcoef": 0.08924300449788912,
    "f1_score": 0.5412541254125413
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5235457063711911,
        "recall": 0.6406779661016949,
        "f1-score": 0.5762195121951219,
        "support": 295
    },
    "1": {
        "precision": 0.5673469387755102,
        "recall": 0.44694533762057875,
        "f1-score": 0.5,
        "support": 311
    },
    "accuracy": 0.5412541254125413,
    "macro avg": {
        "precision": 0.5454463225733507,
        "recall": 0.5438116518611369,
        "f1-score": 0.538109756097561,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5460245566644968,
        "recall": 0.5412541254125413,
        "f1-score": 0.5371035579167673,
        "s


Outlier detection method: PyODPCAOutlierDetection, Outlier Detection Score: {'Precision': 0.70036142691356, 'Recall': 0.6903190319031903, 'F1-score': 0.6869518678156635, 'Accuracy': 0.6903190319031903}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 402189918.1321284}
Cleaner: (PyODPCAOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5566134394244917, 'classification_report': {'0': {'precision': 0.5235457063711911, 'recall': 0.6406779661016949, 'f1-score': 0.5762195121951219, 'support': 295}, '1': {'precision': 0.5673469387755102, 'recall': 0.44694533762057875, 'f1-score': 0.5, 'support': 311}, 'accuracy': 0.5412541254125413, 'macro avg': {'precision': 0.5454463225733507, 'recall': 0.5438116518611369, 'f1-score': 0.538109756097561, 'support': 606}, 'weighted avg': {'precision': 0.5460245566644968, 'recall': 0.5412541254125413, 'f1-score': 0.5371035579167673, 'support': 606}}}






Evaluation: accuracy on test data: 0.5561056105610561
Evaluations on test data:
{
    "accuracy": 0.5561056105610561,
    "accuracy_score": 0.5561056105610561,
    "balanced_accuracy_score": 0.5587170962995258,
    "matthews_corrcoef": 0.11968413084772356,
    "f1_score": 0.5561056105610561
}


Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5359116022099447,
        "recall": 0.6576271186440678,
        "f1-score": 0.5905631659056316,
        "support": 295
    },
    "1": {
        "precision": 0.5860655737704918,
        "recall": 0.45980707395498394,
        "f1-score": 0.5153153153153154,
        "support": 311
    },
    "accuracy": 0.5561056105610561,
    "macro avg": {
        "precision": 0.5609885879902183,
        "recall": 0.5587170962995258,
        "f1-score": 0.5529392406104735,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.561650686624681,
        "recall": 0.5561056105610561,
        "f1-score": 0.5519458696455848,
        "support": 606
    }
}



Outlier detection method: PyODCBLOFOutlierDetection, Outlier Detection Score: {'Precision': 0.8187954544245285, 'Recall': 0.7926292629262927, 'F1-score': 0.7882947627871975, 'Accuracy': 0.7926292629262927}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 389087104.71174496}
Cleaner: (PyODCBLOFOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5764292332007194, 'classification_report': {'0': {'precision': 0.5359116022099447, 'recall': 0.6576271186440678, 'f1-score': 0.5905631659056316, 'support': 295}, '1': {'precision': 0.5860655737704918, 'recall': 0.45980707395498394, 'f1-score': 0.5153153153153154, 'support': 311}, 'accuracy': 0.5561056105610561, 'macro avg': {'precision': 0.5609885879902183, 'recall': 0.5587170962995258, 'f1-score': 0.5529392406104735, 'support': 606}, 'weighted avg': {'precision': 0.561650686624681, 'recall': 0.5561056105610561, 'f1-score': 0.5519458696455848

Evaluation: accuracy on test data: 0.5478547854785478
Evaluations on test data:
{
    "accuracy": 0.5478547854785478,
    "accuracy_score": 0.5478547854785478,
    "balanced_accuracy_score": 0.5459698076189439,
    "matthews_corrcoef": 0.09289220351441713,
    "f1_score": 0.5478547854785478
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5405405405405406,
        "recall": 0.4745762711864407,
        "f1-score": 0.5054151624548736,
        "support": 295
    },
    "1": {
        "precision": 0.553314121037464,
        "recall": 0.617363344051447,
        "f1-score": 0.5835866261398177,
        "support": 311
    },
    "accuracy": 0.5478547854785478,
    "macro avg": {
        "precision": 0.5469273307890024,
        "recall": 0.5459698076189439,
        "f1-score": 0.5445008942973457,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5470959589143743,
        "recall": 0.5478547854785478,
        "f1-score": 0.545532860814638


Outlier detection method: PyODSOSOutlierDetection, Outlier Detection Score: {'Precision': 0.6360021977299181, 'Recall': 0.6259625962596259, 'F1-score': 0.6050356549950111, 'Accuracy': 0.6259625962596259}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 53433142764388.34}
Cleaner: (PyODSOSOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5539484440568968, 'classification_report': {'0': {'precision': 0.5405405405405406, 'recall': 0.4745762711864407, 'f1-score': 0.5054151624548736, 'support': 295}, '1': {'precision': 0.553314121037464, 'recall': 0.617363344051447, 'f1-score': 0.5835866261398177, 'support': 311}, 'accuracy': 0.5478547854785478, 'macro avg': {'precision': 0.5469273307890024, 'recall': 0.5459698076189439, 'f1-score': 0.5445008942973457, 'support': 606}, 'weighted avg': {'precision': 0.5470959589143743, 'recall': 0.5478547854785478, 'f1-score': 0.5455328608146387, 'supp

## Try Cleaners

In [24]:
from pyod.models.knn import KNN
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

  and should_run_async(code)


In [25]:
## featurizers
def build_featurizers(columns):
    categorical_preprocessing = Pipeline([
        ('mark_missing', SimpleImputer(strategy='constant', fill_value='__NA__')),
        ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
    ])

    numeric_preprocessing = Pipeline([
        ('mark_missing', SimpleImputer(strategy='constant', fill_value=0)),
    ])

    cat_cols = [c for c in categorical_columns if c in columns]
    num_cols = [c for c in numerical_columns if c in columns]
    
    feature_transformation = ColumnTransformer(transformers=[
        ('categorical_features', categorical_preprocessing, cat_cols),
        ('scaled_numeric', numeric_preprocessing, num_cols)
    ])

    return feature_transformation

### Pyod Single Column - features

In [26]:
df_outliers = df_corrupted.copy(deep=True)
df_outliers

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16
364,3,2320.00,46.773306,3,1,1,1,2,2,1,1,1,1,2,1,56.0
357,6,2000.00,,1,1,1,1,1,1,2,1,1,1,2,1,70.0
333,7,2200.00,,1,1,1,1,1,1,1,1,1,1,1,1,71.0
111,3,4.00,,2,1,1,1,2,1,2,2,1,1,2,1,58.0
453,3,2840.00,,2,1,2,1,2,2,1,1,1,1,2,1,72.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,3,4.36,,2,1,1,1,1,1,1,1,1,1,2,1,47.0
207,6,2600.00,,2,1,1,1,2,1,1,1,1,1,2,1,66.0
446,7,5.20,,1,1,1,1,1,1,2,1,1,1,1,1,49.0
268,3,2520.00,-13.627679,3,1,1,2,2,2,2,1,1,1,2,1,74.0


In [27]:
predictors = {}

for col in categorical_columns + numerical_columns:
    predictors[col] = Pipeline(
        [('features', build_featurizers([col])),
         ('outlier_detector', KNN())
        ])
    
len(predictors)

  and should_run_async(code)


16

In [28]:
for col in categorical_columns + numerical_columns:
    predictors[col].fit(df_outliers)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [29]:
for col in categorical_columns + numerical_columns:
    df_outliers[col + "_outlier"] = predictors[col].predict(df_corrupted)

  and should_run_async(code)


In [30]:
df_outliers

  and should_run_async(code)


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X91_outlier,X92_outlier,X93_outlier,X94_outlier,X95_outlier,X96_outlier,X97_outlier,X98_outlier,X99_outlier,X100_outlier
0,1041.990593,1039.850024,1037.136237,1033.695733,1029.333904,1023.804033,1016.793331,,996.637055,982.351378,...,0,0,0,0,0,0,0,0,0,0
1,62.191641,62.191641,62.191640,62.191640,62.191640,62.191639,62.191639,,62.191637,62.191636,...,0,0,0,0,0,0,0,0,0,0
2,3216.305785,3216.305745,3216.305698,3216.305642,3216.305575,3216.305494,3216.305398,,3216.305147,3216.304983,...,0,0,0,0,0,0,0,0,0,0
3,6373.569435,6373.015068,6372.300869,6371.380752,6370.195350,6368.668175,6366.700689,6364.165942,6360.900382,6356.693303,...,0,0,0,0,0,0,0,0,0,0
4,16.680583,16.699414,16.723735,16.755150,16.795726,16.848135,16.915828,,17.116196,17.262063,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601,847.498845,847.498704,847.498525,847.498297,847.498008,847.497640,847.497173,,847.495824,847.494865,...,0,0,0,0,0,0,0,0,0,0
602,24724.565870,24728.692710,24733.760610,24739.984180,24747.626940,24757.012520,24768.538360,24782.692490,24800.074270,24821.419720,...,0,0,1,0,0,0,0,0,0,1
603,797.931966,797.931974,797.931983,797.931995,797.932010,797.932029,797.932051,,797.932114,797.932157,...,0,0,0,0,0,0,0,0,0,0
604,906.837107,906.950491,907.094634,907.277881,907.510841,907.807000,908.183503,,909.270640,910.044210,...,0,0,0,0,0,0,0,0,0,0


### Pyod Multiple Columns - features

In [29]:
df_outliers = df_corrupted.copy(deep=True)

  and should_run_async(code)


In [30]:
predictors = Pipeline(
    [('features', build_featurizers(numerical_columns + categorical_columns)),
     ('outlier_detector', KNN())
    ])
predictors.fit(df_outliers)

Pipeline(steps=[('features',
                 ColumnTransformer(transformers=[('categorical_features',
                                                  Pipeline(steps=[('mark_missing',
                                                                   SimpleImputer(fill_value='__NA__',
                                                                                 strategy='constant')),
                                                                  ('one_hot_encode',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['V1', 'V4', 'V5', 'V6', 'V7',
                                                   'V8', 'V9', 'V10', 'V11',
                                                   'V12', 'V13', 'V14',
                                                   'V15']),
                                                 ('scaled_numeric',
                                                  Pipel

In [31]:
outliers = predictors.predict(df_outliers)
outliers

  and should_run_async(code)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0])

### Categorical from train, numerical from pyod

In [18]:
## single column based

In [35]:
def cat_out_detect(df_train, df_corrupted):
    df_outliers = df_corrupted[categorical_columns].copy()
    
    for col in df_train.columns:
        if col in categorical_columns:
            vals_train_unique = df_train[col].unique()
            
            ## add a respective outlier col for each col
            df_outliers[col + "_outlier"] = ''
            
            for i in df_corrupted[col].index:
                if df_corrupted.loc[i, col] in vals_train_unique:
                    df_outliers.loc[i, col + "_outlier"] = 0
                else:
                    df_outliers.loc[i, col + "_outlier"] = 1
                    
    return df_outliers

  and should_run_async(code)


In [36]:
def num_out_detect(df_train, df_corrupted, pyod_model):
    df_outliers = df_corrupted[numerical_columns].copy()
    
    for col in df_train.columns:
        if col in numerical_columns:
            ## find indices of records with NaNs in col in df_corrupted
            nan_idx = df_corrupted[df_corrupted[col].isnull()].index
            non_nan_idx = df_corrupted.loc[set(df_corrupted.index) - set(nan_idx)].index
            
            ## pd series -> np column, needs to be 2D array
            ## taking only the non-NaN records in the corrupted data
            col_tr_arr = np.array(df_train[col]).reshape(-1,1)
            col_corr_arr = np.array(df_corrupted.loc[non_nan_idx][col]).reshape(-1,1)
            
            ## fit the dataset to the model
            model = pyod_model
            model.fit(col_tr_arr)
            
            ## predict raw anomaly score
            scores_pred = model.decision_function(col_corr_arr) * -1
            
            ## prediction of a datapoint category outlier or inlier
            y_pred = model.predict(col_corr_arr)
            
            ## add a respective outlier col for each col
            df_outliers[col + "_outlier"] = ''
            df_outliers.loc[non_nan_idx, col + "_outlier"] = y_pred ## 0: inlier, 1: outlier
            df_outliers.loc[nan_idx, col + "_outlier"] = 1
            
    return df_outliers

In [37]:
from pyod.models.knn import KNN

pyod_model = KNN()
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

Unnamed: 0,Age,Trestbps,Cholesterol,Max_heart_rate,Oldpeak,Age_outlier,Trestbps_outlier,Cholesterol_outlier,Max_heart_rate_outlier,Oldpeak_outlier,...,Number_of_vessels_colored,Thal,Sex_outlier,Chest_pain_type_outlier,Fasting_blood_sugar_&lt;_120_outlier,Resting_ecg_outlier,Exercise_induced_angina_outlier,Slope_outlier,Number_of_vessels_colored_outlier,Thal_outlier
125,54.0,140.0,239.000000,160.0,1.2,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
112,55.0,180.0,327.000000,117.0,3.4,0,1,0,0,0,...,0,2,0,0,0,0,0,0,0,0
103,57.0,140.0,192.000000,148.0,,0,0,0,0,1,...,0,1,0,1,0,0,0,0,0,0
158,59.0,135.0,234.000000,161.0,0.5,0,0,0,0,0,...,0,3,0,1,0,0,0,0,0,0
170,43.0,120.0,177.000000,120.0,,0,0,0,0,1,...,0,3,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,670.0,120.0,540.706938,71.0,1.0,1,0,1,1,0,...,0,2,0,1,0,0,0,0,0,0
213,52.0,128.0,205.000000,184.0,0,0,0,0,0,0,...,0,2,0,1,0,0,0,0,0,0
198,420.0,102.0,417.727651,122.0,,1,0,1,0,1,...,0,2,0,1,0,0,0,0,0,0
199,690.0,140.0,254.000000,146.0,,1,0,0,0,1,...,3,3,0,1,0,0,0,0,0,0


In [38]:
from pyod.models.pca import PCA

pyod_model = PCA() # n_components = min(n_samples, n_features) default  # n_selected_components = None
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

  and should_run_async(code)


Unnamed: 0,Age,Trestbps,Cholesterol,Max_heart_rate,Oldpeak,Age_outlier,Trestbps_outlier,Cholesterol_outlier,Max_heart_rate_outlier,Oldpeak_outlier,...,Number_of_vessels_colored,Thal,Sex_outlier,Chest_pain_type_outlier,Fasting_blood_sugar_&lt;_120_outlier,Resting_ecg_outlier,Exercise_induced_angina_outlier,Slope_outlier,Number_of_vessels_colored_outlier,Thal_outlier
125,54.0,140.0,239.000000,160.0,1.2,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
112,55.0,180.0,327.000000,117.0,3.4,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
103,57.0,140.0,192.000000,148.0,,0,0,1,0,1,...,0,1,0,1,0,0,0,0,0,0
158,59.0,135.0,234.000000,161.0,0.5,0,0,0,0,0,...,0,3,0,1,0,0,0,0,0,0
170,43.0,120.0,177.000000,120.0,,0,0,1,0,1,...,0,3,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,670.0,120.0,540.706938,71.0,1.0,1,0,1,1,0,...,0,2,0,1,0,0,0,0,0,0
213,52.0,128.0,205.000000,184.0,0,0,0,0,1,0,...,0,2,0,1,0,0,0,0,0,0
198,420.0,102.0,417.727651,122.0,,1,1,1,0,1,...,0,2,0,1,0,0,0,0,0,0
199,690.0,140.0,254.000000,146.0,,1,0,0,0,1,...,3,3,0,1,0,0,0,0,0,0


In [39]:
from pyod.models.cblof import CBLOF

pyod_model = CBLOF() # n_clusters = 8 default
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

  and should_run_async(code)


Unnamed: 0,Age,Trestbps,Cholesterol,Max_heart_rate,Oldpeak,Age_outlier,Trestbps_outlier,Cholesterol_outlier,Max_heart_rate_outlier,Oldpeak_outlier,...,Number_of_vessels_colored,Thal,Sex_outlier,Chest_pain_type_outlier,Fasting_blood_sugar_&lt;_120_outlier,Resting_ecg_outlier,Exercise_induced_angina_outlier,Slope_outlier,Number_of_vessels_colored_outlier,Thal_outlier
125,54.0,140.0,239.000000,160.0,1.2,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
112,55.0,180.0,327.000000,117.0,3.4,0,1,1,0,1,...,0,2,0,0,0,0,0,0,0,0
103,57.0,140.0,192.000000,148.0,,0,0,0,0,1,...,0,1,0,1,0,0,0,0,0,0
158,59.0,135.0,234.000000,161.0,0.5,0,0,0,0,0,...,0,3,0,1,0,0,0,0,0,0
170,43.0,120.0,177.000000,120.0,,0,0,0,1,1,...,0,3,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,670.0,120.0,540.706938,71.0,1.0,1,0,1,1,0,...,0,2,0,1,0,0,0,0,0,0
213,52.0,128.0,205.000000,184.0,0,0,0,0,0,0,...,0,2,0,1,0,0,0,0,0,0
198,420.0,102.0,417.727651,122.0,,1,0,1,0,1,...,0,2,0,1,0,0,0,0,0,0
199,690.0,140.0,254.000000,146.0,,1,0,0,0,1,...,3,3,0,1,0,0,0,0,0,0


In [40]:
from pyod.models.sos import SOS

pyod_model = SOS()
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

  and should_run_async(code)


Unnamed: 0,Age,Trestbps,Cholesterol,Max_heart_rate,Oldpeak,Age_outlier,Trestbps_outlier,Cholesterol_outlier,Max_heart_rate_outlier,Oldpeak_outlier,...,Number_of_vessels_colored,Thal,Sex_outlier,Chest_pain_type_outlier,Fasting_blood_sugar_&lt;_120_outlier,Resting_ecg_outlier,Exercise_induced_angina_outlier,Slope_outlier,Number_of_vessels_colored_outlier,Thal_outlier
125,54.0,140.0,239.000000,160.0,1.2,0,0,1,0,0,...,0,2,0,0,0,0,0,0,0,0
112,55.0,180.0,327.000000,117.0,3.4,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
103,57.0,140.0,192.000000,148.0,,0,0,1,0,1,...,0,1,0,1,0,0,0,0,0,0
158,59.0,135.0,234.000000,161.0,0.5,0,0,0,0,0,...,0,3,0,1,0,0,0,0,0,0
170,43.0,120.0,177.000000,120.0,,0,0,0,0,1,...,0,3,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,670.0,120.0,540.706938,71.0,1.0,0,0,1,0,0,...,0,2,0,1,0,0,0,0,0,0
213,52.0,128.0,205.000000,184.0,0,0,0,0,0,0,...,0,2,0,1,0,0,0,0,0,0
198,420.0,102.0,417.727651,122.0,,0,0,0,0,1,...,0,2,0,1,0,0,0,0,0,0
199,690.0,140.0,254.000000,146.0,,0,0,0,0,1,...,3,3,0,1,0,0,0,0,0,0


### Sklearn

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, precision_recall_curve

  and should_run_async(code)


In [42]:
df_outliers = df_corrupted.copy(deep=True)

In [43]:
categorical_precision_threshold = 0.85
numeric_error_percentile = 0.9

predictors = {}
predictable_cols = []

In [44]:
categorical_preprocessing = Pipeline([
    ('mark-missing', SimpleImputer(strategy='constant', fill_value='__NA__')),
    ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
])

numeric_preprocessing = Pipeline([
    ('mark_missing', SimpleImputer(strategy='median')),
    ('scaling', StandardScaler())
])

In [207]:
## cat to string

#### Categoric

In [10]:
col = "V14"

In [11]:
len(train_data[col].unique())

2

In [12]:
feature_transform = ColumnTransformer(transformers=[
    ('categorical_features', categorical_preprocessing, list(set(categorical_columns) - {col})),
    ('numeric_features', numeric_preprocessing, numerical_columns)
])

In [13]:
param_grid = {
    'learner__n_estimators': [10, 50, 100, 200],
}

In [14]:
pipeline = Pipeline([
    ('features', feature_transform),
    ('learner', GradientBoostingClassifier())
])

In [15]:
search = GridSearchCV(pipeline, param_grid, cv=2, verbose=0, n_jobs=-1)
predictors[col] = search.fit(train_data, train_data[col])

print(f'Classifier for col: {col} reached {search.best_score_}')

Classifier for col: V14 reached 0.8206762749445676


In [20]:
train_data

  and should_run_async(code)


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,class
454,3,3.24,2.76,1,1,1,1,1,1,1,1,1,1,2,1,70.0,2
459,3,4.28,3.16,1,1,1,1,2,1,2,1,1,1,1,1,66.0,2
283,2,2.32,1.68,2,1,2,1,2,1,2,1,1,1,2,1,64.0,2
276,3,3.04,2.04,2,1,1,1,2,1,2,1,1,1,2,1,67.0,2
86,3,4.90,4.19,1,1,1,2,2,1,2,1,1,1,1,1,52.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,3,4.52,3.32,1,1,1,1,2,1,2,1,1,1,2,1,58.0,2
270,3,2.52,1.72,2,1,1,1,2,2,2,1,1,1,2,1,71.0,1
53,4,3.76,2.52,2,1,1,1,2,1,2,1,1,1,2,1,75.0,2
152,3,2.72,2.04,2,2,1,1,1,1,2,1,1,1,1,1,76.0,1


In [18]:
test_data

  and should_run_async(code)


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16
22,3,2.36,1.68,1,1,1,1,1,1,2,1,1,1,2,1,62.0
134,3,4.84,3.48,2,1,1,1,2,1,2,1,1,1,2,1,56.0
2,3,2.76,2.08,2,1,1,1,2,1,1,1,1,1,2,1,59.0
95,3,4.56,3.60,2,1,1,1,2,1,1,1,1,1,2,1,54.0
42,3,3.20,2.82,2,1,1,1,2,1,2,1,1,1,2,1,68.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,3,3.68,3.04,1,1,1,1,1,1,1,1,1,1,1,1,54.0
428,3,3.12,2.12,2,1,1,1,2,2,2,1,1,1,2,1,62.0
215,2,2.66,8.56,2,1,2,1,2,1,2,1,1,1,2,1,61.0
298,3,4.32,2.72,3,1,2,1,2,2,1,1,1,1,2,1,77.0


In [16]:
## precision-recall curves for finding the likelihood thresholds for minimal precision
predictors[col].thresholds = {}
probas = predictors[col].predict_proba(test_data)

for label_idx, label in enumerate(predictors[col].classes_):
    prec, rec, threshold = precision_recall_curve(test_data[col]==label, probas[:,label_idx], pos_label=True)
    prec = prec.tolist(); rec = rec.tolist(); threshold = threshold.tolist()
    threshold_for_min_prec = np.array([elem >= categorical_precision_threshold for elem in prec]).nonzero()[0][0] - 1
    predictors[col].thresholds[label] = threshold_for_min_prec

  and should_run_async(code)


ValueError: X has 16 features, but ColumnTransformer is expecting 17 features as input.

In [17]:
##############################

  and should_run_async(code)


In [51]:
threshold

  and should_run_async(code)


[0.03012201661014794,
 0.03050123498776688,
 0.03158356066656846,
 0.031709162347652906,
 0.03210770047538078,
 0.033849608590779554,
 0.0356259486438884,
 0.03633772606087792,
 0.03823946823752761,
 0.04076569144894624,
 0.04292713185877126,
 0.04527985187419503,
 0.04955516247940076,
 0.05016664736014147,
 0.051514178704560855,
 0.053510230793460525,
 0.05703361522060735,
 0.06073320794387913,
 0.06158133634222757,
 0.06327249391345484,
 0.06843614549737498,
 0.069921596411387,
 0.07614065602044606,
 0.07759958999829031,
 0.07961783870537059,
 0.08251649411468776,
 0.09333101998710487,
 0.09934027766552114,
 0.1009942342553614,
 0.13300991291919195,
 0.17415787192476936,
 0.19452130019967168,
 0.19508875784534918,
 0.2514445943694819,
 0.33511692579128577,
 0.3552663731858969,
 0.35689031640496754]

In [52]:
np.array([elem >= categorical_precision_threshold for elem in prec]).nonzero()[0][0] - 1

36

In [53]:
threshold[36]

0.35689031640496754

In [54]:
len(threshold)

37

In [None]:
##############################

In [55]:
predictors[col].thresholds

{'1': -1, '2': 36}

In [24]:
## outlier detection

In [56]:
y_pred = predictors[col].predict(df_corrupted)
y_proba = predictors[col].predict_proba(df_corrupted)

for label_idx, label in enumerate(predictors[col].classes_):
    precision_pred = predictors[col].thresholds[label] <= y_proba[:,label_idx]
    outliers = precision_pred & (df_corrupted[col] != y_pred)

  and should_run_async(code)


In [57]:
outliers

  and should_run_async(code)


399    False
120    False
114    False
170    False
163    False
       ...  
270    False
257    False
339    False
421    False
71     False
Name: V11, Length: 141, dtype: bool

In [58]:
## find indices of records with NaNs in col in df_corrupted
nan_idx = df_corrupted[df_corrupted[col].isnull()].index
non_nan_idx = df_corrupted.loc[set(df_corrupted.index) - set(nan_idx)].index
        
## add a respective outlier col for each col
df_outliers[col + "_outlier"] = ''
df_outliers.loc[non_nan_idx, col + "_outlier"] = outliers.astype('int') ## 0: inlier, 1: outlier
df_outliers.loc[nan_idx, col + "_outlier"] = 1

In [59]:
df_outliers

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V11_outlier
399,3,2.861579,1.90,2,1,1,1,2,1,,1,1,1,2,1,65.0,0
120,2,3.840000,2.56,2,1,1,1,2,1,,2,1,1,1,1,59000.0,0
114,3,4.520000,3.32,1,1,1,1,2,1,2,2,1,1,2,1,58000.0,0
170,3,4.040000,1.88,2,1,1,1,2,1,2,1,1,1,2,1,66000.0,0
163,2,1.420914,1.96,2,1,1,1,2,1,2,2,1,1,1,1,73.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,3,4.670157,1.72,2,1,1,1,2,2,,1,1,1,2,1,71000.0,0
257,3,3.588486,2.32,1,1,1,1,1,1,,1,1,1,2,1,54.0,0
339,3,2.909306,2.00,2,1,1,1,2,1,,1,1,1,2,1,59000.0,0
421,2,3.760000,2.96,2,1,1,1,2,1,4,2,1,1,1,1,64000.0,0


In [60]:
for i in df_outliers.index:
    if df_outliers.loc[i, col + "_outlier"] == 1:
        df_outliers.loc[i, col] = np.nan

  and should_run_async(code)


In [61]:
print(f'Column {col} contained {len(nan_idx)} nans before, now {df_outliers[col].isnull().sum()}')

Column V11 contained 0 nans before, now 0


In [None]:
## imputation

In [226]:
prior_missing = df_outliers[col].isnull().sum()

if prior_missing > 0:
    df_outliers.loc[df_outliers[col].isnull(), col] = predictors[col].predict(df_outliers[df_outliers[col].isnull()])
    
print(f'Imputed {prior_missing} values in column {col}')

Imputed 0 values in column V8


  and should_run_async(code)


#### Numeric

In [62]:
col = "V16"

In [63]:
feature_transform = ColumnTransformer(transformers=[
    ('categorical_features', categorical_preprocessing, categorical_columns),
    ('numeric_features', numeric_preprocessing, list(set(numerical_columns) - {col}))
])

In [64]:
param_grid = {
    'learner__n_estimators': [10, 50, 100],
}

In [65]:
predictors[col] = {}

for perc_name, percentile, in zip(['lower', 'median', 'upper'], [1.0 - numeric_error_percentile, 0.5, numeric_error_percentile]):
    pipeline = Pipeline([
        ('features', feature_transform),
        ('learner', GradientBoostingRegressor(loss='quantile', alpha=percentile))
    ])
    
    search = GridSearchCV(pipeline, param_grid, cv=2, verbose=0, n_jobs=-1)
    predictors[col][perc_name] = search.fit(train_data, train_data[col])
    print(f'Regressor for col: {col}/{perc_name} reached {search.best_score_}')

Regressor for col: V16/lower reached -0.8029324297508953
Regressor for col: V16/median reached 0.0352035499816713
Regressor for col: V16/upper reached -1.0026775445793135


In [35]:
## outlier detection

  and should_run_async(code)


In [66]:
lower_percentile = predictors[col]['lower'].predict(df_corrupted)
upper_percentile = predictors[col]['upper'].predict(df_corrupted)

outliers = (df_corrupted[col] < lower_percentile) | (df_corrupted[col] > upper_percentile)

  and should_run_async(code)


In [67]:
outliers

  and should_run_async(code)


399    False
120     True
114     True
170     True
163    False
       ...  
270     True
257    False
339     True
421     True
71     False
Name: V16, Length: 141, dtype: bool

In [234]:
num_nans = df_outliers[col].isnull().sum()
df_outliers.loc[outliers, col] = np.nan

In [68]:
## find indices of records with NaNs in col in df_corrupted
nan_idx = df_corrupted[df_corrupted[col].isnull()].index
non_nan_idx = df_corrupted.loc[set(df_corrupted.index) - set(nan_idx)].index
        
## add a respective outlier col for each col
df_outliers[col + "_outlier"] = ''
df_outliers.loc[non_nan_idx, col + "_outlier"] = outliers.astype('int') ## 0: inlier, 1: outlier
df_outliers.loc[nan_idx, col + "_outlier"] = 1

In [69]:
df_outliers

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V11_outlier,V16_outlier
399,3,2.861579,1.90,2,1,1,1,2,1,,1,1,1,2,1,65.0,0,0
120,2,3.840000,2.56,2,1,1,1,2,1,,2,1,1,1,1,59000.0,0,1
114,3,4.520000,3.32,1,1,1,1,2,1,2,2,1,1,2,1,58000.0,0,1
170,3,4.040000,1.88,2,1,1,1,2,1,2,1,1,1,2,1,66000.0,0,1
163,2,1.420914,1.96,2,1,1,1,2,1,2,2,1,1,1,1,73.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,3,4.670157,1.72,2,1,1,1,2,2,,1,1,1,2,1,71000.0,0,1
257,3,3.588486,2.32,1,1,1,1,1,1,,1,1,1,2,1,54.0,0,0
339,3,2.909306,2.00,2,1,1,1,2,1,,1,1,1,2,1,59000.0,0,1
421,2,3.760000,2.96,2,1,1,1,2,1,4,2,1,1,1,1,64000.0,0,1


In [70]:
for i in df_outliers.index:
    if df_outliers.loc[i, col + "_outlier"] == 1:
        df_outliers.loc[i, col] = np.nan

  and should_run_async(code)


In [72]:
print(f'Column {col} contained {len(nan_idx)} nans before, now {df_outliers[col].isnull().sum()}')

Column V16 contained 0 nans before, now 86


In [None]:
## imputation

In [236]:
prior_missing = df_outliers[col].isnull().sum()

if prior_missing > 0:
    df_outliers.loc[df_outliers[col].isnull(), col] = predictors[col]['median'].predict(df_outliers[df_outliers[col].isnull()])
    
print(f'Imputed {prior_missing} values in column {col}')

Imputed 86 values in column V3


#### Together

In [45]:
for col in categorical_columns + numerical_columns:
    if col in categorical_columns:
        if len(train_data[col].unique()) > 1:
            feature_transform = ColumnTransformer(transformers=[
                ('categorical_features', categorical_preprocessing, list(set(categorical_columns) - {col})),
                ('numeric_features', numeric_preprocessing, numerical_columns)
            ])

            param_grid = {
                'learner__n_estimators': [10, 50, 100, 200],
            }

            pipeline = Pipeline([
                ('features', feature_transform),
                ('learner', GradientBoostingClassifier())
            ])

            search = GridSearchCV(pipeline, param_grid, cv=2, verbose=0, n_jobs=-1)
            predictors[col] = search.fit(train_data, train_data[col])

            print(f'Classifier for col: {col} reached {search.best_score_}')

            ## precision-recall curves for finding the likelihood thresholds for minimal precision
            predictors[col].thresholds = {}
            probas = predictors[col].predict_proba(test_data)

            for label_idx, label in enumerate(predictors[col].classes_):
                prec, rec, threshold = precision_recall_curve(test_data[col]==label, probas[:,label_idx], pos_label=True)
                prec = prec.tolist(); rec = rec.tolist(); threshold = threshold.tolist()
                threshold_for_min_prec = np.array([elem >= categorical_precision_threshold for elem in prec]).nonzero()[0][0] - 1
                predictors[col].thresholds[label] = threshold_for_min_prec

    elif col in numerical_columns:
        feature_transform = ColumnTransformer(transformers=[
            ('categorical_features', categorical_preprocessing, categorical_columns),
            ('numeric_features', numeric_preprocessing, list(set(numerical_columns) - {col}))
        ])
        
        param_grid = {
            'learner__n_estimators': [10, 50, 100],
        }
        
        predictors[col] = {}
        
        for perc_name, percentile, in zip(['lower', 'median', 'upper'], [1.0 - numeric_error_percentile, 0.5, numeric_error_percentile]):
            pipeline = Pipeline([
                ('features', feature_transform),
                ('learner', GradientBoostingRegressor(loss='quantile', alpha=percentile))
            ])
            
            search = GridSearchCV(pipeline, param_grid, cv=2, verbose=0, n_jobs=-1)
            predictors[col][perc_name] = search.fit(train_data, train_data[col])
            
            print(f'Regressor for col: {col}/{perc_name} reached {search.best_score_}')

Classifier for col: Sex reached 0.7169811320754716
Classifier for col: Chest_pain_type reached 0.4858490566037736
Classifier for col: Fasting_blood_sugar_&lt;_120 reached 0.8160377358490567




Classifier for col: Resting_ecg reached 0.5660377358490566
Classifier for col: Exercise_induced_angina reached 0.7405660377358491
Classifier for col: Slope reached 0.660377358490566
Classifier for col: Number_of_vessels_colored reached 0.5141509433962264




Classifier for col: Thal reached 0.6367924528301887
Regressor for col: Age/lower reached -0.6083177902671769
Regressor for col: Age/median reached 0.2747091034857364
Regressor for col: Age/upper reached -0.5668140669452472
Regressor for col: Trestbps/lower reached -0.6900162986058573
Regressor for col: Trestbps/median reached -0.050561889355387524
Regressor for col: Trestbps/upper reached -1.1863798651826882
Regressor for col: Cholesterol/lower reached -0.8236757598246106
Regressor for col: Cholesterol/median reached -0.0009319844269178867
Regressor for col: Cholesterol/upper reached -1.0681978688202145
Regressor for col: Max_heart_rate/lower reached -0.7252545384383909
Regressor for col: Max_heart_rate/median reached 0.1483918415431673
Regressor for col: Max_heart_rate/upper reached -0.701418134370998
Regressor for col: Oldpeak/lower reached -0.3241422613571008
Regressor for col: Oldpeak/median reached 0.2146490625612419
Regressor for col: Oldpeak/upper reached -0.35170496336284607


In [46]:
predictors.keys()

  and should_run_async(code)


dict_keys(['Sex', 'Chest_pain_type', 'Fasting_blood_sugar_&lt;_120', 'Resting_ecg', 'Exercise_induced_angina', 'Slope', 'Number_of_vessels_colored', 'Thal', 'Age', 'Trestbps', 'Cholesterol', 'Max_heart_rate', 'Oldpeak'])

In [12]:
## outlier detection and removal

In [47]:
for col in categorical_columns + numerical_columns:
    if col in categorical_columns:
        if col in predictors.keys():
            y_pred = predictors[col].predict(df_corrupted)
            y_proba = predictors[col].predict_proba(df_corrupted)

            for label_idx, label in enumerate(predictors[col].classes_):
                precision_pred = predictors[col].thresholds[label] <= y_proba[:,label_idx]
                outliers = precision_pred & (df_corrupted[col] != y_pred)
            
    elif col in numerical_columns:
        lower_percentile = predictors[col]['lower'].predict(df_corrupted)
        upper_percentile = predictors[col]['upper'].predict(df_corrupted)
        outliers = (df_corrupted[col] < lower_percentile) | (df_corrupted[col] > upper_percentile)
        
    ## find indices of records with NaNs in col in df_corrupted
    nan_idx = df_corrupted[df_corrupted[col].isnull()].index
    non_nan_idx = df_corrupted.loc[set(df_corrupted.index) - set(nan_idx)].index
    
    ## add a respective outlier col for each col
    df_outliers[col + "_outlier"] = ''
    df_outliers.loc[non_nan_idx, col + "_outlier"] = outliers.astype('int') ## 0: inlier, 1: outlier
    df_outliers.loc[nan_idx, col + "_outlier"] = 1
    
    for i in df_outliers.index:
        if df_outliers.loc[i, col + "_outlier"] == 1:
            df_outliers.loc[i, col] = np.nan
    
    print(f'Column {col} contained {len(nan_idx)} nans before, now {df_outliers[col].isnull().sum()}')

Column Sex contained 0 nans before, now 0
Column Chest_pain_type contained 67 nans before, now 67
Column Fasting_blood_sugar_&lt;_120 contained 0 nans before, now 0
Column Resting_ecg contained 0 nans before, now 0
Column Exercise_induced_angina contained 0 nans before, now 0
Column Slope contained 0 nans before, now 0
Column Number_of_vessels_colored contained 0 nans before, now 0
Column Thal contained 0 nans before, now 0
Column Age contained 0 nans before, now 56
Column Trestbps contained 0 nans before, now 25
Column Cholesterol contained 0 nans before, now 41
Column Max_heart_rate contained 0 nans before, now 24


TypeError: '<' not supported between instances of 'str' and 'float'

In [18]:
## imputation

  and should_run_async(code)


In [14]:
df_cleaned = df_outliers[df_corrupted.columns].copy(deep=True)

  and should_run_async(code)


In [15]:
for col in categorical_columns + numerical_columns:
    prior_missing = df_cleaned[col].isnull().sum()
    
    if prior_missing > 0:
        if col in categorical_columns:
            df_cleaned.loc[df_cleaned[col].isnull(), col] = predictors[col].predict(df_cleaned[df_cleaned[col].isnull()])
        elif col in numerical_columns:
            df_cleaned.loc[df_cleaned[col].isnull(), col] = predictors[col]['median'].predict(df_cleaned[df_cleaned[col].isnull()])
        
        print(f'Imputed {prior_missing} values in column {col}')

Imputed 59 values in column V1
Imputed 70 values in column V14
Imputed 3 values in column V15
Imputed 65 values in column V2
Imputed 91 values in column V3
Imputed 97 values in column V16


In [None]:
## Swapping between a categoric and numeric variable messes up the categories of the categoric variable, and there
## are issues while comparing the values inside the same column
## Here, I am finding all the numeric values in the categoric columns, that came after using the SwappedValues corruption
## and repacing them with NaNs, they will later be imputed.

In [28]:
test_data

  and should_run_async(code)


Unnamed: 0,Age,Sex,Chest_pain_type,Trestbps,Cholesterol,Fasting_blood_sugar_&lt;_120,Resting_ecg,Max_heart_rate,Exercise_induced_angina,Oldpeak,Slope,Number_of_vessels_colored,Thal
125,54.0,1,2,140.0,239.0,0,2,160.0,0,1.2,2,0,2
112,55.0,0,2,180.0,327.0,0,0,117.0,1,3.4,1,0,2
103,57.0,1,2,140.0,192.0,0,2,148.0,0,0.4,1,0,1
158,59.0,1,2,135.0,234.0,0,2,161.0,0,0.5,1,0,3
170,43.0,1,2,120.0,177.0,0,1,120.0,1,2.5,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,67.0,1,2,120.0,237.0,0,2,71.0,0,1.0,1,0,2
213,52.0,1,0,128.0,205.0,1,2,184.0,0,0.0,2,0,2
198,42.0,0,2,102.0,265.0,0,1,122.0,0,0.6,1,0,2
199,69.0,1,3,140.0,254.0,0,1,146.0,0,2.0,1,3,3


In [26]:
df_corrupted

  and should_run_async(code)


Unnamed: 0,Age,Sex,Chest_pain_type,Trestbps,Cholesterol,Fasting_blood_sugar_&lt;_120,Resting_ecg,Max_heart_rate,Exercise_induced_angina,Oldpeak,Slope,Number_of_vessels_colored,Thal
125,54.0,1,2,140.0,239.000000,0,2,160.0,0,1.2,2,0,2
112,55.0,0,2,180.0,327.000000,0,0,117.0,1,3.4,1,0,2
103,57.0,1,0.4,140.0,192.000000,0,2,148.0,1,,1,0,1
158,59.0,1,,135.0,234.000000,0,2,161.0,1,0.5,1,0,3
170,43.0,1,2.5,120.0,177.000000,0,1,120.0,0,,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,670.0,1,,120.0,540.706938,0,2,71.0,1,1.0,1,0,2
213,52.0,1,0.0,128.0,205.000000,1,2,184.0,1,0,2,0,2
198,420.0,0,0.6,102.0,417.727651,0,1,122.0,0,,1,0,2
199,690.0,1,2.0,140.0,254.000000,0,1,146.0,1,,1,3,3


In [29]:
df_corrupted.index[np.where(df_corrupted.applymap(np.isreal)["Chest_pain_type"] == True)]

  and should_run_async(code)


Int64Index([103, 158, 170,  80, 259, 245, 223,  35, 114, 101, 272, 142, 220,
             89, 298,  16,  77, 253,  53, 151,  81, 269,  40, 232, 255,  91,
            115, 233, 100,  52,  24, 243,  34, 192, 191, 217, 228, 207, 238,
            157, 205,  79, 275, 285,  43, 180,  88, 169,  39, 133,  72, 161,
             11, 241, 288, 248, 117,  82, 165,   7, 252,   8,   3,  26, 213,
            198, 199],
           dtype='int64')

In [31]:
df_corrupted.loc[df_corrupted.index[np.where(df_corrupted.applymap(np.isreal)["Chest_pain_type"] == True)]]

Unnamed: 0,Age,Sex,Chest_pain_type,Trestbps,Cholesterol,Fasting_blood_sugar_&lt;_120,Resting_ecg,Max_heart_rate,Exercise_induced_angina,Oldpeak,Slope,Number_of_vessels_colored,Thal
103,57.0,1,0.4,140.0,192.000000,0,2,148.0,1,,1,0,1
158,59.0,1,,135.0,234.000000,0,2,161.0,1,0.5,1,0,3
170,43.0,1,2.5,120.0,177.000000,0,1,120.0,0,,1,0,3
80,70.0,1,,130.0,120.181188,0,1,109.0,0,2.4,1,3,2
259,58.0,1,,112.0,230.000000,0,1,165.0,0,2.5,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,410.0,0,,112.0,348.451097,0,1,172.0,1,0.0,2,0,2
26,670.0,1,,120.0,540.706938,0,2,71.0,1,1.0,1,0,2
213,52.0,1,0.0,128.0,205.000000,1,2,184.0,1,0,2,0,2
198,420.0,0,0.6,102.0,417.727651,0,1,122.0,0,,1,0,2


In [32]:
for col in categorical_columns:
    idx_to_nan = df_corrupted.index[np.where(df_corrupted.applymap(np.isreal)[col] == True)]
    if len(idx_to_nan) != 0:
        df_corrupted.loc[idx_to_nan, col] = np.nan

  and should_run_async(code)


In [33]:
df_corrupted["Chest_pain_type"]

  and should_run_async(code)


125      2
112      2
103    NaN
158    NaN
170    NaN
      ... 
26     NaN
213    NaN
198    NaN
199    NaN
183      3
Name: Chest_pain_type, Length: 91, dtype: object

In [50]:
numerical_columns

  and should_run_async(code)


['Age', 'Trestbps', 'Cholesterol', 'Max_heart_rate', 'Oldpeak']

In [51]:
df_corrupted[['Age', 'Trestbps', 'Cholesterol', 'Max_heart_rate', 'Oldpeak']]

Unnamed: 0,Age,Trestbps,Cholesterol,Max_heart_rate,Oldpeak
125,54.0,140.0,239.000000,160.0,1.2
112,55.0,180.0,327.000000,117.0,3.4
103,57.0,140.0,192.000000,148.0,
158,59.0,135.0,234.000000,161.0,0.5
170,43.0,120.0,177.000000,120.0,
...,...,...,...,...,...
26,670.0,120.0,540.706938,71.0,1.0
213,52.0,128.0,205.000000,184.0,0
198,420.0,102.0,417.727651,122.0,
199,690.0,140.0,254.000000,146.0,


In [53]:
df_corrupted["Age"].isalpha()

AttributeError: 'Series' object has no attribute 'isalpha'