## Dataset

In [1]:
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('/home/rupali/Documents/Master Thesis/jenga')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from jenga.basis import Dataset

In [2]:
dataset = Dataset("hill-valley")

all_data = dataset.all_data
attribute_names = dataset.attribute_names
attribute_types = dataset.attribute_types

categorical_columns = dataset.categorical_columns
numerical_columns = dataset.numerical_columns

print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")

Dataset: hill-valley
Found 0 categorical and 100 numeric features 



### Get training and test sets

In [3]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data(0.3)

In [4]:
## use categorical columns as strings
def cat_cols_to_str(df):
    for col in df.columns:
        if pd.api.types.is_categorical_dtype(df[col]):
            df[col] = df[col].astype(str)

    return df

In [5]:
### if we don't convert the categorical columns to str, the swapping corruption doesn't let us assign new values to the column: "Cannot setitem on a Categorical with a new category, set the categories first"
train_data = cat_cols_to_str(train_data)
test_data = cat_cols_to_str(test_data)

## Defined Model

In [4]:
from sklearn.linear_model import SGDClassifier

learner = SGDClassifier(loss='log')
param_grid = {
    'learner__max_iter': [500, 1000, 5000],
    'learner__penalty': ['l2', 'l1', 'elasticnet'], 
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

## Corruptions using PPP

In [5]:
from jenga.corruptions.generic import MissingValues, SwappedValues, CategoricalShift
from jenga.corruptions.numerical import Scaling, GaussianNoise

corruptions = [MissingValues, Scaling, GaussianNoise]
fraction = 0.5
num_repetitions = 5

In [6]:
from jenga.cleaning.ppp import PipelinePerformancePrediction

ppp = PipelinePerformancePrediction(train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)
ppp_model = ppp.fit_ppp(train_data)

## generate corrpted data
for _ in range(num_repetitions):
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions, fraction, num_repetitions)

Fitting 5 folds for each of 36 candidates, totalling 180 fits

Generating corrupted training data on 364 rows... 

	perturbation: MissingValues: {'column': 'V58', 'fraction': 0.5, 'sampling': 'MAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'V92', 'fraction': 0.5, 'sampling': 'MNAR'}
	perturbation: GaussianNoise: {'column': 'V18', 'fraction': 0.5, 'sampling': 'MCAR'}

Generating corrupted training data on 364 rows... 

	perturbation: MissingValues: {'column': 'V82', 'fraction': 0.5, 'sampling': 'MCAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'V68', 'fraction': 0.5, 'sampling': 'MAR'}
	perturbation: GaussianNoise: {'column': 'V18', 'fraction': 0.5, 'sampling': 'MAR'}

Generating corrupted training data on 364 rows... 

	perturbation: MissingValues: {'column': 'V92', 'fraction': 0.5, 'sampling': 'MAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'V75', 'fraction': 0.5, 'sampling': 'MNAR'}
	perturbation: GaussianNoise: {'column': 'V57', 'fraction': 0.5, 'samp

## Cleaning

### PPP Cleaning

In [21]:
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNNOutlierDetection, PyODIsolationForestOutlierDetection
from jenga.cleaning.imputation import MeanModeImputation

cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, MeanModeImputation),
    (PyODIsolationForestOutlierDetection, MeanModeImputation)
]

  and should_run_async(code)


In [22]:
from jenga.cleaning.clean import Clean

categorical_precision_threshold=0.7
numerical_std_error_threshold=2.0

clean = Clean(train_data, df_corrupted, categorical_columns, numerical_columns, categorical_precision_threshold, numerical_std_error_threshold, ppp, ppp_model, cleaners)
df_outliers, df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(train_data, test_data, df_corrupted, cols_perturbed)


Applying cleaners... 

PPP score no cleaning: {'roc_auc_score': 0.5515873015873016, 'classification_report': {'F': {'precision': 0.8540145985401459, 'recall': 0.975, 'f1-score': 0.9105058365758756, 'support': 120}, 'T': {'precision': 0.25, 'recall': 0.047619047619047616, 'f1-score': 0.08, 'support': 21}, 'accuracy': 0.8368794326241135, 'macro avg': {'precision': 0.552007299270073, 'recall': 0.5113095238095238, 'f1-score': 0.49525291828793777, 'support': 141}, 'weighted avg': {'precision': 0.7640549774809753, 'recall': 0.8368794326241135, 'f1-score': 0.7868134779369155, 'support': 141}}}
PPP scores with cleaning: 

Outlier detection method: NoOutlierDetection, Outlier Detection Score: {'Precision': 0.25177304964539005, 'Recall': 0.5, 'F1-score': 0.33490566037735847, 'Accuracy': 0.5035460992907801}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': 0.5081521739130435, 'Recall': 0.5100247524752475, 'F1-score': 0.4816176470588236, 'Accuracy': 0.5035460992907801, 'Mean 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Cleaner: (NoOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5488095238095239, 'classification_report': {'F': {'precision': 0.8478260869565217, 'recall': 0.975, 'f1-score': 0.9069767441860466, 'support': 120}, 'T': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 21}, 'accuracy': 0.8297872340425532, 'macro avg': {'precision': 0.42391304347826086, 'recall': 0.4875, 'f1-score': 0.4534883720930233, 'support': 141}, 'weighted avg': {'precision': 0.7215541165587419, 'recall': 0.8297872340425532, 'f1-score': 0.7718951014349332, 'support': 141}}}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Outlier detection method: PyODKNNOutlierDetection, Outlier Detection Score: {'Precision': 0.7051983379502988, 'Recall': 0.7437122736418511, 'F1-score': 0.696433411067735, 'Accuracy': 0.7446808510638298}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': 0.5081521739130435, 'Recall': 0.5100247524752475, 'F1-score': 0.4816176470588236, 'Accuracy': 0.5035460992907801, 'Mean Squared Error': 97.54260649809426}
Cleaner: (PyODKNNOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5476190476190477, 'classification_report': {'F': {'precision': 0.8560606060606061, 'recall': 0.9416666666666667, 'f1-score': 0.8968253968253967, 'support': 120}, 'T': {'precision': 0.2222222222222222, 'recall': 0.09523809523809523, 'f1-score': 0.13333333333333333, 'support': 21}, 'accuracy': 0.8156028368794326, 'macro avg': {'precision': 0.5391414141414141, 'recall': 0.5184523809523809, 'f1-score': 0.515079365079365, 'support': 141}, 'weighted avg': {'precision': 0.7616591446378681, 'recal

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Outlier detection method: PyODIsolationForestOutlierDetection, Outlier Detection Score: {'Precision': 0.6495845912401763, 'Recall': 0.6736921529175051, 'F1-score': 0.6237232659432439, 'Accuracy': 0.6737588652482269}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': 0.5081521739130435, 'Recall': 0.5100247524752475, 'F1-score': 0.4816176470588236, 'Accuracy': 0.5035460992907801, 'Mean Squared Error': 96.91559935735567}
Cleaner: (PyODIsolationForestOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5517857142857143, 'classification_report': {'F': {'precision': 0.8560606060606061, 'recall': 0.9416666666666667, 'f1-score': 0.8968253968253967, 'support': 120}, 'T': {'precision': 0.2222222222222222, 'recall': 0.09523809523809523, 'f1-score': 0.13333333333333333, 'support': 21}, 'accuracy': 0.8156028368794326, 'macro avg': {'precision': 0.5391414141414141, 'recall': 0.5184523809523809, 'f1-score': 0.515079365079365, 'support': 141}, 'weighted avg': {'precision': 0

## Model Evaluation

### With learner and param_grid

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# preprocessing pipeline for numerical columns
transformer_numeric = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('standard_scale', StandardScaler())
])

# preprocessing pipeline for categorical columns
transformer_categorical = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
    ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
])

# preprocessor
feature_transform = ColumnTransformer(transformers=[
    ('categorical_features', transformer_categorical, categorical_columns),
    ('numerical_features', transformer_numeric, numerical_columns)
])

## prediction pipeline: append classifier (learner) to the preprocessing pipeline
pipeline = Pipeline([
    ('features', feature_transform),
    ('learner', learner)
])

  and should_run_async(code)


In [19]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipeline, param_grid, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
model = grid_search.fit(train_data, train_labels)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [20]:
y_pred = model.predict(test_data)

  and should_run_async(code)


In [21]:
from sklearn.metrics import roc_auc_score

roc_auc_score(test_labels, np.transpose(model.predict_proba(test_data))[1])

  and should_run_async(code)


0.48281130634071817

In [22]:
model.predict_proba(test_data)

  and should_run_async(code)


array([[9.999474e-01, 5.256469e-05],
       [9.997258e-01, 2.741634e-04],
       [9.999984e-01, 1.568016e-06],
       [9.999999e-01, 6.675940e-08],
       ...,
       [9.999999e-01, 6.306299e-08],
       [9.986168e-01, 1.383178e-03],
       [2.497819e-01, 7.502181e-01],
       [9.999855e-01, 1.451340e-05]])

In [23]:
from sklearn.metrics import classification_report

classification_report(test_labels, y_pred, output_dict=True)

  and should_run_async(code)


{'F': {'precision': 0.8428571428571429,
  'recall': 0.9915966386554622,
  'f1-score': 0.9111969111969112,
  'support': 119},
 'T': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 22},
 'accuracy': 0.8368794326241135,
 'macro avg': {'precision': 0.42142857142857143,
  'recall': 0.4957983193277311,
  'f1-score': 0.4555984555984556,
  'support': 141},
 'weighted avg': {'precision': 0.7113475177304964,
  'recall': 0.8368794326241135,
  'f1-score': 0.7690243434924285,
  'support': 141}}

### With Autogluon

In [14]:
from autogluon.tabular import TabularPredictor

## training
train_data["class"] = train_labels

label = "class"

## folder to save trained models
#save_path = '/home/rupali/Documents/Master Thesis/jenga/autogluon_models/'

predictor = TabularPredictor(label=label).fit(train_data)

  and should_run_async(code)
Level 25:autogluon.core.utils.utils:No path specified. Models will be saved in: "AutogluonModels/ag-20210323_212707/"
INFO:autogluon.tabular.learner.default_learner:Beginning AutoGluon training ...
INFO:autogluon.tabular.learner.default_learner:AutoGluon will save models to "AutogluonModels/ag-20210323_212707/"
INFO:autogluon.tabular.learner.default_learner:AutoGluon Version:  0.1.0
INFO:autogluon.tabular.learner.default_learner:Train Data Rows:    329
INFO:autogluon.tabular.learner.default_learner:Train Data Columns: 16
INFO:autogluon.tabular.learner.default_learner:Preprocessing data ...
Level 25:autogluon.core.utils.utils:AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
INFO:autogluon.core.utils.utils:	2 unique label values:  ['F', 'T']
Level 25:autogluon.core.utils.utils:	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_t

ERROR:autogluon.tabular.trainer.abstract_trainer:		libopenblas.so.0: cannot open shared object file: No such file or directory
ERROR:autogluon.tabular.trainer.abstract_trainer:Detailed Traceback:
Traceback (most recent call last):
  File "/home/rupali/.local/lib/python3.8/site-packages/autogluon/tabular/trainer/abstract_trainer.py", line 911, in _train_and_save
    model = self._train_single(X, y, model, X_val, y_val, **model_fit_kwargs)
  File "/home/rupali/.local/lib/python3.8/site-packages/autogluon/tabular/trainer/abstract_trainer.py", line 883, in _train_single
    model.fit(X=X, y=y, X_val=X_val, y_val=y_val, **model_fit_kwargs)
  File "/home/rupali/.local/lib/python3.8/site-packages/autogluon/core/models/abstract/abstract_model.py", line 405, in fit
    self._fit(**kwargs)
  File "/home/rupali/.local/lib/python3.8/site-packages/autogluon/tabular/models/tabular_nn/tabular_nn_model.py", line 177, in _fit
    try_import_mxnet()
  File "/home/rupali/.local/lib/python3.8/site-package

█

INFO:autogluon.tabular.trainer.abstract_trainer:	0.8636	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	5.44s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.1s	 = Validation runtime
INFO:autogluon.tabular.trainer.abstract_trainer:Fitting model: LightGBMLarge ...
INFO:autogluon.tabular.trainer.abstract_trainer:	0.8636	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	0.47s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.02s	 = Validation runtime


█

INFO:autogluon.tabular.trainer.abstract_trainer:Fitting model: WeightedEnsemble_L2 ...
INFO:autogluon.tabular.trainer.abstract_trainer:	0.8636	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	0.41s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.0s	 = Validation runtime
INFO:autogluon.tabular.learner.default_learner:AutoGluon training complete, total runtime = 11.56s ...
INFO:root:TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20210323_212707/")


In [15]:
## test on original data
y_pred_test = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=pd.Series(test_labels), y_pred=y_pred_test, auxiliary_metrics=True)

  and should_run_async(code)
INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8368794326241135
INFO:autogluon.tabular.learner.abstract_learner:Evaluations on test data:
INFO:autogluon.tabular.learner.abstract_learner:{
    "accuracy": 0.8368794326241135,
    "accuracy_score": 0.8368794326241135,
    "balanced_accuracy_score": 0.5,
    "matthews_corrcoef": 0.0,
    "f1_score": 0.8368794326241135
}
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:autogluon.tabular.learner.abstract_learner:Detailed (per-class) classification report:
INFO:autogluon.tabular.learner.abstract_learner:{
    "F": {
        "precision": 0.8368794326241135,
        "recall": 1.0,
        "f1-score": 0.9111969111969112,
        "support": 118
    },
    "T": {
        "precision": 0.0,
        "recall": 0.0,
        "f1-score": 0.0,
        "support": 23
    }

In [16]:
## test on corrupted data
y_pred_corrupted = predictor.predict(df_corrupted)
perf = predictor.evaluate_predictions(y_true=pd.Series(test_labels), y_pred=y_pred_corrupted, auxiliary_metrics=True)

  and should_run_async(code)
INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8368794326241135
INFO:autogluon.tabular.learner.abstract_learner:Evaluations on test data:
INFO:autogluon.tabular.learner.abstract_learner:{
    "accuracy": 0.8368794326241135,
    "accuracy_score": 0.8368794326241135,
    "balanced_accuracy_score": 0.5,
    "matthews_corrcoef": 0.0,
    "f1_score": 0.8368794326241135
}
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:autogluon.tabular.learner.abstract_learner:Detailed (per-class) classification report:
INFO:autogluon.tabular.learner.abstract_learner:{
    "F": {
        "precision": 0.8368794326241135,
        "recall": 1.0,
        "f1-score": 0.9111969111969112,
        "support": 118
    },
    "T": {
        "precision": 0.0,
        "recall": 0.0,
        "f1-score": 0.0,
        "support": 23
    }

In [17]:
## test on cleaned data
y_pred_cleaned = predictor.predict(df_cleaned)
perf = predictor.evaluate_predictions(y_true=pd.Series(test_labels), y_pred=y_pred_cleaned, auxiliary_metrics=True)

  and should_run_async(code)
INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8368794326241135
INFO:autogluon.tabular.learner.abstract_learner:Evaluations on test data:
INFO:autogluon.tabular.learner.abstract_learner:{
    "accuracy": 0.8368794326241135,
    "accuracy_score": 0.8368794326241135,
    "balanced_accuracy_score": 0.5,
    "matthews_corrcoef": 0.0,
    "f1_score": 0.8368794326241135
}
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:autogluon.tabular.learner.abstract_learner:Detailed (per-class) classification report:
INFO:autogluon.tabular.learner.abstract_learner:{
    "F": {
        "precision": 0.8368794326241135,
        "recall": 1.0,
        "f1-score": 0.9111969111969112,
        "support": 118
    },
    "T": {
        "precision": 0.0,
        "recall": 0.0,
        "f1-score": 0.0,
        "support": 23
    }

In [18]:
## We can evaluate the performance of each individual trained model on our (labeled) test data
test_data["class"] = test_labels
predictor.leaderboard(test_data, silent=True)

  and should_run_async(code)


█

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT,0.836879,0.863636,0.019441,0.018222,0.346764,0.019441,0.018222,0.346764,1,True,8
1,LightGBMLarge,0.836879,0.863636,0.020956,0.018564,0.46885,0.020956,0.018564,0.46885,1,True,12
2,WeightedEnsemble_L2,0.836879,0.863636,0.024096,0.019142,0.875167,0.003141,0.000578,0.406317,2,True,13
3,LightGBM,0.836879,0.863636,0.037066,0.015656,0.333007,0.037066,0.015656,0.333007,1,True,7
4,RandomForestEntr,0.836879,0.863636,0.103467,0.105233,0.700574,0.103467,0.105233,0.700574,1,True,2
5,ExtraTreesGini,0.836879,0.833333,0.147363,0.087352,0.481024,0.147363,0.087352,0.481024,1,True,3
6,RandomForestGini,0.829787,0.863636,0.092181,0.08489,0.723594,0.092181,0.08489,0.723594,1,True,1
7,NeuralNetFastAI,0.829787,0.863636,0.158394,0.104127,5.44004,0.158394,0.104127,5.44004,1,True,11
8,ExtraTreesEntr,0.822695,0.818182,0.162067,0.086309,0.477268,0.162067,0.086309,0.477268,1,True,4
9,XGBoost,0.815603,0.863636,0.024807,0.008342,0.266442,0.024807,0.008342,0.266442,1,True,10


In [19]:
perf

  and should_run_async(code)


OrderedDict([('accuracy', 0.8368794326241135),
             ('accuracy_score', 0.8368794326241135),
             ('balanced_accuracy_score', 0.5),
             ('matthews_corrcoef', 0.0),
             ('f1_score', 0.8368794326241135),
             ('confusion_matrix',
                   F  T
              F  118  0
              T   23  0),
             ('classification_report',
              {'F': {'precision': 0.8368794326241135,
                'recall': 1.0,
                'f1-score': 0.9111969111969112,
                'support': 118},
               'T': {'precision': 0.0,
                'recall': 0.0,
                'f1-score': 0.0,
                'support': 23},
               'accuracy': 0.8368794326241135,
               'macro avg': {'precision': 0.41843971631205673,
                'recall': 0.5,
                'f1-score': 0.4555984555984556,
                'support': 141},
               'weighted avg': {'precision': 0.7003671847492581,
                'recall': 0.83

In [39]:
list(perf.items())[6][1]

{'F': {'precision': 0.8613138686131386,
  'recall': 0.9915966386554622,
  'f1-score': 0.9218749999999999,
  'support': 119},
 'T': {'precision': 0.75,
  'recall': 0.13636363636363635,
  'f1-score': 0.23076923076923075,
  'support': 22},
 'accuracy': 0.8581560283687943,
 'macro avg': {'precision': 0.8056569343065694,
  'recall': 0.5639801375095492,
  'f1-score': 0.5763221153846153,
  'support': 141},
 'weighted avg': {'precision': 0.8439457472692448,
  'recall': 0.8581560283687943,
  'f1-score': 0.8140428941625749,
  'support': 141}}

In [25]:
from sklearn.metrics import roc_auc_score

roc_auc_score(test_labels, np.transpose(predictor.predict_proba(test_data)).to_numpy()[1])

1.0

In [20]:
perf = predictor.evaluate_predictions(y_true=pd.Series(test_labels), y_pred=y_pred_cleaned, auxiliary_metrics=False)
perf

  and should_run_async(code)
INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8368794326241135


0.8368794326241135

## Undefined Model: Default setting: whole process

In [11]:
from jenga.corruptions.generic import MissingValues, SwappedValues, CategoricalShift
from jenga.corruptions.numerical import Scaling, GaussianNoise

corruptions = [MissingValues, Scaling, GaussianNoise, CategoricalShift]
fraction = 0.5
num_repetitions = 5

  and should_run_async(code)


In [12]:
from jenga.cleaning.ppp import PipelinePerformancePrediction

ppp = PipelinePerformancePrediction(train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns)
ppp_model = ppp.fit_ppp(train_data)

## generate corrpted data
for _ in range(num_repetitions):
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions, fraction, num_repetitions)

Level 25:autogluon.core.utils.utils:No path specified. Models will be saved in: "AutogluonModels/ag-20210323_213847/"
INFO:autogluon.tabular.learner.default_learner:Beginning AutoGluon training ...
INFO:autogluon.tabular.learner.default_learner:AutoGluon will save models to "AutogluonModels/ag-20210323_213847/"
INFO:autogluon.tabular.learner.default_learner:AutoGluon Version:  0.1.0
INFO:autogluon.tabular.learner.default_learner:Train Data Rows:    329
INFO:autogluon.tabular.learner.default_learner:Train Data Columns: 16
INFO:autogluon.tabular.learner.default_learner:Preprocessing data ...
Level 25:autogluon.core.utils.utils:AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
INFO:autogluon.core.utils.utils:	2 unique label values:  ['F', 'T']
Level 25:autogluon.core.utils.utils:	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'mu

INFO:autogluon.tabular.trainer.abstract_trainer:Fitting model: NeuralNetFastAI ...


█

INFO:autogluon.tabular.trainer.abstract_trainer:	0.8636	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	5.19s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.09s	 = Validation runtime
INFO:autogluon.tabular.trainer.abstract_trainer:Fitting model: LightGBMLarge ...
INFO:autogluon.tabular.trainer.abstract_trainer:	0.8788	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	0.41s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.02s	 = Validation runtime


█

INFO:autogluon.tabular.trainer.abstract_trainer:Fitting model: WeightedEnsemble_L2 ...
INFO:autogluon.tabular.trainer.abstract_trainer:	0.8788	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	0.4s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.0s	 = Validation runtime
INFO:autogluon.tabular.learner.default_learner:AutoGluon training complete, total runtime = 11.19s ...
INFO:root:TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20210323_213847/")



Generating corrupted training data on 141 rows... 

	perturbation: MissingValues: {'column': 'PRE25', 'fraction': 0.5, 'sampling': 'MAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'PRE5', 'fraction': 0.5, 'sampling': 'MCAR'}
	perturbation: GaussianNoise: {'column': 'PRE5', 'fraction': 0.5, 'sampling': 'MNAR'}
	perturbation: CategoricalShift: {'column': 'PRE14', 'fraction': 0.5, 'sampling': 'MAR'}

Generating corrupted training data on 141 rows... 

	perturbation: MissingValues: {'column': 'PRE8', 'fraction': 0.5, 'sampling': 'MAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'PRE4', 'fraction': 0.5, 'sampling': 'MNAR'}
	perturbation: GaussianNoise: {'column': 'PRE4', 'fraction': 0.5, 'sampling': 'MAR'}
	perturbation: CategoricalShift: {'column': 'PRE32', 'fraction': 0.5, 'sampling': 'MNAR'}

Generating corrupted training data on 141 rows... 

	perturbation: MissingValues: {'column': 'PRE6', 'fraction': 0.5, 'sampling': 'MAR', 'na_value': nan}
	perturbation: Scaling: 

In [13]:
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNNOutlierDetection, PyODIsolationForestOutlierDetection
from jenga.cleaning.imputation import MeanModeImputation

cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, MeanModeImputation),
    (PyODIsolationForestOutlierDetection, MeanModeImputation)
]

  and should_run_async(code)


In [14]:
from jenga.cleaning.clean import Clean

categorical_precision_threshold=0.7
numerical_std_error_threshold=2.0

clean = Clean(train_data, df_corrupted, categorical_columns, numerical_columns, categorical_precision_threshold, numerical_std_error_threshold, ppp, ppp_model, cleaners)
df_outliers, df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(train_data, test_data, df_corrupted, cols_perturbed)

INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8085106382978723
INFO:autogluon.tabular.learner.abstract_learner:Evaluations on test data:
INFO:autogluon.tabular.learner.abstract_learner:{
    "accuracy": 0.8085106382978723,
    "accuracy_score": 0.8085106382978723,
    "balanced_accuracy_score": 0.5,
    "matthews_corrcoef": 0.0,
    "f1_score": 0.8085106382978723
}
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:autogluon.tabular.learner.abstract_learner:Detailed (per-class) classification report:
INFO:autogluon.tabular.learner.abstract_learner:{
    "F": {
        "precision": 0.8085106382978723,
        "recall": 1.0,
        "f1-score": 0.8941176470588235,
        "support": 114
    },
    "T": {
        "precision": 0.0,
        "recall": 0.0,
        "f1-score": 0.0,
        "support": 27
    },
    "accuracy": 0.808510638


Applying cleaners... 

PPP score no cleaning: {'roc_auc_score': 0.4858674463937622, 'classification_report': {'F': {'precision': 0.8085106382978723, 'recall': 1.0, 'f1-score': 0.8941176470588235, 'support': 114}, 'T': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 27}, 'accuracy': 0.8085106382978723, 'macro avg': {'precision': 0.40425531914893614, 'recall': 0.5, 'f1-score': 0.44705882352941173, 'support': 141}, 'weighted avg': {'precision': 0.653689452240833, 'recall': 0.8085106382978723, 'f1-score': 0.7229036295369211, 'support': 141}}}
PPP scores with cleaning: 


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8085106382978723
INFO:autogluon.tabular.learner.abstract_learner:Evaluations on test data:
INFO:autogluon.tabular.learner.abstract_learner:{
    "accuracy": 0.8085106382978723,
    "accuracy_score": 0.8085106382978723,
    "balanced_accuracy_score": 0.5,



Outlier detection method: NoOutlierDetection, Outlier Detection Score: {'Precision': 0.1879432624113475, 'Recall': 0.5, 'F1-score': 0.2668846483704974, 'Accuracy': 0.375886524822695}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': 0.498015873015873, 'Recall': 0.496900826446281, 'F1-score': 0.48751580278128953, 'Accuracy': 0.6737588652482269, 'Mean Squared Error': 517.6129600807834}
Cleaner: (NoOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.4858674463937622, 'classification_report': {'F': {'precision': 0.8085106382978723, 'recall': 1.0, 'f1-score': 0.8941176470588235, 'support': 114}, 'T': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 27}, 'accuracy': 0.8085106382978723, 'macro avg': {'precision': 0.40425531914893614, 'recall': 0.5, 'f1-score': 0.44705882352941173, 'support': 141}, 'weighted avg': {'precision': 0.653689452240833, 'recall': 0.8085106382978723, 'f1-score': 0.7229036295369211, 'support': 141}}}


INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8085106382978723
INFO:autogluon.tabular.learner.abstract_learner:Evaluations on test data:
INFO:autogluon.tabular.learner.abstract_learner:{
    "accuracy": 0.8085106382978723,
    "accuracy_score": 0.8085106382978723,
    "balanced_accuracy_score": 0.5,
    "matthews_corrcoef": 0.0,
    "f1_score": 0.8085106382978723
}
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:autogluon.tabular.learner.abstract_learner:Detailed (per-class) classification report:
INFO:autogluon.tabular.learner.abstract_learner:{
    "F": {
        "precision": 0.8085106382978723,
        "recall": 1.0,
        "f1-score": 0.8941176470588235,
        "support": 114
    },
    "T": {
        "precision": 0.0,
        "recall": 0.0,
        "f1-score": 0.0,
        "support": 27
    },
    "accuracy": 0.808510638


Outlier detection method: PyODKNNOutlierDetection, Outlier Detection Score: {'Precision': 0.8277315192820908, 'Recall': 0.858228142439543, 'F1-score': 0.806192714507331, 'Accuracy': 0.8156028368794326}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': 0.498015873015873, 'Recall': 0.496900826446281, 'F1-score': 0.48751580278128953, 'Accuracy': 0.6737588652482269, 'Mean Squared Error': 56.39878098682344}
Cleaner: (PyODKNNOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.50909681611436, 'classification_report': {'F': {'precision': 0.8085106382978723, 'recall': 1.0, 'f1-score': 0.8941176470588235, 'support': 114}, 'T': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 27}, 'accuracy': 0.8085106382978723, 'macro avg': {'precision': 0.40425531914893614, 'recall': 0.5, 'f1-score': 0.44705882352941173, 'support': 141}, 'weighted avg': {'precision': 0.653689452240833, 'recall': 0.8085106382978723, 'f1-score': 0.7229036295369211, 'support': 141}}}


INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8156028368794326
INFO:autogluon.tabular.learner.abstract_learner:Evaluations on test data:
INFO:autogluon.tabular.learner.abstract_learner:{
    "accuracy": 0.8156028368794326,
    "accuracy_score": 0.8156028368794326,
    "balanced_accuracy_score": 0.5185185185185185,
    "matthews_corrcoef": 0.17366269075057592,
    "f1_score": 0.8156028368794325
}
INFO:autogluon.tabular.learner.abstract_learner:Detailed (per-class) classification report:
INFO:autogluon.tabular.learner.abstract_learner:{
    "F": {
        "precision": 0.8142857142857143,
        "recall": 1.0,
        "f1-score": 0.8976377952755905,
        "support": 114
    },
    "T": {
        "precision": 1.0,
        "recall": 0.037037037037037035,
        "f1-score": 0.07142857142857142,
        "support": 27
    },
    "accuracy": 0.8156028368794326,
    "macro avg": {
        "precision": 0.9071428571428571,
        "recall": 0.518518518518


Outlier detection method: PyODIsolationForestOutlierDetection, Outlier Detection Score: {'Precision': 0.7843491602294419, 'Recall': 0.8109444402262633, 'F1-score': 0.7571057133618634, 'Accuracy': 0.7677304964539007}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': 0.498015873015873, 'Recall': 0.496900826446281, 'F1-score': 0.48751580278128953, 'Accuracy': 0.6737588652482269, 'Mean Squared Error': 61.27402968396848}
Cleaner: (PyODIsolationForestOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5134827810266407, 'classification_report': {'F': {'precision': 0.8142857142857143, 'recall': 1.0, 'f1-score': 0.8976377952755905, 'support': 114}, 'T': {'precision': 1.0, 'recall': 0.037037037037037035, 'f1-score': 0.07142857142857142, 'support': 27}, 'accuracy': 0.8156028368794326, 'macro avg': {'precision': 0.9071428571428571, 'recall': 0.5185185185185185, 'f1-score': 0.48453318335208095, 'support': 141}, 'weighted avg': {'precision': 0.8498480243161094, 'recall':

## Try Cleaners

### Pyod Single Column - features

In [7]:
df_outliers = df_corrupted.copy(deep=True)
df_outliers

  and should_run_async(code)


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100
802,626.94,736.00,755.85,768.98,720.86,812.95,806.68,755.28,843.98,933.09,...,657.94,590.66,710.01,689.13,693.32,632.65,599.64,709.34,681.010000,621.47
1016,245.40,250.96,269.27,254.46,270.32,271.62,266.75,261.08,264.75,250.49,...,255.03,249.00,258.50,249.42,273.56,245.01,266.65,256.94,-40546.291133,266.11
111,1.00,1.00,0.89,0.89,0.97,1.00,0.99,0.96,0.92,1.03,...,1.03,0.98,0.90,0.99,1.00,1.00,1.00,1.03,1.010000,0.92
351,65587.77,65913.77,65572.12,65901.94,66829.75,68029.52,66790.10,67110.94,69842.12,70651.07,...,67262.37,70535.33,66131.91,70359.66,70318.84,68286.34,65759.27,68706.18,70055.740000,68141.13
840,90526.50,84592.12,100390.15,97490.58,83140.50,82423.89,100139.67,95137.39,89661.51,89202.77,...,86662.42,89290.78,87405.77,83805.90,81585.02,85322.11,82539.30,94971.19,82459.000000,85920.32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
633,113.04,123.05,119.76,113.32,109.56,112.38,121.73,110.69,115.95,113.50,...,118.91,116.27,119.95,108.72,111.58,113.72,119.07,107.12,-24172.942449,109.91
481,159.23,147.59,141.28,154.06,136.94,153.13,157.02,141.28,156.07,143.49,...,149.39,152.19,136.06,161.55,156.84,135.58,158.97,142.17,10138.315315,145.23
793,25787.28,27388.53,24924.79,28136.17,25902.79,25561.51,24747.78,27110.53,25875.04,26453.66,...,25390.84,27871.09,27834.13,26646.12,28117.93,26798.67,25878.47,25673.35,24716.700000,28480.10
1175,8.05,8.06,7.72,7.74,7.99,7.88,7.96,7.84,8.00,7.87,...,7.64,7.76,8.00,7.95,7.93,7.85,7.78,7.99,-26256.014410,7.84


In [8]:
from pyod.models.knn import KNN
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

  and should_run_async(code)


In [9]:
## featurizers
def build_featurizers(columns):
    categorical_preprocessing = Pipeline([
        ('mark_missing', SimpleImputer(strategy='constant', fill_value='__NA__')),
        ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
    ])

    numeric_preprocessing = Pipeline([
        ('mark_missing', SimpleImputer(strategy='constant', fill_value=0)),
    ])

    cat_cols = [c for c in categorical_columns if c in columns]
    num_cols = [c for c in numerical_columns if c in columns]
    
    feature_transformation = ColumnTransformer(transformers=[
        ('categorical_features', categorical_preprocessing, cat_cols),
        ('scaled_numeric', numeric_preprocessing, num_cols)
    ])

    return feature_transformation

In [10]:
predictors = {}

for col in categorical_columns + numerical_columns:
    predictors[col] = Pipeline(
        [('features', build_featurizers([col])),
         ('outlier_detector', KNN())
        ])
    
len(predictors)

100

In [11]:
for col in categorical_columns + numerical_columns:
    predictors[col].fit(df_outliers)

In [12]:
for col in categorical_columns + numerical_columns:
    df_outliers[col + "_outlier"] = predictors[col].predict(df_corrupted)

  and should_run_async(code)


In [13]:
df_outliers

  and should_run_async(code)


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V91_outlier,V92_outlier,V93_outlier,V94_outlier,V95_outlier,V96_outlier,V97_outlier,V98_outlier,V99_outlier,V100_outlier
802,626.94,736.00,755.85,768.98,720.86,812.95,806.68,755.28,843.98,933.09,...,0,0,0,0,0,0,0,0,0,0
1016,245.40,250.96,269.27,254.46,270.32,271.62,266.75,261.08,264.75,250.49,...,0,0,0,0,0,0,0,0,0,0
111,1.00,1.00,0.89,0.89,0.97,1.00,0.99,0.96,0.92,1.03,...,0,0,0,0,0,0,0,0,0,0
351,65587.77,65913.77,65572.12,65901.94,66829.75,68029.52,66790.10,67110.94,69842.12,70651.07,...,1,1,0,1,1,1,1,1,1,1
840,90526.50,84592.12,100390.15,97490.58,83140.50,82423.89,100139.67,95137.39,89661.51,89202.77,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
633,113.04,123.05,119.76,113.32,109.56,112.38,121.73,110.69,115.95,113.50,...,0,0,0,0,0,0,0,0,0,0
481,159.23,147.59,141.28,154.06,136.94,153.13,157.02,141.28,156.07,143.49,...,0,0,0,0,0,0,0,0,0,0
793,25787.28,27388.53,24924.79,28136.17,25902.79,25561.51,24747.78,27110.53,25875.04,26453.66,...,1,0,1,0,1,0,0,0,0,1
1175,8.05,8.06,7.72,7.74,7.99,7.88,7.96,7.84,8.00,7.87,...,0,0,0,0,0,0,0,0,0,0


### Pyod Multiple Columns - features

In [14]:
df_outliers = df_corrupted.copy(deep=True)

  and should_run_async(code)


In [15]:
predictors = Pipeline(
    [('features', build_featurizers(numerical_columns + categorical_columns)),
     ('outlier_detector', KNN())
    ])
predictors.fit(df_outliers)

Pipeline(steps=[('features',
                 ColumnTransformer(transformers=[('categorical_features',
                                                  Pipeline(steps=[('mark_missing',
                                                                   SimpleImputer(fill_value='__NA__',
                                                                                 strategy='constant')),
                                                                  ('one_hot_encode',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  []),
                                                 ('scaled_numeric',
                                                  Pipeline(steps=[('mark_missing',
                                                                   SimpleImputer(fill_value=0,
                                                                                 strategy='constant'))]),
      

In [16]:
outliers = predictors.predict(df_outliers)
outliers

  and should_run_async(code)


array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,

### Categorical from train, numerical from pyod

In [18]:
## single column based

In [17]:
def cat_out_detect(df_train, df_corrupted):
    df_outliers = df_corrupted[categorical_columns].copy()
    
    for col in df_train.columns:
        if col in categorical_columns:
            vals_train_unique = df_train[col].unique()
            
            ## add a respective outlier col for each col
            df_outliers[col + "_outlier"] = ''
            
            for i in df_corrupted[col].index:
                if df_corrupted.loc[i, col] in vals_train_unique:
                    df_outliers.loc[i, col + "_outlier"] = 0
                else:
                    df_outliers.loc[i, col + "_outlier"] = 1
                    
    return df_outliers

  and should_run_async(code)


In [18]:
def num_out_detect(df_train, df_corrupted, pyod_model):
    df_outliers = df_corrupted[numerical_columns].copy()
    
    for col in df_train.columns:
        if col in numerical_columns:
            ## find indices of records with NaNs in col in df_corrupted
            nan_idx = df_corrupted[df_corrupted[col].isnull()].index
            non_nan_idx = df_corrupted.loc[set(df_corrupted.index) - set(nan_idx)].index
            
            ## pd series -> np column, needs to be 2D array
            ## taking only the non-NaN records in the corrupted data
            col_tr_arr = np.array(df_train[col]).reshape(-1,1)
            col_corr_arr = np.array(df_corrupted.loc[non_nan_idx][col]).reshape(-1,1)
            
            ## fit the dataset to the model
            model = pyod_model
            model.fit(col_tr_arr)
            
            ## predict raw anomaly score
            scores_pred = model.decision_function(col_corr_arr) * -1
            
            ## prediction of a datapoint category outlier or inlier
            y_pred = model.predict(col_corr_arr)
            
            ## add a respective outlier col for each col
            df_outliers[col + "_outlier"] = ''
            df_outliers.loc[non_nan_idx, col + "_outlier"] = y_pred ## 0: inlier, 1: outlier
            df_outliers.loc[nan_idx, col + "_outlier"] = 1
            
    return df_outliers

In [19]:
pyod_model = KNN()
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V91_outlier,V92_outlier,V93_outlier,V94_outlier,V95_outlier,V96_outlier,V97_outlier,V98_outlier,V99_outlier,V100_outlier
802,626.94,736.00,755.85,768.98,720.86,812.95,806.68,755.28,843.98,933.09,...,0,0,0,0,0,0,0,0,0,0
1016,245.40,250.96,269.27,254.46,270.32,271.62,266.75,261.08,264.75,250.49,...,0,0,0,0,0,0,0,0,1,0
111,1.00,1.00,0.89,0.89,0.97,1.00,0.99,0.96,0.92,1.03,...,0,0,0,0,0,0,0,0,0,0
351,65587.77,65913.77,65572.12,65901.94,66829.75,68029.52,66790.10,67110.94,69842.12,70651.07,...,1,1,1,1,1,1,1,1,1,1
840,90526.50,84592.12,100390.15,97490.58,83140.50,82423.89,100139.67,95137.39,89661.51,89202.77,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
633,113.04,123.05,119.76,113.32,109.56,112.38,121.73,110.69,115.95,113.50,...,0,0,0,0,0,0,0,0,1,0
481,159.23,147.59,141.28,154.06,136.94,153.13,157.02,141.28,156.07,143.49,...,0,0,0,0,0,0,0,0,0,0
793,25787.28,27388.53,24924.79,28136.17,25902.79,25561.51,24747.78,27110.53,25875.04,26453.66,...,1,0,0,0,0,0,0,0,0,0
1175,8.05,8.06,7.72,7.74,7.99,7.88,7.96,7.84,8.00,7.87,...,0,0,0,0,0,0,0,0,1,0


In [27]:
from pyod.models.pca import PCA

pyod_model = PCA() # n_components = min(n_samples, n_features) default  # n_selected_components = None
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

  and should_run_async(code)


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V91_outlier,V92_outlier,V93_outlier,V94_outlier,V95_outlier,V96_outlier,V97_outlier,V98_outlier,V99_outlier,V100_outlier
802,626.94,736.00,755.85,768.98,720.86,812.95,806.68,755.28,843.98,933.09,...,0,0,0,0,0,0,0,0,0,0
1016,245.40,250.96,269.27,254.46,270.32,271.62,266.75,261.08,264.75,250.49,...,0,0,0,0,0,0,0,0,1,0
111,1.00,1.00,0.89,0.89,0.97,1.00,0.99,0.96,0.92,1.03,...,1,1,1,1,1,1,1,1,1,1
351,65587.77,65913.77,65572.12,65901.94,66829.75,68029.52,66790.10,67110.94,69842.12,70651.07,...,1,1,1,1,1,1,1,1,1,1
840,90526.50,84592.12,100390.15,97490.58,83140.50,82423.89,100139.67,95137.39,89661.51,89202.77,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
633,113.04,123.05,119.76,113.32,109.56,112.38,121.73,110.69,115.95,113.50,...,0,0,0,0,0,0,0,0,1,0
481,159.23,147.59,141.28,154.06,136.94,153.13,157.02,141.28,156.07,143.49,...,0,0,0,0,0,0,0,0,0,0
793,25787.28,27388.53,24924.79,28136.17,25902.79,25561.51,24747.78,27110.53,25875.04,26453.66,...,0,0,0,0,0,0,0,0,0,0
1175,8.05,8.06,7.72,7.74,7.99,7.88,7.96,7.84,8.00,7.87,...,0,0,0,0,0,0,0,0,1,0


In [29]:
from pyod.models.cblof import CBLOF

pyod_model = CBLOF() # n_clusters = 8 default
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

  and should_run_async(code)






Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V91_outlier,V92_outlier,V93_outlier,V94_outlier,V95_outlier,V96_outlier,V97_outlier,V98_outlier,V99_outlier,V100_outlier
802,626.94,736.00,755.85,768.98,720.86,812.95,806.68,755.28,843.98,933.09,...,0,0,0,0,0,0,0,0,0,0
1016,245.40,250.96,269.27,254.46,270.32,271.62,266.75,261.08,264.75,250.49,...,0,0,0,0,0,0,0,0,1,0
111,1.00,1.00,0.89,0.89,0.97,1.00,0.99,0.96,0.92,1.03,...,0,0,0,0,0,0,0,0,0,0
351,65587.77,65913.77,65572.12,65901.94,66829.75,68029.52,66790.10,67110.94,69842.12,70651.07,...,1,1,1,1,1,1,1,1,1,1
840,90526.50,84592.12,100390.15,97490.58,83140.50,82423.89,100139.67,95137.39,89661.51,89202.77,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
633,113.04,123.05,119.76,113.32,109.56,112.38,121.73,110.69,115.95,113.50,...,0,0,0,0,0,0,0,0,1,0
481,159.23,147.59,141.28,154.06,136.94,153.13,157.02,141.28,156.07,143.49,...,0,0,0,0,0,0,0,0,0,0
793,25787.28,27388.53,24924.79,28136.17,25902.79,25561.51,24747.78,27110.53,25875.04,26453.66,...,0,0,0,0,0,0,0,0,0,0
1175,8.05,8.06,7.72,7.74,7.99,7.88,7.96,7.84,8.00,7.87,...,0,0,0,0,0,0,0,0,1,0


In [30]:
from pyod.models.sos import SOS

pyod_model = CBLOF() # n_clusters = 8 default
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

  and should_run_async(code)






Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V91_outlier,V92_outlier,V93_outlier,V94_outlier,V95_outlier,V96_outlier,V97_outlier,V98_outlier,V99_outlier,V100_outlier
802,626.94,736.00,755.85,768.98,720.86,812.95,806.68,755.28,843.98,933.09,...,0,0,0,0,0,0,0,0,0,0
1016,245.40,250.96,269.27,254.46,270.32,271.62,266.75,261.08,264.75,250.49,...,0,0,0,0,0,0,0,0,1,0
111,1.00,1.00,0.89,0.89,0.97,1.00,0.99,0.96,0.92,1.03,...,0,0,0,0,0,0,0,0,0,0
351,65587.77,65913.77,65572.12,65901.94,66829.75,68029.52,66790.10,67110.94,69842.12,70651.07,...,1,1,1,1,1,1,1,1,1,1
840,90526.50,84592.12,100390.15,97490.58,83140.50,82423.89,100139.67,95137.39,89661.51,89202.77,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
633,113.04,123.05,119.76,113.32,109.56,112.38,121.73,110.69,115.95,113.50,...,0,0,0,0,0,0,0,0,1,0
481,159.23,147.59,141.28,154.06,136.94,153.13,157.02,141.28,156.07,143.49,...,0,0,0,0,0,0,0,0,0,0
793,25787.28,27388.53,24924.79,28136.17,25902.79,25561.51,24747.78,27110.53,25875.04,26453.66,...,0,0,0,0,0,1,0,0,0,0
1175,8.05,8.06,7.72,7.74,7.99,7.88,7.96,7.84,8.00,7.87,...,0,0,0,0,0,0,0,0,1,0


### Sklearn