## Dataset

In [1]:
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('/home/rupali/Documents/Master Thesis/jenga')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from jenga.basis import Dataset

In [2]:
dataset = Dataset("hill-valley")

all_data = dataset.all_data
attribute_names = dataset.attribute_names
attribute_types = dataset.attribute_types

categorical_columns = dataset.categorical_columns
numerical_columns = dataset.numerical_columns

print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")

Dataset: hill-valley
Found 0 categorical and 100 numeric features 



### Get training and test sets

In [3]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data(0.3)

In [4]:
## use categorical columns as strings
def cat_cols_to_str(df):
    for col in df.columns:
        if pd.api.types.is_categorical_dtype(df[col]):
            df[col] = df[col].astype(str)

    return df

In [5]:
### if we don't convert the categorical columns to str, the swapping corruption doesn't let us assign new values to the column: "Cannot setitem on a Categorical with a new category, set the categories first"
train_data = cat_cols_to_str(train_data)
test_data = cat_cols_to_str(test_data)

## Workaround to get the data when OpenML decided not to work :D

In [2]:
train_dat = pd.read_csv("/home/rupali/Downloads/Hill_Valley_without_noise_Training.data", sep=",")
train_data = train_dat.loc[:, train_dat.columns != 'class']
train_labels = train_dat["class"]

In [3]:
test_dat = pd.read_csv("/home/rupali/Downloads/Hill_Valley_without_noise_Testing.data", sep=",")
test_data = test_dat.loc[:, test_dat.columns != 'class']
test_labels = test_dat["class"]

In [4]:
categorical_columns = []
numerical_columns = []

for col in train_data.columns:
    if pd.api.types.is_categorical_dtype(train_data[col]):
        categorical_columns.append(col)
    elif pd.api.types.is_numeric_dtype(train_data[col]):
        numerical_columns.append(col)

print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")

Found 0 categorical and 100 numeric features 



## Defined Model

In [5]:
from sklearn.linear_model import SGDClassifier

learner = SGDClassifier(loss='log')
param_grid = {
    'learner__max_iter': [500, 1000, 5000],
    'learner__penalty': ['l2', 'l1', 'elasticnet'], 
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

## Corruptions using PPP

In [6]:
from jenga.corruptions.generic import MissingValues, SwappedValues, CategoricalShift
from jenga.corruptions.numerical import Scaling, GaussianNoise

corruptions = [MissingValues, Scaling, GaussianNoise]
fraction = 0.5
num_repetitions = 5

In [7]:
from jenga.cleaning.ppp import PipelinePerformancePrediction

ppp = PipelinePerformancePrediction(train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)
ppp_model = ppp.fit_ppp(train_data)

## generate corrpted data
for _ in range(num_repetitions):
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions, fraction, num_repetitions)

Fitting 5 folds for each of 36 candidates, totalling 180 fits

Generating corrupted training data on 606 rows... 

	perturbation: MissingValues: {'column': 'X43', 'fraction': 0.5, 'sampling': 'MCAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'X17', 'fraction': 0.5, 'sampling': 'MAR'}
	perturbation: GaussianNoise: {'column': 'X20', 'fraction': 0.5, 'sampling': 'MNAR'}

Generating corrupted training data on 606 rows... 

	perturbation: MissingValues: {'column': 'X63', 'fraction': 0.5, 'sampling': 'MCAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'X92', 'fraction': 0.5, 'sampling': 'MCAR'}
	perturbation: GaussianNoise: {'column': 'X79', 'fraction': 0.5, 'sampling': 'MAR'}

Generating corrupted training data on 606 rows... 

	perturbation: MissingValues: {'column': 'X72', 'fraction': 0.5, 'sampling': 'MCAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'X87', 'fraction': 0.5, 'sampling': 'MNAR'}
	perturbation: GaussianNoise: {'column': 'X79', 'fraction': 0.5, 'sa

## Cleaning

### PPP Cleaning

In [8]:
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNNOutlierDetection, PyODIsolationForestOutlierDetection, PyODPCAOutlierDetection, PyODCBLOFOutlierDetection, PyODSOSOutlierDetection
from jenga.cleaning.imputation import MeanModeImputation

cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, MeanModeImputation),
    (PyODIsolationForestOutlierDetection, MeanModeImputation),
    (PyODPCAOutlierDetection, MeanModeImputation),
    (PyODCBLOFOutlierDetection, MeanModeImputation),
    (PyODSOSOutlierDetection, MeanModeImputation)
]

  and should_run_async(code)


In [9]:
from jenga.cleaning.clean import Clean

categorical_precision_threshold=0.7
numerical_std_error_threshold=2.0

clean = Clean(train_data, df_corrupted, categorical_columns, numerical_columns, categorical_precision_threshold, numerical_std_error_threshold, ppp, ppp_model, cleaners)
df_outliers, df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(train_data, test_data, df_corrupted, cols_perturbed)


Applying cleaners... 

PPP score no cleaning: {'roc_auc_score': 0.6551855686958418, 'classification_report': {'0': {'precision': 0.5685279187817259, 'recall': 0.7593220338983051, 'f1-score': 0.6502177068214805, 'support': 295}, '1': {'precision': 0.6650943396226415, 'recall': 0.4533762057877814, 'f1-score': 0.5391969407265774, 'support': 311}, 'accuracy': 0.6023102310231023, 'macro avg': {'precision': 0.6168111292021837, 'recall': 0.6063491198430433, 'f1-score': 0.594707323774029, 'support': 606}, 'weighted avg': {'precision': 0.6180859334377075, 'recall': 0.6023102310231023, 'f1-score': 0.5932417030995089, 'support': 606}}}
PPP scores with cleaning: 

Outlier detection method: NoOutlierDetection, Outlier Detection Score: {'Precision': 0.25, 'Recall': 0.5, 'F1-score': 0.3333333333333333, 'Accuracy': 0.5}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 82632108746417.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Cleaner: (NoOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.6287372608861518, 'classification_report': {'0': {'precision': 0.564625850340136, 'recall': 0.8440677966101695, 'f1-score': 0.6766304347826086, 'support': 295}, '1': {'precision': 0.7212121212121212, 'recall': 0.38263665594855306, 'f1-score': 0.5, 'support': 311}, 'accuracy': 0.6072607260726073, 'macro avg': {'precision': 0.6429189857761286, 'recall': 0.6133522262793613, 'f1-score': 0.5883152173913043, 'support': 606}, 'weighted avg': {'precision': 0.6449861312661878, 'recall': 0.6072607260726073, 'f1-score': 0.5859834624766824, 'support': 606}}}

Outlier detection method: PyODKNNOutlierDetection, Outlier Detection Score: {'Precision': 0.8408569300146812, 'Recall': 0.8278327832783279, 'F1-score': 0.8262407435124338, 'Accuracy': 0.8278327832783279}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 384872283.31639314}
Clea








Outlier detection method: PyODCBLOFOutlierDetection, Outlier Detection Score: {'Precision': 0.8098841669267345, 'Recall': 0.7832783278327833, 'F1-score': 0.7786253488944451, 'Accuracy': 0.7832783278327833}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 399630584.565965}
Cleaner: (PyODCBLOFOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5408523625265682, 'classification_report': {'0': {'precision': 0.48267898383371827, 'recall': 0.7084745762711865, 'f1-score': 0.5741758241758241, 'support': 295}, '1': {'precision': 0.5028901734104047, 'recall': 0.2797427652733119, 'f1-score': 0.3595041322314049, 'support': 311}, 'accuracy': 0.4884488448844885, 'macro avg': {'precision': 0.4927845786220615, 'recall': 0.4941086707722492, 'f1-score': 0.46683997820361456, 'support': 606}, 'weighted avg': {'precision': 0.4930513930059121, 'recall': 0.4884488448844885, 'f1-score': 0.4640060284749753

## Model Evaluation

### With learner and param_grid

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# preprocessing pipeline for numerical columns
transformer_numeric = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('standard_scale', StandardScaler())
])

# preprocessing pipeline for categorical columns
transformer_categorical = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
    ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
])

# preprocessor
feature_transform = ColumnTransformer(transformers=[
    ('categorical_features', transformer_categorical, categorical_columns),
    ('numerical_features', transformer_numeric, numerical_columns)
])

## prediction pipeline: append classifier (learner) to the preprocessing pipeline
pipeline = Pipeline([
    ('features', feature_transform),
    ('learner', learner)
])

  and should_run_async(code)


In [19]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipeline, param_grid, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
model = grid_search.fit(train_data, train_labels)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [20]:
y_pred = model.predict(test_data)

  and should_run_async(code)


In [21]:
from sklearn.metrics import roc_auc_score

roc_auc_score(test_labels, np.transpose(model.predict_proba(test_data))[1])

  and should_run_async(code)


0.48281130634071817

In [22]:
model.predict_proba(test_data)

  and should_run_async(code)


array([[9.999474e-01, 5.256469e-05],
       [9.997258e-01, 2.741634e-04],
       [9.999984e-01, 1.568016e-06],
       [9.999999e-01, 6.675940e-08],
       ...,
       [9.999999e-01, 6.306299e-08],
       [9.986168e-01, 1.383178e-03],
       [2.497819e-01, 7.502181e-01],
       [9.999855e-01, 1.451340e-05]])

In [23]:
from sklearn.metrics import classification_report

classification_report(test_labels, y_pred, output_dict=True)

  and should_run_async(code)


{'F': {'precision': 0.8428571428571429,
  'recall': 0.9915966386554622,
  'f1-score': 0.9111969111969112,
  'support': 119},
 'T': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 22},
 'accuracy': 0.8368794326241135,
 'macro avg': {'precision': 0.42142857142857143,
  'recall': 0.4957983193277311,
  'f1-score': 0.4555984555984556,
  'support': 141},
 'weighted avg': {'precision': 0.7113475177304964,
  'recall': 0.8368794326241135,
  'f1-score': 0.7690243434924285,
  'support': 141}}

### With Autogluon

In [14]:
from autogluon.tabular import TabularPredictor

## training
train_data["class"] = train_labels

label = "class"

## folder to save trained models
#save_path = '/home/rupali/Documents/Master Thesis/jenga/autogluon_models/'

predictor = TabularPredictor(label=label).fit(train_data)

  and should_run_async(code)
Level 25:autogluon.core.utils.utils:No path specified. Models will be saved in: "AutogluonModels/ag-20210323_212707/"
INFO:autogluon.tabular.learner.default_learner:Beginning AutoGluon training ...
INFO:autogluon.tabular.learner.default_learner:AutoGluon will save models to "AutogluonModels/ag-20210323_212707/"
INFO:autogluon.tabular.learner.default_learner:AutoGluon Version:  0.1.0
INFO:autogluon.tabular.learner.default_learner:Train Data Rows:    329
INFO:autogluon.tabular.learner.default_learner:Train Data Columns: 16
INFO:autogluon.tabular.learner.default_learner:Preprocessing data ...
Level 25:autogluon.core.utils.utils:AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
INFO:autogluon.core.utils.utils:	2 unique label values:  ['F', 'T']
Level 25:autogluon.core.utils.utils:	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_t

ERROR:autogluon.tabular.trainer.abstract_trainer:		libopenblas.so.0: cannot open shared object file: No such file or directory
ERROR:autogluon.tabular.trainer.abstract_trainer:Detailed Traceback:
Traceback (most recent call last):
  File "/home/rupali/.local/lib/python3.8/site-packages/autogluon/tabular/trainer/abstract_trainer.py", line 911, in _train_and_save
    model = self._train_single(X, y, model, X_val, y_val, **model_fit_kwargs)
  File "/home/rupali/.local/lib/python3.8/site-packages/autogluon/tabular/trainer/abstract_trainer.py", line 883, in _train_single
    model.fit(X=X, y=y, X_val=X_val, y_val=y_val, **model_fit_kwargs)
  File "/home/rupali/.local/lib/python3.8/site-packages/autogluon/core/models/abstract/abstract_model.py", line 405, in fit
    self._fit(**kwargs)
  File "/home/rupali/.local/lib/python3.8/site-packages/autogluon/tabular/models/tabular_nn/tabular_nn_model.py", line 177, in _fit
    try_import_mxnet()
  File "/home/rupali/.local/lib/python3.8/site-package

█

INFO:autogluon.tabular.trainer.abstract_trainer:	0.8636	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	5.44s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.1s	 = Validation runtime
INFO:autogluon.tabular.trainer.abstract_trainer:Fitting model: LightGBMLarge ...
INFO:autogluon.tabular.trainer.abstract_trainer:	0.8636	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	0.47s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.02s	 = Validation runtime


█

INFO:autogluon.tabular.trainer.abstract_trainer:Fitting model: WeightedEnsemble_L2 ...
INFO:autogluon.tabular.trainer.abstract_trainer:	0.8636	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	0.41s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.0s	 = Validation runtime
INFO:autogluon.tabular.learner.default_learner:AutoGluon training complete, total runtime = 11.56s ...
INFO:root:TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20210323_212707/")


In [15]:
## test on original data
y_pred_test = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=pd.Series(test_labels), y_pred=y_pred_test, auxiliary_metrics=True)

  and should_run_async(code)
INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8368794326241135
INFO:autogluon.tabular.learner.abstract_learner:Evaluations on test data:
INFO:autogluon.tabular.learner.abstract_learner:{
    "accuracy": 0.8368794326241135,
    "accuracy_score": 0.8368794326241135,
    "balanced_accuracy_score": 0.5,
    "matthews_corrcoef": 0.0,
    "f1_score": 0.8368794326241135
}
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:autogluon.tabular.learner.abstract_learner:Detailed (per-class) classification report:
INFO:autogluon.tabular.learner.abstract_learner:{
    "F": {
        "precision": 0.8368794326241135,
        "recall": 1.0,
        "f1-score": 0.9111969111969112,
        "support": 118
    },
    "T": {
        "precision": 0.0,
        "recall": 0.0,
        "f1-score": 0.0,
        "support": 23
    }

In [16]:
## test on corrupted data
y_pred_corrupted = predictor.predict(df_corrupted)
perf = predictor.evaluate_predictions(y_true=pd.Series(test_labels), y_pred=y_pred_corrupted, auxiliary_metrics=True)

  and should_run_async(code)
INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8368794326241135
INFO:autogluon.tabular.learner.abstract_learner:Evaluations on test data:
INFO:autogluon.tabular.learner.abstract_learner:{
    "accuracy": 0.8368794326241135,
    "accuracy_score": 0.8368794326241135,
    "balanced_accuracy_score": 0.5,
    "matthews_corrcoef": 0.0,
    "f1_score": 0.8368794326241135
}
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:autogluon.tabular.learner.abstract_learner:Detailed (per-class) classification report:
INFO:autogluon.tabular.learner.abstract_learner:{
    "F": {
        "precision": 0.8368794326241135,
        "recall": 1.0,
        "f1-score": 0.9111969111969112,
        "support": 118
    },
    "T": {
        "precision": 0.0,
        "recall": 0.0,
        "f1-score": 0.0,
        "support": 23
    }

In [17]:
## test on cleaned data
y_pred_cleaned = predictor.predict(df_cleaned)
perf = predictor.evaluate_predictions(y_true=pd.Series(test_labels), y_pred=y_pred_cleaned, auxiliary_metrics=True)

  and should_run_async(code)
INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8368794326241135
INFO:autogluon.tabular.learner.abstract_learner:Evaluations on test data:
INFO:autogluon.tabular.learner.abstract_learner:{
    "accuracy": 0.8368794326241135,
    "accuracy_score": 0.8368794326241135,
    "balanced_accuracy_score": 0.5,
    "matthews_corrcoef": 0.0,
    "f1_score": 0.8368794326241135
}
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:autogluon.tabular.learner.abstract_learner:Detailed (per-class) classification report:
INFO:autogluon.tabular.learner.abstract_learner:{
    "F": {
        "precision": 0.8368794326241135,
        "recall": 1.0,
        "f1-score": 0.9111969111969112,
        "support": 118
    },
    "T": {
        "precision": 0.0,
        "recall": 0.0,
        "f1-score": 0.0,
        "support": 23
    }

In [18]:
## We can evaluate the performance of each individual trained model on our (labeled) test data
test_data["class"] = test_labels
predictor.leaderboard(test_data, silent=True)

  and should_run_async(code)


█

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT,0.836879,0.863636,0.019441,0.018222,0.346764,0.019441,0.018222,0.346764,1,True,8
1,LightGBMLarge,0.836879,0.863636,0.020956,0.018564,0.46885,0.020956,0.018564,0.46885,1,True,12
2,WeightedEnsemble_L2,0.836879,0.863636,0.024096,0.019142,0.875167,0.003141,0.000578,0.406317,2,True,13
3,LightGBM,0.836879,0.863636,0.037066,0.015656,0.333007,0.037066,0.015656,0.333007,1,True,7
4,RandomForestEntr,0.836879,0.863636,0.103467,0.105233,0.700574,0.103467,0.105233,0.700574,1,True,2
5,ExtraTreesGini,0.836879,0.833333,0.147363,0.087352,0.481024,0.147363,0.087352,0.481024,1,True,3
6,RandomForestGini,0.829787,0.863636,0.092181,0.08489,0.723594,0.092181,0.08489,0.723594,1,True,1
7,NeuralNetFastAI,0.829787,0.863636,0.158394,0.104127,5.44004,0.158394,0.104127,5.44004,1,True,11
8,ExtraTreesEntr,0.822695,0.818182,0.162067,0.086309,0.477268,0.162067,0.086309,0.477268,1,True,4
9,XGBoost,0.815603,0.863636,0.024807,0.008342,0.266442,0.024807,0.008342,0.266442,1,True,10


In [19]:
perf

  and should_run_async(code)


OrderedDict([('accuracy', 0.8368794326241135),
             ('accuracy_score', 0.8368794326241135),
             ('balanced_accuracy_score', 0.5),
             ('matthews_corrcoef', 0.0),
             ('f1_score', 0.8368794326241135),
             ('confusion_matrix',
                   F  T
              F  118  0
              T   23  0),
             ('classification_report',
              {'F': {'precision': 0.8368794326241135,
                'recall': 1.0,
                'f1-score': 0.9111969111969112,
                'support': 118},
               'T': {'precision': 0.0,
                'recall': 0.0,
                'f1-score': 0.0,
                'support': 23},
               'accuracy': 0.8368794326241135,
               'macro avg': {'precision': 0.41843971631205673,
                'recall': 0.5,
                'f1-score': 0.4555984555984556,
                'support': 141},
               'weighted avg': {'precision': 0.7003671847492581,
                'recall': 0.83

In [39]:
list(perf.items())[6][1]

{'F': {'precision': 0.8613138686131386,
  'recall': 0.9915966386554622,
  'f1-score': 0.9218749999999999,
  'support': 119},
 'T': {'precision': 0.75,
  'recall': 0.13636363636363635,
  'f1-score': 0.23076923076923075,
  'support': 22},
 'accuracy': 0.8581560283687943,
 'macro avg': {'precision': 0.8056569343065694,
  'recall': 0.5639801375095492,
  'f1-score': 0.5763221153846153,
  'support': 141},
 'weighted avg': {'precision': 0.8439457472692448,
  'recall': 0.8581560283687943,
  'f1-score': 0.8140428941625749,
  'support': 141}}

In [25]:
from sklearn.metrics import roc_auc_score

roc_auc_score(test_labels, np.transpose(predictor.predict_proba(test_data)).to_numpy()[1])

1.0

In [20]:
perf = predictor.evaluate_predictions(y_true=pd.Series(test_labels), y_pred=y_pred_cleaned, auxiliary_metrics=False)
perf

  and should_run_async(code)
INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8368794326241135


0.8368794326241135

## Undefined Model: Default setting: whole process

In [10]:
from jenga.corruptions.generic import MissingValues, SwappedValues, CategoricalShift
from jenga.corruptions.numerical import Scaling, GaussianNoise

corruptions = [MissingValues, Scaling, GaussianNoise, CategoricalShift]
fraction = 0.5
num_repetitions = 5

  and should_run_async(code)


In [11]:
from jenga.cleaning.ppp import PipelinePerformancePrediction

ppp = PipelinePerformancePrediction(train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns)
ppp_model = ppp.fit_ppp(train_data)

## generate corrpted data
for _ in range(num_repetitions):
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions, fraction, num_repetitions)

No path specified. Models will be saved in: "AutogluonModels/ag-20210324_221101/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20210324_221101/"
AutoGluon Version:  0.1.0
Train Data Rows:    606
Train Data Columns: 100
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    3047.73 MB
	Train Data (Original)  Memory Usage: 0.48 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of

█

	0.6148	 = Validation accuracy score
	7.82s	 = Training runtime
	0.2s	 = Validation runtime
Fitting model: LightGBMLarge ...
	0.6393	 = Validation accuracy score
	5.52s	 = Training runtime
	0.01s	 = Validation runtime


█

Fitting model: WeightedEnsemble_L2 ...
	0.6885	 = Validation accuracy score
	0.5s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 30.87s ...
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20210324_221101/")



Generating corrupted training data on 606 rows... 

	perturbation: MissingValues: {'column': 'X98', 'fraction': 0.5, 'sampling': 'MAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'X6', 'fraction': 0.5, 'sampling': 'MAR'}
	perturbation: GaussianNoise: {'column': 'X66', 'fraction': 0.5, 'sampling': 'MAR'}


IndexError: Cannot choose from an empty sequence

In [12]:
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNNOutlierDetection, PyODIsolationForestOutlierDetection, PyODPCAOutlierDetection, PyODCBLOFOutlierDetection, PyODSOSOutlierDetection
from jenga.cleaning.imputation import MeanModeImputation

cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, MeanModeImputation),
    (PyODIsolationForestOutlierDetection, MeanModeImputation),
    (PyODPCAOutlierDetection, MeanModeImputation),
    (PyODCBLOFOutlierDetection, MeanModeImputation),
    (PyODSOSOutlierDetection, MeanModeImputation)
]

  and should_run_async(code)


In [13]:
from jenga.cleaning.clean import Clean

categorical_precision_threshold=0.7
numerical_std_error_threshold=2.0

clean = Clean(train_data, df_corrupted, categorical_columns, numerical_columns, categorical_precision_threshold, numerical_std_error_threshold, ppp, ppp_model, cleaners)
df_outliers, df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(train_data, test_data, df_corrupted, cols_perturbed)

Evaluation: accuracy on test data: 0.5693069306930693
Evaluations on test data:
{
    "accuracy": 0.5693069306930693,
    "accuracy_score": 0.5693069306930693,
    "balanced_accuracy_score": 0.5685268951986484,
    "matthews_corrcoef": 0.13727616802960593,
    "f1_score": 0.5693069306930693
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5598591549295775,
        "recall": 0.5389830508474577,
        "f1-score": 0.5492227979274611,
        "support": 295
    },
    "1": {
        "precision": 0.577639751552795,
        "recall": 0.5980707395498392,
        "f1-score": 0.5876777251184835,
        "support": 311
    },
    "accuracy": 0.5693069306930693,
    "macro avg": {
        "precision": 0.5687494532411863,
        "recall": 0.5685268951986484,
        "f1-score": 0.5684502615229723,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5689841805893475,
        "recall": 0.5693069306930693,
        "f1-score": 0.56895791732747


Applying cleaners... 



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Evaluation: accuracy on test data: 0.5693069306930693
Evaluations on test data:
{
    "accuracy": 0.5693069306930693,
    "accuracy_score": 0.5693069306930693,
    "balanced_accuracy_score": 0.5685268951986484,
    "matthews_corrcoef": 0.13727616802960593,
    "f1_score": 0.5693069306930693
}


PPP score no cleaning: {'roc_auc_score': 0.5804458008610823, 'classification_report': {'0': {'precision': 0.5598591549295775, 'recall': 0.5389830508474577, 'f1-score': 0.5492227979274611, 'support': 295}, '1': {'precision': 0.577639751552795, 'recall': 0.5980707395498392, 'f1-score': 0.5876777251184835, 'support': 311}, 'accuracy': 0.5693069306930693, 'macro avg': {'precision': 0.5687494532411863, 'recall': 0.5685268951986484, 'f1-score': 0.5684502615229723, 'support': 606}, 'weighted avg': {'precision': 0.5689841805893475, 'recall': 0.5693069306930693, 'f1-score': 0.5689579173274742, 'support': 606}}}
PPP scores with cleaning: 

Outlier detection method: NoOutlierDetection, Outlier Detection Score: {'Precision': 0.25, 'Recall': 0.5, 'F1-score': 0.3333333333333333, 'Accuracy': 0.5}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 82632108746417.0}


Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5598591549295775,
        "recall": 0.5389830508474577,
        "f1-score": 0.5492227979274611,
        "support": 295
    },
    "1": {
        "precision": 0.577639751552795,
        "recall": 0.5980707395498392,
        "f1-score": 0.5876777251184835,
        "support": 311
    },
    "accuracy": 0.5693069306930693,
    "macro avg": {
        "precision": 0.5687494532411863,
        "recall": 0.5685268951986484,
        "f1-score": 0.5684502615229723,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5689841805893475,
        "recall": 0.5693069306930693,
        "f1-score": 0.5689579173274742,
        "support": 606
    }
}


Cleaner: (NoOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5804458008610823, 'classification_report': {'0': {'precision': 0.5598591549295775, 'recall': 0.5389830508474577, 'f1-score': 0.5492227979274611, 'support': 295}, '1': {'precision': 0.577639751552795, 'recall': 0.5980707395498392, 'f1-score': 0.5876777251184835, 'support': 311}, 'accuracy': 0.5693069306930693, 'macro avg': {'precision': 0.5687494532411863, 'recall': 0.5685268951986484, 'f1-score': 0.5684502615229723, 'support': 606}, 'weighted avg': {'precision': 0.5689841805893475, 'recall': 0.5693069306930693, 'f1-score': 0.5689579173274742, 'support': 606}}}


Evaluation: accuracy on test data: 0.5610561056105611
Evaluations on test data:
{
    "accuracy": 0.5610561056105611,
    "accuracy_score": 0.5610561056105611,
    "balanced_accuracy_score": 0.5613602921140116,
    "matthews_corrcoef": 0.12270186173615062,
    "f1_score": 0.5610561056105611
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5469255663430421,
        "recall": 0.5728813559322034,
        "f1-score": 0.5596026490066225,
        "support": 295
    },
    "1": {
        "precision": 0.5757575757575758,
        "recall": 0.5498392282958199,
        "f1-score": 0.5625,
        "support": 311
    },
    "accuracy": 0.5610561056105611,
    "macro avg": {
        "precision": 0.5613415710503089,
        "recall": 0.5613602921140116,
        "f1-score": 0.5610513245033113,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5617221916366394,
        "recall": 0.5610561056105611,
        "f1-score": 0.5610895733613096,
       


Outlier detection method: PyODKNNOutlierDetection, Outlier Detection Score: {'Precision': 0.8408569300146812, 'Recall': 0.8278327832783279, 'F1-score': 0.8262407435124338, 'Accuracy': 0.8278327832783279}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 384872283.31639314}
Cleaner: (PyODKNNOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.573137500681236, 'classification_report': {'0': {'precision': 0.5469255663430421, 'recall': 0.5728813559322034, 'f1-score': 0.5596026490066225, 'support': 295}, '1': {'precision': 0.5757575757575758, 'recall': 0.5498392282958199, 'f1-score': 0.5625, 'support': 311}, 'accuracy': 0.5610561056105611, 'macro avg': {'precision': 0.5613415710503089, 'recall': 0.5613602921140116, 'f1-score': 0.5610513245033113, 'support': 606}, 'weighted avg': {'precision': 0.5617221916366394, 'recall': 0.5610561056105611, 'f1-score': 0.5610895733613096, 'support': 606}

Evaluation: accuracy on test data: 0.5511551155115512
Evaluations on test data:
{
    "accuracy": 0.5511551155115512,
    "accuracy_score": 0.5511551155115512,
    "balanced_accuracy_score": 0.5538939451741239,
    "matthews_corrcoef": 0.11007944236596345,
    "f1_score": 0.5511551155115512
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5315068493150685,
        "recall": 0.6576271186440678,
        "f1-score": 0.5878787878787878,
        "support": 295
    },
    "1": {
        "precision": 0.5809128630705395,
        "recall": 0.45016077170418006,
        "f1-score": 0.5072463768115942,
        "support": 311
    },
    "accuracy": 0.5511551155115512,
    "macro avg": {
        "precision": 0.5562098561928039,
        "recall": 0.5538939451741239,
        "f1-score": 0.5475625823451911,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5568620807968366,
        "recall": 0.5511551155115512,
        "f1-score": 0.546498128073


Outlier detection method: PyODIsolationForestOutlierDetection, Outlier Detection Score: {'Precision': 0.7322447494913131, 'Recall': 0.7084708470847084, 'F1-score': 0.7039280901824226, 'Accuracy': 0.7084708470847084}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 656300362.0999852}
Cleaner: (PyODIsolationForestOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5698893672679711, 'classification_report': {'0': {'precision': 0.5315068493150685, 'recall': 0.6576271186440678, 'f1-score': 0.5878787878787878, 'support': 295}, '1': {'precision': 0.5809128630705395, 'recall': 0.45016077170418006, 'f1-score': 0.5072463768115942, 'support': 311}, 'accuracy': 0.5511551155115512, 'macro avg': {'precision': 0.5562098561928039, 'recall': 0.5538939451741239, 'f1-score': 0.5475625823451911, 'support': 606}, 'weighted avg': {'precision': 0.5568620807968366, 'recall': 0.5511551155115512, 'f1-score'

Evaluation: accuracy on test data: 0.5412541254125413
Evaluations on test data:
{
    "accuracy": 0.5412541254125413,
    "accuracy_score": 0.5412541254125413,
    "balanced_accuracy_score": 0.5438116518611369,
    "matthews_corrcoef": 0.08924300449788912,
    "f1_score": 0.5412541254125413
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5235457063711911,
        "recall": 0.6406779661016949,
        "f1-score": 0.5762195121951219,
        "support": 295
    },
    "1": {
        "precision": 0.5673469387755102,
        "recall": 0.44694533762057875,
        "f1-score": 0.5,
        "support": 311
    },
    "accuracy": 0.5412541254125413,
    "macro avg": {
        "precision": 0.5454463225733507,
        "recall": 0.5438116518611369,
        "f1-score": 0.538109756097561,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5460245566644968,
        "recall": 0.5412541254125413,
        "f1-score": 0.5371035579167673,
        "s


Outlier detection method: PyODPCAOutlierDetection, Outlier Detection Score: {'Precision': 0.70036142691356, 'Recall': 0.6903190319031903, 'F1-score': 0.6869518678156635, 'Accuracy': 0.6903190319031903}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 402189918.1321284}
Cleaner: (PyODPCAOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5566134394244917, 'classification_report': {'0': {'precision': 0.5235457063711911, 'recall': 0.6406779661016949, 'f1-score': 0.5762195121951219, 'support': 295}, '1': {'precision': 0.5673469387755102, 'recall': 0.44694533762057875, 'f1-score': 0.5, 'support': 311}, 'accuracy': 0.5412541254125413, 'macro avg': {'precision': 0.5454463225733507, 'recall': 0.5438116518611369, 'f1-score': 0.538109756097561, 'support': 606}, 'weighted avg': {'precision': 0.5460245566644968, 'recall': 0.5412541254125413, 'f1-score': 0.5371035579167673, 'support': 606}}}






Evaluation: accuracy on test data: 0.5561056105610561
Evaluations on test data:
{
    "accuracy": 0.5561056105610561,
    "accuracy_score": 0.5561056105610561,
    "balanced_accuracy_score": 0.5587170962995258,
    "matthews_corrcoef": 0.11968413084772356,
    "f1_score": 0.5561056105610561
}


Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5359116022099447,
        "recall": 0.6576271186440678,
        "f1-score": 0.5905631659056316,
        "support": 295
    },
    "1": {
        "precision": 0.5860655737704918,
        "recall": 0.45980707395498394,
        "f1-score": 0.5153153153153154,
        "support": 311
    },
    "accuracy": 0.5561056105610561,
    "macro avg": {
        "precision": 0.5609885879902183,
        "recall": 0.5587170962995258,
        "f1-score": 0.5529392406104735,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.561650686624681,
        "recall": 0.5561056105610561,
        "f1-score": 0.5519458696455848,
        "support": 606
    }
}



Outlier detection method: PyODCBLOFOutlierDetection, Outlier Detection Score: {'Precision': 0.8187954544245285, 'Recall': 0.7926292629262927, 'F1-score': 0.7882947627871975, 'Accuracy': 0.7926292629262927}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 389087104.71174496}
Cleaner: (PyODCBLOFOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5764292332007194, 'classification_report': {'0': {'precision': 0.5359116022099447, 'recall': 0.6576271186440678, 'f1-score': 0.5905631659056316, 'support': 295}, '1': {'precision': 0.5860655737704918, 'recall': 0.45980707395498394, 'f1-score': 0.5153153153153154, 'support': 311}, 'accuracy': 0.5561056105610561, 'macro avg': {'precision': 0.5609885879902183, 'recall': 0.5587170962995258, 'f1-score': 0.5529392406104735, 'support': 606}, 'weighted avg': {'precision': 0.561650686624681, 'recall': 0.5561056105610561, 'f1-score': 0.5519458696455848

Evaluation: accuracy on test data: 0.5478547854785478
Evaluations on test data:
{
    "accuracy": 0.5478547854785478,
    "accuracy_score": 0.5478547854785478,
    "balanced_accuracy_score": 0.5459698076189439,
    "matthews_corrcoef": 0.09289220351441713,
    "f1_score": 0.5478547854785478
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5405405405405406,
        "recall": 0.4745762711864407,
        "f1-score": 0.5054151624548736,
        "support": 295
    },
    "1": {
        "precision": 0.553314121037464,
        "recall": 0.617363344051447,
        "f1-score": 0.5835866261398177,
        "support": 311
    },
    "accuracy": 0.5478547854785478,
    "macro avg": {
        "precision": 0.5469273307890024,
        "recall": 0.5459698076189439,
        "f1-score": 0.5445008942973457,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5470959589143743,
        "recall": 0.5478547854785478,
        "f1-score": 0.545532860814638


Outlier detection method: PyODSOSOutlierDetection, Outlier Detection Score: {'Precision': 0.6360021977299181, 'Recall': 0.6259625962596259, 'F1-score': 0.6050356549950111, 'Accuracy': 0.6259625962596259}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 53433142764388.34}
Cleaner: (PyODSOSOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5539484440568968, 'classification_report': {'0': {'precision': 0.5405405405405406, 'recall': 0.4745762711864407, 'f1-score': 0.5054151624548736, 'support': 295}, '1': {'precision': 0.553314121037464, 'recall': 0.617363344051447, 'f1-score': 0.5835866261398177, 'support': 311}, 'accuracy': 0.5478547854785478, 'macro avg': {'precision': 0.5469273307890024, 'recall': 0.5459698076189439, 'f1-score': 0.5445008942973457, 'support': 606}, 'weighted avg': {'precision': 0.5470959589143743, 'recall': 0.5478547854785478, 'f1-score': 0.5455328608146387, 'supp

## Try Cleaners

In [25]:
from pyod.models.knn import KNN
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

  and should_run_async(code)


In [26]:
## featurizers
def build_featurizers(columns):
    categorical_preprocessing = Pipeline([
        ('mark_missing', SimpleImputer(strategy='constant', fill_value='__NA__')),
        ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
    ])

    numeric_preprocessing = Pipeline([
        ('mark_missing', SimpleImputer(strategy='constant', fill_value=0)),
    ])

    cat_cols = [c for c in categorical_columns if c in columns]
    num_cols = [c for c in numerical_columns if c in columns]
    
    feature_transformation = ColumnTransformer(transformers=[
        ('categorical_features', categorical_preprocessing, cat_cols),
        ('scaled_numeric', numeric_preprocessing, num_cols)
    ])

    return feature_transformation

### Pyod Single Column - features

In [24]:
df_outliers = df_corrupted.copy(deep=True)
df_outliers

  and should_run_async(code)


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X91,X92,X93,X94,X95,X96,X97,X98,X99,X100
0,1041.990593,1039.850024,1037.136237,1033.695733,1029.333904,1023.804033,1016.793331,,996.637055,982.351378,...,1049.984129,1049.984130,1.049984e+05,1049.984130,1049.984130,1049.984130,1049.984130,1049.984130,1049.984130,1049.984130
1,62.191641,62.191641,62.191640,62.191640,62.191640,62.191639,62.191639,,62.191637,62.191636,...,61.472330,61.620700,6.173847e+01,61.831941,61.906135,61.965026,62.011769,62.048871,62.078320,62.101694
2,3216.305785,3216.305745,3216.305698,3216.305642,3216.305575,3216.305494,3216.305398,,3216.305147,3216.304983,...,2326.756690,2471.695866,2.593019e+05,2694.574841,2779.583376,2850.741001,2910.304521,2960.163029,3001.897815,3036.832521
3,6373.569435,6373.015068,6372.300869,6371.380752,6370.195350,6368.668175,6366.700689,6364.165942,6360.900382,6356.693303,...,6375.492069,6375.492097,6.375492e+05,6375.492136,6375.492150,6375.492160,6375.492168,6375.492174,6375.492179,6375.492183
4,16.680583,16.699414,16.723735,16.755150,16.795726,16.848135,16.915828,,17.116196,17.262063,...,16.616014,16.616014,1.661601e+01,16.616014,16.616014,16.616014,16.616014,16.616014,16.616014,16.616014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601,847.498845,847.498704,847.498525,847.498297,847.498008,847.497640,847.497173,,847.495824,847.494865,...,847.475290,847.480419,8.474845e+04,847.487632,847.490132,847.492099,847.493647,847.494865,847.495824,847.496579
602,24724.565870,24728.692710,24733.760610,24739.984180,24747.626940,24757.012520,24768.538360,24782.692490,24800.074270,24821.419720,...,24706.485310,24706.482190,2.470648e+06,24706.477590,24706.475910,24706.474540,24706.473430,24706.472520,24706.471780,24706.471180
603,797.931966,797.931974,797.931983,797.931995,797.932010,797.932029,797.932051,,797.932114,797.932157,...,809.363718,807.175129,8.054055e+04,803.974735,802.817854,801.882455,801.126137,800.514614,800.020165,799.620378
604,906.837107,906.950491,907.094634,907.277881,907.510841,907.807000,908.183503,,909.270640,910.044210,...,906.419280,906.419254,9.064192e+04,906.419218,906.419206,906.419196,906.419189,906.419183,906.419178,906.419174


In [27]:
predictors = {}

for col in categorical_columns + numerical_columns:
    predictors[col] = Pipeline(
        [('features', build_featurizers([col])),
         ('outlier_detector', KNN())
        ])
    
len(predictors)

100

In [28]:
for col in categorical_columns + numerical_columns:
    predictors[col].fit(df_outliers)

In [29]:
for col in categorical_columns + numerical_columns:
    df_outliers[col + "_outlier"] = predictors[col].predict(df_corrupted)

  and should_run_async(code)


In [30]:
df_outliers

  and should_run_async(code)


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X91_outlier,X92_outlier,X93_outlier,X94_outlier,X95_outlier,X96_outlier,X97_outlier,X98_outlier,X99_outlier,X100_outlier
0,1041.990593,1039.850024,1037.136237,1033.695733,1029.333904,1023.804033,1016.793331,,996.637055,982.351378,...,0,0,0,0,0,0,0,0,0,0
1,62.191641,62.191641,62.191640,62.191640,62.191640,62.191639,62.191639,,62.191637,62.191636,...,0,0,0,0,0,0,0,0,0,0
2,3216.305785,3216.305745,3216.305698,3216.305642,3216.305575,3216.305494,3216.305398,,3216.305147,3216.304983,...,0,0,0,0,0,0,0,0,0,0
3,6373.569435,6373.015068,6372.300869,6371.380752,6370.195350,6368.668175,6366.700689,6364.165942,6360.900382,6356.693303,...,0,0,0,0,0,0,0,0,0,0
4,16.680583,16.699414,16.723735,16.755150,16.795726,16.848135,16.915828,,17.116196,17.262063,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601,847.498845,847.498704,847.498525,847.498297,847.498008,847.497640,847.497173,,847.495824,847.494865,...,0,0,0,0,0,0,0,0,0,0
602,24724.565870,24728.692710,24733.760610,24739.984180,24747.626940,24757.012520,24768.538360,24782.692490,24800.074270,24821.419720,...,0,0,1,0,0,0,0,0,0,1
603,797.931966,797.931974,797.931983,797.931995,797.932010,797.932029,797.932051,,797.932114,797.932157,...,0,0,0,0,0,0,0,0,0,0
604,906.837107,906.950491,907.094634,907.277881,907.510841,907.807000,908.183503,,909.270640,910.044210,...,0,0,0,0,0,0,0,0,0,0


### Pyod Multiple Columns - features

In [31]:
df_outliers = df_corrupted.copy(deep=True)

  and should_run_async(code)


In [32]:
predictors = Pipeline(
    [('features', build_featurizers(numerical_columns + categorical_columns)),
     ('outlier_detector', KNN())
    ])
predictors.fit(df_outliers)

Pipeline(steps=[('features',
                 ColumnTransformer(transformers=[('categorical_features',
                                                  Pipeline(steps=[('mark_missing',
                                                                   SimpleImputer(fill_value='__NA__',
                                                                                 strategy='constant')),
                                                                  ('one_hot_encode',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  []),
                                                 ('scaled_numeric',
                                                  Pipeline(steps=[('mark_missing',
                                                                   SimpleImputer(fill_value=0,
                                                                                 strategy='constant'))]),
      

In [33]:
outliers = predictors.predict(df_outliers)
outliers

  and should_run_async(code)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,

### Categorical from train, numerical from pyod

In [18]:
## single column based

In [34]:
def cat_out_detect(df_train, df_corrupted):
    df_outliers = df_corrupted[categorical_columns].copy()
    
    for col in df_train.columns:
        if col in categorical_columns:
            vals_train_unique = df_train[col].unique()
            
            ## add a respective outlier col for each col
            df_outliers[col + "_outlier"] = ''
            
            for i in df_corrupted[col].index:
                if df_corrupted.loc[i, col] in vals_train_unique:
                    df_outliers.loc[i, col + "_outlier"] = 0
                else:
                    df_outliers.loc[i, col + "_outlier"] = 1
                    
    return df_outliers

  and should_run_async(code)


In [35]:
def num_out_detect(df_train, df_corrupted, pyod_model):
    df_outliers = df_corrupted[numerical_columns].copy()
    
    for col in df_train.columns:
        if col in numerical_columns:
            ## find indices of records with NaNs in col in df_corrupted
            nan_idx = df_corrupted[df_corrupted[col].isnull()].index
            non_nan_idx = df_corrupted.loc[set(df_corrupted.index) - set(nan_idx)].index
            
            ## pd series -> np column, needs to be 2D array
            ## taking only the non-NaN records in the corrupted data
            col_tr_arr = np.array(df_train[col]).reshape(-1,1)
            col_corr_arr = np.array(df_corrupted.loc[non_nan_idx][col]).reshape(-1,1)
            
            ## fit the dataset to the model
            model = pyod_model
            model.fit(col_tr_arr)
            
            ## predict raw anomaly score
            scores_pred = model.decision_function(col_corr_arr) * -1
            
            ## prediction of a datapoint category outlier or inlier
            y_pred = model.predict(col_corr_arr)
            
            ## add a respective outlier col for each col
            df_outliers[col + "_outlier"] = ''
            df_outliers.loc[non_nan_idx, col + "_outlier"] = y_pred ## 0: inlier, 1: outlier
            df_outliers.loc[nan_idx, col + "_outlier"] = 1
            
    return df_outliers

In [36]:
pyod_model = KNN()
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X91_outlier,X92_outlier,X93_outlier,X94_outlier,X95_outlier,X96_outlier,X97_outlier,X98_outlier,X99_outlier,X100_outlier
0,1041.990593,1039.850024,1037.136237,1033.695733,1029.333904,1023.804033,1016.793331,,996.637055,982.351378,...,0,0,1,0,0,0,0,0,0,0
1,62.191641,62.191641,62.191640,62.191640,62.191640,62.191639,62.191639,,62.191637,62.191636,...,0,0,0,0,0,0,0,0,0,0
2,3216.305785,3216.305745,3216.305698,3216.305642,3216.305575,3216.305494,3216.305398,,3216.305147,3216.304983,...,0,0,1,0,0,0,0,0,0,0
3,6373.569435,6373.015068,6372.300869,6371.380752,6370.195350,6368.668175,6366.700689,6364.165942,6360.900382,6356.693303,...,0,0,1,0,0,0,0,0,0,0
4,16.680583,16.699414,16.723735,16.755150,16.795726,16.848135,16.915828,,17.116196,17.262063,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601,847.498845,847.498704,847.498525,847.498297,847.498008,847.497640,847.497173,,847.495824,847.494865,...,0,0,1,0,0,0,0,0,0,0
602,24724.565870,24728.692710,24733.760610,24739.984180,24747.626940,24757.012520,24768.538360,24782.692490,24800.074270,24821.419720,...,0,0,1,0,0,0,0,0,0,0
603,797.931966,797.931974,797.931983,797.931995,797.932010,797.932029,797.932051,,797.932114,797.932157,...,0,0,1,0,0,0,0,0,0,0
604,906.837107,906.950491,907.094634,907.277881,907.510841,907.807000,908.183503,,909.270640,910.044210,...,0,0,1,0,0,0,0,0,0,0


In [37]:
from pyod.models.pca import PCA

pyod_model = PCA() # n_components = min(n_samples, n_features) default  # n_selected_components = None
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

  and should_run_async(code)


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X91_outlier,X92_outlier,X93_outlier,X94_outlier,X95_outlier,X96_outlier,X97_outlier,X98_outlier,X99_outlier,X100_outlier
0,1041.990593,1039.850024,1037.136237,1033.695733,1029.333904,1023.804033,1016.793331,,996.637055,982.351378,...,0,0,1,0,0,0,0,0,0,0
1,62.191641,62.191641,62.191640,62.191640,62.191640,62.191639,62.191639,,62.191637,62.191636,...,0,0,0,0,0,0,0,0,0,0
2,3216.305785,3216.305745,3216.305698,3216.305642,3216.305575,3216.305494,3216.305398,,3216.305147,3216.304983,...,0,0,1,0,0,0,0,0,0,0
3,6373.569435,6373.015068,6372.300869,6371.380752,6370.195350,6368.668175,6366.700689,6364.165942,6360.900382,6356.693303,...,0,0,1,0,0,0,0,0,0,0
4,16.680583,16.699414,16.723735,16.755150,16.795726,16.848135,16.915828,,17.116196,17.262063,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601,847.498845,847.498704,847.498525,847.498297,847.498008,847.497640,847.497173,,847.495824,847.494865,...,0,0,1,0,0,0,0,0,0,0
602,24724.565870,24728.692710,24733.760610,24739.984180,24747.626940,24757.012520,24768.538360,24782.692490,24800.074270,24821.419720,...,0,0,1,0,0,0,0,0,0,0
603,797.931966,797.931974,797.931983,797.931995,797.932010,797.932029,797.932051,,797.932114,797.932157,...,0,0,1,0,0,0,0,0,0,0
604,906.837107,906.950491,907.094634,907.277881,907.510841,907.807000,908.183503,,909.270640,910.044210,...,0,0,1,0,0,0,0,0,0,0


In [38]:
from pyod.models.cblof import CBLOF

pyod_model = CBLOF() # n_clusters = 8 default
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

  and should_run_async(code)






Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X91_outlier,X92_outlier,X93_outlier,X94_outlier,X95_outlier,X96_outlier,X97_outlier,X98_outlier,X99_outlier,X100_outlier
0,1041.990593,1039.850024,1037.136237,1033.695733,1029.333904,1023.804033,1016.793331,,996.637055,982.351378,...,0,0,1,0,0,0,0,0,0,0
1,62.191641,62.191641,62.191640,62.191640,62.191640,62.191639,62.191639,,62.191637,62.191636,...,0,0,0,0,0,0,0,0,0,0
2,3216.305785,3216.305745,3216.305698,3216.305642,3216.305575,3216.305494,3216.305398,,3216.305147,3216.304983,...,0,0,1,0,0,0,0,0,0,0
3,6373.569435,6373.015068,6372.300869,6371.380752,6370.195350,6368.668175,6366.700689,6364.165942,6360.900382,6356.693303,...,0,0,1,0,0,0,0,0,0,0
4,16.680583,16.699414,16.723735,16.755150,16.795726,16.848135,16.915828,,17.116196,17.262063,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601,847.498845,847.498704,847.498525,847.498297,847.498008,847.497640,847.497173,,847.495824,847.494865,...,0,0,1,0,0,0,0,0,0,0
602,24724.565870,24728.692710,24733.760610,24739.984180,24747.626940,24757.012520,24768.538360,24782.692490,24800.074270,24821.419720,...,0,0,1,0,0,0,0,0,0,0
603,797.931966,797.931974,797.931983,797.931995,797.932010,797.932029,797.932051,,797.932114,797.932157,...,0,0,1,0,0,0,0,0,0,0
604,906.837107,906.950491,907.094634,907.277881,907.510841,907.807000,908.183503,,909.270640,910.044210,...,0,0,1,0,0,0,0,0,0,0


In [39]:
from pyod.models.sos import SOS

pyod_model = SOS()
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

  and should_run_async(code)


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X91_outlier,X92_outlier,X93_outlier,X94_outlier,X95_outlier,X96_outlier,X97_outlier,X98_outlier,X99_outlier,X100_outlier
0,1041.990593,1039.850024,1037.136237,1033.695733,1029.333904,1023.804033,1016.793331,,996.637055,982.351378,...,0,0,0,0,0,0,0,0,0,0
1,62.191641,62.191641,62.191640,62.191640,62.191640,62.191639,62.191639,,62.191637,62.191636,...,0,0,0,0,0,0,0,0,0,0
2,3216.305785,3216.305745,3216.305698,3216.305642,3216.305575,3216.305494,3216.305398,,3216.305147,3216.304983,...,0,0,0,0,0,0,0,0,0,0
3,6373.569435,6373.015068,6372.300869,6371.380752,6370.195350,6368.668175,6366.700689,6364.165942,6360.900382,6356.693303,...,0,0,0,0,0,0,0,0,0,0
4,16.680583,16.699414,16.723735,16.755150,16.795726,16.848135,16.915828,,17.116196,17.262063,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601,847.498845,847.498704,847.498525,847.498297,847.498008,847.497640,847.497173,,847.495824,847.494865,...,1,1,0,1,1,1,1,1,1,1
602,24724.565870,24728.692710,24733.760610,24739.984180,24747.626940,24757.012520,24768.538360,24782.692490,24800.074270,24821.419720,...,0,1,1,0,0,0,1,1,1,1
603,797.931966,797.931974,797.931983,797.931995,797.932010,797.932029,797.932051,,797.932114,797.932157,...,0,0,0,0,0,0,0,0,0,0
604,906.837107,906.950491,907.094634,907.277881,907.510841,907.807000,908.183503,,909.270640,910.044210,...,0,0,0,0,0,0,0,0,0,0


### Sklearn