## Dataset

In [1]:
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('/home/rupali/Documents/Master Thesis/jenga')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from jenga.basis import Dataset

In [13]:
dataset = Dataset("thoracic-surgery") ## hill-valley

all_data = dataset.all_data
attribute_names = dataset.attribute_names
attribute_types = dataset.attribute_types

categorical_columns = dataset.categorical_columns
numerical_columns = dataset.numerical_columns

print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")

  and should_run_async(code)


Dataset: thoracic-surgery
Found 13 categorical and 3 numeric features 



### Get training and test sets

In [38]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data(0.3)

  and should_run_async(code)


In [None]:
########################################

In [4]:
## use categorical columns as strings
def cat_cols_to_str(df):
    for col in df.columns:
        if pd.api.types.is_categorical_dtype(df[col]):
            df[col] = df[col].astype(str)

    return df

In [5]:
### if we don't convert the categorical columns to str, the swapping corruption doesn't let us assign new values to the column: "Cannot setitem on a Categorical with a new category, set the categories first"
train_data = cat_cols_to_str(train_data)
test_data = cat_cols_to_str(test_data)

In [None]:
########################################

## Workaround to get the data when OpenML decided not to work :D

In [2]:
train_dat = pd.read_csv("/home/rupali/Downloads/Hill_Valley_without_noise_Training.data", sep=",")
train_data = train_dat.loc[:, train_dat.columns != 'class']
train_labels = train_dat["class"]

In [3]:
test_dat = pd.read_csv("/home/rupali/Downloads/Hill_Valley_without_noise_Testing.data", sep=",")
test_data = test_dat.loc[:, test_dat.columns != 'class']
test_labels = test_dat["class"]

In [4]:
categorical_columns = []
numerical_columns = []

for col in train_data.columns:
    if pd.api.types.is_categorical_dtype(train_data[col]):
        categorical_columns.append(col)
    elif pd.api.types.is_numeric_dtype(train_data[col]):
        numerical_columns.append(col)

print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")

Found 0 categorical and 100 numeric features 



## Defined Model

In [255]:
from sklearn.linear_model import SGDClassifier

learner = SGDClassifier(loss='log')
param_grid = {
    'learner__max_iter': [500, 1000, 5000],
    'learner__penalty': ['l2', 'l1', 'elasticnet'], 
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

  and should_run_async(code)


## Corruptions using PPP

In [39]:
from jenga.corruptions.generic import MissingValues, SwappedValues, CategoricalShift
from jenga.corruptions.numerical import Scaling, GaussianNoise

corruptions = [MissingValues, Scaling, GaussianNoise, CategoricalShift]
fraction = 0.5
num_repetitions = 5

In [40]:
from jenga.cleaning.ppp import PipelinePerformancePrediction

ppp = PipelinePerformancePrediction(train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)
ppp_model = ppp.fit_ppp(train_data)

## generate corrpted data
for _ in range(num_repetitions):
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions, fraction, num_repetitions)

Fitting 5 folds for each of 36 candidates, totalling 180 fits

Generating corrupted training data on 141 rows... 

	perturbation: MissingValues: {'column': 'V13', 'fraction': 0.5, 'sampling': 'MCAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'V16', 'fraction': 0.5, 'sampling': 'MCAR'}
	perturbation: GaussianNoise: {'column': 'V2', 'fraction': 0.5, 'sampling': 'MCAR'}
	perturbation: CategoricalShift: {'column': 'V11', 'fraction': 0.5, 'sampling': 'MAR'}

Generating corrupted training data on 141 rows... 

	perturbation: MissingValues: {'column': 'V9', 'fraction': 0.5, 'sampling': 'MNAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'V2', 'fraction': 0.5, 'sampling': 'MCAR'}
	perturbation: GaussianNoise: {'column': 'V16', 'fraction': 0.5, 'sampling': 'MNAR'}
	perturbation: CategoricalShift: {'column': 'V8', 'fraction': 0.5, 'sampling': 'MCAR'}

Generating corrupted training data on 141 rows... 

	perturbation: MissingValues: {'column': 'V3', 'fraction': 0.5, 'sampling': 

## Cleaning

### PPP Cleaning

In [8]:
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNNOutlierDetection, PyODIsolationForestOutlierDetection, PyODPCAOutlierDetection, PyODCBLOFOutlierDetection, PyODSOSOutlierDetection
from jenga.cleaning.imputation import MeanModeImputation

cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, MeanModeImputation),
    (PyODIsolationForestOutlierDetection, MeanModeImputation),
    (PyODPCAOutlierDetection, MeanModeImputation),
    (PyODCBLOFOutlierDetection, MeanModeImputation),
    (PyODSOSOutlierDetection, MeanModeImputation)
]

  and should_run_async(code)


In [9]:
from jenga.cleaning.clean import Clean

categorical_precision_threshold=0.7
numerical_std_error_threshold=2.0

clean = Clean(train_data, df_corrupted, categorical_columns, numerical_columns, categorical_precision_threshold, numerical_std_error_threshold, ppp, ppp_model, cleaners)
df_outliers, df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(train_data, test_data, df_corrupted, cols_perturbed)


Applying cleaners... 

PPP score no cleaning: {'roc_auc_score': 0.6551855686958418, 'classification_report': {'0': {'precision': 0.5685279187817259, 'recall': 0.7593220338983051, 'f1-score': 0.6502177068214805, 'support': 295}, '1': {'precision': 0.6650943396226415, 'recall': 0.4533762057877814, 'f1-score': 0.5391969407265774, 'support': 311}, 'accuracy': 0.6023102310231023, 'macro avg': {'precision': 0.6168111292021837, 'recall': 0.6063491198430433, 'f1-score': 0.594707323774029, 'support': 606}, 'weighted avg': {'precision': 0.6180859334377075, 'recall': 0.6023102310231023, 'f1-score': 0.5932417030995089, 'support': 606}}}
PPP scores with cleaning: 

Outlier detection method: NoOutlierDetection, Outlier Detection Score: {'Precision': 0.25, 'Recall': 0.5, 'F1-score': 0.3333333333333333, 'Accuracy': 0.5}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 82632108746417.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Cleaner: (NoOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.6287372608861518, 'classification_report': {'0': {'precision': 0.564625850340136, 'recall': 0.8440677966101695, 'f1-score': 0.6766304347826086, 'support': 295}, '1': {'precision': 0.7212121212121212, 'recall': 0.38263665594855306, 'f1-score': 0.5, 'support': 311}, 'accuracy': 0.6072607260726073, 'macro avg': {'precision': 0.6429189857761286, 'recall': 0.6133522262793613, 'f1-score': 0.5883152173913043, 'support': 606}, 'weighted avg': {'precision': 0.6449861312661878, 'recall': 0.6072607260726073, 'f1-score': 0.5859834624766824, 'support': 606}}}

Outlier detection method: PyODKNNOutlierDetection, Outlier Detection Score: {'Precision': 0.8408569300146812, 'Recall': 0.8278327832783279, 'F1-score': 0.8262407435124338, 'Accuracy': 0.8278327832783279}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 384872283.31639314}
Clea








Outlier detection method: PyODCBLOFOutlierDetection, Outlier Detection Score: {'Precision': 0.8098841669267345, 'Recall': 0.7832783278327833, 'F1-score': 0.7786253488944451, 'Accuracy': 0.7832783278327833}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 399630584.565965}
Cleaner: (PyODCBLOFOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5408523625265682, 'classification_report': {'0': {'precision': 0.48267898383371827, 'recall': 0.7084745762711865, 'f1-score': 0.5741758241758241, 'support': 295}, '1': {'precision': 0.5028901734104047, 'recall': 0.2797427652733119, 'f1-score': 0.3595041322314049, 'support': 311}, 'accuracy': 0.4884488448844885, 'macro avg': {'precision': 0.4927845786220615, 'recall': 0.4941086707722492, 'f1-score': 0.46683997820361456, 'support': 606}, 'weighted avg': {'precision': 0.4930513930059121, 'recall': 0.4884488448844885, 'f1-score': 0.4640060284749753

## Model Evaluation

### With learner and param_grid

In [256]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# preprocessing pipeline for numerical columns
transformer_numeric = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('standard_scale', StandardScaler())
])

# preprocessing pipeline for categorical columns
transformer_categorical = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
    ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
])

# preprocessor
feature_transform = ColumnTransformer(transformers=[
    ('categorical_features', transformer_categorical, categorical_columns),
    ('numerical_features', transformer_numeric, numerical_columns)
])

## prediction pipeline: append classifier (learner) to the preprocessing pipeline
pipeline = Pipeline([
    ('features', feature_transform),
    ('learner', learner)
])

In [257]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipeline, param_grid, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
model = grid_search.fit(train_data, train_labels)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [258]:
y_pred = model.predict(test_data)

  and should_run_async(code)


In [259]:
from sklearn.metrics import roc_auc_score

roc_auc_score(test_labels, np.transpose(model.predict_proba(test_data))[1])

  and should_run_async(code)


0.5968965517241378

In [260]:
model.predict_proba(test_data)

  and should_run_async(code)


array([[2.48923873e-01, 7.51076127e-01],
       [1.19466022e-01, 8.80533978e-01],
       [4.76675761e-01, 5.23324239e-01],
       [7.52237373e-01, 2.47762627e-01],
       [3.09806104e-02, 9.69019390e-01],
       [3.95795689e-01, 6.04204311e-01],
       [6.30358337e-02, 9.36964166e-01],
       [6.53316975e-01, 3.46683025e-01],
       [1.11722907e-01, 8.88277093e-01],
       [2.10795831e-01, 7.89204169e-01],
       [2.76059381e-01, 7.23940619e-01],
       [5.10868216e-02, 9.48913178e-01],
       [1.53031464e-02, 9.84696854e-01],
       [6.78376864e-01, 3.21623136e-01],
       [1.18168338e-01, 8.81831662e-01],
       [7.34872186e-01, 2.65127814e-01],
       [9.17923570e-05, 9.99908208e-01],
       [2.21709604e-01, 7.78290396e-01],
       [2.36037544e-01, 7.63962456e-01],
       [7.19554811e-03, 9.92804452e-01],
       [2.84619412e-01, 7.15380588e-01],
       [5.03055384e-02, 9.49694462e-01],
       [1.32424108e-01, 8.67575892e-01],
       [9.91516577e-03, 9.90084834e-01],
       [1.004364

In [261]:
from sklearn.metrics import classification_report

classification_report(test_labels, y_pred, output_dict=True)

  and should_run_async(code)


{'1': {'precision': 0.35294117647058826,
  'recall': 0.24,
  'f1-score': 0.28571428571428564,
  'support': 25},
 '2': {'precision': 0.8467741935483871,
  'recall': 0.9051724137931034,
  'f1-score': 0.875,
  'support': 116},
 'accuracy': 0.7872340425531915,
 'macro avg': {'precision': 0.5998576850094877,
  'recall': 0.5725862068965517,
  'f1-score': 0.5803571428571428,
  'support': 141},
 'weighted avg': {'precision': 0.7592151479672171,
  'recall': 0.7872340425531915,
  'f1-score': 0.770516717325228,
  'support': 141}}

### With Autogluon

In [262]:
from autogluon.tabular import TabularPredictor

## training
train_data["class"] = train_labels

label = "class"

## folder to save trained models
#save_path = '/home/rupali/Documents/Master Thesis/jenga/autogluon_models/'

predictor = TabularPredictor(label=label).fit(train_data)

  and should_run_async(code)
Level 25:autogluon.core.utils.utils:No path specified. Models will be saved in: "AutogluonModels/ag-20210326_001410/"
INFO:autogluon.tabular.learner.default_learner:Beginning AutoGluon training ...
INFO:autogluon.tabular.learner.default_learner:AutoGluon will save models to "AutogluonModels/ag-20210326_001410/"
INFO:autogluon.tabular.learner.default_learner:AutoGluon Version:  0.1.0
INFO:autogluon.tabular.learner.default_learner:Train Data Rows:    329
INFO:autogluon.tabular.learner.default_learner:Train Data Columns: 16
INFO:autogluon.tabular.learner.default_learner:Preprocessing data ...
Level 25:autogluon.core.utils.utils:AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
INFO:autogluon.core.utils.utils:	2 unique label values:  ['2', '1']
Level 25:autogluon.core.utils.utils:	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_t

INFO:autogluon.tabular.trainer.abstract_trainer:Fitting model: NeuralNetFastAI ...


Epoch 21: early stopping
█

INFO:autogluon.tabular.trainer.abstract_trainer:	0.8636	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	6.1s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.11s	 = Validation runtime
INFO:autogluon.tabular.trainer.abstract_trainer:Fitting model: LightGBMLarge ...
INFO:autogluon.tabular.trainer.abstract_trainer:	0.8636	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	0.7s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.02s	 = Validation runtime


█

INFO:autogluon.tabular.trainer.abstract_trainer:Fitting model: WeightedEnsemble_L2 ...
INFO:autogluon.tabular.trainer.abstract_trainer:	0.8636	 = Validation accuracy score
INFO:autogluon.tabular.trainer.abstract_trainer:	0.39s	 = Training runtime
INFO:autogluon.tabular.trainer.abstract_trainer:	0.0s	 = Validation runtime
INFO:autogluon.tabular.learner.default_learner:AutoGluon training complete, total runtime = 13.5s ...
INFO:root:TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20210326_001410/")


In [263]:
## test on original data
y_pred_test = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=pd.Series(test_labels), y_pred=y_pred_test, auxiliary_metrics=True)

  and should_run_async(code)
INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8226950354609929
INFO:autogluon.tabular.learner.abstract_learner:Evaluations on test data:
INFO:autogluon.tabular.learner.abstract_learner:{
    "accuracy": 0.8226950354609929,
    "accuracy_score": 0.8226950354609929,
    "balanced_accuracy_score": 0.5,
    "matthews_corrcoef": 0.0,
    "f1_score": 0.8226950354609928
}
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:autogluon.tabular.learner.abstract_learner:Detailed (per-class) classification report:
INFO:autogluon.tabular.learner.abstract_learner:{
    "1": {
        "precision": 0.0,
        "recall": 0.0,
        "f1-score": 0.0,
        "support": 25
    },
    "2": {
        "precision": 0.8226950354609929,
        "recall": 1.0,
        "f1-score": 0.9027237354085603,
        "support": 116
    }

In [264]:
## test on corrupted data
y_pred_corrupted = predictor.predict(df_corrupted)
perf = predictor.evaluate_predictions(y_true=pd.Series(test_labels), y_pred=y_pred_corrupted, auxiliary_metrics=True)

  and should_run_async(code)
INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8226950354609929
INFO:autogluon.tabular.learner.abstract_learner:Evaluations on test data:
INFO:autogluon.tabular.learner.abstract_learner:{
    "accuracy": 0.8226950354609929,
    "accuracy_score": 0.8226950354609929,
    "balanced_accuracy_score": 0.5,
    "matthews_corrcoef": 0.0,
    "f1_score": 0.8226950354609928
}
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:autogluon.tabular.learner.abstract_learner:Detailed (per-class) classification report:
INFO:autogluon.tabular.learner.abstract_learner:{
    "1": {
        "precision": 0.0,
        "recall": 0.0,
        "f1-score": 0.0,
        "support": 25
    },
    "2": {
        "precision": 0.8226950354609929,
        "recall": 1.0,
        "f1-score": 0.9027237354085603,
        "support": 116
    }

In [265]:
## test on cleaned data
y_pred_cleaned = predictor.predict(df_cleaned)
perf = predictor.evaluate_predictions(y_true=pd.Series(test_labels), y_pred=y_pred_cleaned, auxiliary_metrics=True)

  and should_run_async(code)
INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8226950354609929
INFO:autogluon.tabular.learner.abstract_learner:Evaluations on test data:
INFO:autogluon.tabular.learner.abstract_learner:{
    "accuracy": 0.8226950354609929,
    "accuracy_score": 0.8226950354609929,
    "balanced_accuracy_score": 0.5,
    "matthews_corrcoef": 0.0,
    "f1_score": 0.8226950354609928
}
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:autogluon.tabular.learner.abstract_learner:Detailed (per-class) classification report:
INFO:autogluon.tabular.learner.abstract_learner:{
    "1": {
        "precision": 0.0,
        "recall": 0.0,
        "f1-score": 0.0,
        "support": 25
    },
    "2": {
        "precision": 0.8226950354609929,
        "recall": 1.0,
        "f1-score": 0.9027237354085603,
        "support": 116
    }

In [266]:
## We can evaluate the performance of each individual trained model on our (labeled) test data
test_data["class"] = test_labels
predictor.leaderboard(test_data, silent=True)

  and should_run_async(code)


█

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT,0.822695,0.863636,0.015985,0.019807,0.565477,0.015985,0.019807,0.565477,1,True,8
1,LightGBMLarge,0.822695,0.863636,0.019482,0.017002,0.703127,0.019482,0.017002,0.703127,1,True,12
2,LightGBM,0.822695,0.863636,0.020127,0.017831,0.847258,0.020127,0.017831,0.847258,1,True,7
3,CatBoost,0.822695,0.863636,0.021599,0.019487,0.448039,0.021599,0.019487,0.448039,1,True,9
4,WeightedEnsemble_L2,0.822695,0.863636,0.025242,0.023716,0.837128,0.003644,0.004229,0.389089,2,True,13
5,NeuralNetFastAI,0.822695,0.863636,0.164247,0.105555,6.09991,0.164247,0.105555,6.09991,1,True,11
6,KNeighborsDist,0.815603,0.80303,0.007434,0.009465,0.004376,0.007434,0.009465,0.004376,1,True,6
7,KNeighborsUnif,0.815603,0.863636,0.007927,0.006374,0.005564,0.007927,0.006374,0.005564,1,True,5
8,RandomForestGini,0.808511,0.848485,0.105643,0.096689,0.70665,0.105643,0.096689,0.70665,1,True,1
9,XGBoost,0.794326,0.863636,0.02487,0.009048,0.252273,0.02487,0.009048,0.252273,1,True,10


In [267]:
perf

  and should_run_async(code)


OrderedDict([('accuracy', 0.8226950354609929),
             ('accuracy_score', 0.8226950354609929),
             ('balanced_accuracy_score', 0.5),
             ('matthews_corrcoef', 0.0),
             ('f1_score', 0.8226950354609928),
             ('confusion_matrix',
                 1    2
              1  0   25
              2  0  116),
             ('classification_report',
              {'1': {'precision': 0.0,
                'recall': 0.0,
                'f1-score': 0.0,
                'support': 25},
               '2': {'precision': 0.8226950354609929,
                'recall': 1.0,
                'f1-score': 0.9027237354085603,
                'support': 116},
               'accuracy': 0.8226950354609929,
               'macro avg': {'precision': 0.41134751773049644,
                'recall': 0.5,
                'f1-score': 0.45136186770428016,
                'support': 141},
               'weighted avg': {'precision': 0.6768271213721644,
                'recall': 0.8

In [268]:
list(perf.items())[6][1]

  and should_run_async(code)


{'1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 25},
 '2': {'precision': 0.8226950354609929,
  'recall': 1.0,
  'f1-score': 0.9027237354085603,
  'support': 116},
 'accuracy': 0.8226950354609929,
 'macro avg': {'precision': 0.41134751773049644,
  'recall': 0.5,
  'f1-score': 0.45136186770428016,
  'support': 141},
 'weighted avg': {'precision': 0.6768271213721644,
  'recall': 0.8226950354609929,
  'f1-score': 0.7426663355134255,
  'support': 141}}

In [269]:
from sklearn.metrics import roc_auc_score

roc_auc_score(test_labels, np.transpose(predictor.predict_proba(test_data)).to_numpy()[1])

0.5432758620689655

In [270]:
perf = predictor.evaluate_predictions(y_true=pd.Series(test_labels), y_pred=y_pred_cleaned, auxiliary_metrics=False)
perf

  and should_run_async(code)
INFO:autogluon.tabular.learner.abstract_learner:Evaluation: accuracy on test data: 0.8226950354609929


0.8226950354609929

## Undefined Model: Default setting: whole process

In [10]:
from jenga.corruptions.generic import MissingValues, SwappedValues, CategoricalShift
from jenga.corruptions.numerical import Scaling, GaussianNoise

corruptions = [MissingValues, Scaling, GaussianNoise, CategoricalShift]
fraction = 0.5
num_repetitions = 5

  and should_run_async(code)


In [11]:
from jenga.cleaning.ppp import PipelinePerformancePrediction

ppp = PipelinePerformancePrediction(train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns)
ppp_model = ppp.fit_ppp(train_data)

## generate corrpted data
for _ in range(num_repetitions):
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions, fraction, num_repetitions)

No path specified. Models will be saved in: "AutogluonModels/ag-20210324_221101/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20210324_221101/"
AutoGluon Version:  0.1.0
Train Data Rows:    606
Train Data Columns: 100
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    3047.73 MB
	Train Data (Original)  Memory Usage: 0.48 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of

█

	0.6148	 = Validation accuracy score
	7.82s	 = Training runtime
	0.2s	 = Validation runtime
Fitting model: LightGBMLarge ...
	0.6393	 = Validation accuracy score
	5.52s	 = Training runtime
	0.01s	 = Validation runtime


█

Fitting model: WeightedEnsemble_L2 ...
	0.6885	 = Validation accuracy score
	0.5s	 = Training runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 30.87s ...
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20210324_221101/")



Generating corrupted training data on 606 rows... 

	perturbation: MissingValues: {'column': 'X98', 'fraction': 0.5, 'sampling': 'MAR', 'na_value': nan}
	perturbation: Scaling: {'column': 'X6', 'fraction': 0.5, 'sampling': 'MAR'}
	perturbation: GaussianNoise: {'column': 'X66', 'fraction': 0.5, 'sampling': 'MAR'}


IndexError: Cannot choose from an empty sequence

In [12]:
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNNOutlierDetection, PyODIsolationForestOutlierDetection, PyODPCAOutlierDetection, PyODCBLOFOutlierDetection, PyODSOSOutlierDetection
from jenga.cleaning.imputation import MeanModeImputation

cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, MeanModeImputation),
    (PyODIsolationForestOutlierDetection, MeanModeImputation),
    (PyODPCAOutlierDetection, MeanModeImputation),
    (PyODCBLOFOutlierDetection, MeanModeImputation),
    (PyODSOSOutlierDetection, MeanModeImputation)
]

  and should_run_async(code)


In [13]:
from jenga.cleaning.clean import Clean

categorical_precision_threshold=0.7
numerical_std_error_threshold=2.0

clean = Clean(train_data, df_corrupted, categorical_columns, numerical_columns, categorical_precision_threshold, numerical_std_error_threshold, ppp, ppp_model, cleaners)
df_outliers, df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(train_data, test_data, df_corrupted, cols_perturbed)

Evaluation: accuracy on test data: 0.5693069306930693
Evaluations on test data:
{
    "accuracy": 0.5693069306930693,
    "accuracy_score": 0.5693069306930693,
    "balanced_accuracy_score": 0.5685268951986484,
    "matthews_corrcoef": 0.13727616802960593,
    "f1_score": 0.5693069306930693
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5598591549295775,
        "recall": 0.5389830508474577,
        "f1-score": 0.5492227979274611,
        "support": 295
    },
    "1": {
        "precision": 0.577639751552795,
        "recall": 0.5980707395498392,
        "f1-score": 0.5876777251184835,
        "support": 311
    },
    "accuracy": 0.5693069306930693,
    "macro avg": {
        "precision": 0.5687494532411863,
        "recall": 0.5685268951986484,
        "f1-score": 0.5684502615229723,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5689841805893475,
        "recall": 0.5693069306930693,
        "f1-score": 0.56895791732747


Applying cleaners... 



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Evaluation: accuracy on test data: 0.5693069306930693
Evaluations on test data:
{
    "accuracy": 0.5693069306930693,
    "accuracy_score": 0.5693069306930693,
    "balanced_accuracy_score": 0.5685268951986484,
    "matthews_corrcoef": 0.13727616802960593,
    "f1_score": 0.5693069306930693
}


PPP score no cleaning: {'roc_auc_score': 0.5804458008610823, 'classification_report': {'0': {'precision': 0.5598591549295775, 'recall': 0.5389830508474577, 'f1-score': 0.5492227979274611, 'support': 295}, '1': {'precision': 0.577639751552795, 'recall': 0.5980707395498392, 'f1-score': 0.5876777251184835, 'support': 311}, 'accuracy': 0.5693069306930693, 'macro avg': {'precision': 0.5687494532411863, 'recall': 0.5685268951986484, 'f1-score': 0.5684502615229723, 'support': 606}, 'weighted avg': {'precision': 0.5689841805893475, 'recall': 0.5693069306930693, 'f1-score': 0.5689579173274742, 'support': 606}}}
PPP scores with cleaning: 

Outlier detection method: NoOutlierDetection, Outlier Detection Score: {'Precision': 0.25, 'Recall': 0.5, 'F1-score': 0.3333333333333333, 'Accuracy': 0.5}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 82632108746417.0}


Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5598591549295775,
        "recall": 0.5389830508474577,
        "f1-score": 0.5492227979274611,
        "support": 295
    },
    "1": {
        "precision": 0.577639751552795,
        "recall": 0.5980707395498392,
        "f1-score": 0.5876777251184835,
        "support": 311
    },
    "accuracy": 0.5693069306930693,
    "macro avg": {
        "precision": 0.5687494532411863,
        "recall": 0.5685268951986484,
        "f1-score": 0.5684502615229723,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5689841805893475,
        "recall": 0.5693069306930693,
        "f1-score": 0.5689579173274742,
        "support": 606
    }
}


Cleaner: (NoOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5804458008610823, 'classification_report': {'0': {'precision': 0.5598591549295775, 'recall': 0.5389830508474577, 'f1-score': 0.5492227979274611, 'support': 295}, '1': {'precision': 0.577639751552795, 'recall': 0.5980707395498392, 'f1-score': 0.5876777251184835, 'support': 311}, 'accuracy': 0.5693069306930693, 'macro avg': {'precision': 0.5687494532411863, 'recall': 0.5685268951986484, 'f1-score': 0.5684502615229723, 'support': 606}, 'weighted avg': {'precision': 0.5689841805893475, 'recall': 0.5693069306930693, 'f1-score': 0.5689579173274742, 'support': 606}}}


Evaluation: accuracy on test data: 0.5610561056105611
Evaluations on test data:
{
    "accuracy": 0.5610561056105611,
    "accuracy_score": 0.5610561056105611,
    "balanced_accuracy_score": 0.5613602921140116,
    "matthews_corrcoef": 0.12270186173615062,
    "f1_score": 0.5610561056105611
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5469255663430421,
        "recall": 0.5728813559322034,
        "f1-score": 0.5596026490066225,
        "support": 295
    },
    "1": {
        "precision": 0.5757575757575758,
        "recall": 0.5498392282958199,
        "f1-score": 0.5625,
        "support": 311
    },
    "accuracy": 0.5610561056105611,
    "macro avg": {
        "precision": 0.5613415710503089,
        "recall": 0.5613602921140116,
        "f1-score": 0.5610513245033113,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5617221916366394,
        "recall": 0.5610561056105611,
        "f1-score": 0.5610895733613096,
       


Outlier detection method: PyODKNNOutlierDetection, Outlier Detection Score: {'Precision': 0.8408569300146812, 'Recall': 0.8278327832783279, 'F1-score': 0.8262407435124338, 'Accuracy': 0.8278327832783279}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 384872283.31639314}
Cleaner: (PyODKNNOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.573137500681236, 'classification_report': {'0': {'precision': 0.5469255663430421, 'recall': 0.5728813559322034, 'f1-score': 0.5596026490066225, 'support': 295}, '1': {'precision': 0.5757575757575758, 'recall': 0.5498392282958199, 'f1-score': 0.5625, 'support': 311}, 'accuracy': 0.5610561056105611, 'macro avg': {'precision': 0.5613415710503089, 'recall': 0.5613602921140116, 'f1-score': 0.5610513245033113, 'support': 606}, 'weighted avg': {'precision': 0.5617221916366394, 'recall': 0.5610561056105611, 'f1-score': 0.5610895733613096, 'support': 606}

Evaluation: accuracy on test data: 0.5511551155115512
Evaluations on test data:
{
    "accuracy": 0.5511551155115512,
    "accuracy_score": 0.5511551155115512,
    "balanced_accuracy_score": 0.5538939451741239,
    "matthews_corrcoef": 0.11007944236596345,
    "f1_score": 0.5511551155115512
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5315068493150685,
        "recall": 0.6576271186440678,
        "f1-score": 0.5878787878787878,
        "support": 295
    },
    "1": {
        "precision": 0.5809128630705395,
        "recall": 0.45016077170418006,
        "f1-score": 0.5072463768115942,
        "support": 311
    },
    "accuracy": 0.5511551155115512,
    "macro avg": {
        "precision": 0.5562098561928039,
        "recall": 0.5538939451741239,
        "f1-score": 0.5475625823451911,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5568620807968366,
        "recall": 0.5511551155115512,
        "f1-score": 0.546498128073


Outlier detection method: PyODIsolationForestOutlierDetection, Outlier Detection Score: {'Precision': 0.7322447494913131, 'Recall': 0.7084708470847084, 'F1-score': 0.7039280901824226, 'Accuracy': 0.7084708470847084}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 656300362.0999852}
Cleaner: (PyODIsolationForestOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5698893672679711, 'classification_report': {'0': {'precision': 0.5315068493150685, 'recall': 0.6576271186440678, 'f1-score': 0.5878787878787878, 'support': 295}, '1': {'precision': 0.5809128630705395, 'recall': 0.45016077170418006, 'f1-score': 0.5072463768115942, 'support': 311}, 'accuracy': 0.5511551155115512, 'macro avg': {'precision': 0.5562098561928039, 'recall': 0.5538939451741239, 'f1-score': 0.5475625823451911, 'support': 606}, 'weighted avg': {'precision': 0.5568620807968366, 'recall': 0.5511551155115512, 'f1-score'

Evaluation: accuracy on test data: 0.5412541254125413
Evaluations on test data:
{
    "accuracy": 0.5412541254125413,
    "accuracy_score": 0.5412541254125413,
    "balanced_accuracy_score": 0.5438116518611369,
    "matthews_corrcoef": 0.08924300449788912,
    "f1_score": 0.5412541254125413
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5235457063711911,
        "recall": 0.6406779661016949,
        "f1-score": 0.5762195121951219,
        "support": 295
    },
    "1": {
        "precision": 0.5673469387755102,
        "recall": 0.44694533762057875,
        "f1-score": 0.5,
        "support": 311
    },
    "accuracy": 0.5412541254125413,
    "macro avg": {
        "precision": 0.5454463225733507,
        "recall": 0.5438116518611369,
        "f1-score": 0.538109756097561,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5460245566644968,
        "recall": 0.5412541254125413,
        "f1-score": 0.5371035579167673,
        "s


Outlier detection method: PyODPCAOutlierDetection, Outlier Detection Score: {'Precision': 0.70036142691356, 'Recall': 0.6903190319031903, 'F1-score': 0.6869518678156635, 'Accuracy': 0.6903190319031903}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 402189918.1321284}
Cleaner: (PyODPCAOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5566134394244917, 'classification_report': {'0': {'precision': 0.5235457063711911, 'recall': 0.6406779661016949, 'f1-score': 0.5762195121951219, 'support': 295}, '1': {'precision': 0.5673469387755102, 'recall': 0.44694533762057875, 'f1-score': 0.5, 'support': 311}, 'accuracy': 0.5412541254125413, 'macro avg': {'precision': 0.5454463225733507, 'recall': 0.5438116518611369, 'f1-score': 0.538109756097561, 'support': 606}, 'weighted avg': {'precision': 0.5460245566644968, 'recall': 0.5412541254125413, 'f1-score': 0.5371035579167673, 'support': 606}}}






Evaluation: accuracy on test data: 0.5561056105610561
Evaluations on test data:
{
    "accuracy": 0.5561056105610561,
    "accuracy_score": 0.5561056105610561,
    "balanced_accuracy_score": 0.5587170962995258,
    "matthews_corrcoef": 0.11968413084772356,
    "f1_score": 0.5561056105610561
}


Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5359116022099447,
        "recall": 0.6576271186440678,
        "f1-score": 0.5905631659056316,
        "support": 295
    },
    "1": {
        "precision": 0.5860655737704918,
        "recall": 0.45980707395498394,
        "f1-score": 0.5153153153153154,
        "support": 311
    },
    "accuracy": 0.5561056105610561,
    "macro avg": {
        "precision": 0.5609885879902183,
        "recall": 0.5587170962995258,
        "f1-score": 0.5529392406104735,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.561650686624681,
        "recall": 0.5561056105610561,
        "f1-score": 0.5519458696455848,
        "support": 606
    }
}



Outlier detection method: PyODCBLOFOutlierDetection, Outlier Detection Score: {'Precision': 0.8187954544245285, 'Recall': 0.7926292629262927, 'F1-score': 0.7882947627871975, 'Accuracy': 0.7926292629262927}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 389087104.71174496}
Cleaner: (PyODCBLOFOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5764292332007194, 'classification_report': {'0': {'precision': 0.5359116022099447, 'recall': 0.6576271186440678, 'f1-score': 0.5905631659056316, 'support': 295}, '1': {'precision': 0.5860655737704918, 'recall': 0.45980707395498394, 'f1-score': 0.5153153153153154, 'support': 311}, 'accuracy': 0.5561056105610561, 'macro avg': {'precision': 0.5609885879902183, 'recall': 0.5587170962995258, 'f1-score': 0.5529392406104735, 'support': 606}, 'weighted avg': {'precision': 0.561650686624681, 'recall': 0.5561056105610561, 'f1-score': 0.5519458696455848

Evaluation: accuracy on test data: 0.5478547854785478
Evaluations on test data:
{
    "accuracy": 0.5478547854785478,
    "accuracy_score": 0.5478547854785478,
    "balanced_accuracy_score": 0.5459698076189439,
    "matthews_corrcoef": 0.09289220351441713,
    "f1_score": 0.5478547854785478
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.5405405405405406,
        "recall": 0.4745762711864407,
        "f1-score": 0.5054151624548736,
        "support": 295
    },
    "1": {
        "precision": 0.553314121037464,
        "recall": 0.617363344051447,
        "f1-score": 0.5835866261398177,
        "support": 311
    },
    "accuracy": 0.5478547854785478,
    "macro avg": {
        "precision": 0.5469273307890024,
        "recall": 0.5459698076189439,
        "f1-score": 0.5445008942973457,
        "support": 606
    },
    "weighted avg": {
        "precision": 0.5470959589143743,
        "recall": 0.5478547854785478,
        "f1-score": 0.545532860814638


Outlier detection method: PyODSOSOutlierDetection, Outlier Detection Score: {'Precision': 0.6360021977299181, 'Recall': 0.6259625962596259, 'F1-score': 0.6050356549950111, 'Accuracy': 0.6259625962596259}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 53433142764388.34}
Cleaner: (PyODSOSOutlierDetection, MeanModeImputation): {'roc_auc_score': 0.5539484440568968, 'classification_report': {'0': {'precision': 0.5405405405405406, 'recall': 0.4745762711864407, 'f1-score': 0.5054151624548736, 'support': 295}, '1': {'precision': 0.553314121037464, 'recall': 0.617363344051447, 'f1-score': 0.5835866261398177, 'support': 311}, 'accuracy': 0.5478547854785478, 'macro avg': {'precision': 0.5469273307890024, 'recall': 0.5459698076189439, 'f1-score': 0.5445008942973457, 'support': 606}, 'weighted avg': {'precision': 0.5470959589143743, 'recall': 0.5478547854785478, 'f1-score': 0.5455328608146387, 'supp

## Try Cleaners

In [24]:
from pyod.models.knn import KNN
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

  and should_run_async(code)


In [25]:
## featurizers
def build_featurizers(columns):
    categorical_preprocessing = Pipeline([
        ('mark_missing', SimpleImputer(strategy='constant', fill_value='__NA__')),
        ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
    ])

    numeric_preprocessing = Pipeline([
        ('mark_missing', SimpleImputer(strategy='constant', fill_value=0)),
    ])

    cat_cols = [c for c in categorical_columns if c in columns]
    num_cols = [c for c in numerical_columns if c in columns]
    
    feature_transformation = ColumnTransformer(transformers=[
        ('categorical_features', categorical_preprocessing, cat_cols),
        ('scaled_numeric', numeric_preprocessing, num_cols)
    ])

    return feature_transformation

### Pyod Single Column - features

In [26]:
df_outliers = df_corrupted.copy(deep=True)
df_outliers

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16
364,3,2320.00,46.773306,3,1,1,1,2,2,1,1,1,1,2,1,56.0
357,6,2000.00,,1,1,1,1,1,1,2,1,1,1,2,1,70.0
333,7,2200.00,,1,1,1,1,1,1,1,1,1,1,1,1,71.0
111,3,4.00,,2,1,1,1,2,1,2,2,1,1,2,1,58.0
453,3,2840.00,,2,1,2,1,2,2,1,1,1,1,2,1,72.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,3,4.36,,2,1,1,1,1,1,1,1,1,1,2,1,47.0
207,6,2600.00,,2,1,1,1,2,1,1,1,1,1,2,1,66.0
446,7,5.20,,1,1,1,1,1,1,2,1,1,1,1,1,49.0
268,3,2520.00,-13.627679,3,1,1,2,2,2,2,1,1,1,2,1,74.0


In [27]:
predictors = {}

for col in categorical_columns + numerical_columns:
    predictors[col] = Pipeline(
        [('features', build_featurizers([col])),
         ('outlier_detector', KNN())
        ])
    
len(predictors)

  and should_run_async(code)


16

In [28]:
for col in categorical_columns + numerical_columns:
    predictors[col].fit(df_outliers)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [29]:
for col in categorical_columns + numerical_columns:
    df_outliers[col + "_outlier"] = predictors[col].predict(df_corrupted)

  and should_run_async(code)


In [30]:
df_outliers

  and should_run_async(code)


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X91_outlier,X92_outlier,X93_outlier,X94_outlier,X95_outlier,X96_outlier,X97_outlier,X98_outlier,X99_outlier,X100_outlier
0,1041.990593,1039.850024,1037.136237,1033.695733,1029.333904,1023.804033,1016.793331,,996.637055,982.351378,...,0,0,0,0,0,0,0,0,0,0
1,62.191641,62.191641,62.191640,62.191640,62.191640,62.191639,62.191639,,62.191637,62.191636,...,0,0,0,0,0,0,0,0,0,0
2,3216.305785,3216.305745,3216.305698,3216.305642,3216.305575,3216.305494,3216.305398,,3216.305147,3216.304983,...,0,0,0,0,0,0,0,0,0,0
3,6373.569435,6373.015068,6372.300869,6371.380752,6370.195350,6368.668175,6366.700689,6364.165942,6360.900382,6356.693303,...,0,0,0,0,0,0,0,0,0,0
4,16.680583,16.699414,16.723735,16.755150,16.795726,16.848135,16.915828,,17.116196,17.262063,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601,847.498845,847.498704,847.498525,847.498297,847.498008,847.497640,847.497173,,847.495824,847.494865,...,0,0,0,0,0,0,0,0,0,0
602,24724.565870,24728.692710,24733.760610,24739.984180,24747.626940,24757.012520,24768.538360,24782.692490,24800.074270,24821.419720,...,0,0,1,0,0,0,0,0,0,1
603,797.931966,797.931974,797.931983,797.931995,797.932010,797.932029,797.932051,,797.932114,797.932157,...,0,0,0,0,0,0,0,0,0,0
604,906.837107,906.950491,907.094634,907.277881,907.510841,907.807000,908.183503,,909.270640,910.044210,...,0,0,0,0,0,0,0,0,0,0


### Pyod Multiple Columns - features

In [29]:
df_outliers = df_corrupted.copy(deep=True)

  and should_run_async(code)


In [30]:
predictors = Pipeline(
    [('features', build_featurizers(numerical_columns + categorical_columns)),
     ('outlier_detector', KNN())
    ])
predictors.fit(df_outliers)

Pipeline(steps=[('features',
                 ColumnTransformer(transformers=[('categorical_features',
                                                  Pipeline(steps=[('mark_missing',
                                                                   SimpleImputer(fill_value='__NA__',
                                                                                 strategy='constant')),
                                                                  ('one_hot_encode',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['V1', 'V4', 'V5', 'V6', 'V7',
                                                   'V8', 'V9', 'V10', 'V11',
                                                   'V12', 'V13', 'V14',
                                                   'V15']),
                                                 ('scaled_numeric',
                                                  Pipel

In [31]:
outliers = predictors.predict(df_outliers)
outliers

  and should_run_async(code)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0])

### Categorical from train, numerical from pyod

In [18]:
## single column based

In [32]:
def cat_out_detect(df_train, df_corrupted):
    df_outliers = df_corrupted[categorical_columns].copy()
    
    for col in df_train.columns:
        if col in categorical_columns:
            vals_train_unique = df_train[col].unique()
            
            ## add a respective outlier col for each col
            df_outliers[col + "_outlier"] = ''
            
            for i in df_corrupted[col].index:
                if df_corrupted.loc[i, col] in vals_train_unique:
                    df_outliers.loc[i, col + "_outlier"] = 0
                else:
                    df_outliers.loc[i, col + "_outlier"] = 1
                    
    return df_outliers

  and should_run_async(code)


In [33]:
def num_out_detect(df_train, df_corrupted, pyod_model):
    df_outliers = df_corrupted[numerical_columns].copy()
    
    for col in df_train.columns:
        if col in numerical_columns:
            ## find indices of records with NaNs in col in df_corrupted
            nan_idx = df_corrupted[df_corrupted[col].isnull()].index
            non_nan_idx = df_corrupted.loc[set(df_corrupted.index) - set(nan_idx)].index
            
            ## pd series -> np column, needs to be 2D array
            ## taking only the non-NaN records in the corrupted data
            col_tr_arr = np.array(df_train[col]).reshape(-1,1)
            col_corr_arr = np.array(df_corrupted.loc[non_nan_idx][col]).reshape(-1,1)
            
            ## fit the dataset to the model
            model = pyod_model
            model.fit(col_tr_arr)
            
            ## predict raw anomaly score
            scores_pred = model.decision_function(col_corr_arr) * -1
            
            ## prediction of a datapoint category outlier or inlier
            y_pred = model.predict(col_corr_arr)
            
            ## add a respective outlier col for each col
            df_outliers[col + "_outlier"] = ''
            df_outliers.loc[non_nan_idx, col + "_outlier"] = y_pred ## 0: inlier, 1: outlier
            df_outliers.loc[nan_idx, col + "_outlier"] = 1
            
    return df_outliers

In [34]:
pyod_model = KNN()
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

Unnamed: 0,V2,V3,V16,V2_outlier,V3_outlier,V16_outlier,V1,V4,V5,V6,...,V6_outlier,V7_outlier,V8_outlier,V9_outlier,V10_outlier,V11_outlier,V12_outlier,V13_outlier,V14_outlier,V15_outlier
364,2320.00,46.773306,56.0,1,1,0,3,3,1,1,...,0,0,0,0,0,0,0,0,0,0
357,2000.00,,70.0,1,1,0,6,1,1,1,...,0,0,0,0,0,0,0,0,0,0
333,2200.00,,71.0,1,1,0,7,1,1,1,...,0,0,0,0,0,0,0,0,0,0
111,4.00,,58.0,0,1,0,3,2,1,1,...,0,0,0,0,0,0,0,0,0,0
453,2840.00,,72.0,1,1,0,3,2,1,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,4.36,,47.0,0,1,0,3,2,1,1,...,0,0,0,0,0,0,0,0,0,0
207,2600.00,,66.0,1,1,0,6,2,1,1,...,0,0,0,0,0,0,0,0,0,0
446,5.20,,49.0,1,1,0,7,1,1,1,...,0,0,0,0,0,0,0,0,0,0
268,2520.00,-13.627679,74.0,1,1,0,3,3,1,1,...,0,0,0,0,0,0,0,0,0,0


In [35]:
from pyod.models.pca import PCA

pyod_model = PCA() # n_components = min(n_samples, n_features) default  # n_selected_components = None
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

  and should_run_async(code)


Unnamed: 0,V2,V3,V16,V2_outlier,V3_outlier,V16_outlier,V1,V4,V5,V6,...,V6_outlier,V7_outlier,V8_outlier,V9_outlier,V10_outlier,V11_outlier,V12_outlier,V13_outlier,V14_outlier,V15_outlier
364,2320.00,46.773306,56.0,1,1,0,3,3,1,1,...,0,0,0,0,0,0,0,0,0,0
357,2000.00,,70.0,1,1,0,6,1,1,1,...,0,0,0,0,0,0,0,0,0,0
333,2200.00,,71.0,1,1,0,7,1,1,1,...,0,0,0,0,0,0,0,0,0,0
111,4.00,,58.0,0,1,0,3,2,1,1,...,0,0,0,0,0,0,0,0,0,0
453,2840.00,,72.0,1,1,0,3,2,1,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,4.36,,47.0,0,1,0,3,2,1,1,...,0,0,0,0,0,0,0,0,0,0
207,2600.00,,66.0,1,1,0,6,2,1,1,...,0,0,0,0,0,0,0,0,0,0
446,5.20,,49.0,0,1,0,7,1,1,1,...,0,0,0,0,0,0,0,0,0,0
268,2520.00,-13.627679,74.0,1,1,0,3,3,1,1,...,0,0,0,0,0,0,0,0,0,0


In [36]:
from pyod.models.cblof import CBLOF

pyod_model = CBLOF() # n_clusters = 8 default
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

  and should_run_async(code)


Unnamed: 0,V2,V3,V16,V2_outlier,V3_outlier,V16_outlier,V1,V4,V5,V6,...,V6_outlier,V7_outlier,V8_outlier,V9_outlier,V10_outlier,V11_outlier,V12_outlier,V13_outlier,V14_outlier,V15_outlier
364,2320.00,46.773306,56.0,1,1,0,3,3,1,1,...,0,0,0,0,0,0,0,0,0,0
357,2000.00,,70.0,1,1,0,6,1,1,1,...,0,0,0,0,0,0,0,0,0,0
333,2200.00,,71.0,1,1,0,7,1,1,1,...,0,0,0,0,0,0,0,0,0,0
111,4.00,,58.0,0,1,0,3,2,1,1,...,0,0,0,0,0,0,0,0,0,0
453,2840.00,,72.0,1,1,0,3,2,1,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,4.36,,47.0,1,1,0,3,2,1,1,...,0,0,0,0,0,0,0,0,0,0
207,2600.00,,66.0,1,1,0,6,2,1,1,...,0,0,0,0,0,0,0,0,0,0
446,5.20,,49.0,1,1,1,7,1,1,1,...,0,0,0,0,0,0,0,0,0,0
268,2520.00,-13.627679,74.0,1,1,0,3,3,1,1,...,0,0,0,0,0,0,0,0,0,0


In [37]:
from pyod.models.sos import SOS

pyod_model = SOS()
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

  and should_run_async(code)


Unnamed: 0,V2,V3,V16,V2_outlier,V3_outlier,V16_outlier,V1,V4,V5,V6,...,V6_outlier,V7_outlier,V8_outlier,V9_outlier,V10_outlier,V11_outlier,V12_outlier,V13_outlier,V14_outlier,V15_outlier
364,2320.00,46.773306,56.0,0,0,0,3,3,1,1,...,0,0,0,0,0,0,0,0,0,0
357,2000.00,,70.0,0,1,0,6,1,1,1,...,0,0,0,0,0,0,0,0,0,0
333,2200.00,,71.0,0,1,0,7,1,1,1,...,0,0,0,0,0,0,0,0,0,0
111,4.00,,58.0,0,1,0,3,2,1,1,...,0,0,0,0,0,0,0,0,0,0
453,2840.00,,72.0,0,1,0,3,2,1,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,4.36,,47.0,0,1,0,3,2,1,1,...,0,0,0,0,0,0,0,0,0,0
207,2600.00,,66.0,0,1,0,6,2,1,1,...,0,0,0,0,0,0,0,0,0,0
446,5.20,,49.0,0,1,0,7,1,1,1,...,0,0,0,0,0,0,0,0,0,0
268,2520.00,-13.627679,74.0,0,0,0,3,3,1,1,...,0,0,0,0,0,0,0,0,0,0


### Sklearn

In [244]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, precision_recall_curve

  and should_run_async(code)


In [245]:
df_outliers = df_corrupted.copy(deep=True)

In [246]:
categorical_precision_threshold = 0.85
numeric_error_percentile = 0.9

predictors = {}
predictable_cols = []

In [247]:
categorical_preprocessing = Pipeline([
    ('mark-missing', SimpleImputer(strategy='constant', fill_value='__NA__')),
    ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
])

numeric_preprocessing = Pipeline([
    ('mark_missing', SimpleImputer(strategy='median')),
    ('scaling', StandardScaler())
])

In [207]:
## cat to string

#### Categoric

In [208]:
col = "V8"

In [209]:
feature_transform = ColumnTransformer(transformers=[
    ('categorical_features', categorical_preprocessing, list(set(categorical_columns) - {col})),
    ('numeric_features', numeric_preprocessing, numerical_columns)
])

In [210]:
param_grid = {
    'learner__n_estimators': [10, 50, 100, 200],
}

In [211]:
pipeline = Pipeline([
    ('features', feature_transform),
    ('learner', GradientBoostingClassifier())
])

In [212]:
search = GridSearchCV(pipeline, param_grid, cv=2, verbose=0, n_jobs=-1)
predictors[col] = search.fit(train_data, train_data[col])

print(f'Classifier for col: {col} reached {search.best_score_}')

Classifier for col: V8 reached 0.8663340724316334


In [213]:
## precision-recall curves for finding the likelihood thresholds for minimal precision
predictors[col].thresholds = {}
probas = predictors[col].predict_proba(test_data)

for label_idx, label in enumerate(predictors[col].classes_):
    prec, rec, threshold = precision_recall_curve(test_data[col]==label, probas[:,label_idx], pos_label=True)
    prec = prec.tolist(); rec = rec.tolist(); threshold = threshold.tolist()
    threshold_for_min_prec = np.array([elem >= categorical_precision_threshold for elem in prec]).nonzero()[0][0] - 1
    predictors[col].thresholds[label] = threshold_for_min_prec

  and should_run_async(code)


In [214]:
##############################

  and should_run_async(code)


In [215]:
threshold

[0.02117088300000203,
 0.03014453435281933,
 0.03485565892100134,
 0.0428071210429284,
 0.050680258799050776,
 0.05164710208079502,
 0.05284055373388508,
 0.05602381067929773,
 0.05704823667256316,
 0.05738116741932536,
 0.05907743625280016,
 0.05953322762413934,
 0.06363544285744023,
 0.06490107608859166,
 0.06875759839112798,
 0.06883478750398947,
 0.06997215540252173,
 0.07172726520850733,
 0.07182241883489197,
 0.07516288815785069,
 0.0790764035335925,
 0.08185985826724812,
 0.08453808787475377,
 0.0885866497328653,
 0.1008697518181333,
 0.10844414342035333,
 0.12149313730039363,
 0.12218977079417796,
 0.13985685499568426,
 0.17663232346745117,
 0.22095060847751924,
 0.25137148266105264,
 0.2867344179064725,
 0.3131591884708921,
 0.4160242947679989,
 0.4990420074725702,
 0.5837108238235176,
 0.5912538399069804,
 0.6546257960793735,
 0.6775525091376235,
 0.7481324084617174,
 0.7706909280512712,
 0.777420328143789,
 0.7941309705896612,
 0.8041420499986565,
 0.8224908770428878,
 0.829

In [216]:
np.array([elem >= categorical_precision_threshold for elem in prec]).nonzero()[0][0] - 1

22

In [217]:
threshold[117]

0.9637919052992308

In [218]:
len(threshold)

122

In [219]:
##############################

In [220]:
predictors[col].thresholds

{'1': 60, '2': 22}

In [221]:
## outlier detection

In [222]:
y_pred = predictors[col].predict(df_outliers)
y_proba = predictors[col].predict_proba(df_outliers)

for label_idx, label in enumerate(predictors[col].classes_):
    precision_pred = predictors[col].thresholds[label] <= y_proba[:,label_idx]
    outliers = precision_pred & (df_outliers[col] != y_pred)

In [223]:
outliers

  and should_run_async(code)


329    False
293    False
229    False
101    False
65     False
       ...  
339    False
159    False
360    False
265    False
259    False
Name: V8, Length: 141, dtype: bool

In [224]:
num_nans = df_outliers[col].isnull().sum()
df_outliers.loc[outliers, col] = np.nan

In [225]:
print(f'Column {col} contained {num_nans} nans before, now {df_outliers[col].isnull().sum()}')

Column V8 contained 0 nans before, now 0


In [None]:
## imputation

In [226]:
prior_missing = df_outliers[col].isnull().sum()

if prior_missing > 0:
    df_outliers.loc[df_outliers[col].isnull(), col] = predictors[col].predict(df_outliers[df_outliers[col].isnull()])
    
print(f'Imputed {prior_missing} values in column {col}')

Imputed 0 values in column V8


  and should_run_async(code)


#### Numeric

In [227]:
col = "V3"

In [228]:
feature_transform = ColumnTransformer(transformers=[
    ('categorical_features', categorical_preprocessing, categorical_columns),
    ('numeric_features', numeric_preprocessing, list(set(numerical_columns) - {col}))
])

In [229]:
param_grid = {
    'learner__n_estimators': [10, 50, 100],
}

In [230]:
predictors[col] = {}

for perc_name, percentile, in zip(['lower', 'median', 'upper'], [1.0 - numeric_error_percentile, 0.5, numeric_error_percentile]):
    pipeline = Pipeline([
        ('features', feature_transform),
        ('learner', GradientBoostingRegressor(loss='quantile', alpha=percentile))
    ])
    
    search = GridSearchCV(pipeline, param_grid, cv=2, verbose=0, n_jobs=-1)
    predictors[col][perc_name] = search.fit(train_data, train_data[col])
    print(f'Regressor for col: {col}/{perc_name} reached {search.best_score_}')

Regressor for col: V3/lower reached -0.039998355063562085
Regressor for col: V3/median reached -0.2766384079223627
Regressor for col: V3/upper reached -1.4318003920796856


In [231]:
## outlier detection

  and should_run_async(code)


In [232]:
lower_percentile = predictors[col]['lower'].predict(df_outliers)
upper_percentile = predictors[col]['upper'].predict(df_outliers)

outliers = (df_outliers[col] < lower_percentile) | (df_outliers[col] > upper_percentile)

In [233]:
outliers

  and should_run_async(code)


329    False
293    False
229    False
101    False
65      True
       ...  
339     True
159     True
360     True
265     True
259     True
Name: V3, Length: 141, dtype: bool

In [234]:
num_nans = df_outliers[col].isnull().sum()
df_outliers.loc[outliers, col] = np.nan

In [235]:
print(f'Column {col} contained {num_nans} nans before, now {df_outliers[col].isnull().sum()}')

Column V3 contained 0 nans before, now 86


In [None]:
## imputation

In [236]:
prior_missing = df_outliers[col].isnull().sum()

if prior_missing > 0:
    df_outliers.loc[df_outliers[col].isnull(), col] = predictors[col]['median'].predict(df_outliers[df_outliers[col].isnull()])
    
print(f'Imputed {prior_missing} values in column {col}')

Imputed 86 values in column V3


#### Together

In [248]:
for col in categorical_columns + numerical_columns:
    if col in categorical_columns:
        feature_transform = ColumnTransformer(transformers=[
            ('categorical_features', categorical_preprocessing, list(set(categorical_columns) - {col})),
            ('numeric_features', numeric_preprocessing, numerical_columns)
        ])
        
        param_grid = {
            'learner__n_estimators': [10, 50, 100, 200],
        }
        
        pipeline = Pipeline([
            ('features', feature_transform),
            ('learner', GradientBoostingClassifier())
        ])
        
        search = GridSearchCV(pipeline, param_grid, cv=2, verbose=0, n_jobs=-1)
        predictors[col] = search.fit(train_data, train_data[col])
        
        print(f'Classifier for col: {col} reached {search.best_score_}')
        
        ## precision-recall curves for finding the likelihood thresholds for minimal precision
        predictors[col].thresholds = {}
        probas = predictors[col].predict_proba(test_data)
        
        for label_idx, label in enumerate(predictors[col].classes_):
            prec, rec, threshold = precision_recall_curve(test_data[col]==label, probas[:,label_idx], pos_label=True)
            prec = prec.tolist(); rec = rec.tolist(); threshold = threshold.tolist()
            threshold_for_min_prec = np.array([elem >= categorical_precision_threshold for elem in prec]).nonzero()[0][0] - 1
            predictors[col].thresholds[label] = threshold_for_min_prec
            
    elif col in numerical_columns:
        feature_transform = ColumnTransformer(transformers=[
            ('categorical_features', categorical_preprocessing, categorical_columns),
            ('numeric_features', numeric_preprocessing, list(set(numerical_columns) - {col}))
        ])
        
        param_grid = {
            'learner__n_estimators': [10, 50, 100],
        }
        
        predictors[col] = {}
        
        for perc_name, percentile, in zip(['lower', 'median', 'upper'], [1.0 - numeric_error_percentile, 0.5, numeric_error_percentile]):
            pipeline = Pipeline([
                ('features', feature_transform),
                ('learner', GradientBoostingRegressor(loss='quantile', alpha=percentile))
            ])
            
            search = GridSearchCV(pipeline, param_grid, cv=2, verbose=0, n_jobs=-1)
            predictors[col][perc_name] = search.fit(train_data, train_data[col])
            
            print(f'Regressor for col: {col}/{perc_name} reached {search.best_score_}')

Classifier for col: V1 reached 0.7386178861788617
Classifier for col: V4 reached 0.8480968218773097
Classifier for col: V5 reached 0.9270325203252032
Classifier for col: V6 reached 0.8723577235772357
Classifier for col: V7 reached 0.927050997782705
Classifier for col: V8 reached 0.8572431633407243
Classifier for col: V9 reached 0.8632113821138212
Classifier for col: V10 reached 0.5530672579453068
Classifier for col: V11 reached 0.927050997782705
Classifier for col: V12 reached 0.9847745750184774
Classifier for col: V13 reached 0.975720620842572
Classifier for col: V14 reached 0.7751108647450111




Classifier for col: V15 reached nan
Regressor for col: V2/lower reached 0.16010371343280655
Regressor for col: V2/median reached 0.6655567131283129
Regressor for col: V2/upper reached 0.22608211334876116
Regressor for col: V3/lower reached -0.03891978318653888
Regressor for col: V3/median reached -0.27632529721117893
Regressor for col: V3/upper reached -1.3732224544042722
Regressor for col: V16/lower reached -0.7824885234166155
Regressor for col: V16/median reached 0.08373990993977048
Regressor for col: V16/upper reached -1.108010255507478


In [249]:
## outlier detection and removal

  and should_run_async(code)


In [250]:
for col in categorical_columns + numerical_columns:
    if col in categorical_columns:
        y_pred = predictors[col].predict(df_outliers)
        y_proba = predictors[col].predict_proba(df_outliers)
        
        for label_idx, label in enumerate(predictors[col].classes_):
            precision_pred = predictors[col].thresholds[label] <= y_proba[:,label_idx]
            outliers = precision_pred & (df_outliers[col] != y_pred)
            
    elif col in numerical_columns:
        lower_percentile = predictors[col]['lower'].predict(df_outliers)
        upper_percentile = predictors[col]['upper'].predict(df_outliers)
        outliers = (df_outliers[col] < lower_percentile) | (df_outliers[col] > upper_percentile)
        
    num_nans = df_outliers[col].isnull().sum()
    df_outliers.loc[outliers, col] = np.nan
    
    print(f'Column {col} contained {num_nans} nans before, now {df_outliers[col].isnull().sum()}')

Column V1 contained 0 nans before, now 69
Column V4 contained 0 nans before, now 0
Column V5 contained 0 nans before, now 0
Column V6 contained 0 nans before, now 0
Column V7 contained 0 nans before, now 0
Column V8 contained 0 nans before, now 0
Column V9 contained 0 nans before, now 0
Column V10 contained 0 nans before, now 0
Column V11 contained 0 nans before, now 0
Column V12 contained 0 nans before, now 73
Column V13 contained 70 nans before, now 70
Column V14 contained 0 nans before, now 29
Column V15 contained 0 nans before, now 1
Column V2 contained 0 nans before, now 53
Column V3 contained 0 nans before, now 90
Column V16 contained 0 nans before, now 92


In [None]:
## imputation

In [251]:
df_cleaned = df_outliers.copy(deep=True)

  and should_run_async(code)


In [252]:
for col in categorical_columns + numerical_columns:
    prior_missing = df_cleaned[col].isnull().sum()
    
    if prior_missing > 0:
        if col in categorical_columns:
            df_cleaned.loc[df_cleaned[col].isnull(), col] = predictors[col].predict(df_cleaned[df_cleaned[col].isnull()])
        elif col in numerical_columns:
            df_cleaned.loc[df_cleaned[col].isnull(), col] = predictors[col]['median'].predict(df_cleaned[df_cleaned[col].isnull()])
        
        print(f'Imputed {prior_missing} values in column {col}')

Imputed 69 values in column V1
Imputed 73 values in column V12
Imputed 70 values in column V13
Imputed 29 values in column V14
Imputed 1 values in column V15
Imputed 53 values in column V2
Imputed 90 values in column V3
Imputed 92 values in column V16
