## Dataset

In [1]:
# mount drive for access to the files
from google.colab import drive

drive.mount("/content/drive")

# all the drive the files are present in "/content/drive/My Drive"
!ls "/content/drive/My Drive/Beuth Uni/Master Thesis"

import sys
sys.path.append('/content/drive/My Drive/Beuth Uni/Master Thesis/jenga')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Data
'Data Quality in ML Production Systems.pdf'
'Datawig: Missing Value Imputation for Tables.pdf'
 Declaration
 Images
 jenga
 jenga.pdf
 MICE_Multivariate_Imputation_by_Chained_Equations_.pdf


In [20]:
!pip install openml
!pip install pyod

!pip install mxnet autogluon
!pip install mxnet-mkl --pre --upgrade

Requirement already up-to-date: mxnet-mkl in /usr/local/lib/python3.6/dist-packages (1.6.0)


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from jenga.basis import Dataset

In [3]:
seed = 100

In [4]:
dataset = Dataset(seed, "credit-g")

all_data = dataset.all_data
attribute_names = dataset.attribute_names
attribute_types = dataset.attribute_types

categorical_columns = dataset.categorical_columns
numerical_columns = dataset.numerical_columns

print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")

Dataset: credit-g
Found 13 categorical and 7 numeric features 



### Visualize the dataset

In [None]:
## plot the original dataset
def hide_current_axis(*args, **kwds):
        plt.gca().set_visible(False)
        
def plot_data(data):
    sns.set_style("white") # grid/no grid style: darkgrid, whitegrid, dark, white, ticks
    
    plot = sns.pairplot(data, hue="class")
    plot.map_upper(hide_current_axis)
    plt.show()

In [None]:
plot_data(all_data)

### Get training and test sets

In [5]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data(0.3)

## Model

### Model using AutoGluon

In [None]:
import autogluon as ag
from autogluon import TabularPrediction as task

label_col = 'class'

tr_data = train_data
tr_data[label_col] = train_labels

model = task.fit(train_data=tr_data, label=label_col)

# predictions
y_pred = model.predict(test_data)

# predictor performance
perf = model.evaluate_predictions(y_true=test_labels, y_pred=y_pred, auxiliary_metrics=True)

ModuleNotFoundError: ignored

In [None]:
import random
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

import autogluon as ag
from autogluon import TabularPrediction as task


class Model:

  def __init__(self, seed, train_data, train_labels, test_data, test_labels, pipeline, learner, param_grid):

    ## fix random seeds for reproducibility
    random.seed(seed)
    np.random.seed(seed)
    
    ## train and test data and labels
    self.train_data = train_data
    self.train_labels = train_labels
    self.test_data = test_data
    self.test_labels = test_labels
    
    ## preprocessing pipeline
    self.pipeline = pipeline
    
    ## information for model parameters
    self.learner = learner
    self.param_grid = param_grid
    
  
  def __repr__(self):
    return f"{self.__class__.__name__}: {self.__dict__}"
  
  
  # method for training a model on the raw data with preprocessing
  def fit_model(self):

    if self.learner == 'autogluon':
      label_col = 'class'
      
      tr_data = self.train_data
      tr_data[label_col] = self.train_labels
      
      model = task.fit(train_data=tr_data, label=label_col)

      return model
    else:
      grid_search = GridSearchCV(self.pipeline, self.param_grid, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
      model = grid_search.fit(self.train_data, self.train_labels)

      return model

  # method for computing evaluation scores
  def eval_scores(self, model):
    if self.learner == 'autogluon':
      # predictions
      y_pred = model.predict(self.test_data)
      
      # predictor performance
      perf = model.evaluate_predictions(y_true=self.test_labels, y_pred=y_pred, auxiliary_metrics=True)
      
      return perf
    else:
      pred_prob = model.predict_proba(self.test_data)
      roc_auc_acore = roc_auc_score(self.test_labels, np.transpose(pred_prob)[1])
      
      return roc_auc_acore
    

In [None]:
## define preprocessing pipeline if not given
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

def get_pipeline(learner):
  # preprocessing pipeline for numerical columns
  transformer_numeric = Pipeline([
      ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
      ('standard_scale', StandardScaler())
  ])

  # preprocessing pipeline for categorical columns
  transformer_categorical = Pipeline([
      ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
      ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
  ])

  # preprocessor
  feature_transform = ColumnTransformer(transformers=[
      ('categorical_features', transformer_categorical, categorical_columns),
      ('numerical_features', transformer_numeric, numerical_columns)
  ])

  ## prediction pipeline: append classifier (learner) to the preprocessing pipeline
  pipeline = Pipeline([
      ('features', feature_transform),
      ('learner', learner)
  ])

  return pipeline

In [None]:
# test autogluon
# pipeline in the case of autogluon is not used so anything is given
model_obj = Model(seed, train_data, train_labels, test_data, test_labels, get_pipeline(SGDClassifier(loss='log')), learner="autogluon", param_grid={})
model = model_obj.fit_model()
scores = model_obj.eval_scores(model)

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200720_140007/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200720_140007/
AutoGluon Version:  0.0.12
Train Data Rows:    800
Train Data Columns: 21
Preprocessing data ...
Here are the 2 unique label values in your data:  ['good', 'bad']
AutoGluon infers your prediction problem is: binary  (because only two unique label-values observed).
If this is wrong, please specify `problem_type` argument in fit() instead (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])

Selected class <--> label mapping:  class 1 = bad, class 0 = good
Train Data Class Count: 2
Feature Generator processed 800 data points with 20 features
Original Features (raw dtypes):
	category features: 13
	float64 features: 7
Original Features (inferred dtypes):
	category features: 13
	float features: 7
Generated Features (special dtypes):
Final Features (raw dtypes):
	category feat

In [None]:
## model parameters
## models is a dict where key = leaner & value = param_grid
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

models = {SGDClassifier(loss='log'): {'learner__max_iter': [500, 1000, 5000], 
                                         'learner__penalty': ['l2', 'l1', 'elasticnet'], 
                                         'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
                                        }, 
          RandomForestClassifier():{'learner__n_estimators': [100, 200, 500], 
                                    'learner__max_depth': [5, 10, 15]
                                   },
          "autogluon": {}
         }

In [None]:
for learner, param_grid in models.items():
  print(learner, param_grid)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False) {'learner__max_iter': [500, 1000, 5000], 'learner__penalty': ['l2', 'l1', 'elasticnet'], 'learner__alpha': [0.0001, 0.001, 0.01, 0.1]}
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                    

In [None]:
model_summary = []
scores_summary = []

for learner, param_grid in models.items():
  pipeline = None
  
  if learner == 'autogluon':
    pipeline = get_pipeline(SGDClassifier(loss='log'))
  else:
    pipeline = get_pipeline(learner)
  
  model_obj = Model(seed, train_data, train_labels, test_data, test_labels, pipeline, learner, param_grid)
  
  model = model_obj.fit_model()
  model_summary.append(model)

  scores = model_obj.eval_scores(model)
  scores_summary.append(scores)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  54 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 177 out of 180 | elapsed:    8.6s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    8.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   19.5s finished
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200720_140922/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200720_140922/
AutoGluon Version:  0.0.12
Train Data Rows:    800
Train Data Columns: 21
Preprocessing data ...
Here are the 2 unique label values in your data:  ['bad', 'good']
AutoGluon infers your prediction problem is: binary  (because only two unique label-values observed).
If this is wrong, please specify `problem_type` argument in fit() instead (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])

Selected class <--> label mapping:  class 1 = good, class 0 = bad
Train Data Class Count: 2
NumExpr defaulting to 2 threads.
Feature Generator processed 800 data points with 20 features
Original Features (raw dtypes):
	category features: 13
	float64 features: 7
Original Features (inferred dtypes):
	category features

In [None]:
model_summary

[GridSearchCV(cv=5, error_score=nan,
              estimator=Pipeline(memory=None,
                                 steps=[('features',
                                         ColumnTransformer(n_jobs=None,
                                                           remainder='drop',
                                                           sparse_threshold=0.3,
                                                           transformer_weights=None,
                                                           transformers=[('categorical_features',
                                                                          Pipeline(memory=None,
                                                                                   steps=[('imputer',
                                                                                           SimpleImputer(add_indicator=False,
                                                                                                         copy=True,
           

In [None]:
scores_summary

[0.8093735390369332,
 0.7849462365591399,
 OrderedDict([('accuracy', 0.73),
              ('accuracy_score', 0.73),
              ('balanced_accuracy_score', 0.6666666666666667),
              ('matthews_corrcoef', 0.3472488574259035),
              ('f1_score', 0.7299999999999999),
              ('classification_report',
               {'accuracy': 0.73,
                'bad': {'f1-score': 0.5344827586206897,
                 'precision': 0.5740740740740741,
                 'recall': 0.5,
                 'support': 62},
                'good': {'f1-score': 0.8098591549295775,
                 'precision': 0.7876712328767124,
                 'recall': 0.8333333333333334,
                 'support': 138},
                'macro avg': {'f1-score': 0.6721709567751336,
                 'precision': 0.6808726534753933,
                 'recall': 0.6666666666666667,
                 'support': 200},
                'weighted avg': {'f1-score': 0.7244924720738223,
                 'precisi

## Corruptions

In [None]:
from jenga.corruptions.perturbations import Perturbation

In [None]:
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise

In [None]:
corruptions = [MissingValues, SwappedValues, Scaling, GaussianNoise]
fraction = 0.5

In [None]:
# corruption perturbations to apply
corr_perturbations = Perturbation(categorical_columns, numerical_columns)

In [None]:
test_data_corrupted, perturbations, cols_perturbed, summary_col_corrupt = corr_perturbations.apply_perturbation(test_data, corruptions, fraction)

Applying perturbations... 

MissingValues: {'column': 'other_parties', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MAR'}
SwappedValues: {'column_a': 'checking_status', 'column_b': 'employment', 'fraction': 0.5}
Scaling: {'column': 'residence_since', 'fraction': 0.5}
GaussianNoise: {'column': 'age', 'fraction': 0.5}


### Visualize the original and corrupted test set

In [None]:
## original test data
plot_data(pd.concat([test_data, pd.Series(test_labels, name='class')], axis=1))

In [None]:
## corrupted test data
plot_data(pd.concat([test_data_corrupted, pd.Series(test_labels, name='class')], axis=1))

## Model & Corruptions using PPP

In [6]:
from sklearn.linear_model import SGDClassifier

learner = SGDClassifier(loss='log')
param_grid = {
    'learner__max_iter': [500, 1000, 5000],
    'learner__penalty': ['l2', 'l1', 'elasticnet'], 
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

In [7]:
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise

corruptions = [MissingValues, SwappedValues, Scaling, GaussianNoise]
fraction = 0.5
num_repetitions = 5

In [8]:
from jenga.cleaning.ppp import PipelinePerformancePrediction

ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)
ppp_model = ppp.fit_ppp(train_data)

## generate corrpted data
df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions, fraction, num_repetitions)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  62 tasks      | elapsed:    4.5s



Generating corrupted training data on 300 rows... 

	perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: SwappedValues: {'column_a': 'property_magnitude', 'column_b': 'purpose', 'fraction': 0.5}
	perturbation: Scaling: {'column': 'installment_commitment', 'fraction': 0.5}
	perturbation: GaussianNoise: {'column': 'age', 'fraction': 0.5}
	perturbation: MissingValues: {'column': 'existing_credits', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: SwappedValues: {'column_a': 'purpose', 'column_b': 'own_telephone', 'fraction': 0.5}
	perturbation: Scaling: {'column': 'num_dependents', 'fraction': 0.5}
	perturbation: GaussianNoise: {'column': 'num_dependents', 'fraction': 0.5}
	perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: SwappedValues: {'column_a': 'other_parties', 'column_b': 'credit_history', 'fract

[Parallel(n_jobs=-1)]: Done 177 out of 180 | elapsed:    8.6s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    8.7s finished


In [9]:
df_corrupted

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,18.0,existing paid,radio/tv,433.0,rent,unemployed,3.000000,female div/dep/mar,co applicant,4.0,real estate,22.0,,<100,1.0,skilled,1.0,none,yes
353,<0,12.0,no credits/all paid,radio/tv,6199.0,rent,1<=X<4,4.000000,male single,none,2.0,life insurance,28.0,,<100,2.0,skilled,1.0,yes,yes
537,0<=X<200,18.0,critical/other existing credit,furniture/equipment,3612.0,<100,>=7,3.000000,female div/dep/mar,none,4.0,life insurance,37.0,,own,1.0,skilled,10.0,yes,yes
424,0<=X<200,12.0,existing paid,furniture/equipment,2762.0,own,>=7,-3.771531,female div/dep/mar,none,2.0,life insurance,25.0,,no known savings,1.0,skilled,10.0,yes,yes
564,0<=X<200,24.0,delayed previously,business,4712.0,no known savings,1<=X<4,4.000000,male single,none,2.0,life insurance,37.0,,own,2.0,high qualif/self emp/mgmt,1.0,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,no checking,6.0,existing paid,radio/tv,2108.0,<100,4<=X<7,0.292980,male mar/wid,none,2.0,real estate,29.0,none,rent,1.0,skilled,1.0,none,yes
838,<0,24.0,critical/other existing credit,used car,2957.0,<100,>=7,3.152586,male single,none,4.0,life insurance,63.0,none,own,2.0,skilled,1.0,yes,yes
974,no checking,30.0,critical/other existing credit,radio/tv,2831.0,<100,1<=X<4,4.000000,female div/dep/mar,none,2.0,car,33.0,none,own,1.0,skilled,1.0,yes,yes
203,<0,12.0,existing paid,retraining,902.0,rent,4<=X<7,4.000000,male mar/wid,none,4.0,life insurance,21.0,,<100,1.0,skilled,10.0,none,yes


## Cleaning

### Cleaning using AutoEncoders?

In [None]:
!pip install dfencoder



In [None]:
from dfencoder import AutoEncoder

In [None]:
model = AutoEncoder(encoder_layers=[512, 512, 512], 
                    decoder_layers=[], 
                    activation='relu', 
                    lr=0.01, 
                    lr_decay=0.99, 
                    batch_size=512, 
                    logger='ipnyb', 
                    verbose=False, 
                    optimizer='sgd', 
                    scaler='gauss_rank', 
                    min_cats=3)

In [None]:
model.fit(train_data, epochs=1000, val=test_data)

  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))

  0%|          | 0/2 [00:00<?, ?it/s][A

AttributeError: ignored

### Cleaning using AutoGluon

In [None]:
from autogluon import TabularPrediction as task

ModuleNotFoundError: No module named 'autogluon'

In [None]:
def cat_cols_to_str(df):
  for col in df.columns:
    if pd.api.types.is_categorical_dtype(df[col]):
      df[col] = df[col].astype(str)

  return df

In [None]:
train_data = cat_cols_to_str(train_data)
df_corrupted = cat_cols_to_str(df_corrupted)
test_data = cat_cols_to_str(test_data)

In [None]:
categorical_precision_threshold = 0.7
numerical_std_error_threshold = 2

#### Training

In [None]:
predictors = {}
predictable_cols = {}

In [None]:
for col in categorical_columns:
  predictors[col] = task.fit(train_data=train_data, label=col, problem_type='multiclass')

  y_test = test_data[col].dropna() # take only the non-nan records # test_data? OR split the train_data again into train and test
  y_pred = predictors[col].predict(test_data.drop([col], axis=1)) # drop the actual column before predicting

  perf = predictors[col].evaluate_predictions(y_test, y_pred, auxiliary_metrics=True)

  labels = [k for k in perf['classification_report'].keys() if k not in ['accuracy', 'macro avg', 'weighted avg']]

  high_precision_labels = []
  for label in labels:
    if perf['classification_report'][label]['precision'] > categorical_precision_threshold:
      high_precision_labels.append(label)

  if high_precision_labels:
    # predictable_cols.append(col)
    # predictors[col].high_precision_labels = high_precision_labels
    predictable_cols[col] = high_precision_labels


No output_directory specified. Models will be saved in: AutogluonModels/ag-20200729_230559/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200729_230559/
AutoGluon Version:  0.0.12
Train Data Rows:    700
Train Data Columns: 20
Preprocessing data ...
Train Data Class Count: 4
Feature Generator processed 700 data points with 19 features
Original Features (raw dtypes):
	float64 features: 7
	object features: 12
Original Features (inferred dtypes):
	float features: 7
	object features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 7
	category features: 12
Final Features:
	float features: 7
	category features: 12
	Data preprocessing and feature engineering runtime = 0.14s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini

In [None]:
for col in numerical_columns:
  predictors[col] = task.fit(train_data=train_data, label=col, problem_type='regression')

  y_test = test_data[col].dropna() # take only the non-nan records # test_data? OR split the train_data again into train and test
  y_pred = predictors[col].predict(test_data.drop([col], axis=1)) # drop the actual column before predicting

  perf = predictors[col].evaluate_predictions(y_test, y_pred, auxiliary_metrics=True)

  if perf['root_mean_squared_error'] < numerical_std_error_threshold * y_test.std():
    # predictable_cols.append(col)
    # predictors[col].root_mean_squared_error = perf['root_mean_squared_error']
    predictable_cols[col] = perf['root_mean_squared_error']

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200729_231156/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200729_231156/
AutoGluon Version:  0.0.12
Train Data Rows:    700
Train Data Columns: 20
Preprocessing data ...
Feature Generator processed 700 data points with 19 features
Original Features (raw dtypes):
	object features: 13
	float64 features: 6
Original Features (inferred dtypes):
	object features: 13
	float features: 6
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 6
	category features: 13
Final Features:
	float features: 6
	category features: 13
	Data preprocessing and feature engineering runtime = 0.12s ...
AutoGluon will gauge predictive performance using evaluation metric: root_mean_squared_error
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: root_mean_squared_error
Fitting model: RandomForestRegressorM

In [None]:
predictable_cols

{'age': 12.597579544098458,
 'credit_amount': 3109.3800766769746,
 'credit_history': ['existing paid'],
 'duration': 15.7794387371624,
 'existing_credits': 0.696056329374258,
 'foreign_worker': ['yes'],
 'housing': ['for free', 'own'],
 'installment_commitment': 1.3191040987360383,
 'num_dependents': 0.40359747535553026,
 'other_parties': ['none'],
 'other_payment_plans': ['bank', 'none'],
 'property_magnitude': ['no known property'],
 'residence_since': 1.2693950401902552}

In [None]:
predictors

{'age': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7f99cd6e16a0>,
 'checking_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7f99cb3cdf60>,
 'credit_amount': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7f99cb811588>,
 'credit_history': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7f99cba4f160>,
 'duration': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7f99cbc509b0>,
 'employment': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7f99cbc737b8>,
 'existing_credits': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7f99cb432cc0>,
 'foreign_worker': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7f99cbc72208>,
 'housing': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7f99cb3d8208>,
 'installment_commitment': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7f99cac424a8>,
 'job': 

In [None]:
print(f"Found {len(predictable_cols.keys())} predictable columns: {predictable_cols.keys()}")

Found 13 predictable columns: dict_keys(['credit_history', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'foreign_worker', 'duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents'])


#### Outlier Detection/Removal

In [None]:
col = "age"

In [None]:
y_pred = predictors[col].predict(df_corrupted)
y_pred

array([43.51275427, 35.83040951, 40.05013168, 42.83572055, 35.04089732,
       32.17258873, 32.48700568, 35.27354521, 34.68426327, 39.24061232,
       31.28843832, 34.51434478, 40.39149956, 41.5410409 , 33.86552249,
       39.5880507 , 41.67894465, 38.34793572, 39.10610223, 45.02336698,
       44.0762099 , 34.81241474, 29.62976866, 42.59616951, 34.79584546,
       39.95271712, 37.63975811, 41.59510478, 35.63257529, 44.8476982 ,
       39.27980516, 36.53296746, 44.58683467, 47.1799268 , 33.71600757,
       38.91013749, 30.62773118, 34.7944822 , 41.36272217, 41.45807076,
       32.30584115, 30.47521142, 30.72300508, 33.46067158, 41.99328106,
       32.48345083, 39.22818273, 36.27138376, 43.51777878, 37.37116436,
       35.4271282 , 40.9279374 , 38.25906167, 45.77320611, 37.74736415,
       36.72325077, 34.18338425, 41.00481232, 38.58701428, 36.53815808,
       48.95578047, 36.4702659 , 38.65048948, 37.91686703, 35.84799479,
       38.16634484, 42.28681682, 40.35830266, 36.28390818, 40.34

In [None]:
y_test = df_corrupted[col]
y_test

249    22.0
353    28.0
537    37.0
424    25.0
564    37.0
       ... 
193    29.0
838    63.0
974    33.0
203    21.0
425    21.0
Name: age, Length: 300, dtype: float64

In [None]:
df_corrupted[col].isnull().sum()

0

In [None]:
predictable_cols[col]

12.519270831214259

In [None]:
auxiliary_df_test_pred = pd.DataFrame(y_test)
auxiliary_df_test_pred["pred"] = y_pred

auxiliary_df_test_pred

Unnamed: 0,age,pred
249,22.0,43.512754
353,28.0,35.830410
537,37.0,40.050132
424,25.0,42.835721
564,37.0,35.040897
...,...,...
193,29.0,29.707539
838,63.0,43.609453
974,33.0,33.316747
203,21.0,34.648074


In [None]:
auxiliary_df_test_pred.loc[173, col] != auxiliary_df_test_pred.loc[173, "pred"]

True

In [None]:
np.sqrt((auxiliary_df_test_pred.loc[330, "pred"] - auxiliary_df_test_pred.loc[330, col]) ** 2)

33.32105535163225

In [None]:
np.sqrt((auxiliary_df_test_pred.loc[330, "pred"] - auxiliary_df_test_pred.loc[330, col]) ** 2) > predictable_cols[col] * numerical_std_error_threshold

True

In [None]:
presumably_wrong = {}

if col in categorical_columns:
  presumably_wrong_aux = []
  for i in auxiliary_df_test_pred.index:
    if any(np.isin(predictable_cols[col], auxiliary_df_test_pred.loc[i, "pred"])) & (auxiliary_df_test_pred.loc[i, col] != auxiliary_df_test_pred.loc[i, "pred"]):
      presumably_wrong_aux.append(i)

  presumably_wrong[col] = np.array(presumably_wrong_aux)

presumably_wrong

{'housing': array([544, 173, 759, 955, 121, 230,  11, 659, 419, 944, 417, 374, 982,
        139, 218, 449,  16, 904, 381, 329, 334, 403, 940, 349, 809, 445,
        890, 883, 539,  27,  64, 442, 131, 437,  22, 649, 941, 988, 935,
        663, 872, 375, 482, 193, 248, 271, 531, 138, 354, 643, 700, 438,
        529, 625, 181, 861, 304, 709, 898, 726, 614, 339, 440,  52,  75,
        707, 326, 855, 826,  43, 195, 916,  92, 196, 839, 939, 104, 280,
        335, 901, 711, 923, 854, 172, 444, 811, 774,  85, 250,  34, 124,
        739, 507, 750, 728, 602, 299, 222,  31, 300, 960, 943, 378, 576,
        274, 192, 398, 520, 188, 194, 225, 580, 428, 244, 588,  12, 765,
        950, 905, 813, 875, 593, 754, 590, 682, 190, 880, 617, 853, 140,
        817, 555, 592, 800, 565, 197, 163, 642, 264, 152, 685, 116, 903,
        859, 307, 942, 467,  30, 345, 100, 284, 646, 731, 894,  35, 802])}

In [None]:
presumably_wrong = {}

if col in numerical_columns:
  presumably_wrong_aux = []
  predictor_rmse = predictable_cols[col]
  for i in auxiliary_df_test_pred.index:
    rmse = np.sqrt((auxiliary_df_test_pred.loc[i, "pred"] - auxiliary_df_test_pred.loc[i, col]) ** 2)
    if rmse > predictor_rmse * numerical_std_error_threshold:
      presumably_wrong_aux.append(i)

  presumably_wrong[col] = np.array(presumably_wrong_aux)

presumably_wrong

{'age': array([330, 219, 430, 606, 137])}

In [None]:
for i in presumably_wrong[col]:
  df_corrupted.loc[i, col] = np.nan

df_corrupted.loc[presumably_wrong[col]]

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
330,<0,24.0,critical/other existing credit,used car,6615.0,<100,unemployed,2.0,male single,none,4.0,no known property,,none,for free,2.0,high qualif/self emp/mgmt,1.0,yes,yes
219,no checking,10.0,existing paid,new car,1364.0,own,1<=X<4,2.0,female div/dep/mar,none,4.0,car,,none,,1.0,skilled,10.0,yes,yes
430,no checking,5.0,existing paid,business,3448.0,own,4<=X<7,2.428858,male single,none,4.0,real estate,,,,1.0,unskilled resident,1.0,none,yes
606,no checking,24.0,critical/other existing credit,business,4526.0,own,1<=X<4,3.0,male single,none,2.0,real estate,,none,,1.0,high qualif/self emp/mgmt,10.0,yes,yes
137,0<=X<200,12.0,existing paid,radio/tv,766.0,500<=X<1000,1<=X<4,4.0,male single,none,3.0,real estate,,,own,1.0,unskilled resident,10.0,none,yes


In [None]:
df_corrupted[col].isnull().sum()

5

In [None]:
df_outliers = df_corrupted.copy()

presumably_wrong = {}

for col in predictable_cols:
  y_pred = predictors[col].predict(df_outliers)
  y_test = df_outliers[col]

  auxiliary_df_test_pred = pd.DataFrame(y_test)
  auxiliary_df_test_pred["pred"] = y_pred

  num_nans = df_outliers[col].isnull().sum()

  if col in categorical_columns:
    presumably_wrong_aux = []
    for i in auxiliary_df_test_pred.index:
      if any(np.isin(predictable_cols[col], auxiliary_df_test_pred.loc[i, "pred"])) & (auxiliary_df_test_pred.loc[i, col] != auxiliary_df_test_pred.loc[i, "pred"]):
        presumably_wrong_aux.append(i)

    presumably_wrong[col] = np.array(presumably_wrong_aux)

  if col in numerical_columns:
    presumably_wrong_aux = []
    predictor_rmse = predictable_cols[col]
    for i in auxiliary_df_test_pred.index:
      rmse = np.sqrt((auxiliary_df_test_pred.loc[i, "pred"] - auxiliary_df_test_pred.loc[i, col]) ** 2)
      if rmse > predictor_rmse * numerical_std_error_threshold:
        presumably_wrong_aux.append(i)

    presumably_wrong[col] = np.array(presumably_wrong_aux)

  for i in presumably_wrong[col]:
    df_outliers.loc[i, col] = np.nan

  print(f"Column {col}: Num NaNs: Before: {num_nans}, Now: {df_outliers[col].isnull().sum()}")


Column credit_history: Num NaNs: Before: 0, Now: 46
Column other_parties: Num NaNs: Before: 0, Now: 23
Column property_magnitude: Num NaNs: Before: 0, Now: 7
Column other_payment_plans: Num NaNs: Before: 0, Now: 180
Column housing: Num NaNs: Before: 0, Now: 160
Column foreign_worker: Num NaNs: Before: 0, Now: 14
Column duration: Num NaNs: Before: 0, Now: 0
Column credit_amount: Num NaNs: Before: 0, Now: 3
Column installment_commitment: Num NaNs: Before: 0, Now: 56
Column residence_since: Num NaNs: Before: 0, Now: 1
Column age: Num NaNs: Before: 0, Now: 6
Column existing_credits: Num NaNs: Before: 0, Now: 2
Column num_dependents: Num NaNs: Before: 0, Now: 144


In [None]:
presumably_wrong

{'age': array([330, 219, 430, 606, 137, 848]),
 'credit_amount': array([378, 381, 917]),
 'credit_history': array([537, 553, 395, 315, 216, 562, 781, 480, 244, 157, 558, 345, 666,
        567, 602, 301, 247, 980, 191, 450, 606, 207, 175, 614, 213, 551,
         98, 675, 404, 599, 184, 103, 153, 861, 829, 474, 687, 272,  23,
        255, 867, 160, 615, 287, 282, 974]),
 'duration': array([], dtype=float64),
 'existing_credits': array([197, 590]),
 'foreign_worker': array([930, 173,  24, 515, 268, 247, 859, 117, 264, 413,  22, 325, 156,
        108]),
 'housing': array([353, 424, 124, 553, 935, 633, 146, 672, 395, 923, 629, 216,  44,
        116, 639, 879,  48, 562, 178, 197, 266, 717, 489, 229, 888, 480,
        173, 378, 310,  43, 704, 745,  61, 512, 547, 402,  81, 157, 483,
        448, 558, 934, 219, 263, 661,  46, 567, 733, 602, 114, 509,  51,
        360, 101, 265, 239, 201, 430, 710, 133, 205, 686, 816, 311,  41,
        247, 937, 458, 859, 980, 503, 149, 764, 117, 191, 950, 107, 

#### Imputation

##### Autogluon directly for imputation

In [None]:
from autogluon import TabularPrediction as task

In [None]:
def cat_cols_to_str(df):
  for col in df.columns:
    if pd.api.types.is_categorical_dtype(df[col]):
      df[col] = df[col].astype(str)

  return df

In [None]:
train_data = cat_cols_to_str(train_data)
df_corrupted = cat_cols_to_str(df_corrupted)

In [None]:
predictors = {}

for col in categorical_columns:
  predictors[col] = task.fit(train_data=train_data, label=col, problem_type='multiclass')
  
for col in numerical_columns:
  predictors[col] = task.fit(train_data=train_data, label=col, problem_type='regression')

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200724_110758/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200724_110758/
AutoGluon Version:  0.0.12
Train Data Rows:    800
Train Data Columns: 20
Preprocessing data ...
Train Data Class Count: 4
Feature Generator processed 800 data points with 19 features
Original Features (raw dtypes):
	float64 features: 7
	object features: 12
Original Features (inferred dtypes):
	float features: 7
	object features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 7
	category features: 12
Final Features:
	float features: 7
	category features: 12
	Data preprocessing and feature engineering runtime = 0.17s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini

In [None]:
df_imputed = df_corrupted.copy()

In [None]:
for col in df_corrupted.columns:
  df_imputed[col + '_imputed'] = predictors[col].predict(df_imputed.drop([col], axis=1)) # drop the actual column before predicting
  perf = predictors[col].evaluate_predictions(df_imputed[col], df_imputed[col + '_imputed'], auxiliary_metrics=True)

  df_imputed[col].fillna(df_imputed[col + '_imputed'], inplace=True)

In [None]:
df_imputed

##### Predictors from AutoGluon Outlier Detection

In [None]:
df_imputed = df_outliers.copy()

In [None]:
for col in df_outliers.columns:
  df_imputed[col + '_imputed'] = predictors[col].predict(df_imputed.drop([col], axis=1)) # drop the actual column before predicting
  perf = predictors[col].evaluate_predictions(df_imputed[col], df_imputed[col + '_imputed'], auxiliary_metrics=False) ## True gives error

  df_imputed[col].fillna(df_imputed[col + '_imputed'], inplace=True)

Evaluation: accuracy on test data: 0.38333333333333336
Evaluation: root_mean_squared_error on test data: 8.950816608398393
Evaluation: accuracy on test data: 0.6866666666666666
Evaluation: accuracy on test data: 0.33666666666666667
Evaluation: root_mean_squared_error on test data: 1868.7605264953045
Evaluation: accuracy on test data: 0.29333333333333333
Evaluation: accuracy on test data: 0.39
Evaluation: root_mean_squared_error on test data: 1.1659236510676352
Evaluation: accuracy on test data: 0.6233333333333333
Evaluation: accuracy on test data: 0.9233333333333333
Evaluation: root_mean_squared_error on test data: 1.0245848167322102
Evaluation: accuracy on test data: 0.42333333333333334
Evaluation: root_mean_squared_error on test data: 8.973307640577467
Evaluation: accuracy on test data: 0.3933333333333333
Evaluation: accuracy on test data: 0.43
Evaluation: root_mean_squared_error on test data: 0.3288328605080836
Evaluation: accuracy on test data: 0.65
Evaluation: root_mean_squared_er

In [None]:
df_imputed

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,checking_status_imputed,duration_imputed,credit_history_imputed,purpose_imputed,credit_amount_imputed,savings_status_imputed,employment_imputed,installment_commitment_imputed,personal_status_imputed,other_parties_imputed,residence_since_imputed,property_magnitude_imputed,age_imputed,other_payment_plans_imputed,housing_imputed,existing_credits_imputed,job_imputed,num_dependents_imputed,own_telephone_imputed,foreign_worker_imputed
249,no checking,18.0,existing paid,radio/tv,433.0,rent,unemployed,3.000000,female div/dep/mar,none,4.0,car,22.0,none,<100,1.0,skilled,1.000000,none,yes,0<=X<200,10.367582,existing paid,radio/tv,2467.023634,<100,<1,3.764265,female div/dep/mar,none,3.369345,car,43.028615,none,rent,1.156923,skilled,1.124437,none,yes
353,<0,12.0,no credits/all paid,radio/tv,6199.0,rent,1<=X<4,4.000000,male single,none,2.0,life insurance,28.0,none,own,2.0,skilled,1.000000,yes,yes,no checking,39.310208,critical/other existing credit,furniture/equipment,2120.579225,<100,1<=X<4,2.627232,male single,none,3.079011,car,35.793551,none,own,1.770216,skilled,1.137485,none,yes
537,0<=X<200,18.0,existing paid,furniture/equipment,3612.0,<100,>=7,3.000000,female div/dep/mar,none,4.0,life insurance,37.0,none,own,1.0,skilled,1.045648,yes,yes,no checking,23.752204,existing paid,furniture/equipment,2667.414667,<100,1<=X<4,1.720177,male single,none,3.113792,car,36.957064,none,own,1.105424,high qualif/self emp/mgmt,1.045648,yes,yes
424,0<=X<200,12.0,existing paid,furniture/equipment,2762.0,own,>=7,2.494549,female div/dep/mar,none,2.0,life insurance,25.0,none,own,1.0,skilled,1.037778,yes,yes,no checking,20.306722,existing paid,furniture/equipment,3675.545173,<100,<1,2.494549,male single,none,3.446077,car,41.750950,none,own,1.088711,skilled,1.037778,none,yes
564,0<=X<200,24.0,delayed previously,business,4712.0,no known savings,1<=X<4,4.000000,male single,none,2.0,life insurance,37.0,none,own,2.0,high qualif/self emp/mgmt,1.000000,yes,yes,no checking,31.592973,critical/other existing credit,new car,4285.732199,<100,unemployed,2.314242,male single,none,2.977250,car,34.380729,none,own,1.721355,skilled,1.435771,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,no checking,6.0,existing paid,radio/tv,2108.0,<100,4<=X<7,0.292980,male mar/wid,none,2.0,real estate,29.0,none,own,1.0,skilled,1.000000,none,yes,no checking,13.154903,existing paid,radio/tv,1763.979511,<100,1<=X<4,1.716458,female div/dep/mar,none,2.350578,car,33.221631,none,own,1.079189,skilled,1.105908,yes,yes
838,<0,24.0,critical/other existing credit,used car,2957.0,<100,>=7,3.152586,male single,none,4.0,life insurance,63.0,none,own,2.0,skilled,1.000000,yes,yes,no checking,24.838171,critical/other existing credit,furniture/equipment,3980.326337,<100,>=7,3.359993,male single,none,3.697664,car,43.335257,none,own,1.841947,skilled,1.205513,none,yes
974,no checking,30.0,existing paid,radio/tv,2831.0,<100,1<=X<4,4.000000,female div/dep/mar,none,2.0,car,33.0,none,own,1.0,skilled,1.000000,yes,yes,<0,24.783447,existing paid,radio/tv,3812.225022,<100,1<=X<4,3.059771,male single,none,2.885709,car,30.504764,none,own,1.091442,skilled,1.060027,yes,yes
203,<0,12.0,existing paid,retraining,902.0,rent,4<=X<7,4.000000,male mar/wid,none,4.0,life insurance,21.0,none,own,1.0,skilled,0.971181,none,yes,0<=X<200,14.526258,existing paid,radio/tv,1606.455708,<100,<1,3.420039,female div/dep/mar,none,2.838425,real estate,31.295981,none,own,1.076322,skilled,0.971181,none,yes


In [None]:
df_imputed = df_imputed[df_outliers.columns]
df_imputed

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,18.0,existing paid,radio/tv,433.0,rent,unemployed,3.000000,female div/dep/mar,none,4.0,car,22.0,none,<100,1.0,skilled,1.000000,none,yes
353,<0,12.0,no credits/all paid,radio/tv,6199.0,rent,1<=X<4,4.000000,male single,none,2.0,life insurance,28.0,none,own,2.0,skilled,1.000000,yes,yes
537,0<=X<200,18.0,existing paid,furniture/equipment,3612.0,<100,>=7,3.000000,female div/dep/mar,none,4.0,life insurance,37.0,none,own,1.0,skilled,1.045648,yes,yes
424,0<=X<200,12.0,existing paid,furniture/equipment,2762.0,own,>=7,2.494549,female div/dep/mar,none,2.0,life insurance,25.0,none,own,1.0,skilled,1.037778,yes,yes
564,0<=X<200,24.0,delayed previously,business,4712.0,no known savings,1<=X<4,4.000000,male single,none,2.0,life insurance,37.0,none,own,2.0,high qualif/self emp/mgmt,1.000000,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,no checking,6.0,existing paid,radio/tv,2108.0,<100,4<=X<7,0.292980,male mar/wid,none,2.0,real estate,29.0,none,own,1.0,skilled,1.000000,none,yes
838,<0,24.0,critical/other existing credit,used car,2957.0,<100,>=7,3.152586,male single,none,4.0,life insurance,63.0,none,own,2.0,skilled,1.000000,yes,yes
974,no checking,30.0,existing paid,radio/tv,2831.0,<100,1<=X<4,4.000000,female div/dep/mar,none,2.0,car,33.0,none,own,1.0,skilled,1.000000,yes,yes
203,<0,12.0,existing paid,retraining,902.0,rent,4<=X<7,4.000000,male mar/wid,none,4.0,life insurance,21.0,none,own,1.0,skilled,0.971181,none,yes


### Using class cleaner

In [10]:
from jenga.cleaning.outlier_detection import PyODKNNOutlierDetection, PyODIsolationForestOutlierDetection, AutoGluonOutlierDetection
from jenga.cleaning.imputation import MeanModeImputation, AutoGluonImputation

cleaners = [
    # (PyODKNNOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, AutoGluonImputation),
    # (PyODIsolationForestOutlierDetection, MeanModeImputation),
    # (PyODIsolationForestOutlierDetection, AutoGluonImputation),
    # (AutoGluonOutlierDetection, MeanModeImputation),
    (AutoGluonOutlierDetection, AutoGluonImputation)
]

In [11]:
from jenga.cleaning.clean import Clean

categorical_precision_threshold=0.7
numerical_std_error_threshold=2.0

clean = Clean(train_data, df_corrupted, categorical_columns, numerical_columns, categorical_precision_threshold, numerical_std_error_threshold, ppp, ppp_model, cleaners)
df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(train_data, df_corrupted)

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200731_183958/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200731_183958/
AutoGluon Version:  0.0.12
Train Data Rows:    560
Train Data Columns: 20
Preprocessing data ...
Train Data Class Count: 4
Feature Generator processed 560 data points with 19 features
Original Features (raw dtypes):
	float64 features: 7
	object features: 12
Original Features (inferred dtypes):
	float features: 7
	object features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 7
	category features: 12
Final Features:
	float features: 7
	category features: 12
	Data preprocessing and feature engineering runtime = 0.16s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini

Categorical precision threshold: 0.7
Numerical Std Error threshold: 2.0
Predictors: {'checking_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f4870a866a0>, 'credit_history': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f480b64ecf8>, 'purpose': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f480bc3e780>, 'savings_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f48711aa0f0>, 'employment': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f480b65ce10>, 'personal_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f480bc8a588>, 'other_parties': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f480bc8f470>, 'property_magnitude': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f480b65cba8>, 'other_payment_plans': <autogluon.task.tabular_prediction.predict

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200731_184638/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200731_184638/
AutoGluon Version:  0.0.12
Train Data Rows:    700
Train Data Columns: 20
Preprocessing data ...
Train Data Class Count: 4
Feature Generator processed 700 data points with 19 features
Original Features (raw dtypes):
	float64 features: 7
	object features: 12
Original Features (inferred dtypes):
	float features: 7
	object features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 7
	category features: 12
Final Features:
	float features: 7
	category features: 12
	Data preprocessing and feature engineering runtime = 0.14s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini

Cleaner: {'outlier_detection': PyODKNNOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.747843734145104, 'classification_report': {'bad': {'precision': 0.5571428571428572, 'recall': 0.48148148148148145, 'f1-score': 0.5165562913907285, 'support': 81}, 'good': {'precision': 0.8173913043478261, 'recall': 0.8584474885844748, 'f1-score': 0.8374164810690424, 'support': 219}, 'accuracy': 0.7566666666666667, 'macro avg': {'precision': 0.6872670807453416, 'recall': 0.6699644850329781, 'f1-score': 0.6769863862298855, 'support': 300}, 'weighted avg': {'precision': 0.7471242236024845, 'recall': 0.7566666666666667, 'f1-score': 0.7507842298558977, 'support': 300}}}
Column other_parties: Num NaNs: Before: 0, Now: 22
Column property_magnitude: Num NaNs: Before: 0, Now: 2
Column other_payment_plans: Num NaNs: Before: 0, Now: 178
Column housing: Num NaNs: Before: 0, Now: 132
Column own_telephone: Num NaNs: Before: 0, Now: 56
Column foreign_worker: Num NaNs: Before: 0, Now: 14
Col

Evaluation: accuracy on test data: 0.39666666666666667
Evaluation: root_mean_squared_error on test data: 9.572419272290121
Evaluation: accuracy on test data: 0.68
Evaluation: accuracy on test data: 0.33
Evaluation: root_mean_squared_error on test data: 1692.9754900157595
Evaluation: accuracy on test data: 0.27666666666666667
Evaluation: accuracy on test data: 0.4666666666666667
Evaluation: root_mean_squared_error on test data: 1.108048161613976
Evaluation: accuracy on test data: 0.6233333333333333
Evaluation: accuracy on test data: 0.9033333333333333
Evaluation: root_mean_squared_error on test data: 1.0025217056603957
Evaluation: accuracy on test data: 0.4166666666666667
Evaluation: root_mean_squared_error on test data: 9.562154207991963
Evaluation: accuracy on test data: 0.38666666666666666
Evaluation: accuracy on test data: 0.41333333333333333
Evaluation: root_mean_squared_error on test data: 0.43296491805495385
Evaluation: accuracy on test data: 0.67
Evaluation: root_mean_squared_er

Cleaner: {'outlier_detection': AutoGluonOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.7472236315463104, 'classification_report': {'bad': {'precision': 0.5238095238095238, 'recall': 0.2716049382716049, 'f1-score': 0.35772357723577236, 'support': 81}, 'good': {'precision': 0.7713178294573644, 'recall': 0.908675799086758, 'f1-score': 0.8343815513626834, 'support': 219}, 'accuracy': 0.7366666666666667, 'macro avg': {'precision': 0.647563676633444, 'recall': 0.5901403686791815, 'f1-score': 0.5960525642992279, 'support': 300}, 'weighted avg': {'precision': 0.7044905869324475, 'recall': 0.7366666666666667, 'f1-score': 0.7056838983484174, 'support': 300}}}


Evaluation: accuracy on test data: 0.38666666666666666
Evaluation: root_mean_squared_error on test data: 11.366894154809074
Evaluation: accuracy on test data: 0.68
Evaluation: accuracy on test data: 0.33
Evaluation: root_mean_squared_error on test data: 1889.1163421151177
Evaluation: accuracy on test data: 0.29
Evaluation: accuracy on test data: 0.37
Evaluation: root_mean_squared_error on test data: 1.0014176117976314
Evaluation: accuracy on test data: 0.6233333333333333
Evaluation: accuracy on test data: 0.9233333333333333
Evaluation: root_mean_squared_error on test data: 1.0442621345084864
Evaluation: accuracy on test data: 0.43666666666666665
Evaluation: root_mean_squared_error on test data: 7.837590309414825
Evaluation: accuracy on test data: 0.4
Evaluation: accuracy on test data: 0.4266666666666667
Evaluation: root_mean_squared_error on test data: 0.4286350570321052
Evaluation: accuracy on test data: 0.6566666666666666
Evaluation: root_mean_squared_error on test data: 0.3563543402


Best cleaning method:
Cleaning score: Cleaner: {'outlier_detection': PyODKNNOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.747843734145104, 'classification_report': {'bad': {'precision': 0.5571428571428572, 'recall': 0.48148148148148145, 'f1-score': 0.5165562913907285, 'support': 81}, 'good': {'precision': 0.8173913043478261, 'recall': 0.8584474885844748, 'f1-score': 0.8374164810690424, 'support': 219}, 'accuracy': 0.7566666666666667, 'macro avg': {'precision': 0.6872670807453416, 'recall': 0.6699644850329781, 'f1-score': 0.6769863862298855, 'support': 300}, 'weighted avg': {'precision': 0.7471242236024845, 'recall': 0.7566666666666667, 'f1-score': 0.7507842298558977, 'support': 300}}} 

Cleaning improved the overall score 





In [None]:
# default verbose
from jenga.cleaning.clean import Clean

categorical_precision_threshold=0.7
numerical_std_error_threshold=2.0

clean = Clean(train_data, df_corrupted, categorical_columns, numerical_columns, categorical_precision_threshold, numerical_std_error_threshold, ppp, ppp_model, cleaners)
df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(train_data, df_corrupted)

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200730_122632/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200730_122632/
AutoGluon Version:  0.0.12
Train Data Rows:    560
Train Data Columns: 20
Preprocessing data ...
Train Data Class Count: 4
Feature Generator processed 560 data points with 19 features
Original Features (raw dtypes):
	float64 features: 7
	object features: 12
Original Features (inferred dtypes):
	float features: 7
	object features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 7
	category features: 12
Final Features:
	float features: 7
	category features: 12
	Data preprocessing and feature engineering runtime = 0.28s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini

Categorical precision threshold: 0.7
Numerical Std Error threshold: 2.0
Predictors: {'checking_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f611d816ef0>, 'credit_history': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f611d7d6240>, 'purpose': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f611d768a90>, 'savings_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f611d7abf28>, 'employment': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f611d81edd8>, 'personal_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f611c959f28>, 'other_parties': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f610e348be0>, 'property_magnitude': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f6181676c50>, 'other_payment_plans': <autogluon.task.tabular_prediction.predict

	0.4821	 = Validation accuracy score
	0.85s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: RandomForestClassifierEntr ...
	0.4375	 = Validation accuracy score
	0.84s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: ExtraTreesClassifierGini ...
	0.5	 = Validation accuracy score
	0.64s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: ExtraTreesClassifierEntr ...
	0.5179	 = Validation accuracy score
	0.73s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: KNeighborsClassifierUnif ...
	0.3929	 = Validation accuracy score
	0.01s	 = Training runtime
	0.11s	 = Validation runtime
Fitting model: KNeighborsClassifierDist ...
	0.3304	 = Validation accuracy score
	0.02s	 = Training runtime
	0.11s	 = Validation runtime
Fitting model: LightGBMClassifier ...
	0.5268	 = Validation accuracy score
	0.4s	 = Training runtime
	0.02s	 = Validation runtime
Fitting model: CatboostClassifier ...
	0.5268	 = Validation accuracy score
	6.42s	 = Training

Categorical precision threshold: 0.7
Numerical Std Error threshold: 2.0
Predictors: {'checking_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f610db58898>, 'credit_history': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f61361d4588>, 'purpose': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f610e35d9b0>, 'savings_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f610e0d3fd0>, 'employment': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f611d753978>, 'personal_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f611d7fdef0>, 'other_parties': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f611d716eb8>, 'property_magnitude': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f611d744278>, 'other_payment_plans': <autogluon.task.tabular_prediction.predict

	0.4821	 = Validation accuracy score
	0.85s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: RandomForestClassifierEntr ...
	0.4375	 = Validation accuracy score
	0.95s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: ExtraTreesClassifierGini ...
	0.5	 = Validation accuracy score
	0.74s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: ExtraTreesClassifierEntr ...
	0.5179	 = Validation accuracy score
	0.74s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: KNeighborsClassifierUnif ...
	0.3929	 = Validation accuracy score
	0.01s	 = Training runtime
	0.11s	 = Validation runtime
Fitting model: KNeighborsClassifierDist ...
	0.3304	 = Validation accuracy score
	0.02s	 = Training runtime
	0.11s	 = Validation runtime
Fitting model: LightGBMClassifier ...
	0.5268	 = Validation accuracy score
	0.4s	 = Training runtime
	0.02s	 = Validation runtime
Fitting model: CatboostClassifier ...
	0.5268	 = Validation accuracy score
	6.27s	 = Training

Categorical precision threshold: 0.7
Numerical Std Error threshold: 2.0
Predictors: {'checking_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f610e45e2b0>, 'credit_history': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f611c9287b8>, 'purpose': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f610e47e400>, 'savings_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f610e475da0>, 'employment': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f610dc268d0>, 'personal_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f610dc64080>, 'other_parties': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f610e4aa4e0>, 'property_magnitude': <autogluon.task.tabular_prediction.predictor.TabularPredictor object at 0x7f610e4aa908>, 'other_payment_plans': <autogluon.task.tabular_prediction.predict

Feature Generator processed 700 data points with 19 features
Original Features (raw dtypes):
	float64 features: 7
	object features: 12
Original Features (inferred dtypes):
	float features: 7
	object features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 7
	category features: 12
Final Features:
	float features: 7
	category features: 12
	Data preprocessing and feature engineering runtime = 0.14s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini ...
	0.4429	 = Validation accuracy score
	0.85s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: RandomForestClassifierEntr ...
	0.4429	 = Validation accuracy score
	0.94s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: ExtraTreesClassifierGini ...
	0.4429	 = Validation accuracy sc

Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.6062912227295789, 'classification_report': {'bad': {'precision': 0.3473053892215569, 'recall': 0.7160493827160493, 'f1-score': 0.467741935483871, 'support': 81}, 'good': {'precision': 0.8270676691729323, 'recall': 0.502283105022831, 'f1-score': 0.625, 'support': 219}, 'accuracy': 0.56, 'macro avg': {'precision': 0.5871865291972446, 'recall': 0.6091662438694402, 'f1-score': 0.5463709677419355, 'support': 300}, 'weighted avg': {'precision': 0.6975318535860608, 'recall': 0.56, 'f1-score': 0.5825403225806451, 'support': 300}}}
Cleaner: {'outlier_detection': PyODKNNOutlierDetection, 'imputation': NoImputation}: {'roc_auc_acore': 0.700152207001522, 'classification_report': {'bad': {'precision': 0.6, 'recall': 0.2222222222222222, 'f1-score': 0.32432432432432434, 'support': 81}, 'good': {'precision': 0.7666666666666667, 'recall': 0.9452054794520548, 'f1-score': 0.8466257668711656, 'suppor

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200730_125305/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200730_125305/
AutoGluon Version:  0.0.12
Train Data Rows:    700
Train Data Columns: 20
Preprocessing data ...
Train Data Class Count: 4
Feature Generator processed 700 data points with 19 features
Original Features (raw dtypes):
	float64 features: 7
	object features: 12
Original Features (inferred dtypes):
	float features: 7
	object features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 7
	category features: 12
Final Features:
	float features: 7
	category features: 12
	Data preprocessing and feature engineering runtime = 0.12s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini

Cleaner: {'outlier_detection': PyODKNNOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.7456451885675629, 'classification_report': {'bad': {'precision': 0.5373134328358209, 'recall': 0.4444444444444444, 'f1-score': 0.4864864864864865, 'support': 81}, 'good': {'precision': 0.8068669527896996, 'recall': 0.8584474885844748, 'f1-score': 0.8318584070796459, 'support': 219}, 'accuracy': 0.7466666666666667, 'macro avg': {'precision': 0.6720901928127603, 'recall': 0.6514459665144596, 'f1-score': 0.6591724467830662, 'support': 300}, 'weighted avg': {'precision': 0.7340875024021523, 'recall': 0.7466666666666667, 'f1-score': 0.7386079885194928, 'support': 300}}}
Cleaner: {'outlier_detection': PyODIsolationForestOutlierDetection, 'imputation': NoImputation}: {'roc_auc_acore': 0.6917526354360448, 'classification_report': {'bad': {'precision': 0.5333333333333333, 'recall': 0.2962962962962963, 'f1-score': 0.38095238095238093, 'support': 81}, 'good': {'precision': 0.77647058823

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200730_130041/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200730_130041/
AutoGluon Version:  0.0.12
Train Data Rows:    700
Train Data Columns: 20
Preprocessing data ...
Train Data Class Count: 4
Feature Generator processed 700 data points with 19 features
Original Features (raw dtypes):
	float64 features: 7
	object features: 12
Original Features (inferred dtypes):
	float features: 7
	object features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 7
	category features: 12
Final Features:
	float features: 7
	category features: 12
	Data preprocessing and feature engineering runtime = 0.11s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini

Cleaner: {'outlier_detection': PyODIsolationForestOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.7170640960595298, 'classification_report': {'bad': {'precision': 0.47619047619047616, 'recall': 0.37037037037037035, 'f1-score': 0.4166666666666667, 'support': 81}, 'good': {'precision': 0.7848101265822784, 'recall': 0.8493150684931506, 'f1-score': 0.8157894736842105, 'support': 219}, 'accuracy': 0.72, 'macro avg': {'precision': 0.6305003013863772, 'recall': 0.6098427194317605, 'f1-score': 0.6162280701754386, 'support': 300}, 'weighted avg': {'precision': 0.7014828209764918, 'recall': 0.72, 'f1-score': 0.7080263157894737, 'support': 300}}}
Column other_parties: Num NaNs: Before: 0, Now: 22
Column property_magnitude: Num NaNs: Before: 0, Now: 3
Column other_payment_plans: Num NaNs: Before: 0, Now: 180
Column housing: Num NaNs: Before: 0, Now: 133
Column own_telephone: Num NaNs: Before: 0, Now: 57
Column foreign_worker: Num NaNs: Before: 0, Now: 14
Column duration: 

Evaluation: accuracy on test data: 0.38666666666666666
Evaluation: root_mean_squared_error on test data: 8.719058215045418
Evaluation: accuracy on test data: 0.68
Evaluation: accuracy on test data: 0.3433333333333333
Evaluation: root_mean_squared_error on test data: 2012.490958311381
Evaluation: accuracy on test data: 0.2866666666666667
Evaluation: accuracy on test data: 0.39
Evaluation: root_mean_squared_error on test data: 1.079639959858525
Evaluation: accuracy on test data: 0.6066666666666667
Evaluation: accuracy on test data: 0.92
Evaluation: root_mean_squared_error on test data: 0.9640353528118364
Evaluation: accuracy on test data: 0.3933333333333333
Evaluation: root_mean_squared_error on test data: 9.575725809754958
Evaluation: accuracy on test data: 0.38333333333333336
Evaluation: accuracy on test data: 0.41
Evaluation: root_mean_squared_error on test data: 0.3298818934104116
Evaluation: accuracy on test data: 0.6833333333333333
Evaluation: root_mean_squared_error on test data: 

Cleaner: {'outlier_detection': AutoGluonOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.7584418512881222, 'classification_report': {'bad': {'precision': 0.5471698113207547, 'recall': 0.35802469135802467, 'f1-score': 0.43283582089552236, 'support': 81}, 'good': {'precision': 0.7894736842105263, 'recall': 0.8904109589041096, 'f1-score': 0.8369098712446352, 'support': 219}, 'accuracy': 0.7466666666666667, 'macro avg': {'precision': 0.6683217477656405, 'recall': 0.6242178251310672, 'f1-score': 0.6348728460700788, 'support': 300}, 'weighted avg': {'precision': 0.724051638530288, 'recall': 0.7466666666666667, 'f1-score': 0.7278098776503747, 'support': 300}}}
Column credit_history: Num NaNs: Before: 0, Now: 45
Column personal_status: Num NaNs: Before: 0, Now: 84
Column other_parties: Num NaNs: Before: 0, Now: 21
Column property_magnitude: Num NaNs: Before: 0, Now: 52
Column other_payment_plans: Num NaNs: Before: 0, Now: 178
Column housing: Num NaNs: Before: 0, Now: 1

In [None]:
df_cleaned

### Imputation

In [None]:
mean_mode_imputer = MeanModeImputation(train_data, test_data_corrupted, categorical_columns, numerical_columns)

test_data_mm_imputed = mean_mode_imputer.fit_transform(train_data, test_data_corrupted)
test_data_mm_imputed

In [None]:
datawig_imputer = DatawigImputation(train_data, test_data_corrupted, categorical_columns, numerical_columms)

test_data_dw_imputed = datawig_imputer.fit_transform(train_data, test_data_corrupted)
test_data_dw_imputed

##### Using PPP

In [None]:
# for all imputers return scores, take best
# using ppp

In [None]:
from jenga.cleaning.imputation import MeanModeImputation, DatawigImputation
from jenga.cleaning.ppp import PipelinePerformancePrediction

In [None]:
learner = SGDClassifier(max_iter=1000)
param_grid = {
    'learner__loss': ['log'],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

In [None]:
ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)

In [None]:
# generate corrpted test data
test_data_corrupted, perturbations, cols_perturbed = ppp.get_corrupted(test_data)

In [None]:
imputer_candidates = [MeanModeImputation, DatawigImputation]

In [None]:
imputers = []
for imputer in imputer_candidates:
    imputers.append(imputer(train_data, test_data_corrupted, categorical_columns, numerical_columns))

In [None]:
imputers

In [None]:
ppp_model = ppp.fit_ppp(train_data)

In [None]:
score_no_cleaning = ppp.predict_score_ppp(ppp_model, test_data)
score_no_cleaning

In [None]:
imputed_scores_ppp = []
for imputer in imputers:
    test_data_imputed = imputer.fit_transform(train_data, test_data_corrupted)
    imputed_score = ppp.predict_score_ppp(ppp_model, test_data_imputed)
    print(f"PPP score with {imputer}: {imputed_score}")
    imputed_scores_ppp.append(imputed_score)

In [None]:
imputed_scores_ppp

##### Using PPP and Cleaner classes

In [None]:
from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.cleaner import Cleaner
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNN, PyODIsolationForest
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, DatawigImputation
from jenga.cleaning.clean import Clean

In [None]:
learner = SGDClassifier(max_iter=1000)
param_grid = {
    'learner__loss': ['log'],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

In [None]:
ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)

In [None]:
ppp_model = ppp.fit_ppp(train_data)

In [None]:
ppp_model_score = ppp.predict_score_ppp(ppp_model, test_data)
ppp_model_score

In [None]:
# generate corrpted test data
test_data_corrupted, perturbations, cols_perturbed = ppp.get_corrupted(test_data)

In [None]:
score_no_cleaning = ppp.predict_score_ppp(ppp_model, test_data_corrupted)
score_no_cleaning

In [None]:
cleaner_candidates = [
    (NoOutlierDetection, NoImputation),
    (NoOutlierDetection, MeanModeImputation),
    (NoOutlierDetection, DatawigImputation),
    (PyODKNN, NoImputation),
    (PyODKNN, MeanModeImputation),
    (PyODKNN, DatawigImputation),
    (PyODIsolationForest, NoImputation),
    (PyODIsolationForest, MeanModeImputation),
    (PyODIsolationForest, DatawigImputation)
]

In [None]:
cleaners = []
for outd, imp in cleaner_candidates:
    cleaners.append(Cleaner(train_data, 
                            test_data_corrupted, 
                            categorical_columns, 
                            numerical_columns, 
                            outlier_detection = outd(train_data, 
                                                     test_data_corrupted, 
                                                     categorical_columns, 
                                                     numerical_columns), 
                            imputation = imp(train_data, 
                                             test_data_corrupted, 
                                             categorical_columns, 
                                             numerical_columns)
                           ))

In [None]:
cleaner_scores_ppp = []
for cleaner in cleaners:
    test_data_cleaned = cleaner.apply_cleaner(train_data, test_data_corrupted, categorical_columns, numerical_columns)
    cleaner_score = ppp.predict_score_ppp(ppp_model, test_data_cleaned)
    print(f"PPP score with {cleaner}: {cleaner_score}")
    cleaner_scores_ppp.append(cleaner_score)

In [None]:
cleaner_scores_ppp

In [None]:
best_cleaning_idx = pd.Series(cleaner_scores_ppp).idxmax()
best_cleaning_idx

In [None]:
best_cleaning_score = cleaner_scores_ppp[best_cleaning_idx]
best_cleaning_score

In [None]:
if best_cleaning_score > score_no_cleaning:
    test_data_cleaned = cleaners[best_cleaning_idx].apply_cleaner(train_data, test_data_corrupted, categorical_columns, numerical_columns)
    print(f"Best cleaning method: {cleaners[best_cleaning_idx]}: {best_cleaning_score}")
else:
    print("Cleaning didnt't improve the score")

In [None]:
## using clean class

In [None]:
clean = Clean(train_data, test_data_corrupted, categorical_columns, numerical_columns, ppp, ppp_model)

In [None]:
test_data_cleaned, score_no_cleaning, cleaner_scores_ppp = clean(train_data, test_data_corrupted)

In [None]:
from jenga.cleaning.outlier_detection import NoOutlierDetection
from jenga.cleaning.imputation import NoImputation


class Cleaner:
    
    def __init__(self, 
                 df_train,
                 df_corrupted,
                 categorical_columns,
                 numerical_columns,
                 outlier_detection=NoOutlierDetection, 
                 imputation=NoImputation):
        self.outlier_detection = outlier_detection
        self.imputation = imputation
        
    
    def apply_cleaner(self, df_train, df_corrupted, categorical_columns, numerical_columns):
        df_cleaned = self.outlier_detection(df_train, df_corrupted)
        
        # do something for fixing/removing the outliers
        if 'outlier' in df_cleaned.columns:
            ### TODO 
            df_cleaned = df_cleaned.drop('outlier', axis=1)
            
        # impute
        df_cleaned = self.imputation(df_train, df_cleaned)
        
        return df_cleaned

In [None]:
import pandas as pd

from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.cleaner import Cleaner
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNN, PyODIsolationForest
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, DatawigImputation


DEFAULT_CLEANERS = [
    (NoOutlierDetection, NoImputation),
    (NoOutlierDetection, MeanModeImputation),
    (NoOutlierDetection, DatawigImputation),
    (PyODKNN, NoImputation),
    (PyODKNN, MeanModeImputation),
    (PyODKNN, DatawigImputation),
    (PyODIsolationForest, NoImputation),
    (PyODIsolationForest, MeanModeImputation),
    (PyODIsolationForest, DatawigImputation)
]


class Clean:
    
    def __init__(self, 
                 df_train, 
                 df_corrupted, 
                 categorical_columns, 
                 numerical_columns,
                 ppp,
                 ppp_model,
                 cleaners=DEFAULT_CLEANERS):

        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns
        
        self.ppp = ppp
        self.ppp_model = ppp_model
        
        self.cleaners = []
        for outd, imp in cleaners:
            self.cleaners.append(Cleaner(df_train,
                                         df_corrupted,
                                         self.categorical_columns,
                                         self.numerical_columns,
                                         outlier_detection = outd(df_train,
                                                                  df_corrupted,
                                                                  self.categorical_columns,
                                                                  self.numerical_columns),
                                         imputation = imp(df_train,
                                                          df_corrupted,
                                                          self.categorical_columns,
                                                          self.numerical_columns)
                                        )
                                )
            
        
    def get_cleaned(self, df_train, df_corrupted):
        
        score_no_cleaning = self.ppp.predict_score_ppp(self.ppp_model, df_corrupted)
        print(f"PPP score no cleaning: {score_no_cleaning}")
        
        cleaner_scores_ppp = []
        for cleaner in self.cleaners:
            df_cleaned = cleaner.apply_cleaner(df_train, df_corrupted, self.categorical_columns, self.numerical_columns)
            cleaner_score = self.ppp.predict_score_ppp(self.ppp_model, df_cleaned)
            print(f"PPP score with cleaning: {cleaner}: {cleaner_score}")
            cleaner_scores_ppp.append(cleaner_score)
            
        best_cleaning_idx = pd.Series(cleaner_scores_ppp).idxmax()
        best_cleaning_score = cleaner_scores_ppp[best_cleaning_idx]
        if best_cleaning_score > score_no_cleaning:
            df_cleaned = self.cleaners[best_cleaning_idx].apply_cleaner(df_train, df_corrupted, self.categorical_columns, self.numerical_columns)
            print(f"Best cleaning method: {self.cleaners[best_cleaning_idx]}: {best_cleaning_score}")
        else:
            print("Cleaning didnt't improve the score")
            
        return df_cleaned, score_no_cleaning, cleaner_scores_ppp
    
    
    def __call__(self, df_train, df_corrupted):
        return self.get_cleaned(df_train, df_corrupted)

### Outlier Detection

In [None]:
# detection using KNN from PyOD
outlier = PyODKNN(train_data, test_data_corrupted, categorical_columns, numerical_columms)

In [None]:
test_data_corrupted_outliers = outlier.fit_transform(train_data, test_data_corrupted)
test_data_corrupted_outliers.head(10)

In [None]:
# detection using Isolation Forest from PyOD
outlier_if = PyODIsolationForest(train_data, test_data_corrupted, categorical_columns, numerical_columms)

In [None]:
test_data_corrupted_outliers_if = outlier_if.fit_transform(train_data, test_data_corrupted)
test_data_corrupted_outliers_if.head(10)

#### Preparing the outliers for imputation

In [None]:
if "outlier" in test_data_corrupted_outliers.columns:
    print(f'Setting {test_data_corrupted_outliers["outlier"].sum()} to Nan')
    test_data_corrupted_outliers.loc[test_data_corrupted_outliers["outlier"], :] = np.nan
    test_data_corrupted_outliers = test_data_corrupted_outliers.drop('outlier', axis=1)

In [None]:
## train_data, test_data_corrupted, 
## check values in column in the training data -> check for outliers in the same column in the corrupted data
## store .loc 
## convert those .loc for those column into nan
## impute

In [None]:
numerical_columms

In [None]:
test_data_corrupted

In [None]:
from abc import abstractmethod

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from pyod.models.knn import KNN
from pyod.models.iforest import IForest


class OutlierDetection:
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):
        
        self.df_train = df_train
        self.df_corrupted = df_corrupted
        
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns
        
        
        # preprocessing pipeline for numerical columns
        transformer_numeric = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
            ('standard_scale', StandardScaler())
        ])

        # preprocessing pipeline for categorical columns
        transformer_categorical = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
            ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
        ])

        # preprocessor
        self.feature_transform = ColumnTransformer(transformers=[
            ('categorical_features', transformer_categorical, self.categorical_columns),
            ('numerical_features', transformer_numeric, self.numerical_columns)
        ], sparse_threshold=1.0)
        
        
        @abstractmethod
        def fit_transform(self, df_train, df_corrupted):
            pass



class NoOutlierDetection(OutlierDetection):
    
    def fit_transform(self, df_train, df_corrupted):
        df_outliers = df_corrupted.copy()
        
        return df_outliers
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)


        
class PyODKNN(OutlierDetection):
    
    def fit_transform(self, df_train, df_corrupted):
        df_outliers = df_corrupted.copy()
        
        feature_transformation = self.feature_transform.fit(df_train)
        x = feature_transformation.transform(df_train).toarray()
        
        model = KNN()
        model.fit(x)
        
        xx = feature_transformation.transform(df_outliers).toarray()

        df_outliers["outlier"] = model.predict(xx) ## 0: inlier, 1: outlier
        
        return df_outliers
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)

    
    
class PyODIsolationForest(OutlierDetection):
    
    def fit_transform(self, df_train, df_corrupted):
        df_outliers = df_corrupted.copy()
        
        feature_transformation = self.feature_transform.fit(df_train)
        x = feature_transformation.transform(df_train).toarray()
        
        model = IForest(contamination=0.25)
        model.fit(x)
        
        xx = feature_transformation.transform(df_outliers).toarray()

        df_outliers["outlier"] = model.predict(xx) ## 0: inlier, 1: outlier
        
        return df_outliers
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)


In [None]:
from abc import abstractmethod
import numpy as np
import pandas as pd

import datawig



class Imputation:
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):
        self.df_train = df_train
        self.df_corrupted = df_corrupted
        
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns
        
    
    @abstractmethod
    def fit_transform(self, df_train, df_corrupted):
        pass

    
    
class NoImputation(Imputation):    
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):        
        Imputation.__init__(self, df_train, df_corrupted, categorical_columns, numerical_columns)
    
    
    def fit_transform(self, df_train, df_corrupted):
        df_imputed = df_corrupted.copy()
        return df_imputed
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)
    
    
    
class MeanModeImputation(Imputation):
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):
        self.means = {}
        self.modes = {}
    
        Imputation.__init__(self, df_train, df_corrupted, categorical_columns, numerical_columns)
    
    
    def fit_transform(self, df_train, df_corrupted):
        df_imputed = df_corrupted.copy()
        
        for col in df_train.columns:
            if col in self.numerical_columns:
                # mean imputer
                mean = np.mean(df_train[col])
                self.means[col] = mean
            elif col in self.categorical_columns:
                # mode imputer
                mode = df_train[col].value_counts().index[0]
                self.modes[col] = mode
                
                
        for col in df_corrupted.columns:
            if col in self.numerical_columns:
                # mean imputer
                df_imputed[col].fillna(self.means[col], inplace=True)
            elif col in self.categorical_columns:
                # mode imputer
                df_imputed[col].fillna(self.modes[col], inplace=True)
                
        return df_imputed
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)

    

class DatawigImputation(Imputation):
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):        
        Imputation.__init__(self, df_train, df_corrupted, categorical_columns, numerical_columns)
    
    
    def fit_transform(self, df_train, df_corrupted):
        df_imputed = df_corrupted.copy()

        for col in df_train.columns:
            if pd.api.types.is_categorical_dtype(df_train[col]):
                df_train[col] = df_train[col].astype(str)

        for col in df_corrupted.columns:
            if pd.api.types.is_categorical_dtype(df_corrupted[col]):
                df_corrupted[col] = df_corrupted[col].astype(str)


        for col in self.categorical_columns + self.numerical_columns:
            output_column = col
            input_columns = list(set(df_train.columns) - set([output_column]))

            print(f"Fitting model for column: {col}")
            model = datawig.SimpleImputer(input_columns, output_column, 'imputer_model')
            model.fit(df_train)

            df_imputed = model.predict(df_imputed)
            df_imputed[col].fillna(df_imputed[col + '_imputed'], inplace=True)
            df_imputed = df_imputed[df_corrupted.columns]

        return df_imputed
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)

## Evaluation

In [None]:
# score without cleaning
model_obj.score_on_test_data(model.predict_proba(test_data))

In [None]:
# score with corruptions
model_obj.score_on_test_data(model.predict_proba(test_data_corrupted))

In [None]:
# score with mean/mode imputation
model_obj.score_on_test_data(model.predict_proba(test_data_mm_imputed))

In [None]:
# score with datawig imputation
model_obj.score_on_test_data(model.predict_proba(test_data_dw_imputed))