## Dataset

In [1]:
# mount drive for access to the files
from google.colab import drive

drive.mount("/content/drive")

# all the drive the files are present in "/content/drive/My Drive"
!ls "/content/drive/My Drive/Beuth Uni/Master Thesis"

import sys
sys.path.append('/content/drive/My Drive/Beuth Uni/Master Thesis/jenga')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Data
'Data Quality in ML Production Systems.pdf'
'Datawig: Missing Value Imputation for Tables.pdf'
 Declaration
 Images
 jenga
 jenga.pdf
 MICE_Multivariate_Imputation_by_Chained_Equations_.pdf


In [2]:
!pip install openml
!pip install pyod

!pip install mxnet autogluon
!pip install mxnet-mkl --pre --upgrade

Requirement already up-to-date: mxnet-mkl in /usr/local/lib/python3.6/dist-packages (1.6.0)


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from jenga.basis import Dataset

In [4]:
seed = 100

In [5]:
dataset = Dataset(seed, "credit-g")

all_data = dataset.all_data
attribute_names = dataset.attribute_names
attribute_types = dataset.attribute_types

categorical_columns = dataset.categorical_columns
numerical_columns = dataset.numerical_columns

print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")

Dataset: credit-g
Found 13 categorical and 7 numeric features 



### Visualize the dataset

In [None]:
## plot the original dataset
def hide_current_axis(*args, **kwds):
        plt.gca().set_visible(False)
        
def plot_data(data):
    sns.set_style("white") # grid/no grid style: darkgrid, whitegrid, dark, white, ticks
    
    plot = sns.pairplot(data, hue="class")
    plot.map_upper(hide_current_axis)
    plt.show()

In [None]:
plot_data(all_data)

### Get training and test sets

In [6]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data(0.3)

## Model

### Model using AutoGluon

In [None]:
import autogluon as ag
from autogluon import TabularPrediction as task

label_col = 'class'

tr_data = train_data
tr_data[label_col] = train_labels

model = task.fit(train_data=tr_data, label=label_col)

# predictions
y_pred = model.predict(test_data)

# predictor performance
perf = model.evaluate_predictions(y_true=test_labels, y_pred=y_pred, auxiliary_metrics=True)

ModuleNotFoundError: ignored

In [None]:
import random
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

import autogluon as ag
from autogluon import TabularPrediction as task


class Model:

  def __init__(self, seed, train_data, train_labels, test_data, test_labels, pipeline, learner, param_grid):

    ## fix random seeds for reproducibility
    random.seed(seed)
    np.random.seed(seed)
    
    ## train and test data and labels
    self.train_data = train_data
    self.train_labels = train_labels
    self.test_data = test_data
    self.test_labels = test_labels
    
    ## preprocessing pipeline
    self.pipeline = pipeline
    
    ## information for model parameters
    self.learner = learner
    self.param_grid = param_grid
    
  
  def __repr__(self):
    return f"{self.__class__.__name__}: {self.__dict__}"
  
  
  # method for training a model on the raw data with preprocessing
  def fit_model(self):

    if self.learner == 'autogluon':
      label_col = 'class'
      
      tr_data = self.train_data
      tr_data[label_col] = self.train_labels
      
      model = task.fit(train_data=tr_data, label=label_col)

      return model
    else:
      grid_search = GridSearchCV(self.pipeline, self.param_grid, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
      model = grid_search.fit(self.train_data, self.train_labels)

      return model

  # method for computing evaluation scores
  def eval_scores(self, model):
    if self.learner == 'autogluon':
      # predictions
      y_pred = model.predict(self.test_data)
      
      # predictor performance
      perf = model.evaluate_predictions(y_true=self.test_labels, y_pred=y_pred, auxiliary_metrics=True)
      
      return perf
    else:
      pred_prob = model.predict_proba(self.test_data)
      roc_auc_acore = roc_auc_score(self.test_labels, np.transpose(pred_prob)[1])
      
      return roc_auc_acore
    

In [None]:
## define preprocessing pipeline if not given
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

def get_pipeline(learner):
  # preprocessing pipeline for numerical columns
  transformer_numeric = Pipeline([
      ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
      ('standard_scale', StandardScaler())
  ])

  # preprocessing pipeline for categorical columns
  transformer_categorical = Pipeline([
      ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
      ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
  ])

  # preprocessor
  feature_transform = ColumnTransformer(transformers=[
      ('categorical_features', transformer_categorical, categorical_columns),
      ('numerical_features', transformer_numeric, numerical_columns)
  ])

  ## prediction pipeline: append classifier (learner) to the preprocessing pipeline
  pipeline = Pipeline([
      ('features', feature_transform),
      ('learner', learner)
  ])

  return pipeline

In [None]:
# test autogluon
# pipeline in the case of autogluon is not used so anything is given
model_obj = Model(seed, train_data, train_labels, test_data, test_labels, get_pipeline(SGDClassifier(loss='log')), learner="autogluon", param_grid={})
model = model_obj.fit_model()
scores = model_obj.eval_scores(model)

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200720_140007/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200720_140007/
AutoGluon Version:  0.0.12
Train Data Rows:    800
Train Data Columns: 21
Preprocessing data ...
Here are the 2 unique label values in your data:  ['good', 'bad']
AutoGluon infers your prediction problem is: binary  (because only two unique label-values observed).
If this is wrong, please specify `problem_type` argument in fit() instead (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])

Selected class <--> label mapping:  class 1 = bad, class 0 = good
Train Data Class Count: 2
Feature Generator processed 800 data points with 20 features
Original Features (raw dtypes):
	category features: 13
	float64 features: 7
Original Features (inferred dtypes):
	category features: 13
	float features: 7
Generated Features (special dtypes):
Final Features (raw dtypes):
	category feat

In [None]:
## model parameters
## models is a dict where key = leaner & value = param_grid
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

models = {SGDClassifier(loss='log'): {'learner__max_iter': [500, 1000, 5000], 
                                         'learner__penalty': ['l2', 'l1', 'elasticnet'], 
                                         'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
                                        }, 
          RandomForestClassifier():{'learner__n_estimators': [100, 200, 500], 
                                    'learner__max_depth': [5, 10, 15]
                                   },
          "autogluon": {}
         }

In [None]:
for learner, param_grid in models.items():
  print(learner, param_grid)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False) {'learner__max_iter': [500, 1000, 5000], 'learner__penalty': ['l2', 'l1', 'elasticnet'], 'learner__alpha': [0.0001, 0.001, 0.01, 0.1]}
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                    

In [None]:
model_summary = []
scores_summary = []

for learner, param_grid in models.items():
  pipeline = None
  
  if learner == 'autogluon':
    pipeline = get_pipeline(SGDClassifier(loss='log'))
  else:
    pipeline = get_pipeline(learner)
  
  model_obj = Model(seed, train_data, train_labels, test_data, test_labels, pipeline, learner, param_grid)
  
  model = model_obj.fit_model()
  model_summary.append(model)

  scores = model_obj.eval_scores(model)
  scores_summary.append(scores)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  54 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 177 out of 180 | elapsed:    8.6s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    8.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   19.5s finished
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200720_140922/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200720_140922/
AutoGluon Version:  0.0.12
Train Data Rows:    800
Train Data Columns: 21
Preprocessing data ...
Here are the 2 unique label values in your data:  ['bad', 'good']
AutoGluon infers your prediction problem is: binary  (because only two unique label-values observed).
If this is wrong, please specify `problem_type` argument in fit() instead (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])

Selected class <--> label mapping:  class 1 = good, class 0 = bad
Train Data Class Count: 2
NumExpr defaulting to 2 threads.
Feature Generator processed 800 data points with 20 features
Original Features (raw dtypes):
	category features: 13
	float64 features: 7
Original Features (inferred dtypes):
	category features

In [None]:
model_summary

[GridSearchCV(cv=5, error_score=nan,
              estimator=Pipeline(memory=None,
                                 steps=[('features',
                                         ColumnTransformer(n_jobs=None,
                                                           remainder='drop',
                                                           sparse_threshold=0.3,
                                                           transformer_weights=None,
                                                           transformers=[('categorical_features',
                                                                          Pipeline(memory=None,
                                                                                   steps=[('imputer',
                                                                                           SimpleImputer(add_indicator=False,
                                                                                                         copy=True,
           

In [None]:
scores_summary

[0.8093735390369332,
 0.7849462365591399,
 OrderedDict([('accuracy', 0.73),
              ('accuracy_score', 0.73),
              ('balanced_accuracy_score', 0.6666666666666667),
              ('matthews_corrcoef', 0.3472488574259035),
              ('f1_score', 0.7299999999999999),
              ('classification_report',
               {'accuracy': 0.73,
                'bad': {'f1-score': 0.5344827586206897,
                 'precision': 0.5740740740740741,
                 'recall': 0.5,
                 'support': 62},
                'good': {'f1-score': 0.8098591549295775,
                 'precision': 0.7876712328767124,
                 'recall': 0.8333333333333334,
                 'support': 138},
                'macro avg': {'f1-score': 0.6721709567751336,
                 'precision': 0.6808726534753933,
                 'recall': 0.6666666666666667,
                 'support': 200},
                'weighted avg': {'f1-score': 0.7244924720738223,
                 'precisi

## Corruptions

In [None]:
from jenga.corruptions.perturbations import Perturbation

In [None]:
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise

In [None]:
corruptions = [MissingValues, SwappedValues, Scaling, GaussianNoise]
fraction = 0.5

In [None]:
# corruption perturbations to apply
corr_perturbations = Perturbation(categorical_columns, numerical_columns)

In [None]:
test_data_corrupted, perturbations, cols_perturbed, summary_col_corrupt = corr_perturbations.apply_perturbation(test_data, corruptions, fraction)

Applying perturbations... 

MissingValues: {'column': 'other_parties', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MAR'}
SwappedValues: {'column_a': 'checking_status', 'column_b': 'employment', 'fraction': 0.5}
Scaling: {'column': 'residence_since', 'fraction': 0.5}
GaussianNoise: {'column': 'age', 'fraction': 0.5}


### Visualize the original and corrupted test set

In [None]:
## original test data
plot_data(pd.concat([test_data, pd.Series(test_labels, name='class')], axis=1))

In [None]:
## corrupted test data
plot_data(pd.concat([test_data_corrupted, pd.Series(test_labels, name='class')], axis=1))

## Model & Corruptions using PPP

In [7]:
from sklearn.linear_model import SGDClassifier

learner = SGDClassifier(loss='log')
param_grid = {
    'learner__max_iter': [500, 1000, 5000],
    'learner__penalty': ['l2', 'l1', 'elasticnet'], 
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

In [8]:
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise

corruptions = [MissingValues, SwappedValues, Scaling, GaussianNoise]
fraction = 0.5
num_repetitions = 5

In [9]:
from jenga.cleaning.ppp import PipelinePerformancePrediction

ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)
ppp_model = ppp.fit_ppp(train_data)

## generate corrpted data
df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions, fraction, num_repetitions)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    4.4s



Generating corrupted training data on 300 rows... 

	perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: SwappedValues: {'column_a': 'property_magnitude', 'column_b': 'purpose', 'fraction': 0.5}
	perturbation: Scaling: {'column': 'installment_commitment', 'fraction': 0.5}
	perturbation: GaussianNoise: {'column': 'age', 'fraction': 0.5}
	perturbation: MissingValues: {'column': 'existing_credits', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MAR'}
	perturbation: SwappedValues: {'column_a': 'purpose', 'column_b': 'own_telephone', 'fraction': 0.5}
	perturbation: Scaling: {'column': 'num_dependents', 'fraction': 0.5}
	perturbation: GaussianNoise: {'column': 'num_dependents', 'fraction': 0.5}
	perturbation: MissingValues: {'column': 'property_magnitude', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: SwappedValues: {'column_a': 'other_parties', 'column_b': 'credit_history', 'fract

[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    9.5s finished


In [10]:
df_corrupted

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,18.0,existing paid,radio/tv,433.0,rent,unemployed,3.000000,female div/dep/mar,co applicant,4.0,real estate,22.0,none,<100,1.0,skilled,1.0,none,yes
353,<0,12.0,no credits/all paid,radio/tv,6199.0,rent,1<=X<4,4.000000,male single,none,2.0,life insurance,28.0,,<100,2.0,skilled,1.0,yes,yes
537,0<=X<200,18.0,critical/other existing credit,furniture/equipment,3612.0,<100,>=7,3.000000,female div/dep/mar,none,4.0,life insurance,37.0,,own,1.0,skilled,10.0,yes,yes
424,0<=X<200,12.0,existing paid,furniture/equipment,2762.0,own,>=7,-3.771531,female div/dep/mar,none,2.0,life insurance,25.0,,no known savings,1.0,skilled,10.0,yes,yes
564,0<=X<200,24.0,delayed previously,business,4712.0,no known savings,1<=X<4,4.000000,male single,none,2.0,life insurance,37.0,,own,2.0,high qualif/self emp/mgmt,1.0,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,no checking,6.0,existing paid,radio/tv,2108.0,<100,4<=X<7,0.292980,male mar/wid,none,2.0,real estate,29.0,,rent,1.0,skilled,1.0,none,yes
838,<0,24.0,critical/other existing credit,used car,2957.0,<100,>=7,3.152586,male single,none,4.0,life insurance,63.0,,own,2.0,skilled,1.0,yes,yes
974,no checking,30.0,critical/other existing credit,radio/tv,2831.0,<100,1<=X<4,4.000000,female div/dep/mar,none,2.0,car,33.0,,own,1.0,skilled,1.0,yes,yes
203,<0,12.0,existing paid,retraining,902.0,rent,4<=X<7,4.000000,male mar/wid,none,4.0,life insurance,21.0,none,<100,1.0,skilled,10.0,none,yes


## Cleaning

### Cleaning using AutoGluon

In [11]:
from autogluon import TabularPrediction as task

In [12]:
def cat_cols_to_str(df):
  for col in df.columns:
    if pd.api.types.is_categorical_dtype(df[col]):
      df[col] = df[col].astype(str)

  return df

In [13]:
train_data = cat_cols_to_str(train_data)
df_corrupted = cat_cols_to_str(df_corrupted)
test_data = cat_cols_to_str(test_data)

In [14]:
categorical_precision_threshold = 0.7
numerical_std_error_threshold = 2

#### Training

In [15]:
predictors = {}
predictable_cols = {}

In [16]:
for col in categorical_columns:
  predictors[col] = task.fit(train_data=train_data, label=col, problem_type='multiclass')

  y_test = test_data[col].dropna() # take only the non-nan records # test_data? OR split the train_data again into train and test
  y_pred = predictors[col].predict(test_data.drop([col], axis=1)) # drop the actual column before predicting

  perf = predictors[col].evaluate_predictions(y_test, y_pred, auxiliary_metrics=True)

  labels = [k for k in perf['classification_report'].keys() if k not in ['accuracy', 'macro avg', 'weighted avg']]

  high_precision_labels = []
  for label in labels:
    if perf['classification_report'][label]['precision'] > categorical_precision_threshold:
      high_precision_labels.append(label)

  if high_precision_labels:
    # predictable_cols.append(col)
    # predictors[col].high_precision_labels = high_precision_labels
    predictable_cols[col] = high_precision_labels


No output_directory specified. Models will be saved in: AutogluonModels/ag-20200728_225121/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200728_225121/
AutoGluon Version:  0.0.12
Train Data Rows:    700
Train Data Columns: 20
Preprocessing data ...
Train Data Class Count: 4
Feature Generator processed 700 data points with 19 features
Original Features (raw dtypes):
	float64 features: 7
	object features: 12
Original Features (inferred dtypes):
	float features: 7
	object features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 7
	category features: 12
Final Features:
	float features: 7
	category features: 12
	Data preprocessing and feature engineering runtime = 0.16s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini

In [17]:
for col in numerical_columns:
  predictors[col] = task.fit(train_data=train_data, label=col, problem_type='regression')

  y_test = test_data[col].dropna() # take only the non-nan records # test_data? OR split the train_data again into train and test
  y_pred = predictors[col].predict(test_data.drop([col], axis=1)) # drop the actual column before predicting

  perf = predictors[col].evaluate_predictions(y_test, y_pred, auxiliary_metrics=True)

  if perf['root_mean_squared_error'] < numerical_std_error_threshold * y_test.std():
    # predictable_cols.append(col)
    # predictors[col].root_mean_squared_error = perf['root_mean_squared_error']
    predictable_cols[col] = perf['root_mean_squared_error']

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200728_225742/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200728_225742/
AutoGluon Version:  0.0.12
Train Data Rows:    700
Train Data Columns: 20
Preprocessing data ...
Feature Generator processed 700 data points with 19 features
Original Features (raw dtypes):
	object features: 13
	float64 features: 6
Original Features (inferred dtypes):
	object features: 13
	float features: 6
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 6
	category features: 13
Final Features:
	float features: 6
	category features: 13
	Data preprocessing and feature engineering runtime = 0.13s ...
AutoGluon will gauge predictive performance using evaluation metric: root_mean_squared_error
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: root_mean_squared_error
Fitting model: RandomForestRegressorM

In [18]:
predictable_cols

{'age': 12.503235268996436,
 'checking_status': ['>=200'],
 'credit_amount': 3111.5654444954134,
 'credit_history': ['existing paid'],
 'duration': 16.038427993723687,
 'existing_credits': 0.696056329374258,
 'foreign_worker': ['yes'],
 'housing': ['for free', 'own'],
 'installment_commitment': 1.3120735525116587,
 'num_dependents': 0.40345596810191436,
 'other_parties': ['none'],
 'other_payment_plans': ['bank', 'none'],
 'own_telephone': ['yes'],
 'property_magnitude': ['no known property'],
 'residence_since': 1.2693950401902552}

In [19]:
predictors

{'age': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7ff15e918ba8>,
 'checking_status': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7ff0d8aceb38>,
 'credit_amount': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7ff0d5815780>,
 'credit_history': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7ff0dbabc0f0>,
 'duration': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7ff0cb68a518>,
 'employment': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7ff0f2dcd470>,
 'existing_credits': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7ff0d57d5908>,
 'foreign_worker': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7ff0d5808d68>,
 'housing': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7ff0d57a81d0>,
 'installment_commitment': <autogluon.task.tabular_prediction.predictor.TabularPredictor at 0x7ff0cb680a20>,
 'job': 

In [20]:
print(f"Found {len(predictable_cols.keys())} predictable columns: {predictable_cols.keys()}")

Found 15 predictable columns: dict_keys(['checking_status', 'credit_history', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'own_telephone', 'foreign_worker', 'duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents'])


#### Outlier Detection/Removal

In [70]:
col = "age"

In [45]:
y_pred = predictors[col].predict(df_corrupted)
y_pred

array([43.51275427, 35.83040951, 40.05013168, 42.83572055, 35.04089732,
       32.17258873, 32.48700568, 35.27354521, 34.68426327, 39.24061232,
       31.28843832, 34.51434478, 40.39149956, 41.5410409 , 33.86552249,
       39.5880507 , 41.67894465, 38.34793572, 39.10610223, 45.02336698,
       44.0762099 , 34.81241474, 29.62976866, 42.59616951, 34.79584546,
       39.95271712, 37.63975811, 41.59510478, 35.63257529, 44.8476982 ,
       39.27980516, 36.53296746, 44.58683467, 47.1799268 , 33.71600757,
       38.91013749, 30.62773118, 34.7944822 , 41.36272217, 41.45807076,
       32.30584115, 30.47521142, 30.72300508, 33.46067158, 41.99328106,
       32.48345083, 39.22818273, 36.27138376, 43.51777878, 37.37116436,
       35.4271282 , 40.9279374 , 38.25906167, 45.77320611, 37.74736415,
       36.72325077, 34.18338425, 41.00481232, 38.58701428, 36.53815808,
       48.95578047, 36.4702659 , 38.65048948, 37.91686703, 35.84799479,
       38.16634484, 42.28681682, 40.35830266, 36.28390818, 40.34

In [46]:
y_test = df_corrupted[col]
y_test

249    22.0
353    28.0
537    37.0
424    25.0
564    37.0
       ... 
193    29.0
838    63.0
974    33.0
203    21.0
425    21.0
Name: age, Length: 300, dtype: float64

In [47]:
df_corrupted[col].isnull().sum()

0

In [48]:
predictable_cols[col]

12.519270831214259

In [50]:
auxiliary_df_test_pred = pd.DataFrame(y_test)
auxiliary_df_test_pred["pred"] = y_pred

auxiliary_df_test_pred

Unnamed: 0,age,pred
249,22.0,43.512754
353,28.0,35.830410
537,37.0,40.050132
424,25.0,42.835721
564,37.0,35.040897
...,...,...
193,29.0,29.707539
838,63.0,43.609453
974,33.0,33.316747
203,21.0,34.648074


In [90]:
auxiliary_df_test_pred.loc[173, col] != auxiliary_df_test_pred.loc[173, "pred"]

True

In [55]:
np.sqrt((auxiliary_df_test_pred.loc[330, "pred"] - auxiliary_df_test_pred.loc[330, col]) ** 2)

33.32105535163225

In [57]:
np.sqrt((auxiliary_df_test_pred.loc[330, "pred"] - auxiliary_df_test_pred.loc[330, col]) ** 2) > predictable_cols[col] * numerical_std_error_threshold

True

In [95]:
presumably_wrong = {}

if col in categorical_columns:
  presumably_wrong_aux = []
  for i in auxiliary_df_test_pred.index:
    if any(np.isin(predictable_cols[col], auxiliary_df_test_pred.loc[i, "pred"])) & (auxiliary_df_test_pred.loc[i, col] != auxiliary_df_test_pred.loc[i, "pred"]):
      presumably_wrong_aux.append(i)

  presumably_wrong[col] = np.array(presumably_wrong_aux)

presumably_wrong

{'housing': array([544, 173, 759, 955, 121, 230,  11, 659, 419, 944, 417, 374, 982,
        139, 218, 449,  16, 904, 381, 329, 334, 403, 940, 349, 809, 445,
        890, 883, 539,  27,  64, 442, 131, 437,  22, 649, 941, 988, 935,
        663, 872, 375, 482, 193, 248, 271, 531, 138, 354, 643, 700, 438,
        529, 625, 181, 861, 304, 709, 898, 726, 614, 339, 440,  52,  75,
        707, 326, 855, 826,  43, 195, 916,  92, 196, 839, 939, 104, 280,
        335, 901, 711, 923, 854, 172, 444, 811, 774,  85, 250,  34, 124,
        739, 507, 750, 728, 602, 299, 222,  31, 300, 960, 943, 378, 576,
        274, 192, 398, 520, 188, 194, 225, 580, 428, 244, 588,  12, 765,
        950, 905, 813, 875, 593, 754, 590, 682, 190, 880, 617, 853, 140,
        817, 555, 592, 800, 565, 197, 163, 642, 264, 152, 685, 116, 903,
        859, 307, 942, 467,  30, 345, 100, 284, 646, 731, 894,  35, 802])}

In [54]:
presumably_wrong = {}

if col in numerical_columns:
  presumably_wrong_aux = []
  predictor_rmse = predictable_cols[col]
  for i in auxiliary_df_test_pred.index:
    rmse = np.sqrt((auxiliary_df_test_pred.loc[i, "pred"] - auxiliary_df_test_pred.loc[i, col]) ** 2)
    if rmse > predictor_rmse * numerical_std_error_threshold:
      presumably_wrong_aux.append(i)

  presumably_wrong[col] = np.array(presumably_wrong_aux)

presumably_wrong

{'age': array([330, 219, 430, 606, 137])}

In [72]:
for i in presumably_wrong[col]:
  df_corrupted.loc[i, col] = np.nan

df_corrupted.loc[presumably_wrong[col]]

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
330,<0,24.0,critical/other existing credit,used car,6615.0,<100,unemployed,2.0,male single,none,4.0,no known property,,none,for free,2.0,high qualif/self emp/mgmt,1.0,yes,yes
219,no checking,10.0,existing paid,new car,1364.0,own,1<=X<4,2.0,female div/dep/mar,none,4.0,car,,none,,1.0,skilled,10.0,yes,yes
430,no checking,5.0,existing paid,business,3448.0,own,4<=X<7,2.428858,male single,none,4.0,real estate,,,,1.0,unskilled resident,1.0,none,yes
606,no checking,24.0,critical/other existing credit,business,4526.0,own,1<=X<4,3.0,male single,none,2.0,real estate,,none,,1.0,high qualif/self emp/mgmt,10.0,yes,yes
137,0<=X<200,12.0,existing paid,radio/tv,766.0,500<=X<1000,1<=X<4,4.0,male single,none,3.0,real estate,,,own,1.0,unskilled resident,10.0,none,yes


In [73]:
df_corrupted[col].isnull().sum()

5

In [21]:
df_outliers = df_corrupted.copy()

presumably_wrong = {}

for col in predictable_cols:
  y_pred = predictors[col].predict(df_outliers)
  y_test = df_outliers[col]

  auxiliary_df_test_pred = pd.DataFrame(y_test)
  auxiliary_df_test_pred["pred"] = y_pred

  num_nans = df_outliers[col].isnull().sum()

  if col in categorical_columns:
    presumably_wrong_aux = []
    for i in auxiliary_df_test_pred.index:
      if any(np.isin(predictable_cols[col], auxiliary_df_test_pred.loc[i, "pred"])) & (auxiliary_df_test_pred.loc[i, col] != auxiliary_df_test_pred.loc[i, "pred"]):
        presumably_wrong_aux.append(i)

    presumably_wrong[col] = np.array(presumably_wrong_aux)

  if col in numerical_columns:
    presumably_wrong_aux = []
    predictor_rmse = predictable_cols[col]
    for i in auxiliary_df_test_pred.index:
      rmse = np.sqrt((auxiliary_df_test_pred.loc[i, "pred"] - auxiliary_df_test_pred.loc[i, col]) ** 2)
      if rmse > predictor_rmse * numerical_std_error_threshold:
        presumably_wrong_aux.append(i)

    presumably_wrong[col] = np.array(presumably_wrong_aux)

  for i in presumably_wrong[col]:
    df_outliers.loc[i, col] = np.nan

  print(f"Column {col}: Num NaNs: Before: {num_nans}, Now: {df_outliers[col].isnull().sum()}")


Column checking_status: Num NaNs: Before: 0, Now: 1
Column credit_history: Num NaNs: Before: 0, Now: 41
Column other_parties: Num NaNs: Before: 0, Now: 23
Column property_magnitude: Num NaNs: Before: 0, Now: 7
Column other_payment_plans: Num NaNs: Before: 0, Now: 180
Column housing: Num NaNs: Before: 0, Now: 160
Column own_telephone: Num NaNs: Before: 0, Now: 23
Column foreign_worker: Num NaNs: Before: 0, Now: 14
Column duration: Num NaNs: Before: 0, Now: 1
Column credit_amount: Num NaNs: Before: 0, Now: 3
Column installment_commitment: Num NaNs: Before: 0, Now: 57
Column residence_since: Num NaNs: Before: 0, Now: 1
Column age: Num NaNs: Before: 0, Now: 5
Column existing_credits: Num NaNs: Before: 0, Now: 2
Column num_dependents: Num NaNs: Before: 0, Now: 144


In [22]:
presumably_wrong

{'age': array([330, 219, 430, 606, 137]),
 'checking_status': array([156]),
 'credit_amount': array([378, 381, 917]),
 'credit_history': array([537, 553, 395, 781, 480, 244, 157, 345, 666, 567, 602, 301, 686,
        247, 450, 606, 299, 207, 175, 614, 213, 551,  98, 675, 669, 404,
        599, 184, 103, 565, 153, 861, 829, 474, 272,  23, 255, 160, 615,
        282, 974]),
 'duration': array([917]),
 'existing_credits': array([197, 590]),
 'foreign_worker': array([930, 173,  24, 515, 268, 247, 859, 117, 264, 413,  22, 325, 156,
        108]),
 'housing': array([353, 424, 124, 553, 935, 633, 146, 672, 395, 923, 629, 216,  44,
        116, 639, 879,  48, 562, 178, 197, 266, 717, 489, 229, 888, 480,
        173, 378, 310,  43, 704, 745,  61, 512, 547, 402,  81, 157, 483,
        448, 558, 934, 219, 263, 661,  46, 567, 733, 602, 114, 509,  51,
        360, 101, 265, 239, 201, 430, 710, 133, 205, 686, 816, 311,  41,
        247, 937, 458, 859, 980, 503, 149, 764, 117, 191, 950, 107, 413,
   

#### Imputation

In [None]:
from autogluon import TabularPrediction as task

In [None]:
def cat_cols_to_str(df):
  for col in df.columns:
    if pd.api.types.is_categorical_dtype(df[col]):
      df[col] = df[col].astype(str)

  return df

In [None]:
train_data = cat_cols_to_str(train_data)
df_corrupted = cat_cols_to_str(df_corrupted)

In [None]:
predictors = {}

for col in categorical_columns:
  predictors[col] = task.fit(train_data=train_data, label=col, problem_type='multiclass')
  
for col in numerical_columns:
  predictors[col] = task.fit(train_data=train_data, label=col, problem_type='regression')

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200724_110758/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200724_110758/
AutoGluon Version:  0.0.12
Train Data Rows:    800
Train Data Columns: 20
Preprocessing data ...
Train Data Class Count: 4
Feature Generator processed 800 data points with 19 features
Original Features (raw dtypes):
	float64 features: 7
	object features: 12
Original Features (inferred dtypes):
	float features: 7
	object features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 7
	category features: 12
Final Features:
	float features: 7
	category features: 12
	Data preprocessing and feature engineering runtime = 0.17s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini

In [23]:
df_imputed = df_outliers.copy()

In [27]:
for col in df_outliers.columns:
  y_test = df_imputed[col]
  y_pred = predictors[col].predict(df_imputed.drop([col], axis=1)) # drop the actual column before predicting
  
  perf = predictors[col].evaluate_predictions(y_test, y_pred, auxiliary_metrics=False) ## True gives error

  df_imputed[col + '_imputed'] = y_pred

  df_imputed[col].fillna(df_imputed[col + '_imputed'], inplace=True)

Evaluation: accuracy on test data: 0.39
Evaluation: root_mean_squared_error on test data: 16.6862859041637
Evaluation: accuracy on test data: 0.63
Evaluation: accuracy on test data: 0.26666666666666666
Evaluation: root_mean_squared_error on test data: 3487.151804543952
Evaluation: accuracy on test data: 0.29333333333333333
Evaluation: accuracy on test data: 0.39
Evaluation: root_mean_squared_error on test data: 1.4394324128176104
Evaluation: accuracy on test data: 0.6266666666666667
Evaluation: accuracy on test data: 0.9233333333333333
Evaluation: root_mean_squared_error on test data: 1.2432646924887438
Evaluation: accuracy on test data: 0.42333333333333334
Evaluation: root_mean_squared_error on test data: 12.170260161445974
Evaluation: accuracy on test data: 0.3933333333333333
Evaluation: accuracy on test data: 0.43
Evaluation: root_mean_squared_error on test data: 0.6386246067389293
Evaluation: accuracy on test data: 0.65
Evaluation: root_mean_squared_error on test data: 0.4318443389

In [28]:
df_imputed

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,checking_status_imputed,duration_imputed,credit_history_imputed,purpose_imputed,credit_amount_imputed,savings_status_imputed,employment_imputed,installment_commitment_imputed,personal_status_imputed,other_parties_imputed,residence_since_imputed,property_magnitude_imputed,age_imputed,other_payment_plans_imputed,housing_imputed,existing_credits_imputed,job_imputed,num_dependents_imputed,own_telephone_imputed,foreign_worker_imputed
249,no checking,18.0,existing paid,radio/tv,433.0,rent,unemployed,3.000000,female div/dep/mar,none,4.0,car,22.0,none,<100,1.0,skilled,1.000000,none,yes,0<=X<200,9.595051,existing paid,radio/tv,2513.036941,<100,<1,3.811189,female div/dep/mar,none,3.369345,car,42.846143,none,rent,1.156923,skilled,1.124549,none,yes
353,<0,12.0,no credits/all paid,radio/tv,6199.0,rent,1<=X<4,4.000000,male single,none,2.0,life insurance,28.0,none,own,2.0,skilled,1.000000,yes,yes,0<=X<200,39.074231,critical/other existing credit,furniture/equipment,2176.267325,<100,1<=X<4,2.562566,male single,none,3.079011,car,36.853478,none,own,1.770216,skilled,1.148943,none,yes
537,0<=X<200,18.0,existing paid,furniture/equipment,3612.0,<100,>=7,3.000000,female div/dep/mar,none,4.0,life insurance,37.0,none,own,1.0,skilled,1.047665,yes,yes,no checking,22.845383,existing paid,new car,2721.563758,<100,1<=X<4,1.858894,male single,none,3.113792,car,39.074213,none,own,1.105424,high qualif/self emp/mgmt,1.047665,none,yes
424,0<=X<200,12.0,existing paid,furniture/equipment,2762.0,own,>=7,2.487725,female div/dep/mar,none,2.0,life insurance,25.0,none,own,1.0,skilled,1.039282,yes,yes,no checking,19.166808,existing paid,new car,3705.004340,<100,<1,2.487725,male single,none,3.447030,car,41.730986,none,own,1.088711,skilled,1.039282,none,yes
564,0<=X<200,24.0,delayed previously,business,4712.0,no known savings,1<=X<4,4.000000,male single,none,2.0,life insurance,37.0,none,own,2.0,high qualif/self emp/mgmt,1.000000,yes,yes,no checking,32.541165,critical/other existing credit,radio/tv,4289.893097,<100,unemployed,2.478173,male single,none,2.977250,car,35.413415,none,own,1.721355,skilled,1.447608,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,no checking,6.0,existing paid,radio/tv,2108.0,<100,4<=X<7,0.292980,male mar/wid,none,2.0,real estate,29.0,none,own,1.0,skilled,1.000000,none,yes,no checking,12.854188,existing paid,new car,1835.076213,<100,1<=X<4,1.748051,female div/dep/mar,none,2.350578,car,34.926667,none,own,1.079189,skilled,1.105846,none,yes
838,<0,24.0,critical/other existing credit,used car,2957.0,<100,>=7,3.152586,male single,none,4.0,life insurance,63.0,none,own,2.0,skilled,1.000000,yes,yes,no checking,23.079064,critical/other existing credit,new car,4004.330014,<100,>=7,3.378114,male single,none,3.697664,car,43.438849,none,own,1.841947,skilled,1.221256,none,yes
974,no checking,30.0,existing paid,radio/tv,2831.0,<100,1<=X<4,4.000000,female div/dep/mar,none,2.0,car,33.0,none,own,1.0,skilled,1.000000,yes,yes,<0,25.879135,existing paid,radio/tv,3819.915005,<100,1<=X<4,3.127492,male single,none,2.885709,car,31.334555,none,own,1.091442,skilled,1.060856,none,yes
203,<0,12.0,existing paid,retraining,902.0,rent,4<=X<7,4.000000,male mar/wid,none,4.0,life insurance,21.0,none,own,1.0,skilled,0.974335,none,yes,0<=X<200,14.213030,existing paid,radio/tv,1687.008585,<100,<1,3.412312,female div/dep/mar,none,2.838425,real estate,33.887987,none,own,1.076322,skilled,0.974335,none,yes


In [29]:
df_imputed = df_imputed[df_outliers.columns]
df_imputed

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,18.0,existing paid,radio/tv,433.0,rent,unemployed,3.000000,female div/dep/mar,none,4.0,car,22.0,none,<100,1.0,skilled,1.000000,none,yes
353,<0,12.0,no credits/all paid,radio/tv,6199.0,rent,1<=X<4,4.000000,male single,none,2.0,life insurance,28.0,none,own,2.0,skilled,1.000000,yes,yes
537,0<=X<200,18.0,existing paid,furniture/equipment,3612.0,<100,>=7,3.000000,female div/dep/mar,none,4.0,life insurance,37.0,none,own,1.0,skilled,1.047665,yes,yes
424,0<=X<200,12.0,existing paid,furniture/equipment,2762.0,own,>=7,2.487725,female div/dep/mar,none,2.0,life insurance,25.0,none,own,1.0,skilled,1.039282,yes,yes
564,0<=X<200,24.0,delayed previously,business,4712.0,no known savings,1<=X<4,4.000000,male single,none,2.0,life insurance,37.0,none,own,2.0,high qualif/self emp/mgmt,1.000000,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,no checking,6.0,existing paid,radio/tv,2108.0,<100,4<=X<7,0.292980,male mar/wid,none,2.0,real estate,29.0,none,own,1.0,skilled,1.000000,none,yes
838,<0,24.0,critical/other existing credit,used car,2957.0,<100,>=7,3.152586,male single,none,4.0,life insurance,63.0,none,own,2.0,skilled,1.000000,yes,yes
974,no checking,30.0,existing paid,radio/tv,2831.0,<100,1<=X<4,4.000000,female div/dep/mar,none,2.0,car,33.0,none,own,1.0,skilled,1.000000,yes,yes
203,<0,12.0,existing paid,retraining,902.0,rent,4<=X<7,4.000000,male mar/wid,none,4.0,life insurance,21.0,none,own,1.0,skilled,0.974335,none,yes


### Using class cleaner

In [12]:
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNN, PyODIsolationForest
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, AutoGluonImputation

cleaners = [
    # (NoOutlierDetection, MeanModeImputation),
    (NoOutlierDetection, AutoGluonImputation)
    # (PyODKNN, NoImputation),
    # (PyODKNN, MeanModeImputation),
    # (PyODKNN, AutoGluonImputation),
    # (PyODIsolationForest, NoImputation),
    # (PyODIsolationForest, MeanModeImputation),
    # (PyODIsolationForest, AutoGluonImputation)
]

In [13]:
from jenga.cleaning.clean import Clean

clean = Clean(train_data, df_corrupted, categorical_columns, numerical_columns, ppp, ppp_model, cleaners)
df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(train_data, df_corrupted)

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200728_192923/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200728_192923/
AutoGluon Version:  0.0.12
Train Data Rows:    700
Train Data Columns: 20
Preprocessing data ...
Train Data Class Count: 4



Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.7709105682799249, 'classification_report': {'bad': {'precision': 0.6388888888888888, 'recall': 0.23711340206185566, 'f1-score': 0.3458646616541353, 'support': 97}, 'good': {'precision': 0.7196969696969697, 'recall': 0.9359605911330049, 'f1-score': 0.8137044967880085, 'support': 203}, 'accuracy': 0.71, 'macro avg': {'precision': 0.6792929292929293, 'recall': 0.5865369965974303, 'f1-score': 0.5797845792210718, 'support': 300}, 'weighted avg': {'precision': 0.6935690235690235, 'recall': 0.71, 'f1-score': 0.6624362834280562, 'support': 300}}}
PPP scores with cleaning: 


Feature Generator processed 700 data points with 19 features
Original Features (raw dtypes):
	float64 features: 7
	object features: 12
Original Features (inferred dtypes):
	float features: 7
	object features: 12
Generated Features (special dtypes):
Final Features (raw dtypes):
	float features: 7
	category features: 12
Final Features:
	float features: 7
	category features: 12
	Data preprocessing and feature engineering runtime = 0.18s ...
AutoGluon will gauge predictive performance using evaluation metric: accuracy
To change this, specify the eval_metric argument of fit()
AutoGluon will early stop models using evaluation metric: accuracy
Fitting model: RandomForestClassifierGini ...
	0.4929	 = Validation accuracy score
	0.95s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: RandomForestClassifierEntr ...
	0.5214	 = Validation accuracy score
	0.94s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: ExtraTreesClassifierGini ...
	0.4571	 = Validation accuracy sc

Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation': AutoGluonImputation}: {'roc_auc_acore': 0.7713676298816718, 'classification_report': {'bad': {'precision': 0.6190476190476191, 'recall': 0.26804123711340205, 'f1-score': 0.37410071942446044, 'support': 97}, 'good': {'precision': 0.7248062015503876, 'recall': 0.9211822660098522, 'f1-score': 0.8112798264642082, 'support': 203}, 'accuracy': 0.71, 'macro avg': {'precision': 0.6719269102990033, 'recall': 0.5946117515616272, 'f1-score': 0.5926902729443343, 'support': 300}, 'weighted avg': {'precision': 0.6906109265411592, 'recall': 0.71, 'f1-score': 0.6699252485213564, 'support': 300}}}


Fitting model: RandomForestClassifierGini ...
	0.4929	 = Validation accuracy score
	0.85s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: RandomForestClassifierEntr ...
	0.5071	 = Validation accuracy score
	0.94s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: ExtraTreesClassifierGini ...
	0.4429	 = Validation accuracy score
	0.74s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: ExtraTreesClassifierEntr ...
	0.4714	 = Validation accuracy score
	0.76s	 = Training runtime
	0.12s	 = Validation runtime
Fitting model: KNeighborsClassifierUnif ...
	0.3429	 = Validation accuracy score
	0.02s	 = Training runtime
	0.11s	 = Validation runtime
Fitting model: KNeighborsClassifierDist ...
	0.3143	 = Validation accuracy score
	0.02s	 = Training runtime
	0.11s	 = Validation runtime
Fitting model: LightGBMClassifier ...
	0.5571	 = Validation accuracy score
	0.54s	 = Training runtime
	0.02s	 = Validation runtime
Fitting model: CatboostClassifier ...
	0.55

KeyboardInterrupt: ignored

In [None]:
df_cleaned

NameError: ignored

### Imputation

In [None]:
mean_mode_imputer = MeanModeImputation(train_data, test_data_corrupted, categorical_columns, numerical_columns)

test_data_mm_imputed = mean_mode_imputer.fit_transform(train_data, test_data_corrupted)
test_data_mm_imputed

In [None]:
datawig_imputer = DatawigImputation(train_data, test_data_corrupted, categorical_columns, numerical_columms)

test_data_dw_imputed = datawig_imputer.fit_transform(train_data, test_data_corrupted)
test_data_dw_imputed

##### Using PPP

In [None]:
# for all imputers return scores, take best
# using ppp

In [None]:
from jenga.cleaning.imputation import MeanModeImputation, DatawigImputation
from jenga.cleaning.ppp import PipelinePerformancePrediction

In [None]:
learner = SGDClassifier(max_iter=1000)
param_grid = {
    'learner__loss': ['log'],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

In [None]:
ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)

In [None]:
# generate corrpted test data
test_data_corrupted, perturbations, cols_perturbed = ppp.get_corrupted(test_data)

In [None]:
imputer_candidates = [MeanModeImputation, DatawigImputation]

In [None]:
imputers = []
for imputer in imputer_candidates:
    imputers.append(imputer(train_data, test_data_corrupted, categorical_columns, numerical_columns))

In [None]:
imputers

In [None]:
ppp_model = ppp.fit_ppp(train_data)

In [None]:
score_no_cleaning = ppp.predict_score_ppp(ppp_model, test_data)
score_no_cleaning

In [None]:
imputed_scores_ppp = []
for imputer in imputers:
    test_data_imputed = imputer.fit_transform(train_data, test_data_corrupted)
    imputed_score = ppp.predict_score_ppp(ppp_model, test_data_imputed)
    print(f"PPP score with {imputer}: {imputed_score}")
    imputed_scores_ppp.append(imputed_score)

In [None]:
imputed_scores_ppp

##### Using PPP and Cleaner classes

In [None]:
from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.cleaner import Cleaner
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNN, PyODIsolationForest
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, DatawigImputation
from jenga.cleaning.clean import Clean

In [None]:
learner = SGDClassifier(max_iter=1000)
param_grid = {
    'learner__loss': ['log'],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

In [None]:
ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)

In [None]:
ppp_model = ppp.fit_ppp(train_data)

In [None]:
ppp_model_score = ppp.predict_score_ppp(ppp_model, test_data)
ppp_model_score

In [None]:
# generate corrpted test data
test_data_corrupted, perturbations, cols_perturbed = ppp.get_corrupted(test_data)

In [None]:
score_no_cleaning = ppp.predict_score_ppp(ppp_model, test_data_corrupted)
score_no_cleaning

In [None]:
cleaner_candidates = [
    (NoOutlierDetection, NoImputation),
    (NoOutlierDetection, MeanModeImputation),
    (NoOutlierDetection, DatawigImputation),
    (PyODKNN, NoImputation),
    (PyODKNN, MeanModeImputation),
    (PyODKNN, DatawigImputation),
    (PyODIsolationForest, NoImputation),
    (PyODIsolationForest, MeanModeImputation),
    (PyODIsolationForest, DatawigImputation)
]

In [None]:
cleaners = []
for outd, imp in cleaner_candidates:
    cleaners.append(Cleaner(train_data, 
                            test_data_corrupted, 
                            categorical_columns, 
                            numerical_columns, 
                            outlier_detection = outd(train_data, 
                                                     test_data_corrupted, 
                                                     categorical_columns, 
                                                     numerical_columns), 
                            imputation = imp(train_data, 
                                             test_data_corrupted, 
                                             categorical_columns, 
                                             numerical_columns)
                           ))

In [None]:
cleaner_scores_ppp = []
for cleaner in cleaners:
    test_data_cleaned = cleaner.apply_cleaner(train_data, test_data_corrupted, categorical_columns, numerical_columns)
    cleaner_score = ppp.predict_score_ppp(ppp_model, test_data_cleaned)
    print(f"PPP score with {cleaner}: {cleaner_score}")
    cleaner_scores_ppp.append(cleaner_score)

In [None]:
cleaner_scores_ppp

In [None]:
best_cleaning_idx = pd.Series(cleaner_scores_ppp).idxmax()
best_cleaning_idx

In [None]:
best_cleaning_score = cleaner_scores_ppp[best_cleaning_idx]
best_cleaning_score

In [None]:
if best_cleaning_score > score_no_cleaning:
    test_data_cleaned = cleaners[best_cleaning_idx].apply_cleaner(train_data, test_data_corrupted, categorical_columns, numerical_columns)
    print(f"Best cleaning method: {cleaners[best_cleaning_idx]}: {best_cleaning_score}")
else:
    print("Cleaning didnt't improve the score")

In [None]:
## using clean class

In [None]:
clean = Clean(train_data, test_data_corrupted, categorical_columns, numerical_columns, ppp, ppp_model)

In [None]:
test_data_cleaned, score_no_cleaning, cleaner_scores_ppp = clean(train_data, test_data_corrupted)

In [None]:
from jenga.cleaning.outlier_detection import NoOutlierDetection
from jenga.cleaning.imputation import NoImputation


class Cleaner:
    
    def __init__(self, 
                 df_train,
                 df_corrupted,
                 categorical_columns,
                 numerical_columns,
                 outlier_detection=NoOutlierDetection, 
                 imputation=NoImputation):
        self.outlier_detection = outlier_detection
        self.imputation = imputation
        
    
    def apply_cleaner(self, df_train, df_corrupted, categorical_columns, numerical_columns):
        df_cleaned = self.outlier_detection(df_train, df_corrupted)
        
        # do something for fixing/removing the outliers
        if 'outlier' in df_cleaned.columns:
            ### TODO 
            df_cleaned = df_cleaned.drop('outlier', axis=1)
            
        # impute
        df_cleaned = self.imputation(df_train, df_cleaned)
        
        return df_cleaned

In [None]:
import pandas as pd

from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.cleaner import Cleaner
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNN, PyODIsolationForest
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, DatawigImputation


DEFAULT_CLEANERS = [
    (NoOutlierDetection, NoImputation),
    (NoOutlierDetection, MeanModeImputation),
    (NoOutlierDetection, DatawigImputation),
    (PyODKNN, NoImputation),
    (PyODKNN, MeanModeImputation),
    (PyODKNN, DatawigImputation),
    (PyODIsolationForest, NoImputation),
    (PyODIsolationForest, MeanModeImputation),
    (PyODIsolationForest, DatawigImputation)
]


class Clean:
    
    def __init__(self, 
                 df_train, 
                 df_corrupted, 
                 categorical_columns, 
                 numerical_columns,
                 ppp,
                 ppp_model,
                 cleaners=DEFAULT_CLEANERS):

        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns
        
        self.ppp = ppp
        self.ppp_model = ppp_model
        
        self.cleaners = []
        for outd, imp in cleaners:
            self.cleaners.append(Cleaner(df_train,
                                         df_corrupted,
                                         self.categorical_columns,
                                         self.numerical_columns,
                                         outlier_detection = outd(df_train,
                                                                  df_corrupted,
                                                                  self.categorical_columns,
                                                                  self.numerical_columns),
                                         imputation = imp(df_train,
                                                          df_corrupted,
                                                          self.categorical_columns,
                                                          self.numerical_columns)
                                        )
                                )
            
        
    def get_cleaned(self, df_train, df_corrupted):
        
        score_no_cleaning = self.ppp.predict_score_ppp(self.ppp_model, df_corrupted)
        print(f"PPP score no cleaning: {score_no_cleaning}")
        
        cleaner_scores_ppp = []
        for cleaner in self.cleaners:
            df_cleaned = cleaner.apply_cleaner(df_train, df_corrupted, self.categorical_columns, self.numerical_columns)
            cleaner_score = self.ppp.predict_score_ppp(self.ppp_model, df_cleaned)
            print(f"PPP score with cleaning: {cleaner}: {cleaner_score}")
            cleaner_scores_ppp.append(cleaner_score)
            
        best_cleaning_idx = pd.Series(cleaner_scores_ppp).idxmax()
        best_cleaning_score = cleaner_scores_ppp[best_cleaning_idx]
        if best_cleaning_score > score_no_cleaning:
            df_cleaned = self.cleaners[best_cleaning_idx].apply_cleaner(df_train, df_corrupted, self.categorical_columns, self.numerical_columns)
            print(f"Best cleaning method: {self.cleaners[best_cleaning_idx]}: {best_cleaning_score}")
        else:
            print("Cleaning didnt't improve the score")
            
        return df_cleaned, score_no_cleaning, cleaner_scores_ppp
    
    
    def __call__(self, df_train, df_corrupted):
        return self.get_cleaned(df_train, df_corrupted)

### Outlier Detection

In [None]:
# detection using KNN from PyOD
outlier = PyODKNN(train_data, test_data_corrupted, categorical_columns, numerical_columms)

In [None]:
test_data_corrupted_outliers = outlier.fit_transform(train_data, test_data_corrupted)
test_data_corrupted_outliers.head(10)

In [None]:
# detection using Isolation Forest from PyOD
outlier_if = PyODIsolationForest(train_data, test_data_corrupted, categorical_columns, numerical_columms)

In [None]:
test_data_corrupted_outliers_if = outlier_if.fit_transform(train_data, test_data_corrupted)
test_data_corrupted_outliers_if.head(10)

#### Preparing the outliers for imputation

In [None]:
if "outlier" in test_data_corrupted_outliers.columns:
    print(f'Setting {test_data_corrupted_outliers["outlier"].sum()} to Nan')
    test_data_corrupted_outliers.loc[test_data_corrupted_outliers["outlier"], :] = np.nan
    test_data_corrupted_outliers = test_data_corrupted_outliers.drop('outlier', axis=1)

In [None]:
## train_data, test_data_corrupted, 
## check values in column in the training data -> check for outliers in the same column in the corrupted data
## store .loc 
## convert those .loc for those column into nan
## impute

In [None]:
numerical_columms

In [None]:
test_data_corrupted

In [None]:
from abc import abstractmethod

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from pyod.models.knn import KNN
from pyod.models.iforest import IForest


class OutlierDetection:
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):
        
        self.df_train = df_train
        self.df_corrupted = df_corrupted
        
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns
        
        
        # preprocessing pipeline for numerical columns
        transformer_numeric = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
            ('standard_scale', StandardScaler())
        ])

        # preprocessing pipeline for categorical columns
        transformer_categorical = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
            ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
        ])

        # preprocessor
        self.feature_transform = ColumnTransformer(transformers=[
            ('categorical_features', transformer_categorical, self.categorical_columns),
            ('numerical_features', transformer_numeric, self.numerical_columns)
        ], sparse_threshold=1.0)
        
        
        @abstractmethod
        def fit_transform(self, df_train, df_corrupted):
            pass



class NoOutlierDetection(OutlierDetection):
    
    def fit_transform(self, df_train, df_corrupted):
        df_outliers = df_corrupted.copy()
        
        return df_outliers
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)


        
class PyODKNN(OutlierDetection):
    
    def fit_transform(self, df_train, df_corrupted):
        df_outliers = df_corrupted.copy()
        
        feature_transformation = self.feature_transform.fit(df_train)
        x = feature_transformation.transform(df_train).toarray()
        
        model = KNN()
        model.fit(x)
        
        xx = feature_transformation.transform(df_outliers).toarray()

        df_outliers["outlier"] = model.predict(xx) ## 0: inlier, 1: outlier
        
        return df_outliers
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)

    
    
class PyODIsolationForest(OutlierDetection):
    
    def fit_transform(self, df_train, df_corrupted):
        df_outliers = df_corrupted.copy()
        
        feature_transformation = self.feature_transform.fit(df_train)
        x = feature_transformation.transform(df_train).toarray()
        
        model = IForest(contamination=0.25)
        model.fit(x)
        
        xx = feature_transformation.transform(df_outliers).toarray()

        df_outliers["outlier"] = model.predict(xx) ## 0: inlier, 1: outlier
        
        return df_outliers
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)


In [None]:
from abc import abstractmethod
import numpy as np
import pandas as pd

import datawig



class Imputation:
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):
        self.df_train = df_train
        self.df_corrupted = df_corrupted
        
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns
        
    
    @abstractmethod
    def fit_transform(self, df_train, df_corrupted):
        pass

    
    
class NoImputation(Imputation):    
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):        
        Imputation.__init__(self, df_train, df_corrupted, categorical_columns, numerical_columns)
    
    
    def fit_transform(self, df_train, df_corrupted):
        df_imputed = df_corrupted.copy()
        return df_imputed
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)
    
    
    
class MeanModeImputation(Imputation):
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):
        self.means = {}
        self.modes = {}
    
        Imputation.__init__(self, df_train, df_corrupted, categorical_columns, numerical_columns)
    
    
    def fit_transform(self, df_train, df_corrupted):
        df_imputed = df_corrupted.copy()
        
        for col in df_train.columns:
            if col in self.numerical_columns:
                # mean imputer
                mean = np.mean(df_train[col])
                self.means[col] = mean
            elif col in self.categorical_columns:
                # mode imputer
                mode = df_train[col].value_counts().index[0]
                self.modes[col] = mode
                
                
        for col in df_corrupted.columns:
            if col in self.numerical_columns:
                # mean imputer
                df_imputed[col].fillna(self.means[col], inplace=True)
            elif col in self.categorical_columns:
                # mode imputer
                df_imputed[col].fillna(self.modes[col], inplace=True)
                
        return df_imputed
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)

    

class DatawigImputation(Imputation):
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):        
        Imputation.__init__(self, df_train, df_corrupted, categorical_columns, numerical_columns)
    
    
    def fit_transform(self, df_train, df_corrupted):
        df_imputed = df_corrupted.copy()

        for col in df_train.columns:
            if pd.api.types.is_categorical_dtype(df_train[col]):
                df_train[col] = df_train[col].astype(str)

        for col in df_corrupted.columns:
            if pd.api.types.is_categorical_dtype(df_corrupted[col]):
                df_corrupted[col] = df_corrupted[col].astype(str)


        for col in self.categorical_columns + self.numerical_columns:
            output_column = col
            input_columns = list(set(df_train.columns) - set([output_column]))

            print(f"Fitting model for column: {col}")
            model = datawig.SimpleImputer(input_columns, output_column, 'imputer_model')
            model.fit(df_train)

            df_imputed = model.predict(df_imputed)
            df_imputed[col].fillna(df_imputed[col + '_imputed'], inplace=True)
            df_imputed = df_imputed[df_corrupted.columns]

        return df_imputed
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)

## Evaluation

In [None]:
# score without cleaning
model_obj.score_on_test_data(model.predict_proba(test_data))

In [None]:
# score with corruptions
model_obj.score_on_test_data(model.predict_proba(test_data_corrupted))

In [None]:
# score with mean/mode imputation
model_obj.score_on_test_data(model.predict_proba(test_data_mm_imputed))

In [None]:
# score with datawig imputation
model_obj.score_on_test_data(model.predict_proba(test_data_dw_imputed))