## Dataset

In [None]:
# mount drive for access to the files
from google.colab import drive

drive.mount("/content/drive")

# all the drive the files are present in "/content/drive/My Drive"
!ls "/content/drive/My Drive/Beuth Uni/Master Thesis"

import sys
sys.path.append('/content/drive/My Drive/Beuth Uni/Master Thesis/jenga')

In [None]:
!pip install openml
!pip install pyod
!pip install datawig
!pip install mxnet autogluon
!pip install mxnet-mkl --pre --upgrade

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from jenga.basis import Dataset

ModuleNotFoundError: ignored

In [None]:
seed = 10

In [None]:
dataset = Dataset(seed, "credit-g")

Data pickle file already exists and is up to date.


Dataset: credit-g


In [None]:
all_data = dataset.all_data
#all_data

In [None]:
attribute_names = dataset.attribute_names
#attribute_names

In [None]:
attribute_types = dataset.attribute_types
#attribute_types

In [None]:
categorical_columns = dataset.categorical_columns
#categorical_columns

In [None]:
numerical_columns = dataset.numerical_columns
#numerical_columns

In [None]:
print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")

Found 13 categorical and 7 numeric features 



In [None]:
eval_scores = {
    "accuracy": accuracy_score(self.test_labels, y_pred), 
    "precision":precision_score(self.test_labels, y_pred)
}

### Visualize the dataset

In [None]:
## plot the original dataset
def hide_current_axis(*args, **kwds):
        plt.gca().set_visible(False)
        
def plot_data(data):
    sns.set_style("white") # grid/no grid style: darkgrid, whitegrid, dark, white, ticks
    
    plot = sns.pairplot(data, hue="class")
    plot.map_upper(hide_current_axis)
    plt.show()

In [None]:
plot_data(all_data)

### Get training and test sets

In [None]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data()

## Model

In [None]:
import autogluon as ag
from autogluon import TabularPrediction as task

label_col = 'class'

tr_data = train_data
tr_data[label_col] = train_labels

model = task.fit(train_data=tr_data, label=label_col)

# predictions
y_pred = model.predict(test_data)

# predictor performance
perf = model.evaluate_predictions(y_true=test_labels, y_pred=y_pred, auxiliary_metrics=True)

ModuleNotFoundError: ignored

In [None]:
import random
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

import autogluon as ag
from autogluon import TabularPrediction as task


class Model:

  def __init__(self, seed, train_data, train_labels, test_data, test_labels, pipeline, learner, param_grid):

    ## fix random seeds for reproducibility
    random.seed(seed)
    np.random.seed(seed)
    
    ## train and test data and labels
    self.train_data = train_data
    self.train_labels = train_labels
    self.test_data = test_data
    self.test_labels = test_labels
    
    ## preprocessing pipeline
    self.pipeline = pipeline
    
    ## information for model parameters
    self.learner = learner
    self.param_grid = param_grid
    
  
  def __repr__(self):
    return f"{self.__class__.__name__}: {self.__dict__}"
  
  
  # method for training a model on the raw data with preprocessing
  def fit_model(self):

    if self.learner == 'autogluon':
      label_col = 'class'
      
      tr_data = self.train_data
      tr_data[label_col] = self.train_labels
      
      model = task.fit(train_data=tr_data, label=label_col)

      return model
    else:
      grid_search = GridSearchCV(self.pipeline, self.param_grid, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
      model = grid_search.fit(self.train_data, self.train_labels)

      return model

  # method for computing evaluation scores
  def eval_scores(self, model):
    if self.learner == 'autogluon':
      # predictions
      y_pred = model.predict(self.test_data)
      
      # predictor performance
      perf = model.evaluate_predictions(y_true=self.test_labels, y_pred=y_pred, auxiliary_metrics=True)
      
      return perf
    else:
      pred_prob = model.predict_proba(self.test_data)
      roc_auc_acore = roc_auc_score(self.test_labels, np.transpose(pred_prob)[1])
      
      return roc_auc_acore
    

In [None]:
## define preprocessing pipeline if not given
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

def get_pipeline(learner):
  # preprocessing pipeline for numerical columns
  transformer_numeric = Pipeline([
      ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
      ('standard_scale', StandardScaler())
  ])

  # preprocessing pipeline for categorical columns
  transformer_categorical = Pipeline([
      ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
      ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
  ])

  # preprocessor
  feature_transform = ColumnTransformer(transformers=[
      ('categorical_features', transformer_categorical, categorical_columns),
      ('numerical_features', transformer_numeric, numerical_columns)
  ])

  ## prediction pipeline: append classifier (learner) to the preprocessing pipeline
  pipeline = Pipeline([
      ('features', feature_transform),
      ('learner', learner)
  ])

  return pipeline

In [None]:
# test autogluon
# pipeline in the case of autogluon is not used so anything is given
model_obj = Model(seed, train_data, train_labels, test_data, test_labels, get_pipeline(SGDClassifier(loss='log')), learner="autogluon", param_grid={})
model = model_obj.fit_model()
scores = model_obj.eval_scores(model)

No output_directory specified. Models will be saved in: AutogluonModels/ag-20200720_140007/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200720_140007/
AutoGluon Version:  0.0.12
Train Data Rows:    800
Train Data Columns: 21
Preprocessing data ...
Here are the 2 unique label values in your data:  ['good', 'bad']
AutoGluon infers your prediction problem is: binary  (because only two unique label-values observed).
If this is wrong, please specify `problem_type` argument in fit() instead (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])

Selected class <--> label mapping:  class 1 = bad, class 0 = good
Train Data Class Count: 2
Feature Generator processed 800 data points with 20 features
Original Features (raw dtypes):
	category features: 13
	float64 features: 7
Original Features (inferred dtypes):
	category features: 13
	float features: 7
Generated Features (special dtypes):
Final Features (raw dtypes):
	category feat

In [None]:
## model parameters
## models is a dict where key = leaner & value = param_grid
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

models = {SGDClassifier(loss='log'): {'learner__max_iter': [500, 1000, 5000], 
                                         'learner__penalty': ['l2', 'l1', 'elasticnet'], 
                                         'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
                                        }, 
          RandomForestClassifier():{'learner__n_estimators': [100, 200, 500], 
                                    'learner__max_depth': [5, 10, 15]
                                   },
          "autogluon": {}
         }

In [None]:
for learner, param_grid in models.items():
  print(learner, param_grid)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False) {'learner__max_iter': [500, 1000, 5000], 'learner__penalty': ['l2', 'l1', 'elasticnet'], 'learner__alpha': [0.0001, 0.001, 0.01, 0.1]}
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                    

In [None]:
model_summary = []
scores_summary = []

for learner, param_grid in models.items():
  pipeline = None
  
  if learner == 'autogluon':
    pipeline = get_pipeline(SGDClassifier(loss='log'))
  else:
    pipeline = get_pipeline(learner)
  
  model_obj = Model(seed, train_data, train_labels, test_data, test_labels, pipeline, learner, param_grid)
  
  model = model_obj.fit_model()
  model_summary.append(model)

  scores = model_obj.eval_scores(model)
  scores_summary.append(scores)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  54 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 177 out of 180 | elapsed:    8.6s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    8.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   19.5s finished
No output_directory specified. Models will be saved in: AutogluonModels/ag-20200720_140922/
Beginning AutoGluon training ...
AutoGluon will save models to AutogluonModels/ag-20200720_140922/
AutoGluon Version:  0.0.12
Train Data Rows:    800
Train Data Columns: 21
Preprocessing data ...
Here are the 2 unique label values in your data:  ['bad', 'good']
AutoGluon infers your prediction problem is: binary  (because only two unique label-values observed).
If this is wrong, please specify `problem_type` argument in fit() instead (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])

Selected class <--> label mapping:  class 1 = good, class 0 = bad
Train Data Class Count: 2
NumExpr defaulting to 2 threads.
Feature Generator processed 800 data points with 20 features
Original Features (raw dtypes):
	category features: 13
	float64 features: 7
Original Features (inferred dtypes):
	category features

In [None]:
model_summary

[GridSearchCV(cv=5, error_score=nan,
              estimator=Pipeline(memory=None,
                                 steps=[('features',
                                         ColumnTransformer(n_jobs=None,
                                                           remainder='drop',
                                                           sparse_threshold=0.3,
                                                           transformer_weights=None,
                                                           transformers=[('categorical_features',
                                                                          Pipeline(memory=None,
                                                                                   steps=[('imputer',
                                                                                           SimpleImputer(add_indicator=False,
                                                                                                         copy=True,
           

In [None]:
scores_summary

[0.8093735390369332,
 0.7849462365591399,
 OrderedDict([('accuracy', 0.73),
              ('accuracy_score', 0.73),
              ('balanced_accuracy_score', 0.6666666666666667),
              ('matthews_corrcoef', 0.3472488574259035),
              ('f1_score', 0.7299999999999999),
              ('classification_report',
               {'accuracy': 0.73,
                'bad': {'f1-score': 0.5344827586206897,
                 'precision': 0.5740740740740741,
                 'recall': 0.5,
                 'support': 62},
                'good': {'f1-score': 0.8098591549295775,
                 'precision': 0.7876712328767124,
                 'recall': 0.8333333333333334,
                 'support': 138},
                'macro avg': {'f1-score': 0.6721709567751336,
                 'precision': 0.6808726534753933,
                 'recall': 0.6666666666666667,
                 'support': 200},
                'weighted avg': {'f1-score': 0.7244924720738223,
                 'precisi

## Corruptions

In [None]:
from jenga.corruptions.perturbations import Perturbation

In [None]:
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise

In [None]:
corruptions = [MissingValues, SwappedValues, Scaling, GaussianNoise]
fraction = 0.5

In [None]:
# corruption perturbations to apply
corr_perturbations = Perturbation(categorical_columns, numerical_columns)

In [None]:
test_data_corrupted, perturbations, cols_perturbed, summary_col_corrupt = corr_perturbations.apply_perturbation(test_data, corruptions, fraction)

Applying perturbations... 

MissingValues: {'column': 'other_parties', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MAR'}
SwappedValues: {'column_a': 'checking_status', 'column_b': 'employment', 'fraction': 0.5}
Scaling: {'column': 'residence_since', 'fraction': 0.5}
GaussianNoise: {'column': 'age', 'fraction': 0.5}


### Visualize the original and corrupted test set

In [None]:
## original test data
plot_data(pd.concat([test_data, pd.Series(test_labels, name='class')], axis=1))

In [None]:
## corrupted test data
plot_data(pd.concat([test_data_corrupted, pd.Series(test_labels, name='class')], axis=1))

## Cleaning

### Imputation

In [None]:
mean_mode_imputer = MeanModeImputation(train_data, test_data_corrupted, categorical_columns, numerical_columns)

test_data_mm_imputed = mean_mode_imputer.fit_transform(train_data, test_data_corrupted)
test_data_mm_imputed

In [None]:
datawig_imputer = DatawigImputation(train_data, test_data_corrupted, categorical_columns, numerical_columms)

test_data_dw_imputed = datawig_imputer.fit_transform(train_data, test_data_corrupted)
test_data_dw_imputed

##### Using PPP

In [None]:
# for all imputers return scores, take best
# using ppp

In [None]:
from jenga.cleaning.imputation import MeanModeImputation, DatawigImputation
from jenga.cleaning.ppp import PipelinePerformancePrediction

In [None]:
learner = SGDClassifier(max_iter=1000)
param_grid = {
    'learner__loss': ['log'],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

In [None]:
ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)

In [None]:
# generate corrpted test data
test_data_corrupted, perturbations, cols_perturbed = ppp.get_corrupted(test_data)

In [None]:
imputer_candidates = [MeanModeImputation, DatawigImputation]

In [None]:
imputers = []
for imputer in imputer_candidates:
    imputers.append(imputer(train_data, test_data_corrupted, categorical_columns, numerical_columns))

In [None]:
imputers

In [None]:
ppp_model = ppp.fit_ppp(train_data)

In [None]:
score_no_cleaning = ppp.predict_score_ppp(ppp_model, test_data)
score_no_cleaning

In [None]:
imputed_scores_ppp = []
for imputer in imputers:
    test_data_imputed = imputer.fit_transform(train_data, test_data_corrupted)
    imputed_score = ppp.predict_score_ppp(ppp_model, test_data_imputed)
    print(f"PPP score with {imputer}: {imputed_score}")
    imputed_scores_ppp.append(imputed_score)

In [None]:
imputed_scores_ppp

##### Using PPP and Cleaner classes

In [None]:
from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.cleaner import Cleaner
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNN, PyODIsolationForest
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, DatawigImputation
from jenga.cleaning.clean import Clean

In [None]:
learner = SGDClassifier(max_iter=1000)
param_grid = {
    'learner__loss': ['log'],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

In [None]:
ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)

In [None]:
ppp_model = ppp.fit_ppp(train_data)

In [None]:
ppp_model_score = ppp.predict_score_ppp(ppp_model, test_data)
ppp_model_score

In [None]:
# generate corrpted test data
test_data_corrupted, perturbations, cols_perturbed = ppp.get_corrupted(test_data)

In [None]:
score_no_cleaning = ppp.predict_score_ppp(ppp_model, test_data_corrupted)
score_no_cleaning

In [None]:
cleaner_candidates = [
    (NoOutlierDetection, NoImputation),
    (NoOutlierDetection, MeanModeImputation),
    (NoOutlierDetection, DatawigImputation),
    (PyODKNN, NoImputation),
    (PyODKNN, MeanModeImputation),
    (PyODKNN, DatawigImputation),
    (PyODIsolationForest, NoImputation),
    (PyODIsolationForest, MeanModeImputation),
    (PyODIsolationForest, DatawigImputation)
]

In [None]:
cleaners = []
for outd, imp in cleaner_candidates:
    cleaners.append(Cleaner(train_data, 
                            test_data_corrupted, 
                            categorical_columns, 
                            numerical_columns, 
                            outlier_detection = outd(train_data, 
                                                     test_data_corrupted, 
                                                     categorical_columns, 
                                                     numerical_columns), 
                            imputation = imp(train_data, 
                                             test_data_corrupted, 
                                             categorical_columns, 
                                             numerical_columns)
                           ))

In [None]:
cleaner_scores_ppp = []
for cleaner in cleaners:
    test_data_cleaned = cleaner.apply_cleaner(train_data, test_data_corrupted, categorical_columns, numerical_columns)
    cleaner_score = ppp.predict_score_ppp(ppp_model, test_data_cleaned)
    print(f"PPP score with {cleaner}: {cleaner_score}")
    cleaner_scores_ppp.append(cleaner_score)

In [None]:
cleaner_scores_ppp

In [None]:
best_cleaning_idx = pd.Series(cleaner_scores_ppp).idxmax()
best_cleaning_idx

In [None]:
best_cleaning_score = cleaner_scores_ppp[best_cleaning_idx]
best_cleaning_score

In [None]:
if best_cleaning_score > score_no_cleaning:
    test_data_cleaned = cleaners[best_cleaning_idx].apply_cleaner(train_data, test_data_corrupted, categorical_columns, numerical_columns)
    print(f"Best cleaning method: {cleaners[best_cleaning_idx]}: {best_cleaning_score}")
else:
    print("Cleaning didnt't improve the score")

In [None]:
## using clean class

In [None]:
clean = Clean(train_data, test_data_corrupted, categorical_columns, numerical_columns, ppp, ppp_model)

In [None]:
test_data_cleaned, score_no_cleaning, cleaner_scores_ppp = clean(train_data, test_data_corrupted)

In [None]:
from jenga.cleaning.outlier_detection import NoOutlierDetection
from jenga.cleaning.imputation import NoImputation


class Cleaner:
    
    def __init__(self, 
                 df_train,
                 df_corrupted,
                 categorical_columns,
                 numerical_columns,
                 outlier_detection=NoOutlierDetection, 
                 imputation=NoImputation):
        self.outlier_detection = outlier_detection
        self.imputation = imputation
        
    
    def apply_cleaner(self, df_train, df_corrupted, categorical_columns, numerical_columns):
        df_cleaned = self.outlier_detection(df_train, df_corrupted)
        
        # do something for fixing/removing the outliers
        if 'outlier' in df_cleaned.columns:
            ### TODO 
            df_cleaned = df_cleaned.drop('outlier', axis=1)
            
        # impute
        df_cleaned = self.imputation(df_train, df_cleaned)
        
        return df_cleaned

In [None]:
import pandas as pd

from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.cleaner import Cleaner
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNN, PyODIsolationForest
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, DatawigImputation


DEFAULT_CLEANERS = [
    (NoOutlierDetection, NoImputation),
    (NoOutlierDetection, MeanModeImputation),
    (NoOutlierDetection, DatawigImputation),
    (PyODKNN, NoImputation),
    (PyODKNN, MeanModeImputation),
    (PyODKNN, DatawigImputation),
    (PyODIsolationForest, NoImputation),
    (PyODIsolationForest, MeanModeImputation),
    (PyODIsolationForest, DatawigImputation)
]


class Clean:
    
    def __init__(self, 
                 df_train, 
                 df_corrupted, 
                 categorical_columns, 
                 numerical_columns,
                 ppp,
                 ppp_model,
                 cleaners=DEFAULT_CLEANERS):

        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns
        
        self.ppp = ppp
        self.ppp_model = ppp_model
        
        self.cleaners = []
        for outd, imp in cleaners:
            self.cleaners.append(Cleaner(df_train,
                                         df_corrupted,
                                         self.categorical_columns,
                                         self.numerical_columns,
                                         outlier_detection = outd(df_train,
                                                                  df_corrupted,
                                                                  self.categorical_columns,
                                                                  self.numerical_columns),
                                         imputation = imp(df_train,
                                                          df_corrupted,
                                                          self.categorical_columns,
                                                          self.numerical_columns)
                                        )
                                )
            
        
    def get_cleaned(self, df_train, df_corrupted):
        
        score_no_cleaning = self.ppp.predict_score_ppp(self.ppp_model, df_corrupted)
        print(f"PPP score no cleaning: {score_no_cleaning}")
        
        cleaner_scores_ppp = []
        for cleaner in self.cleaners:
            df_cleaned = cleaner.apply_cleaner(df_train, df_corrupted, self.categorical_columns, self.numerical_columns)
            cleaner_score = self.ppp.predict_score_ppp(self.ppp_model, df_cleaned)
            print(f"PPP score with cleaning: {cleaner}: {cleaner_score}")
            cleaner_scores_ppp.append(cleaner_score)
            
        best_cleaning_idx = pd.Series(cleaner_scores_ppp).idxmax()
        best_cleaning_score = cleaner_scores_ppp[best_cleaning_idx]
        if best_cleaning_score > score_no_cleaning:
            df_cleaned = self.cleaners[best_cleaning_idx].apply_cleaner(df_train, df_corrupted, self.categorical_columns, self.numerical_columns)
            print(f"Best cleaning method: {self.cleaners[best_cleaning_idx]}: {best_cleaning_score}")
        else:
            print("Cleaning didnt't improve the score")
            
        return df_cleaned, score_no_cleaning, cleaner_scores_ppp
    
    
    def __call__(self, df_train, df_corrupted):
        return self.get_cleaned(df_train, df_corrupted)

### Outlier Detection

In [None]:
# detection using KNN from PyOD
outlier = PyODKNN(train_data, test_data_corrupted, categorical_columns, numerical_columms)

In [None]:
test_data_corrupted_outliers = outlier.fit_transform(train_data, test_data_corrupted)
test_data_corrupted_outliers.head(10)

In [None]:
# detection using Isolation Forest from PyOD
outlier_if = PyODIsolationForest(train_data, test_data_corrupted, categorical_columns, numerical_columms)

In [None]:
test_data_corrupted_outliers_if = outlier_if.fit_transform(train_data, test_data_corrupted)
test_data_corrupted_outliers_if.head(10)

#### Preparing the outliers for imputation

In [None]:
if "outlier" in test_data_corrupted_outliers.columns:
    print(f'Setting {test_data_corrupted_outliers["outlier"].sum()} to Nan')
    test_data_corrupted_outliers.loc[test_data_corrupted_outliers["outlier"], :] = np.nan
    test_data_corrupted_outliers = test_data_corrupted_outliers.drop('outlier', axis=1)

In [None]:
## train_data, test_data_corrupted, 
## check values in column in the training data -> check for outliers in the same column in the corrupted data
## store .loc 
## convert those .loc for those column into nan
## impute

In [None]:
numerical_columms

In [None]:
test_data_corrupted

In [None]:
from abc import abstractmethod

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from pyod.models.knn import KNN
from pyod.models.iforest import IForest


class OutlierDetection:
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):
        
        self.df_train = df_train
        self.df_corrupted = df_corrupted
        
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns
        
        
        # preprocessing pipeline for numerical columns
        transformer_numeric = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
            ('standard_scale', StandardScaler())
        ])

        # preprocessing pipeline for categorical columns
        transformer_categorical = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
            ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
        ])

        # preprocessor
        self.feature_transform = ColumnTransformer(transformers=[
            ('categorical_features', transformer_categorical, self.categorical_columns),
            ('numerical_features', transformer_numeric, self.numerical_columns)
        ], sparse_threshold=1.0)
        
        
        @abstractmethod
        def fit_transform(self, df_train, df_corrupted):
            pass



class NoOutlierDetection(OutlierDetection):
    
    def fit_transform(self, df_train, df_corrupted):
        df_outliers = df_corrupted.copy()
        
        return df_outliers
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)


        
class PyODKNN(OutlierDetection):
    
    def fit_transform(self, df_train, df_corrupted):
        df_outliers = df_corrupted.copy()
        
        feature_transformation = self.feature_transform.fit(df_train)
        x = feature_transformation.transform(df_train).toarray()
        
        model = KNN()
        model.fit(x)
        
        xx = feature_transformation.transform(df_outliers).toarray()

        df_outliers["outlier"] = model.predict(xx) ## 0: inlier, 1: outlier
        
        return df_outliers
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)

    
    
class PyODIsolationForest(OutlierDetection):
    
    def fit_transform(self, df_train, df_corrupted):
        df_outliers = df_corrupted.copy()
        
        feature_transformation = self.feature_transform.fit(df_train)
        x = feature_transformation.transform(df_train).toarray()
        
        model = IForest(contamination=0.25)
        model.fit(x)
        
        xx = feature_transformation.transform(df_outliers).toarray()

        df_outliers["outlier"] = model.predict(xx) ## 0: inlier, 1: outlier
        
        return df_outliers
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)


In [None]:
from abc import abstractmethod
import numpy as np
import pandas as pd

import datawig



class Imputation:
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):
        self.df_train = df_train
        self.df_corrupted = df_corrupted
        
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns
        
    
    @abstractmethod
    def fit_transform(self, df_train, df_corrupted):
        pass

    
    
class NoImputation(Imputation):    
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):        
        Imputation.__init__(self, df_train, df_corrupted, categorical_columns, numerical_columns)
    
    
    def fit_transform(self, df_train, df_corrupted):
        df_imputed = df_corrupted.copy()
        return df_imputed
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)
    
    
    
class MeanModeImputation(Imputation):
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):
        self.means = {}
        self.modes = {}
    
        Imputation.__init__(self, df_train, df_corrupted, categorical_columns, numerical_columns)
    
    
    def fit_transform(self, df_train, df_corrupted):
        df_imputed = df_corrupted.copy()
        
        for col in df_train.columns:
            if col in self.numerical_columns:
                # mean imputer
                mean = np.mean(df_train[col])
                self.means[col] = mean
            elif col in self.categorical_columns:
                # mode imputer
                mode = df_train[col].value_counts().index[0]
                self.modes[col] = mode
                
                
        for col in df_corrupted.columns:
            if col in self.numerical_columns:
                # mean imputer
                df_imputed[col].fillna(self.means[col], inplace=True)
            elif col in self.categorical_columns:
                # mode imputer
                df_imputed[col].fillna(self.modes[col], inplace=True)
                
        return df_imputed
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)

    

class DatawigImputation(Imputation):
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):        
        Imputation.__init__(self, df_train, df_corrupted, categorical_columns, numerical_columns)
    
    
    def fit_transform(self, df_train, df_corrupted):
        df_imputed = df_corrupted.copy()

        for col in df_train.columns:
            if pd.api.types.is_categorical_dtype(df_train[col]):
                df_train[col] = df_train[col].astype(str)

        for col in df_corrupted.columns:
            if pd.api.types.is_categorical_dtype(df_corrupted[col]):
                df_corrupted[col] = df_corrupted[col].astype(str)


        for col in self.categorical_columns + self.numerical_columns:
            output_column = col
            input_columns = list(set(df_train.columns) - set([output_column]))

            print(f"Fitting model for column: {col}")
            model = datawig.SimpleImputer(input_columns, output_column, 'imputer_model')
            model.fit(df_train)

            df_imputed = model.predict(df_imputed)
            df_imputed[col].fillna(df_imputed[col + '_imputed'], inplace=True)
            df_imputed = df_imputed[df_corrupted.columns]

        return df_imputed
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)

## Evaluation

In [None]:
# score without cleaning
model_obj.score_on_test_data(model.predict_proba(test_data))

In [None]:
# score with corruptions
model_obj.score_on_test_data(model.predict_proba(test_data_corrupted))

In [None]:
# score with mean/mode imputation
model_obj.score_on_test_data(model.predict_proba(test_data_mm_imputed))

In [None]:
# score with datawig imputation
model_obj.score_on_test_data(model.predict_proba(test_data_dw_imputed))