## Dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import SGDClassifier

from jenga.basis import Dataset
from jenga.models.model import Model
from jenga.corruptions.perturbations import Perturbation
from jenga.cleaning.imputation import MeanModeImputation, DatawigImputation
from jenga.cleaning.outlier_detection import PyODKNN, PyODIsolationForest

In [2]:
seed = 10

In [3]:
dataset = Dataset(seed, "credit-g")

Dataset 'credit-g', target: 'class'
**Author**: Dr. Hans Hofmann  

**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)) - 1994    

**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)



**German Credit data**  

This dataset classifies people described by a set of attributes as good or bad credit risks.



This dataset comes with a cost matrix: 

``` 

      Good  Bad (predicted)  

Good   0    1   (actual)  

Bad    5    0  

```



It is worse to class a customer as go

Attribute types: 


Unnamed: 0,attribute_names,categorical_indicator
0,checking_status,True
1,duration,False
2,credit_history,True
3,purpose,True
4,credit_amount,False
5,savings_status,True
6,employment,True
7,installment_commitment,False
8,personal_status,True
9,other_parties,True


In [4]:
all_data = dataset.all_data
all_data

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,...,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes,good
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,...,real estate,22.0,none,own,1.0,skilled,1.0,none,yes,bad
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,...,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes,good
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,...,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes,good
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,...,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,no checking,12.0,existing paid,furniture/equipment,1736.0,<100,4<=X<7,3.0,female div/dep/mar,none,...,real estate,31.0,none,own,1.0,unskilled resident,1.0,none,yes,good
996,<0,30.0,existing paid,used car,3857.0,<100,1<=X<4,4.0,male div/sep,none,...,life insurance,40.0,none,own,1.0,high qualif/self emp/mgmt,1.0,yes,yes,good
997,no checking,12.0,existing paid,radio/tv,804.0,<100,>=7,4.0,male single,none,...,car,38.0,none,own,1.0,skilled,1.0,none,yes,good
998,<0,45.0,existing paid,radio/tv,1845.0,<100,1<=X<4,4.0,male single,none,...,no known property,23.0,none,for free,1.0,skilled,1.0,yes,yes,bad


In [5]:
attribute_names = dataset.attribute_names
attribute_names

['checking_status',
 'duration',
 'credit_history',
 'purpose',
 'credit_amount',
 'savings_status',
 'employment',
 'installment_commitment',
 'personal_status',
 'other_parties',
 'residence_since',
 'property_magnitude',
 'age',
 'other_payment_plans',
 'housing',
 'existing_credits',
 'job',
 'num_dependents',
 'own_telephone',
 'foreign_worker']

In [6]:
attribute_types = dataset.attribute_types
attribute_types

Unnamed: 0,attribute_names,categorical_indicator
0,checking_status,True
1,duration,False
2,credit_history,True
3,purpose,True
4,credit_amount,False
5,savings_status,True
6,employment,True
7,installment_commitment,False
8,personal_status,True
9,other_parties,True


In [7]:
categorical_columns = dataset.categorical_columns
categorical_columns

['checking_status',
 'credit_history',
 'purpose',
 'savings_status',
 'employment',
 'personal_status',
 'other_parties',
 'property_magnitude',
 'other_payment_plans',
 'housing',
 'job',
 'own_telephone',
 'foreign_worker']

In [8]:
numerical_columns = dataset.numerical_columns
numerical_columns

['duration',
 'credit_amount',
 'installment_commitment',
 'residence_since',
 'age',
 'existing_credits',
 'num_dependents']

### Visualize the dataset

In [None]:
## plot the original dataset
def hide_current_axis(*args, **kwds):
        plt.gca().set_visible(False)
        
def plot_data(data):
    sns.set_style("white") # grid/no grid style: darkgrid, whitegrid, dark, white, ticks
    
    plot = sns.pairplot(data, hue="class")
    plot.map_upper(hide_current_axis)
    plt.show()

In [None]:
plot_data(all_data)

### Get training and test sets

In [9]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data()

display(train_data.head())
print(train_labels[0:5])

display(test_data.head())
print(test_labels[0:5])

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
188,<0,12.0,existing paid,radio/tv,674.0,100<=X<500,4<=X<7,4.0,male mar/wid,none,1.0,life insurance,20.0,none,own,1.0,skilled,1.0,none,yes
194,0<=X<200,45.0,existing paid,radio/tv,3031.0,100<=X<500,1<=X<4,4.0,male single,guarantor,4.0,life insurance,21.0,none,rent,1.0,skilled,1.0,none,yes
225,no checking,36.0,no credits/all paid,repairs,2613.0,<100,1<=X<4,4.0,male single,none,2.0,car,27.0,none,own,2.0,skilled,1.0,none,yes
580,0<=X<200,18.0,critical/other existing credit,new car,1056.0,<100,>=7,3.0,male single,guarantor,3.0,real estate,30.0,bank,own,2.0,skilled,1.0,none,yes
428,no checking,9.0,existing paid,furniture/equipment,1313.0,<100,>=7,1.0,male single,none,4.0,car,20.0,none,own,1.0,skilled,1.0,none,yes


['bad' 'bad' 'good' 'bad' 'good']


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
841,no checking,21.0,delayed previously,used car,2993.0,<100,1<=X<4,3.0,male single,none,2.0,real estate,28.0,stores,own,2.0,unskilled resident,1.0,none,yes
956,>=200,30.0,critical/other existing credit,radio/tv,3656.0,no known savings,>=7,4.0,male single,none,4.0,life insurance,49.0,stores,own,2.0,unskilled resident,1.0,none,yes
544,no checking,12.0,critical/other existing credit,new car,1255.0,<100,>=7,4.0,male single,none,4.0,real estate,61.0,none,own,2.0,unskilled resident,1.0,none,yes
173,0<=X<200,8.0,existing paid,radio/tv,1414.0,<100,1<=X<4,4.0,male single,guarantor,2.0,real estate,33.0,none,own,1.0,skilled,1.0,none,no
759,<0,12.0,critical/other existing credit,new car,691.0,<100,>=7,4.0,male single,none,3.0,life insurance,35.0,none,own,2.0,skilled,1.0,none,yes


['good' 'good' 'good' 'good' 'bad']


## Model

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


# model parameters
learner = SGDClassifier(max_iter=1000)
param_grid = {
    'learner__loss': ['log'],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}


# preprocessing pipeline for numerical columns
transformer_numeric = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('standard_scale', StandardScaler())
])

# preprocessing pipeline for categorical columns
transformer_categorical = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
    ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
])

# preprocessor
feature_transform = ColumnTransformer(transformers=[
    ('categorical_features', transformer_categorical, categorical_columns),
    ('numerical_features', transformer_numeric, numerical_columms)
])

## prediction pipeline: append classifier (learner) to the preprocessing pipeline
pipeline = Pipeline([
    ('features', feature_transform),
    ('learner', learner)
])

In [None]:
model_obj = Model(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columms, pipeline, learner, param_grid)

model = model_obj.fit_model(train_data, train_labels)

## Corruptions

In [None]:
# corruption perturbations to apply
corr_perturbations = Perturbation(categorical_columns, numerical_columns)

In [None]:
test_data_corrupted, perturbations, cols_perturbed = corr_perturbations.apply_perturbation(test_data, 5)
test_data_corrupted.head(10)

In [None]:
perturbations

In [None]:
cols_perturbed

### Visualize the original and corrupted test set

In [None]:
## original test data
plot_data(pd.concat([test_data, pd.Series(test_labels, name='class')], axis=1))

In [None]:
## corrupted test data
plot_data(pd.concat([test_data_corrupted, pd.Series(test_labels, name='class')], axis=1))

## Cleaning

### Imputation

In [None]:
mean_mode_imputer = MeanModeImputation(train_data, test_data_corrupted, categorical_columns, numerical_columns)

test_data_mm_imputed = mean_mode_imputer.fit_transform(train_data, test_data_corrupted)
test_data_mm_imputed

In [None]:
datawig_imputer = DatawigImputation(train_data, test_data_corrupted, categorical_columns, numerical_columms)

test_data_dw_imputed = datawig_imputer.fit_transform(train_data, test_data_corrupted)
test_data_dw_imputed

##### Using PPP

In [None]:
# for all imputers return scores, take best
# using ppp

In [None]:
from jenga.cleaning.imputation import MeanModeImputation, DatawigImputation
from jenga.cleaning.ppp import PipelinePerformancePrediction

In [None]:
learner = SGDClassifier(max_iter=1000)
param_grid = {
    'learner__loss': ['log'],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

In [None]:
ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)

In [None]:
# generate corrpted test data
test_data_corrupted, perturbations, cols_perturbed = ppp.get_corrupted(test_data)

In [None]:
imputer_candidates = [MeanModeImputation, DatawigImputation]

In [None]:
imputers = []
for imputer in imputer_candidates:
    imputers.append(imputer(train_data, test_data_corrupted, categorical_columns, numerical_columns))

In [None]:
imputers

In [None]:
ppp_model = ppp.fit_ppp(train_data)

In [None]:
score_no_cleaning = ppp.predict_score_ppp(ppp_model, test_data)
score_no_cleaning

In [None]:
imputed_scores_ppp = []
for imputer in imputers:
    test_data_imputed = imputer.fit_transform(train_data, test_data_corrupted)
    imputed_score = ppp.predict_score_ppp(ppp_model, test_data_imputed)
    print(f"PPP score with {imputer}: {imputed_score}")
    imputed_scores_ppp.append(imputed_score)

In [None]:
imputed_scores_ppp

##### Using PPP and Cleaner classes

In [10]:
from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.cleaner import Cleaner
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNN, PyODIsolationForest
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, DatawigImputation

In [11]:
learner = SGDClassifier(max_iter=1000)
param_grid = {
    'learner__loss': ['log'],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

In [12]:
ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)

In [16]:
ppp_model = ppp.fit_ppp(train_data)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    5.5s finished


In [17]:
score_no_cleaning = ppp.predict_score_ppp(ppp_model, test_data)
score_no_cleaning

0.8103085553997195

In [13]:
# generate corrpted test data
test_data_corrupted, perturbations, cols_perturbed = ppp.get_corrupted(test_data)

Generating corrupted training data on 200 rows...
Applying perturbations...
GaussianNoise: {'column': 'residence_since', 'fraction': 0.25} on column ['residence_since']
MissingValues: {'column': 'credit_history', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MAR'} on column ['credit_history']
GaussianNoise: {'column': 'duration', 'fraction': 0.25} on column ['duration']
MissingValues: {'column': 'credit_amount', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MAR'} on column ['credit_amount']
SwappedValues: {'column_a': 'housing', 'column_b': 'personal_status', 'fraction': 0.25} on column ['housing', 'personal_status']


In [14]:
cleaner_candidates = [
    (NoOutlierDetection, NoImputation),
    (NoOutlierDetection, MeanModeImputation),
    (NoOutlierDetection, DatawigImputation),
    (PyODKNN, NoImputation),
    (PyODKNN, MeanModeImputation),
    (PyODKNN, DatawigImputation),
    (PyODIsolationForest, NoImputation),
    (PyODIsolationForest, MeanModeImputation),
    (PyODIsolationForest, DatawigImputation)
]

In [15]:
cleaners = []
for outd, imp in cleaner_candidates:
    cleaners.append(Cleaner(train_data, 
                            test_data_corrupted, 
                            categorical_columns, 
                            numerical_columns, 
                            outlier_detection = outd(train_data, 
                                                     test_data_corrupted, 
                                                     categorical_columns, 
                                                     numerical_columns), 
                            imputation = imp(train_data, 
                                             test_data_corrupted, 
                                             categorical_columns, 
                                             numerical_columns)
                           ))

In [18]:
cleaner_scores_ppp = []
for cleaner in cleaners:
    test_data_cleaned = cleaner.apply_cleaner(train_data, test_data_corrupted, categorical_columns, numerical_columns)
    cleaner_score = ppp.predict_score_ppp(ppp_model, test_data_cleaned)
    print(f"PPP score with {cleaner}: {cleaner_score}")
    cleaner_scores_ppp.append(cleaner_score)

2020-06-22 23:19:23,711 [INFO]  CategoricalEncoder for column checking_status                                found only 44 occurrences of value >=200


PPP score with <jenga.cleaning.cleaner.Cleaner object at 0x0000019F4FD027F0>: 0.7946470313230481
PPP score with <jenga.cleaning.cleaner.Cleaner object at 0x0000019F50FB37F0>: 0.7850631136044881
Fitting model for column: checking_status


2020-06-22 23:19:24,528 [INFO]  
2020-06-22 23:19:24,899 [INFO]  Epoch[0] Batch [0-23]	Speed: 1101.44 samples/sec	cross-entropy=1.271047	checking_status-accuracy=0.377604
2020-06-22 23:19:25,206 [INFO]  Epoch[0] Train-cross-entropy=1.229597
2020-06-22 23:19:25,208 [INFO]  Epoch[0] Train-checking_status-accuracy=0.383333
2020-06-22 23:19:25,209 [INFO]  Epoch[0] Time cost=0.676
2020-06-22 23:19:25,217 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:19:25,273 [INFO]  Epoch[0] Validation-cross-entropy=1.037734
2020-06-22 23:19:25,275 [INFO]  Epoch[0] Validation-checking_status-accuracy=0.575000
2020-06-22 23:19:25,631 [INFO]  Epoch[1] Batch [0-23]	Speed: 1090.03 samples/sec	cross-entropy=1.128615	checking_status-accuracy=0.434896
2020-06-22 23:19:25,919 [INFO]  Epoch[1] Train-cross-entropy=1.129866
2020-06-22 23:19:25,921 [INFO]  Epoch[1] Train-checking_status-accuracy=0.438889
2020-06-22 23:19:25,922 [INFO]  Epoch[1] Time cost=0.645
2020-06-22 23:19:25,931 [INF

2020-06-22 23:19:34,808 [INFO]  CategoricalEncoder for column credit_history                                found only 63 occurrences of value delayed previously
2020-06-22 23:19:34,810 [INFO]  CategoricalEncoder for column credit_history                                found only 36 occurrences of value all paid
2020-06-22 23:19:34,813 [INFO]  CategoricalEncoder for column credit_history                                found only 31 occurrences of value no credits/all paid


Fitting model for column: credit_history


2020-06-22 23:19:35,771 [INFO]  
2020-06-22 23:19:36,217 [INFO]  Epoch[0] Batch [0-23]	Speed: 934.14 samples/sec	cross-entropy=1.082231	credit_history-accuracy=0.578125
2020-06-22 23:19:36,529 [INFO]  Epoch[0] Train-cross-entropy=1.051018
2020-06-22 23:19:36,530 [INFO]  Epoch[0] Train-credit_history-accuracy=0.626389
2020-06-22 23:19:36,531 [INFO]  Epoch[0] Time cost=0.750
2020-06-22 23:19:36,542 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:19:36,609 [INFO]  Epoch[0] Validation-cross-entropy=0.875049
2020-06-22 23:19:36,610 [INFO]  Epoch[0] Validation-credit_history-accuracy=0.700000
2020-06-22 23:19:36,977 [INFO]  Epoch[1] Batch [0-23]	Speed: 1057.24 samples/sec	cross-entropy=0.869959	credit_history-accuracy=0.705729
2020-06-22 23:19:37,292 [INFO]  Epoch[1] Train-cross-entropy=0.898068
2020-06-22 23:19:37,294 [INFO]  Epoch[1] Train-credit_history-accuracy=0.701389
2020-06-22 23:19:37,297 [INFO]  Epoch[1] Time cost=0.686
2020-06-22 23:19:37,314 [INFO]  Sa

Fitting model for column: purpose


2020-06-22 23:19:42,473 [INFO]  
2020-06-22 23:19:42,981 [INFO]  Epoch[0] Batch [0-23]	Speed: 803.89 samples/sec	cross-entropy=1.858011	purpose-accuracy=0.252604
2020-06-22 23:19:43,367 [INFO]  Epoch[0] Train-cross-entropy=1.788633
2020-06-22 23:19:43,368 [INFO]  Epoch[0] Train-purpose-accuracy=0.293056
2020-06-22 23:19:43,369 [INFO]  Epoch[0] Time cost=0.889
2020-06-22 23:19:43,381 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:19:43,444 [INFO]  Epoch[0] Validation-cross-entropy=1.749462
2020-06-22 23:19:43,447 [INFO]  Epoch[0] Validation-purpose-accuracy=0.325000
2020-06-22 23:19:43,890 [INFO]  Epoch[1] Batch [0-23]	Speed: 873.32 samples/sec	cross-entropy=1.611556	purpose-accuracy=0.291667
2020-06-22 23:19:44,328 [INFO]  Epoch[1] Train-cross-entropy=1.613456
2020-06-22 23:19:44,329 [INFO]  Epoch[1] Train-purpose-accuracy=0.330556
2020-06-22 23:19:44,331 [INFO]  Epoch[1] Time cost=0.883
2020-06-22 23:19:44,344 [INFO]  Saved checkpoint to "imputer_model\mod

Fitting model for column: savings_status


2020-06-22 23:19:54,019 [INFO]  
2020-06-22 23:19:54,430 [INFO]  Epoch[0] Batch [0-23]	Speed: 973.52 samples/sec	cross-entropy=1.116978	savings_status-accuracy=0.630208
2020-06-22 23:19:54,839 [INFO]  Epoch[0] Train-cross-entropy=1.165443
2020-06-22 23:19:54,841 [INFO]  Epoch[0] Train-savings_status-accuracy=0.590278
2020-06-22 23:19:54,843 [INFO]  Epoch[0] Time cost=0.819
2020-06-22 23:19:54,858 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:19:54,941 [INFO]  Epoch[0] Validation-cross-entropy=1.244429
2020-06-22 23:19:54,943 [INFO]  Epoch[0] Validation-savings_status-accuracy=0.487500
2020-06-22 23:19:55,313 [INFO]  Epoch[1] Batch [0-23]	Speed: 1067.95 samples/sec	cross-entropy=0.975097	savings_status-accuracy=0.653646
2020-06-22 23:19:55,633 [INFO]  Epoch[1] Train-cross-entropy=1.065564
2020-06-22 23:19:55,635 [INFO]  Epoch[1] Train-savings_status-accuracy=0.605556
2020-06-22 23:19:55,637 [INFO]  Epoch[1] Time cost=0.690
2020-06-22 23:19:55,646 [INFO]  Sa

Fitting model for column: employment


2020-06-22 23:20:02,074 [INFO]  
2020-06-22 23:20:02,496 [INFO]  Epoch[0] Batch [0-23]	Speed: 1075.69 samples/sec	cross-entropy=1.453955	employment-accuracy=0.351562
2020-06-22 23:20:02,798 [INFO]  Epoch[0] Train-cross-entropy=1.392621
2020-06-22 23:20:02,800 [INFO]  Epoch[0] Train-employment-accuracy=0.390278
2020-06-22 23:20:02,802 [INFO]  Epoch[0] Time cost=0.679
2020-06-22 23:20:02,810 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:20:02,876 [INFO]  Epoch[0] Validation-cross-entropy=1.210511
2020-06-22 23:20:02,877 [INFO]  Epoch[0] Validation-employment-accuracy=0.462500
2020-06-22 23:20:03,298 [INFO]  Epoch[1] Batch [0-23]	Speed: 922.46 samples/sec	cross-entropy=1.273872	employment-accuracy=0.440104
2020-06-22 23:20:03,649 [INFO]  Epoch[1] Train-cross-entropy=1.272462
2020-06-22 23:20:03,651 [INFO]  Epoch[1] Train-employment-accuracy=0.445833
2020-06-22 23:20:03,654 [INFO]  Epoch[1] Time cost=0.775
2020-06-22 23:20:03,666 [INFO]  Saved checkpoint to "i

Fitting model for column: personal_status


2020-06-22 23:20:09,277 [INFO]  
2020-06-22 23:20:09,737 [INFO]  Epoch[0] Batch [0-23]	Speed: 866.12 samples/sec	cross-entropy=1.091382	personal_status-accuracy=0.575521
2020-06-22 23:20:10,079 [INFO]  Epoch[0] Train-cross-entropy=1.006835
2020-06-22 23:20:10,081 [INFO]  Epoch[0] Train-personal_status-accuracy=0.594444
2020-06-22 23:20:10,084 [INFO]  Epoch[0] Time cost=0.800
2020-06-22 23:20:10,104 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:20:10,163 [INFO]  Epoch[0] Validation-cross-entropy=0.908374
2020-06-22 23:20:10,165 [INFO]  Epoch[0] Validation-personal_status-accuracy=0.575000
2020-06-22 23:20:10,537 [INFO]  Epoch[1] Batch [0-23]	Speed: 1039.39 samples/sec	cross-entropy=0.900637	personal_status-accuracy=0.604167
2020-06-22 23:20:10,893 [INFO]  Epoch[1] Train-cross-entropy=0.887096
2020-06-22 23:20:10,895 [INFO]  Epoch[1] Train-personal_status-accuracy=0.626389
2020-06-22 23:20:10,898 [INFO]  Epoch[1] Time cost=0.731
2020-06-22 23:20:10,907 [INFO

Fitting model for column: other_parties


2020-06-22 23:20:16,779 [INFO]  
2020-06-22 23:20:17,147 [INFO]  Epoch[0] Batch [0-23]	Speed: 1108.06 samples/sec	cross-entropy=0.565320	other_parties-accuracy=0.867188
2020-06-22 23:20:17,441 [INFO]  Epoch[0] Train-cross-entropy=0.477823
2020-06-22 23:20:17,444 [INFO]  Epoch[0] Train-other_parties-accuracy=0.884722
2020-06-22 23:20:17,447 [INFO]  Epoch[0] Time cost=0.659
2020-06-22 23:20:17,455 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:20:17,513 [INFO]  Epoch[0] Validation-cross-entropy=0.342650
2020-06-22 23:20:17,516 [INFO]  Epoch[0] Validation-other_parties-accuracy=0.912500
2020-06-22 23:20:17,840 [INFO]  Epoch[1] Batch [0-23]	Speed: 1197.67 samples/sec	cross-entropy=0.356954	other_parties-accuracy=0.898438
2020-06-22 23:20:18,124 [INFO]  Epoch[1] Train-cross-entropy=0.353520
2020-06-22 23:20:18,126 [INFO]  Epoch[1] Train-other_parties-accuracy=0.901389
2020-06-22 23:20:18,129 [INFO]  Epoch[1] Time cost=0.611
2020-06-22 23:20:18,137 [INFO]  Saved 

Fitting model for column: property_magnitude


2020-06-22 23:20:23,777 [INFO]  
2020-06-22 23:20:24,161 [INFO]  Epoch[0] Batch [0-23]	Speed: 1045.26 samples/sec	cross-entropy=1.279738	property_magnitude-accuracy=0.343750
2020-06-22 23:20:24,498 [INFO]  Epoch[0] Train-cross-entropy=1.260279
2020-06-22 23:20:24,501 [INFO]  Epoch[0] Train-property_magnitude-accuracy=0.370833
2020-06-22 23:20:24,505 [INFO]  Epoch[0] Time cost=0.722
2020-06-22 23:20:24,514 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:20:24,589 [INFO]  Epoch[0] Validation-cross-entropy=1.117972
2020-06-22 23:20:24,591 [INFO]  Epoch[0] Validation-property_magnitude-accuracy=0.450000
2020-06-22 23:20:24,960 [INFO]  Epoch[1] Batch [0-23]	Speed: 1082.06 samples/sec	cross-entropy=1.137246	property_magnitude-accuracy=0.429688
2020-06-22 23:20:25,308 [INFO]  Epoch[1] Train-cross-entropy=1.153540
2020-06-22 23:20:25,310 [INFO]  Epoch[1] Train-property_magnitude-accuracy=0.454167
2020-06-22 23:20:25,314 [INFO]  Epoch[1] Time cost=0.720
2020-06-22 23

2020-06-22 23:20:34,620 [INFO]  Epoch[13] Time cost=0.693
2020-06-22 23:20:34,630 [INFO]  Saved checkpoint to "imputer_model\model-0013.params"
2020-06-22 23:20:34,702 [INFO]  Epoch[13] Validation-cross-entropy=1.059980
2020-06-22 23:20:34,773 [INFO]  Epoch[13] Validation-property_magnitude-accuracy=0.500000
2020-06-22 23:20:35,137 [INFO]  Epoch[14] Batch [0-23]	Speed: 1098.07 samples/sec	cross-entropy=0.921644	property_magnitude-accuracy=0.580729
2020-06-22 23:20:35,471 [INFO]  Epoch[14] Train-cross-entropy=0.929362
2020-06-22 23:20:35,473 [INFO]  Epoch[14] Train-property_magnitude-accuracy=0.590278
2020-06-22 23:20:35,475 [INFO]  Epoch[14] Time cost=0.698
2020-06-22 23:20:35,484 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-06-22 23:20:35,553 [INFO]  Epoch[14] Validation-cross-entropy=1.059637
2020-06-22 23:20:35,556 [INFO]  Epoch[14] Validation-property_magnitude-accuracy=0.487500
2020-06-22 23:20:35,927 [INFO]  Epoch[15] Batch [0-23]	Speed: 1063.36 samples/sec	

Fitting model for column: other_payment_plans


2020-06-22 23:20:41,418 [INFO]  
2020-06-22 23:20:41,768 [INFO]  Epoch[0] Batch [0-23]	Speed: 1160.33 samples/sec	cross-entropy=0.723378	other_payment_plans-accuracy=0.783854
2020-06-22 23:20:42,095 [INFO]  Epoch[0] Train-cross-entropy=0.665664
2020-06-22 23:20:42,097 [INFO]  Epoch[0] Train-other_payment_plans-accuracy=0.793056
2020-06-22 23:20:42,100 [INFO]  Epoch[0] Time cost=0.675
2020-06-22 23:20:42,108 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:20:42,172 [INFO]  Epoch[0] Validation-cross-entropy=0.506242
2020-06-22 23:20:42,174 [INFO]  Epoch[0] Validation-other_payment_plans-accuracy=0.850000
2020-06-22 23:20:42,515 [INFO]  Epoch[1] Batch [0-23]	Speed: 1147.67 samples/sec	cross-entropy=0.537334	other_payment_plans-accuracy=0.817708
2020-06-22 23:20:42,798 [INFO]  Epoch[1] Train-cross-entropy=0.549235
2020-06-22 23:20:42,800 [INFO]  Epoch[1] Train-other_payment_plans-accuracy=0.811111
2020-06-22 23:20:42,803 [INFO]  Epoch[1] Time cost=0.626
2020-06-

Fitting model for column: housing


2020-06-22 23:20:48,599 [INFO]  
2020-06-22 23:20:48,970 [INFO]  Epoch[0] Batch [0-23]	Speed: 1086.81 samples/sec	cross-entropy=0.811511	housing-accuracy=0.692708
2020-06-22 23:20:49,248 [INFO]  Epoch[0] Train-cross-entropy=0.731186
2020-06-22 23:20:49,251 [INFO]  Epoch[0] Train-housing-accuracy=0.712500
2020-06-22 23:20:49,253 [INFO]  Epoch[0] Time cost=0.648
2020-06-22 23:20:49,273 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:20:49,347 [INFO]  Epoch[0] Validation-cross-entropy=0.560799
2020-06-22 23:20:49,349 [INFO]  Epoch[0] Validation-housing-accuracy=0.762500
2020-06-22 23:20:49,674 [INFO]  Epoch[1] Batch [0-23]	Speed: 1204.55 samples/sec	cross-entropy=0.629483	housing-accuracy=0.744792
2020-06-22 23:20:49,993 [INFO]  Epoch[1] Train-cross-entropy=0.607621
2020-06-22 23:20:49,995 [INFO]  Epoch[1] Train-housing-accuracy=0.756944
2020-06-22 23:20:49,999 [INFO]  Epoch[1] Time cost=0.647
2020-06-22 23:20:50,008 [INFO]  Saved checkpoint to "imputer_model\m

Fitting model for column: job


2020-06-22 23:20:57,138 [INFO]  
2020-06-22 23:20:57,544 [INFO]  Epoch[0] Batch [0-23]	Speed: 995.88 samples/sec	cross-entropy=1.069260	job-accuracy=0.578125
2020-06-22 23:20:57,885 [INFO]  Epoch[0] Train-cross-entropy=0.947341
2020-06-22 23:20:57,888 [INFO]  Epoch[0] Train-job-accuracy=0.630556
2020-06-22 23:20:57,891 [INFO]  Epoch[0] Time cost=0.745
2020-06-22 23:20:57,899 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:20:57,963 [INFO]  Epoch[0] Validation-cross-entropy=0.738909
2020-06-22 23:20:57,965 [INFO]  Epoch[0] Validation-job-accuracy=0.712500
2020-06-22 23:20:58,352 [INFO]  Epoch[1] Batch [0-23]	Speed: 999.59 samples/sec	cross-entropy=0.860451	job-accuracy=0.627604
2020-06-22 23:20:58,669 [INFO]  Epoch[1] Train-cross-entropy=0.810804
2020-06-22 23:20:58,671 [INFO]  Epoch[1] Train-job-accuracy=0.659722
2020-06-22 23:20:58,674 [INFO]  Epoch[1] Time cost=0.706
2020-06-22 23:20:58,682 [INFO]  Saved checkpoint to "imputer_model\model-0001.params"
2020

Fitting model for column: own_telephone


2020-06-22 23:21:05,103 [INFO]  
2020-06-22 23:21:05,481 [INFO]  Epoch[0] Batch [0-23]	Speed: 1058.75 samples/sec	cross-entropy=0.679787	own_telephone-accuracy=0.625000
2020-06-22 23:21:05,759 [INFO]  Epoch[0] Train-cross-entropy=0.632108
2020-06-22 23:21:05,761 [INFO]  Epoch[0] Train-own_telephone-accuracy=0.645833
2020-06-22 23:21:05,764 [INFO]  Epoch[0] Time cost=0.654
2020-06-22 23:21:05,773 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:21:05,829 [INFO]  Epoch[0] Validation-cross-entropy=0.563126
2020-06-22 23:21:05,831 [INFO]  Epoch[0] Validation-own_telephone-accuracy=0.750000
2020-06-22 23:21:06,172 [INFO]  Epoch[1] Batch [0-23]	Speed: 1154.77 samples/sec	cross-entropy=0.552911	own_telephone-accuracy=0.705729
2020-06-22 23:21:06,468 [INFO]  Epoch[1] Train-cross-entropy=0.551095
2020-06-22 23:21:06,470 [INFO]  Epoch[1] Train-own_telephone-accuracy=0.695833
2020-06-22 23:21:06,473 [INFO]  Epoch[1] Time cost=0.639
2020-06-22 23:21:06,481 [INFO]  Saved 

Fitting model for column: foreign_worker


2020-06-22 23:21:13,293 [INFO]  
2020-06-22 23:21:13,679 [INFO]  Epoch[0] Batch [0-23]	Speed: 1074.09 samples/sec	cross-entropy=0.350679	foreign_worker-accuracy=0.940104
2020-06-22 23:21:13,996 [INFO]  Epoch[0] Train-cross-entropy=0.241185
2020-06-22 23:21:13,999 [INFO]  Epoch[0] Train-foreign_worker-accuracy=0.955556
2020-06-22 23:21:14,001 [INFO]  Epoch[0] Time cost=0.699
2020-06-22 23:21:14,009 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:21:14,076 [INFO]  Epoch[0] Validation-cross-entropy=0.107131
2020-06-22 23:21:14,078 [INFO]  Epoch[0] Validation-foreign_worker-accuracy=0.975000
2020-06-22 23:21:14,434 [INFO]  Epoch[1] Batch [0-23]	Speed: 1088.45 samples/sec	cross-entropy=0.168954	foreign_worker-accuracy=0.950521
2020-06-22 23:21:14,738 [INFO]  Epoch[1] Train-cross-entropy=0.146884
2020-06-22 23:21:14,741 [INFO]  Epoch[1] Train-foreign_worker-accuracy=0.961111
2020-06-22 23:21:14,743 [INFO]  Epoch[1] Time cost=0.662
2020-06-22 23:21:14,753 [INFO]  S

Fitting model for column: duration


2020-06-22 23:21:25,608 [INFO]  
2020-06-22 23:21:26,429 [INFO]  Epoch[0] Batch [0-23]	Speed: 1267.99 samples/sec	cross-entropy=12.636686	duration-accuracy=0.000000
2020-06-22 23:21:26,703 [INFO]  Epoch[0] Train-cross-entropy=10.344818
2020-06-22 23:21:26,707 [INFO]  Epoch[0] Train-duration-accuracy=0.000000
2020-06-22 23:21:26,714 [INFO]  Epoch[0] Time cost=1.097
2020-06-22 23:21:26,725 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:21:26,778 [INFO]  Epoch[0] Validation-cross-entropy=8.926878
2020-06-22 23:21:26,781 [INFO]  Epoch[0] Validation-duration-accuracy=0.000000
2020-06-22 23:21:27,105 [INFO]  Epoch[1] Batch [0-23]	Speed: 1205.83 samples/sec	cross-entropy=8.056277	duration-accuracy=0.000000
2020-06-22 23:21:27,363 [INFO]  Epoch[1] Train-cross-entropy=7.750112
2020-06-22 23:21:27,365 [INFO]  Epoch[1] Train-duration-accuracy=0.000000
2020-06-22 23:21:27,368 [INFO]  Epoch[1] Time cost=0.584
2020-06-22 23:21:27,375 [INFO]  Saved checkpoint to "imputer_

Fitting model for column: credit_amount


2020-06-22 23:21:34,426 [INFO]  
2020-06-22 23:21:34,743 [INFO]  Epoch[0] Batch [0-23]	Speed: 1259.25 samples/sec	cross-entropy=11.101247	credit_amount-accuracy=0.000000
2020-06-22 23:21:35,029 [INFO]  Epoch[0] Train-cross-entropy=9.183131
2020-06-22 23:21:35,031 [INFO]  Epoch[0] Train-credit_amount-accuracy=0.000000
2020-06-22 23:21:35,033 [INFO]  Epoch[0] Time cost=0.600
2020-06-22 23:21:35,042 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:21:35,096 [INFO]  Epoch[0] Validation-cross-entropy=7.936163
2020-06-22 23:21:35,099 [INFO]  Epoch[0] Validation-credit_amount-accuracy=0.000000
2020-06-22 23:21:35,391 [INFO]  Epoch[1] Batch [0-23]	Speed: 1332.08 samples/sec	cross-entropy=7.533372	credit_amount-accuracy=0.000000
2020-06-22 23:21:35,644 [INFO]  Epoch[1] Train-cross-entropy=6.818372
2020-06-22 23:21:35,647 [INFO]  Epoch[1] Train-credit_amount-accuracy=0.000000
2020-06-22 23:21:35,649 [INFO]  Epoch[1] Time cost=0.549
2020-06-22 23:21:35,656 [INFO]  Saved

Fitting model for column: installment_commitment


2020-06-22 23:21:41,375 [INFO]  
2020-06-22 23:21:41,669 [INFO]  Epoch[0] Batch [0-23]	Speed: 1351.59 samples/sec	cross-entropy=14.470730	installment_commitment-accuracy=0.000000
2020-06-22 23:21:41,916 [INFO]  Epoch[0] Train-cross-entropy=14.397676
2020-06-22 23:21:41,919 [INFO]  Epoch[0] Train-installment_commitment-accuracy=0.000000
2020-06-22 23:21:41,923 [INFO]  Epoch[0] Time cost=0.541
2020-06-22 23:21:41,932 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:21:41,995 [INFO]  Epoch[0] Validation-cross-entropy=12.526102
2020-06-22 23:21:41,999 [INFO]  Epoch[0] Validation-installment_commitment-accuracy=0.000000
2020-06-22 23:21:42,313 [INFO]  Epoch[1] Batch [0-23]	Speed: 1242.37 samples/sec	cross-entropy=11.826665	installment_commitment-accuracy=0.000000
2020-06-22 23:21:42,571 [INFO]  Epoch[1] Train-cross-entropy=12.460365
2020-06-22 23:21:42,574 [INFO]  Epoch[1] Train-installment_commitment-accuracy=0.000000
2020-06-22 23:21:42,577 [INFO]  Epoch[1] Time

2020-06-22 23:21:50,737 [INFO]  Epoch[13] Train-cross-entropy=10.124056
2020-06-22 23:21:50,740 [INFO]  Epoch[13] Train-installment_commitment-accuracy=0.000000
2020-06-22 23:21:50,743 [INFO]  Epoch[13] Time cost=0.674
2020-06-22 23:21:50,753 [INFO]  Saved checkpoint to "imputer_model\model-0013.params"
2020-06-22 23:21:50,821 [INFO]  Epoch[13] Validation-cross-entropy=10.662213
2020-06-22 23:21:50,824 [INFO]  Epoch[13] Validation-installment_commitment-accuracy=0.000000
2020-06-22 23:21:51,177 [INFO]  Epoch[14] Batch [0-23]	Speed: 1101.35 samples/sec	cross-entropy=9.740896	installment_commitment-accuracy=0.000000
2020-06-22 23:21:51,482 [INFO]  Epoch[14] Train-cross-entropy=10.072895
2020-06-22 23:21:51,485 [INFO]  Epoch[14] Train-installment_commitment-accuracy=0.000000
2020-06-22 23:21:51,489 [INFO]  Epoch[14] Time cost=0.660
2020-06-22 23:21:51,496 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-06-22 23:21:51,560 [INFO]  Epoch[14] Validation-cross-entropy=10.659

Fitting model for column: residence_since


2020-06-22 23:21:57,437 [INFO]  
2020-06-22 23:21:57,766 [INFO]  Epoch[0] Batch [0-23]	Speed: 1240.23 samples/sec	cross-entropy=15.528859	residence_since-accuracy=0.000000
2020-06-22 23:21:58,049 [INFO]  Epoch[0] Train-cross-entropy=15.783199
2020-06-22 23:21:58,052 [INFO]  Epoch[0] Train-residence_since-accuracy=0.000000
2020-06-22 23:21:58,055 [INFO]  Epoch[0] Time cost=0.610
2020-06-22 23:21:58,062 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:21:58,119 [INFO]  Epoch[0] Validation-cross-entropy=13.337120
2020-06-22 23:21:58,122 [INFO]  Epoch[0] Validation-residence_since-accuracy=0.000000
2020-06-22 23:21:58,438 [INFO]  Epoch[1] Batch [0-23]	Speed: 1234.06 samples/sec	cross-entropy=13.942403	residence_since-accuracy=0.000000
2020-06-22 23:21:58,702 [INFO]  Epoch[1] Train-cross-entropy=14.386278
2020-06-22 23:21:58,705 [INFO]  Epoch[1] Train-residence_since-accuracy=0.000000
2020-06-22 23:21:58,708 [INFO]  Epoch[1] Time cost=0.583
2020-06-22 23:21:58,720

Fitting model for column: age


2020-06-22 23:22:03,709 [INFO]  
2020-06-22 23:22:04,061 [INFO]  Epoch[0] Batch [0-23]	Speed: 1190.41 samples/sec	cross-entropy=14.826350	age-accuracy=0.000000
2020-06-22 23:22:04,332 [INFO]  Epoch[0] Train-cross-entropy=14.507992
2020-06-22 23:22:04,335 [INFO]  Epoch[0] Train-age-accuracy=0.000000
2020-06-22 23:22:04,338 [INFO]  Epoch[0] Time cost=0.619
2020-06-22 23:22:04,346 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:22:04,403 [INFO]  Epoch[0] Validation-cross-entropy=18.573334
2020-06-22 23:22:04,406 [INFO]  Epoch[0] Validation-age-accuracy=0.000000
2020-06-22 23:22:04,725 [INFO]  Epoch[1] Batch [0-23]	Speed: 1208.02 samples/sec	cross-entropy=13.174112	age-accuracy=0.000000
2020-06-22 23:22:04,987 [INFO]  Epoch[1] Train-cross-entropy=13.153277
2020-06-22 23:22:04,990 [INFO]  Epoch[1] Train-age-accuracy=0.000000
2020-06-22 23:22:04,993 [INFO]  Epoch[1] Time cost=0.585
2020-06-22 23:22:05,004 [INFO]  Saved checkpoint to "imputer_model\model-0001.param

2020-06-22 23:22:13,532 [INFO]  Epoch[14] Time cost=0.586
2020-06-22 23:22:13,539 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-06-22 23:22:13,629 [INFO]  Epoch[14] Validation-cross-entropy=15.546282
2020-06-22 23:22:13,632 [INFO]  Epoch[14] Validation-age-accuracy=0.000000
2020-06-22 23:22:13,929 [INFO]  Epoch[15] Batch [0-23]	Speed: 1332.07 samples/sec	cross-entropy=10.214892	age-accuracy=0.000000
2020-06-22 23:22:14,193 [INFO]  Epoch[15] Train-cross-entropy=10.216543
2020-06-22 23:22:14,197 [INFO]  Epoch[15] Train-age-accuracy=0.000000
2020-06-22 23:22:14,200 [INFO]  Epoch[15] Time cost=0.566
2020-06-22 23:22:14,207 [INFO]  Saved checkpoint to "imputer_model\model-0015.params"
2020-06-22 23:22:14,260 [INFO]  Epoch[15] Validation-cross-entropy=15.542183
2020-06-22 23:22:14,263 [INFO]  Epoch[15] Validation-age-accuracy=0.000000
2020-06-22 23:22:14,559 [INFO]  Epoch[16] Batch [0-23]	Speed: 1315.41 samples/sec	cross-entropy=10.144373	age-accuracy=0.000000
2020-06-22

Fitting model for column: existing_credits


2020-06-22 23:22:17,204 [INFO]  
2020-06-22 23:22:17,531 [INFO]  Epoch[0] Batch [0-23]	Speed: 1221.80 samples/sec	cross-entropy=15.481094	existing_credits-accuracy=0.000000
2020-06-22 23:22:17,798 [INFO]  Epoch[0] Train-cross-entropy=15.499833
2020-06-22 23:22:17,800 [INFO]  Epoch[0] Train-existing_credits-accuracy=0.000000
2020-06-22 23:22:17,803 [INFO]  Epoch[0] Time cost=0.590
2020-06-22 23:22:17,810 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:22:17,863 [INFO]  Epoch[0] Validation-cross-entropy=12.411274
2020-06-22 23:22:17,866 [INFO]  Epoch[0] Validation-existing_credits-accuracy=0.000000
2020-06-22 23:22:18,162 [INFO]  Epoch[1] Batch [0-23]	Speed: 1315.40 samples/sec	cross-entropy=13.255598	existing_credits-accuracy=0.000000
2020-06-22 23:22:18,428 [INFO]  Epoch[1] Train-cross-entropy=13.592357
2020-06-22 23:22:18,431 [INFO]  Epoch[1] Train-existing_credits-accuracy=0.000000
2020-06-22 23:22:18,434 [INFO]  Epoch[1] Time cost=0.566
2020-06-22 23:22:1

Fitting model for column: num_dependents


2020-06-22 23:22:26,441 [INFO]  
2020-06-22 23:22:26,780 [INFO]  Epoch[0] Batch [0-23]	Speed: 1169.48 samples/sec	cross-entropy=14.818382	num_dependents-accuracy=0.000000
2020-06-22 23:22:27,085 [INFO]  Epoch[0] Train-cross-entropy=15.642547
2020-06-22 23:22:27,088 [INFO]  Epoch[0] Train-num_dependents-accuracy=0.000000
2020-06-22 23:22:27,090 [INFO]  Epoch[0] Time cost=0.642
2020-06-22 23:22:27,097 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:22:27,158 [INFO]  Epoch[0] Validation-cross-entropy=14.600964
2020-06-22 23:22:27,161 [INFO]  Epoch[0] Validation-num_dependents-accuracy=0.000000
2020-06-22 23:22:27,463 [INFO]  Epoch[1] Batch [0-23]	Speed: 1292.36 samples/sec	cross-entropy=13.483074	num_dependents-accuracy=0.000000
2020-06-22 23:22:27,793 [INFO]  Epoch[1] Train-cross-entropy=14.513335
2020-06-22 23:22:27,796 [INFO]  Epoch[1] Train-num_dependents-accuracy=0.000000
2020-06-22 23:22:27,799 [INFO]  Epoch[1] Time cost=0.635
2020-06-22 23:22:27,807 [INF

PPP score with <jenga.cleaning.cleaner.Cleaner object at 0x0000019F50FB37B8>: 0.7807386629266012
PPP score with <jenga.cleaning.cleaner.Cleaner object at 0x0000019F50FB3B00>: 0.7946470313230481
PPP score with <jenga.cleaning.cleaner.Cleaner object at 0x0000019F50FB3D68>: 0.7850631136044881
Fitting model for column: checking_status

2020-06-22 23:22:33,135 [INFO]  CategoricalEncoder for column checking_status                                found only 44 occurrences of value >=200





2020-06-22 23:22:34,055 [INFO]  
2020-06-22 23:22:34,502 [INFO]  Epoch[0] Batch [0-23]	Speed: 897.76 samples/sec	cross-entropy=1.266775	checking_status-accuracy=0.367188
2020-06-22 23:22:34,949 [INFO]  Epoch[0] Train-cross-entropy=1.227117
2020-06-22 23:22:34,953 [INFO]  Epoch[0] Train-checking_status-accuracy=0.368056
2020-06-22 23:22:34,956 [INFO]  Epoch[0] Time cost=0.893
2020-06-22 23:22:34,964 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:22:35,025 [INFO]  Epoch[0] Validation-cross-entropy=1.048711
2020-06-22 23:22:35,029 [INFO]  Epoch[0] Validation-checking_status-accuracy=0.550000
2020-06-22 23:22:35,413 [INFO]  Epoch[1] Batch [0-23]	Speed: 1025.53 samples/sec	cross-entropy=1.126570	checking_status-accuracy=0.419271
2020-06-22 23:22:35,711 [INFO]  Epoch[1] Train-cross-entropy=1.129209
2020-06-22 23:22:35,713 [INFO]  Epoch[1] Train-checking_status-accuracy=0.438889
2020-06-22 23:22:35,717 [INFO]  Epoch[1] Time cost=0.686
2020-06-22 23:22:35,726 [INFO

2020-06-22 23:22:46,573 [INFO]  CategoricalEncoder for column credit_history                                found only 63 occurrences of value delayed previously
2020-06-22 23:22:46,577 [INFO]  CategoricalEncoder for column credit_history                                found only 36 occurrences of value all paid
2020-06-22 23:22:46,582 [INFO]  CategoricalEncoder for column credit_history                                found only 31 occurrences of value no credits/all paid


Fitting model for column: credit_history


2020-06-22 23:22:47,469 [INFO]  
2020-06-22 23:22:47,838 [INFO]  Epoch[0] Batch [0-23]	Speed: 1089.99 samples/sec	cross-entropy=1.086538	credit_history-accuracy=0.575521
2020-06-22 23:22:48,151 [INFO]  Epoch[0] Train-cross-entropy=1.049954
2020-06-22 23:22:48,154 [INFO]  Epoch[0] Train-credit_history-accuracy=0.625000
2020-06-22 23:22:48,158 [INFO]  Epoch[0] Time cost=0.681
2020-06-22 23:22:48,172 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:22:48,243 [INFO]  Epoch[0] Validation-cross-entropy=0.858893
2020-06-22 23:22:48,247 [INFO]  Epoch[0] Validation-credit_history-accuracy=0.700000
2020-06-22 23:22:48,643 [INFO]  Epoch[1] Batch [0-23]	Speed: 973.52 samples/sec	cross-entropy=0.857125	credit_history-accuracy=0.710938
2020-06-22 23:22:48,973 [INFO]  Epoch[1] Train-cross-entropy=0.891277
2020-06-22 23:22:48,977 [INFO]  Epoch[1] Train-credit_history-accuracy=0.702778
2020-06-22 23:22:48,980 [INFO]  Epoch[1] Time cost=0.730
2020-06-22 23:22:48,993 [INFO]  Sa

Fitting model for column: purpose


2020-06-22 23:22:54,100 [INFO]  
2020-06-22 23:22:54,629 [INFO]  Epoch[0] Batch [0-23]	Speed: 754.56 samples/sec	cross-entropy=1.845153	purpose-accuracy=0.255208
2020-06-22 23:22:55,042 [INFO]  Epoch[0] Train-cross-entropy=1.779229
2020-06-22 23:22:55,045 [INFO]  Epoch[0] Train-purpose-accuracy=0.300000
2020-06-22 23:22:55,049 [INFO]  Epoch[0] Time cost=0.941
2020-06-22 23:22:55,062 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:22:55,147 [INFO]  Epoch[0] Validation-cross-entropy=1.744341
2020-06-22 23:22:55,150 [INFO]  Epoch[0] Validation-purpose-accuracy=0.312500
2020-06-22 23:22:55,605 [INFO]  Epoch[1] Batch [0-23]	Speed: 856.11 samples/sec	cross-entropy=1.600820	purpose-accuracy=0.307292
2020-06-22 23:22:55,997 [INFO]  Epoch[1] Train-cross-entropy=1.601411
2020-06-22 23:22:56,001 [INFO]  Epoch[1] Train-purpose-accuracy=0.345833
2020-06-22 23:22:56,004 [INFO]  Epoch[1] Time cost=0.851
2020-06-22 23:22:56,026 [INFO]  Saved checkpoint to "imputer_model\mod

Fitting model for column: savings_status


2020-06-22 23:23:05,999 [INFO]  
2020-06-22 23:23:06,405 [INFO]  Epoch[0] Batch [0-23]	Speed: 1029.22 samples/sec	cross-entropy=1.116828	savings_status-accuracy=0.627604
2020-06-22 23:23:06,731 [INFO]  Epoch[0] Train-cross-entropy=1.168884
2020-06-22 23:23:06,734 [INFO]  Epoch[0] Train-savings_status-accuracy=0.597222
2020-06-22 23:23:06,740 [INFO]  Epoch[0] Time cost=0.733
2020-06-22 23:23:06,749 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:23:06,805 [INFO]  Epoch[0] Validation-cross-entropy=1.251577
2020-06-22 23:23:06,808 [INFO]  Epoch[0] Validation-savings_status-accuracy=0.500000
2020-06-22 23:23:07,212 [INFO]  Epoch[1] Batch [0-23]	Speed: 948.55 samples/sec	cross-entropy=0.976428	savings_status-accuracy=0.658854
2020-06-22 23:23:07,608 [INFO]  Epoch[1] Train-cross-entropy=1.068355
2020-06-22 23:23:07,612 [INFO]  Epoch[1] Train-savings_status-accuracy=0.612500
2020-06-22 23:23:07,616 [INFO]  Epoch[1] Time cost=0.806
2020-06-22 23:23:07,628 [INFO]  Sa

Fitting model for column: employment


2020-06-22 23:23:14,108 [INFO]  
2020-06-22 23:23:14,630 [INFO]  Epoch[0] Batch [0-23]	Speed: 756.12 samples/sec	cross-entropy=1.458680	employment-accuracy=0.330729
2020-06-22 23:23:14,943 [INFO]  Epoch[0] Train-cross-entropy=1.389435
2020-06-22 23:23:14,947 [INFO]  Epoch[0] Train-employment-accuracy=0.383333
2020-06-22 23:23:14,950 [INFO]  Epoch[0] Time cost=0.834
2020-06-22 23:23:14,960 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:23:15,019 [INFO]  Epoch[0] Validation-cross-entropy=1.194825
2020-06-22 23:23:15,023 [INFO]  Epoch[0] Validation-employment-accuracy=0.487500
2020-06-22 23:23:15,375 [INFO]  Epoch[1] Batch [0-23]	Speed: 1108.00 samples/sec	cross-entropy=1.273190	employment-accuracy=0.434896
2020-06-22 23:23:15,730 [INFO]  Epoch[1] Train-cross-entropy=1.267736
2020-06-22 23:23:15,733 [INFO]  Epoch[1] Train-employment-accuracy=0.445833
2020-06-22 23:23:15,737 [INFO]  Epoch[1] Time cost=0.710
2020-06-22 23:23:15,744 [INFO]  Saved checkpoint to "i

Fitting model for column: personal_status


2020-06-22 23:23:24,480 [INFO]  
2020-06-22 23:23:24,863 [INFO]  Epoch[0] Batch [0-23]	Speed: 1048.25 samples/sec	cross-entropy=1.094120	personal_status-accuracy=0.572917
2020-06-22 23:23:25,214 [INFO]  Epoch[0] Train-cross-entropy=1.008009
2020-06-22 23:23:25,219 [INFO]  Epoch[0] Train-personal_status-accuracy=0.595833
2020-06-22 23:23:25,222 [INFO]  Epoch[0] Time cost=0.734
2020-06-22 23:23:25,229 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:23:25,288 [INFO]  Epoch[0] Validation-cross-entropy=0.896609
2020-06-22 23:23:25,291 [INFO]  Epoch[0] Validation-personal_status-accuracy=0.600000
2020-06-22 23:23:25,763 [INFO]  Epoch[1] Batch [0-23]	Speed: 814.53 samples/sec	cross-entropy=0.899457	personal_status-accuracy=0.630208
2020-06-22 23:23:26,091 [INFO]  Epoch[1] Train-cross-entropy=0.885442
2020-06-22 23:23:26,094 [INFO]  Epoch[1] Train-personal_status-accuracy=0.637500
2020-06-22 23:23:26,099 [INFO]  Epoch[1] Time cost=0.805
2020-06-22 23:23:26,106 [INFO

Fitting model for column: other_parties


2020-06-22 23:23:32,889 [INFO]  
2020-06-22 23:23:33,313 [INFO]  Epoch[0] Batch [0-23]	Speed: 950.99 samples/sec	cross-entropy=0.558129	other_parties-accuracy=0.872396
2020-06-22 23:23:33,608 [INFO]  Epoch[0] Train-cross-entropy=0.474744
2020-06-22 23:23:33,612 [INFO]  Epoch[0] Train-other_parties-accuracy=0.887500
2020-06-22 23:23:33,616 [INFO]  Epoch[0] Time cost=0.716
2020-06-22 23:23:33,623 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:23:33,675 [INFO]  Epoch[0] Validation-cross-entropy=0.339402
2020-06-22 23:23:33,679 [INFO]  Epoch[0] Validation-other_parties-accuracy=0.912500
2020-06-22 23:23:34,000 [INFO]  Epoch[1] Batch [0-23]	Speed: 1205.81 samples/sec	cross-entropy=0.355497	other_parties-accuracy=0.898438
2020-06-22 23:23:34,313 [INFO]  Epoch[1] Train-cross-entropy=0.352543
2020-06-22 23:23:34,318 [INFO]  Epoch[1] Train-other_parties-accuracy=0.901389
2020-06-22 23:23:34,322 [INFO]  Epoch[1] Time cost=0.639
2020-06-22 23:23:34,332 [INFO]  Saved c

Fitting model for column: property_magnitude


2020-06-22 23:23:41,170 [INFO]  
2020-06-22 23:23:41,586 [INFO]  Epoch[0] Batch [0-23]	Speed: 958.40 samples/sec	cross-entropy=1.280537	property_magnitude-accuracy=0.348958
2020-06-22 23:23:41,937 [INFO]  Epoch[0] Train-cross-entropy=1.260059
2020-06-22 23:23:41,941 [INFO]  Epoch[0] Train-property_magnitude-accuracy=0.369444
2020-06-22 23:23:41,946 [INFO]  Epoch[0] Time cost=0.767
2020-06-22 23:23:41,955 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:23:42,011 [INFO]  Epoch[0] Validation-cross-entropy=1.113960
2020-06-22 23:23:42,015 [INFO]  Epoch[0] Validation-property_magnitude-accuracy=0.475000
2020-06-22 23:23:42,387 [INFO]  Epoch[1] Batch [0-23]	Speed: 1041.66 samples/sec	cross-entropy=1.136349	property_magnitude-accuracy=0.442708
2020-06-22 23:23:42,700 [INFO]  Epoch[1] Train-cross-entropy=1.153716
2020-06-22 23:23:42,704 [INFO]  Epoch[1] Train-property_magnitude-accuracy=0.451389
2020-06-22 23:23:42,708 [INFO]  Epoch[1] Time cost=0.690
2020-06-22 23:

2020-06-22 23:23:52,103 [INFO]  Epoch[13] Time cost=0.666
2020-06-22 23:23:52,112 [INFO]  Saved checkpoint to "imputer_model\model-0013.params"
2020-06-22 23:23:52,164 [INFO]  Epoch[13] Validation-cross-entropy=1.042172
2020-06-22 23:23:52,167 [INFO]  Epoch[13] Validation-property_magnitude-accuracy=0.512500
2020-06-22 23:23:52,560 [INFO]  Epoch[14] Batch [0-23]	Speed: 982.57 samples/sec	cross-entropy=0.920992	property_magnitude-accuracy=0.593750
2020-06-22 23:23:52,883 [INFO]  Epoch[14] Train-cross-entropy=0.926176
2020-06-22 23:23:52,888 [INFO]  Epoch[14] Train-property_magnitude-accuracy=0.595833
2020-06-22 23:23:52,891 [INFO]  Epoch[14] Time cost=0.720
2020-06-22 23:23:52,900 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-06-22 23:23:52,957 [INFO]  Epoch[14] Validation-cross-entropy=1.042395
2020-06-22 23:23:52,961 [INFO]  Epoch[14] Validation-property_magnitude-accuracy=0.512500
2020-06-22 23:23:53,341 [INFO]  Epoch[15] Batch [0-23]	Speed: 1013.69 samples/sec	c

Fitting model for column: other_payment_plans


2020-06-22 23:24:01,377 [INFO]  
2020-06-22 23:24:02,019 [INFO]  Epoch[0] Batch [0-23]	Speed: 610.25 samples/sec	cross-entropy=0.721428	other_payment_plans-accuracy=0.796875
2020-06-22 23:24:02,506 [INFO]  Epoch[0] Train-cross-entropy=0.663562
2020-06-22 23:24:02,510 [INFO]  Epoch[0] Train-other_payment_plans-accuracy=0.800000
2020-06-22 23:24:02,513 [INFO]  Epoch[0] Time cost=1.126
2020-06-22 23:24:02,532 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:24:02,650 [INFO]  Epoch[0] Validation-cross-entropy=0.503750
2020-06-22 23:24:02,654 [INFO]  Epoch[0] Validation-other_payment_plans-accuracy=0.850000
2020-06-22 23:24:03,158 [INFO]  Epoch[1] Batch [0-23]	Speed: 863.54 samples/sec	cross-entropy=0.541429	other_payment_plans-accuracy=0.817708
2020-06-22 23:24:03,459 [INFO]  Epoch[1] Train-cross-entropy=0.549453
2020-06-22 23:24:03,463 [INFO]  Epoch[1] Train-other_payment_plans-accuracy=0.811111
2020-06-22 23:24:03,467 [INFO]  Epoch[1] Time cost=0.809
2020-06-22

Fitting model for column: housing


2020-06-22 23:24:08,757 [INFO]  
2020-06-22 23:24:09,146 [INFO]  Epoch[0] Batch [0-23]	Speed: 1039.33 samples/sec	cross-entropy=0.805240	housing-accuracy=0.692708
2020-06-22 23:24:09,478 [INFO]  Epoch[0] Train-cross-entropy=0.726922
2020-06-22 23:24:09,483 [INFO]  Epoch[0] Train-housing-accuracy=0.720833
2020-06-22 23:24:09,488 [INFO]  Epoch[0] Time cost=0.722
2020-06-22 23:24:09,495 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:24:09,550 [INFO]  Epoch[0] Validation-cross-entropy=0.569481
2020-06-22 23:24:09,553 [INFO]  Epoch[0] Validation-housing-accuracy=0.737500
2020-06-22 23:24:09,890 [INFO]  Epoch[1] Batch [0-23]	Speed: 1160.33 samples/sec	cross-entropy=0.626335	housing-accuracy=0.742188
2020-06-22 23:24:10,176 [INFO]  Epoch[1] Train-cross-entropy=0.602726
2020-06-22 23:24:10,179 [INFO]  Epoch[1] Train-housing-accuracy=0.754167
2020-06-22 23:24:10,182 [INFO]  Epoch[1] Time cost=0.625
2020-06-22 23:24:10,191 [INFO]  Saved checkpoint to "imputer_model\m

2020-06-22 23:24:20,168 [INFO]  CategoricalEncoder for column job                                found only 98 occurrences of value high qualif/self emp/mgmt
2020-06-22 23:24:20,172 [INFO]  CategoricalEncoder for column job                                found only 16 occurrences of value unemp/unskilled non res


Fitting model for column: job


2020-06-22 23:24:21,198 [INFO]  
2020-06-22 23:24:21,794 [INFO]  Epoch[0] Batch [0-23]	Speed: 662.45 samples/sec	cross-entropy=1.067986	job-accuracy=0.578125
2020-06-22 23:24:22,101 [INFO]  Epoch[0] Train-cross-entropy=0.950978
2020-06-22 23:24:22,105 [INFO]  Epoch[0] Train-job-accuracy=0.627778
2020-06-22 23:24:22,109 [INFO]  Epoch[0] Time cost=0.901
2020-06-22 23:24:22,117 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:24:22,171 [INFO]  Epoch[0] Validation-cross-entropy=0.749943
2020-06-22 23:24:22,175 [INFO]  Epoch[0] Validation-job-accuracy=0.700000
2020-06-22 23:24:22,530 [INFO]  Epoch[1] Batch [0-23]	Speed: 1088.45 samples/sec	cross-entropy=0.865077	job-accuracy=0.622396
2020-06-22 23:24:22,842 [INFO]  Epoch[1] Train-cross-entropy=0.814491
2020-06-22 23:24:22,846 [INFO]  Epoch[1] Train-job-accuracy=0.659722
2020-06-22 23:24:22,849 [INFO]  Epoch[1] Time cost=0.670
2020-06-22 23:24:22,859 [INFO]  Saved checkpoint to "imputer_model\model-0001.params"
202

Fitting model for column: own_telephone


2020-06-22 23:24:29,774 [INFO]  
2020-06-22 23:24:30,142 [INFO]  Epoch[0] Batch [0-23]	Speed: 1096.30 samples/sec	cross-entropy=0.678979	own_telephone-accuracy=0.638021
2020-06-22 23:24:30,454 [INFO]  Epoch[0] Train-cross-entropy=0.631679
2020-06-22 23:24:30,461 [INFO]  Epoch[0] Train-own_telephone-accuracy=0.651389
2020-06-22 23:24:30,465 [INFO]  Epoch[0] Time cost=0.681
2020-06-22 23:24:30,476 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:24:30,547 [INFO]  Epoch[0] Validation-cross-entropy=0.558979
2020-06-22 23:24:30,550 [INFO]  Epoch[0] Validation-own_telephone-accuracy=0.750000
2020-06-22 23:24:30,881 [INFO]  Epoch[1] Batch [0-23]	Speed: 1178.86 samples/sec	cross-entropy=0.550761	own_telephone-accuracy=0.718750
2020-06-22 23:24:31,168 [INFO]  Epoch[1] Train-cross-entropy=0.552716
2020-06-22 23:24:31,171 [INFO]  Epoch[1] Train-own_telephone-accuracy=0.701389
2020-06-22 23:24:31,175 [INFO]  Epoch[1] Time cost=0.621
2020-06-22 23:24:31,183 [INFO]  Saved 

Fitting model for column: foreign_worker


2020-06-22 23:24:38,195 [INFO]  
2020-06-22 23:24:38,563 [INFO]  Epoch[0] Batch [0-23]	Speed: 1077.30 samples/sec	cross-entropy=0.360983	foreign_worker-accuracy=0.929688
2020-06-22 23:24:38,880 [INFO]  Epoch[0] Train-cross-entropy=0.246135
2020-06-22 23:24:38,884 [INFO]  Epoch[0] Train-foreign_worker-accuracy=0.950000
2020-06-22 23:24:38,888 [INFO]  Epoch[0] Time cost=0.684
2020-06-22 23:24:38,898 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:24:38,959 [INFO]  Epoch[0] Validation-cross-entropy=0.110679
2020-06-22 23:24:38,963 [INFO]  Epoch[0] Validation-foreign_worker-accuracy=0.975000
2020-06-22 23:24:39,388 [INFO]  Epoch[1] Batch [0-23]	Speed: 914.45 samples/sec	cross-entropy=0.170895	foreign_worker-accuracy=0.950521
2020-06-22 23:24:39,704 [INFO]  Epoch[1] Train-cross-entropy=0.148414
2020-06-22 23:24:39,710 [INFO]  Epoch[1] Train-foreign_worker-accuracy=0.961111
2020-06-22 23:24:39,714 [INFO]  Epoch[1] Time cost=0.746
2020-06-22 23:24:39,724 [INFO]  Sa

2020-06-22 23:24:48,731 [INFO]  Epoch[13] Validation-cross-entropy=0.098764
2020-06-22 23:24:48,737 [INFO]  Epoch[13] Validation-foreign_worker-accuracy=0.975000
2020-06-22 23:24:49,093 [INFO]  Epoch[14] Batch [0-23]	Speed: 1085.25 samples/sec	cross-entropy=0.101877	foreign_worker-accuracy=0.955729
2020-06-22 23:24:49,445 [INFO]  Epoch[14] Train-cross-entropy=0.095851
2020-06-22 23:24:49,450 [INFO]  Epoch[14] Train-foreign_worker-accuracy=0.965278
2020-06-22 23:24:49,454 [INFO]  Epoch[14] Time cost=0.714
2020-06-22 23:24:49,465 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-06-22 23:24:49,547 [INFO]  No improvement detected for 5 epochs compared to 0.09598303362727165 last error obtained: 0.09967826567590236, stopping here
2020-06-22 23:24:49,551 [INFO]  
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting model for column: duration


2020-06-22 23:24:51,167 [INFO]  
2020-06-22 23:24:51,578 [INFO]  Epoch[0] Batch [0-23]	Speed: 1058.75 samples/sec	cross-entropy=13.230839	duration-accuracy=0.000000
2020-06-22 23:24:51,894 [INFO]  Epoch[0] Train-cross-entropy=10.659650
2020-06-22 23:24:51,899 [INFO]  Epoch[0] Train-duration-accuracy=0.000000
2020-06-22 23:24:51,902 [INFO]  Epoch[0] Time cost=0.725
2020-06-22 23:24:51,910 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:24:51,961 [INFO]  Epoch[0] Validation-cross-entropy=9.130201
2020-06-22 23:24:51,965 [INFO]  Epoch[0] Validation-duration-accuracy=0.000000
2020-06-22 23:24:52,258 [INFO]  Epoch[1] Batch [0-23]	Speed: 1334.43 samples/sec	cross-entropy=8.244398	duration-accuracy=0.000000
2020-06-22 23:24:52,538 [INFO]  Epoch[1] Train-cross-entropy=7.862704
2020-06-22 23:24:52,541 [INFO]  Epoch[1] Train-duration-accuracy=0.000000
2020-06-22 23:24:52,545 [INFO]  Epoch[1] Time cost=0.576
2020-06-22 23:24:52,555 [INFO]  Saved checkpoint to "imputer_

Fitting model for column: credit_amount


2020-06-22 23:24:58,656 [INFO]  
2020-06-22 23:24:58,988 [INFO]  Epoch[0] Batch [0-23]	Speed: 1209.78 samples/sec	cross-entropy=11.356336	credit_amount-accuracy=0.000000
2020-06-22 23:24:59,251 [INFO]  Epoch[0] Train-cross-entropy=9.291890
2020-06-22 23:24:59,257 [INFO]  Epoch[0] Train-credit_amount-accuracy=0.000000
2020-06-22 23:24:59,264 [INFO]  Epoch[0] Time cost=0.595
2020-06-22 23:24:59,273 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:24:59,330 [INFO]  Epoch[0] Validation-cross-entropy=8.061475
2020-06-22 23:24:59,334 [INFO]  Epoch[0] Validation-credit_amount-accuracy=0.000000
2020-06-22 23:24:59,652 [INFO]  Epoch[1] Batch [0-23]	Speed: 1213.69 samples/sec	cross-entropy=7.378949	credit_amount-accuracy=0.000000
2020-06-22 23:24:59,963 [INFO]  Epoch[1] Train-cross-entropy=6.749732
2020-06-22 23:24:59,967 [INFO]  Epoch[1] Train-credit_amount-accuracy=0.000000
2020-06-22 23:24:59,971 [INFO]  Epoch[1] Time cost=0.633
2020-06-22 23:24:59,980 [INFO]  Saved

Fitting model for column: installment_commitment


2020-06-22 23:25:06,632 [INFO]  
2020-06-22 23:25:07,000 [INFO]  Epoch[0] Batch [0-23]	Speed: 1093.20 samples/sec	cross-entropy=13.968772	installment_commitment-accuracy=0.000000
2020-06-22 23:25:07,251 [INFO]  Epoch[0] Train-cross-entropy=14.039492
2020-06-22 23:25:07,256 [INFO]  Epoch[0] Train-installment_commitment-accuracy=0.000000
2020-06-22 23:25:07,261 [INFO]  Epoch[0] Time cost=0.617
2020-06-22 23:25:07,269 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:25:07,325 [INFO]  Epoch[0] Validation-cross-entropy=11.850121
2020-06-22 23:25:07,329 [INFO]  Epoch[0] Validation-installment_commitment-accuracy=0.000000
2020-06-22 23:25:07,623 [INFO]  Epoch[1] Batch [0-23]	Speed: 1327.28 samples/sec	cross-entropy=11.714394	installment_commitment-accuracy=0.000000
2020-06-22 23:25:07,875 [INFO]  Epoch[1] Train-cross-entropy=12.268152
2020-06-22 23:25:07,879 [INFO]  Epoch[1] Train-installment_commitment-accuracy=0.000000
2020-06-22 23:25:07,883 [INFO]  Epoch[1] Time

2020-06-22 23:25:16,153 [INFO]  Epoch[13] Train-cross-entropy=10.165791
2020-06-22 23:25:16,156 [INFO]  Epoch[13] Train-installment_commitment-accuracy=0.000000
2020-06-22 23:25:16,160 [INFO]  Epoch[13] Time cost=0.562
2020-06-22 23:25:16,169 [INFO]  Saved checkpoint to "imputer_model\model-0013.params"
2020-06-22 23:25:16,228 [INFO]  Epoch[13] Validation-cross-entropy=10.800904
2020-06-22 23:25:16,231 [INFO]  Epoch[13] Validation-installment_commitment-accuracy=0.000000
2020-06-22 23:25:16,511 [INFO]  Epoch[14] Batch [0-23]	Speed: 1392.38 samples/sec	cross-entropy=9.740624	installment_commitment-accuracy=0.000000
2020-06-22 23:25:16,775 [INFO]  Epoch[14] Train-cross-entropy=10.110223
2020-06-22 23:25:16,779 [INFO]  Epoch[14] Train-installment_commitment-accuracy=0.000000
2020-06-22 23:25:16,782 [INFO]  Epoch[14] Time cost=0.548
2020-06-22 23:25:16,792 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-06-22 23:25:16,843 [INFO]  Epoch[14] Validation-cross-entropy=10.797

Fitting model for column: residence_since


2020-06-22 23:25:19,184 [INFO]  
2020-06-22 23:25:19,532 [INFO]  Epoch[0] Batch [0-23]	Speed: 1149.48 samples/sec	cross-entropy=15.513399	residence_since-accuracy=0.000000
2020-06-22 23:25:19,774 [INFO]  Epoch[0] Train-cross-entropy=15.724795
2020-06-22 23:25:19,778 [INFO]  Epoch[0] Train-residence_since-accuracy=0.000000
2020-06-22 23:25:19,782 [INFO]  Epoch[0] Time cost=0.587
2020-06-22 23:25:19,793 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:25:19,845 [INFO]  Epoch[0] Validation-cross-entropy=13.663737
2020-06-22 23:25:19,849 [INFO]  Epoch[0] Validation-residence_since-accuracy=0.000000
2020-06-22 23:25:20,139 [INFO]  Epoch[1] Batch [0-23]	Speed: 1341.76 samples/sec	cross-entropy=14.000350	residence_since-accuracy=0.000000
2020-06-22 23:25:20,409 [INFO]  Epoch[1] Train-cross-entropy=14.377342
2020-06-22 23:25:20,413 [INFO]  Epoch[1] Train-residence_since-accuracy=0.000000
2020-06-22 23:25:20,417 [INFO]  Epoch[1] Time cost=0.564
2020-06-22 23:25:20,426

2020-06-22 23:25:28,826 [INFO]  Saved checkpoint to "imputer_model\model-0013.params"
2020-06-22 23:25:28,881 [INFO]  Epoch[13] Validation-cross-entropy=13.463364
2020-06-22 23:25:28,886 [INFO]  Epoch[13] Validation-residence_since-accuracy=0.000000
2020-06-22 23:25:29,189 [INFO]  Epoch[14] Batch [0-23]	Speed: 1308.46 samples/sec	cross-entropy=11.196969	residence_since-accuracy=0.000000
2020-06-22 23:25:29,520 [INFO]  Epoch[14] Train-cross-entropy=11.410341
2020-06-22 23:25:29,523 [INFO]  Epoch[14] Train-residence_since-accuracy=0.000000
2020-06-22 23:25:29,528 [INFO]  Epoch[14] Time cost=0.638
2020-06-22 23:25:29,539 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-06-22 23:25:29,603 [INFO]  Epoch[14] Validation-cross-entropy=13.473713
2020-06-22 23:25:29,607 [INFO]  Epoch[14] Validation-residence_since-accuracy=0.000000
2020-06-22 23:25:29,908 [INFO]  Epoch[15] Batch [0-23]	Speed: 1294.60 samples/sec	cross-entropy=11.134558	residence_since-accuracy=0.000000
2020-06-

Fitting model for column: age


2020-06-22 23:25:32,582 [INFO]  
2020-06-22 23:25:32,912 [INFO]  Epoch[0] Batch [0-23]	Speed: 1212.17 samples/sec	cross-entropy=14.990745	age-accuracy=0.000000
2020-06-22 23:25:33,162 [INFO]  Epoch[0] Train-cross-entropy=14.536432
2020-06-22 23:25:33,167 [INFO]  Epoch[0] Train-age-accuracy=0.000000
2020-06-22 23:25:33,171 [INFO]  Epoch[0] Time cost=0.578
2020-06-22 23:25:33,180 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:25:33,231 [INFO]  Epoch[0] Validation-cross-entropy=18.428380
2020-06-22 23:25:33,236 [INFO]  Epoch[0] Validation-age-accuracy=0.000000
2020-06-22 23:25:33,543 [INFO]  Epoch[1] Batch [0-23]	Speed: 1267.98 samples/sec	cross-entropy=13.139181	age-accuracy=0.000000
2020-06-22 23:25:33,815 [INFO]  Epoch[1] Train-cross-entropy=13.102317
2020-06-22 23:25:33,820 [INFO]  Epoch[1] Train-age-accuracy=0.000000
2020-06-22 23:25:33,824 [INFO]  Epoch[1] Time cost=0.584
2020-06-22 23:25:33,833 [INFO]  Saved checkpoint to "imputer_model\model-0001.param

2020-06-22 23:25:42,460 [INFO]  Epoch[14] Time cost=0.551
2020-06-22 23:25:42,468 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-06-22 23:25:42,521 [INFO]  Epoch[14] Validation-cross-entropy=15.369738
2020-06-22 23:25:42,525 [INFO]  Epoch[14] Validation-age-accuracy=0.000000
2020-06-22 23:25:42,840 [INFO]  Epoch[15] Batch [0-23]	Speed: 1233.98 samples/sec	cross-entropy=10.218097	age-accuracy=0.000000
2020-06-22 23:25:43,112 [INFO]  Epoch[15] Train-cross-entropy=10.217817
2020-06-22 23:25:43,116 [INFO]  Epoch[15] Train-age-accuracy=0.000000
2020-06-22 23:25:43,120 [INFO]  Epoch[15] Time cost=0.590
2020-06-22 23:25:43,130 [INFO]  Saved checkpoint to "imputer_model\model-0015.params"
2020-06-22 23:25:43,184 [INFO]  Epoch[15] Validation-cross-entropy=15.381701
2020-06-22 23:25:43,188 [INFO]  Epoch[15] Validation-age-accuracy=0.000000
2020-06-22 23:25:43,483 [INFO]  Epoch[16] Batch [0-23]	Speed: 1336.90 samples/sec	cross-entropy=10.141178	age-accuracy=0.000000
2020-06-22

Fitting model for column: existing_credits


2020-06-22 23:25:50,956 [INFO]  
2020-06-22 23:25:51,312 [INFO]  Epoch[0] Batch [0-23]	Speed: 1131.85 samples/sec	cross-entropy=15.470359	existing_credits-accuracy=0.000000
2020-06-22 23:25:51,573 [INFO]  Epoch[0] Train-cross-entropy=15.524725
2020-06-22 23:25:51,579 [INFO]  Epoch[0] Train-existing_credits-accuracy=0.000000
2020-06-22 23:25:51,584 [INFO]  Epoch[0] Time cost=0.615
2020-06-22 23:25:51,594 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:25:51,653 [INFO]  Epoch[0] Validation-cross-entropy=12.585136
2020-06-22 23:25:51,658 [INFO]  Epoch[0] Validation-existing_credits-accuracy=0.000000
2020-06-22 23:25:52,018 [INFO]  Epoch[1] Batch [0-23]	Speed: 1072.61 samples/sec	cross-entropy=13.441843	existing_credits-accuracy=0.000000
2020-06-22 23:25:52,255 [INFO]  Epoch[1] Train-cross-entropy=13.655326
2020-06-22 23:25:52,262 [INFO]  Epoch[1] Train-existing_credits-accuracy=0.000000
2020-06-22 23:25:52,268 [INFO]  Epoch[1] Time cost=0.605
2020-06-22 23:25:5

Fitting model for column: num_dependents


2020-06-22 23:26:02,427 [INFO]  
2020-06-22 23:26:03,076 [INFO]  Epoch[0] Batch [0-23]	Speed: 614.20 samples/sec	cross-entropy=14.812305	num_dependents-accuracy=0.000000
2020-06-22 23:26:03,370 [INFO]  Epoch[0] Train-cross-entropy=15.648352
2020-06-22 23:26:03,374 [INFO]  Epoch[0] Train-num_dependents-accuracy=0.000000
2020-06-22 23:26:03,378 [INFO]  Epoch[0] Time cost=0.936
2020-06-22 23:26:03,388 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:26:03,451 [INFO]  Epoch[0] Validation-cross-entropy=14.030573
2020-06-22 23:26:03,457 [INFO]  Epoch[0] Validation-num_dependents-accuracy=0.000000
2020-06-22 23:26:03,765 [INFO]  Epoch[1] Batch [0-23]	Speed: 1267.99 samples/sec	cross-entropy=13.452226	num_dependents-accuracy=0.000000
2020-06-22 23:26:04,024 [INFO]  Epoch[1] Train-cross-entropy=14.476436
2020-06-22 23:26:04,029 [INFO]  Epoch[1] Train-num_dependents-accuracy=0.000000
2020-06-22 23:26:04,032 [INFO]  Epoch[1] Time cost=0.571
2020-06-22 23:26:04,042 [INFO

PPP score with <jenga.cleaning.cleaner.Cleaner object at 0x0000019F50FB3FD0>: 0.7812061711079944
PPP score with <jenga.cleaning.cleaner.Cleaner object at 0x0000019F50F083C8>: 0.7946470313230481
PPP score with <jenga.cleaning.cleaner.Cleaner object at 0x0000019F50FD6160>: 0.7850631136044881


2020-06-22 23:26:10,798 [INFO]  CategoricalEncoder for column checking_status                                found only 44 occurrences of value >=200


Fitting model for column: checking_status


2020-06-22 23:26:11,802 [INFO]  
2020-06-22 23:26:12,408 [INFO]  Epoch[0] Batch [0-23]	Speed: 654.77 samples/sec	cross-entropy=1.269358	checking_status-accuracy=0.380208
2020-06-22 23:26:12,717 [INFO]  Epoch[0] Train-cross-entropy=1.230729
2020-06-22 23:26:12,722 [INFO]  Epoch[0] Train-checking_status-accuracy=0.383333
2020-06-22 23:26:12,727 [INFO]  Epoch[0] Time cost=0.910
2020-06-22 23:26:12,736 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:26:12,785 [INFO]  Epoch[0] Validation-cross-entropy=1.049188
2020-06-22 23:26:12,788 [INFO]  Epoch[0] Validation-checking_status-accuracy=0.550000
2020-06-22 23:26:13,178 [INFO]  Epoch[1] Batch [0-23]	Speed: 999.96 samples/sec	cross-entropy=1.129746	checking_status-accuracy=0.429688
2020-06-22 23:26:13,502 [INFO]  Epoch[1] Train-cross-entropy=1.132194
2020-06-22 23:26:13,507 [INFO]  Epoch[1] Train-checking_status-accuracy=0.434722
2020-06-22 23:26:13,511 [INFO]  Epoch[1] Time cost=0.717
2020-06-22 23:26:13,522 [INFO]

Fitting model for column: credit_history


2020-06-22 23:26:22,428 [INFO]  
2020-06-22 23:26:23,095 [INFO]  Epoch[0] Batch [0-23]	Speed: 596.10 samples/sec	cross-entropy=1.086093	credit_history-accuracy=0.583333
2020-06-22 23:26:23,488 [INFO]  Epoch[0] Train-cross-entropy=1.046199
2020-06-22 23:26:23,492 [INFO]  Epoch[0] Train-credit_history-accuracy=0.629167
2020-06-22 23:26:23,497 [INFO]  Epoch[0] Time cost=1.058
2020-06-22 23:26:23,506 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:26:23,563 [INFO]  Epoch[0] Validation-cross-entropy=0.873624
2020-06-22 23:26:23,568 [INFO]  Epoch[0] Validation-credit_history-accuracy=0.712500
2020-06-22 23:26:23,915 [INFO]  Epoch[1] Batch [0-23]	Speed: 1121.53 samples/sec	cross-entropy=0.876201	credit_history-accuracy=0.700521
2020-06-22 23:26:24,229 [INFO]  Epoch[1] Train-cross-entropy=0.899546
2020-06-22 23:26:24,234 [INFO]  Epoch[1] Train-credit_history-accuracy=0.698611
2020-06-22 23:26:24,239 [INFO]  Epoch[1] Time cost=0.667
2020-06-22 23:26:24,250 [INFO]  Sa

Fitting model for column: purpose


2020-06-22 23:26:29,732 [INFO]  
2020-06-22 23:26:30,221 [INFO]  Epoch[0] Batch [0-23]	Speed: 823.59 samples/sec	cross-entropy=1.855478	purpose-accuracy=0.252604
2020-06-22 23:26:30,610 [INFO]  Epoch[0] Train-cross-entropy=1.779483
2020-06-22 23:26:30,614 [INFO]  Epoch[0] Train-purpose-accuracy=0.286111
2020-06-22 23:26:30,618 [INFO]  Epoch[0] Time cost=0.877
2020-06-22 23:26:30,630 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:26:30,688 [INFO]  Epoch[0] Validation-cross-entropy=1.744617
2020-06-22 23:26:30,692 [INFO]  Epoch[0] Validation-purpose-accuracy=0.300000
2020-06-22 23:26:31,290 [INFO]  Epoch[1] Batch [0-23]	Speed: 647.29 samples/sec	cross-entropy=1.606297	purpose-accuracy=0.291667
2020-06-22 23:26:31,731 [INFO]  Epoch[1] Train-cross-entropy=1.604866
2020-06-22 23:26:31,735 [INFO]  Epoch[1] Train-purpose-accuracy=0.326389
2020-06-22 23:26:31,741 [INFO]  Epoch[1] Time cost=1.043
2020-06-22 23:26:31,803 [INFO]  Saved checkpoint to "imputer_model\mod

Fitting model for column: savings_status


2020-06-22 23:26:40,611 [INFO]  
2020-06-22 23:26:40,986 [INFO]  Epoch[0] Batch [0-23]	Speed: 1072.63 samples/sec	cross-entropy=1.115493	savings_status-accuracy=0.630208
2020-06-22 23:26:41,323 [INFO]  Epoch[0] Train-cross-entropy=1.169497
2020-06-22 23:26:41,328 [INFO]  Epoch[0] Train-savings_status-accuracy=0.597222
2020-06-22 23:26:41,333 [INFO]  Epoch[0] Time cost=0.712
2020-06-22 23:26:41,345 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:26:41,404 [INFO]  Epoch[0] Validation-cross-entropy=1.251856
2020-06-22 23:26:41,409 [INFO]  Epoch[0] Validation-savings_status-accuracy=0.487500
2020-06-22 23:26:41,782 [INFO]  Epoch[1] Batch [0-23]	Speed: 1036.48 samples/sec	cross-entropy=0.974795	savings_status-accuracy=0.656250
2020-06-22 23:26:42,093 [INFO]  Epoch[1] Train-cross-entropy=1.066414
2020-06-22 23:26:42,098 [INFO]  Epoch[1] Train-savings_status-accuracy=0.611111
2020-06-22 23:26:42,102 [INFO]  Epoch[1] Time cost=0.689
2020-06-22 23:26:42,111 [INFO]  S

Fitting model for column: employment


2020-06-22 23:26:48,109 [INFO]  
2020-06-22 23:26:48,594 [INFO]  Epoch[0] Batch [0-23]	Speed: 924.77 samples/sec	cross-entropy=1.451846	employment-accuracy=0.335938
2020-06-22 23:26:48,916 [INFO]  Epoch[0] Train-cross-entropy=1.386327
2020-06-22 23:26:48,921 [INFO]  Epoch[0] Train-employment-accuracy=0.387500
2020-06-22 23:26:48,926 [INFO]  Epoch[0] Time cost=0.760
2020-06-22 23:26:48,935 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:26:48,990 [INFO]  Epoch[0] Validation-cross-entropy=1.204541
2020-06-22 23:26:48,995 [INFO]  Epoch[0] Validation-employment-accuracy=0.462500
2020-06-22 23:26:49,508 [INFO]  Epoch[1] Batch [0-23]	Speed: 749.96 samples/sec	cross-entropy=1.277075	employment-accuracy=0.424479
2020-06-22 23:26:49,892 [INFO]  Epoch[1] Train-cross-entropy=1.271776
2020-06-22 23:26:49,896 [INFO]  Epoch[1] Train-employment-accuracy=0.436111
2020-06-22 23:26:49,900 [INFO]  Epoch[1] Time cost=0.900
2020-06-22 23:26:49,909 [INFO]  Saved checkpoint to "im

Fitting model for column: personal_status


2020-06-22 23:26:56,900 [INFO]  
2020-06-22 23:26:57,289 [INFO]  Epoch[0] Batch [0-23]	Speed: 1027.76 samples/sec	cross-entropy=1.090422	personal_status-accuracy=0.559896
2020-06-22 23:26:57,649 [INFO]  Epoch[0] Train-cross-entropy=1.009791
2020-06-22 23:26:57,654 [INFO]  Epoch[0] Train-personal_status-accuracy=0.587500
2020-06-22 23:26:57,659 [INFO]  Epoch[0] Time cost=0.749
2020-06-22 23:26:57,671 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:26:57,746 [INFO]  Epoch[0] Validation-cross-entropy=0.899064
2020-06-22 23:26:57,750 [INFO]  Epoch[0] Validation-personal_status-accuracy=0.587500
2020-06-22 23:26:58,103 [INFO]  Epoch[1] Batch [0-23]	Speed: 1104.74 samples/sec	cross-entropy=0.898937	personal_status-accuracy=0.614583
2020-06-22 23:26:58,413 [INFO]  Epoch[1] Train-cross-entropy=0.887238
2020-06-22 23:26:58,419 [INFO]  Epoch[1] Train-personal_status-accuracy=0.630556
2020-06-22 23:26:58,423 [INFO]  Epoch[1] Time cost=0.669
2020-06-22 23:26:58,436 [INF

Fitting model for column: other_parties


2020-06-22 23:27:05,112 [INFO]  
2020-06-22 23:27:05,513 [INFO]  Epoch[0] Batch [0-23]	Speed: 1008.16 samples/sec	cross-entropy=0.549558	other_parties-accuracy=0.864583
2020-06-22 23:27:05,815 [INFO]  Epoch[0] Train-cross-entropy=0.470729
2020-06-22 23:27:05,819 [INFO]  Epoch[0] Train-other_parties-accuracy=0.883333
2020-06-22 23:27:05,823 [INFO]  Epoch[0] Time cost=0.700
2020-06-22 23:27:05,833 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:27:05,888 [INFO]  Epoch[0] Validation-cross-entropy=0.351835
2020-06-22 23:27:05,893 [INFO]  Epoch[0] Validation-other_parties-accuracy=0.912500
2020-06-22 23:27:06,195 [INFO]  Epoch[1] Batch [0-23]	Speed: 1290.15 samples/sec	cross-entropy=0.354230	other_parties-accuracy=0.898438
2020-06-22 23:27:06,502 [INFO]  Epoch[1] Train-cross-entropy=0.353912
2020-06-22 23:27:06,507 [INFO]  Epoch[1] Train-other_parties-accuracy=0.901389
2020-06-22 23:27:06,511 [INFO]  Epoch[1] Time cost=0.613
2020-06-22 23:27:06,520 [INFO]  Saved 

Fitting model for column: property_magnitude


2020-06-22 23:27:12,729 [INFO]  
2020-06-22 23:27:13,132 [INFO]  Epoch[0] Batch [0-23]	Speed: 1013.56 samples/sec	cross-entropy=1.280408	property_magnitude-accuracy=0.377604
2020-06-22 23:27:13,438 [INFO]  Epoch[0] Train-cross-entropy=1.260246
2020-06-22 23:27:13,442 [INFO]  Epoch[0] Train-property_magnitude-accuracy=0.400000
2020-06-22 23:27:13,448 [INFO]  Epoch[0] Time cost=0.706
2020-06-22 23:27:13,457 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:27:13,516 [INFO]  Epoch[0] Validation-cross-entropy=1.112021
2020-06-22 23:27:13,520 [INFO]  Epoch[0] Validation-property_magnitude-accuracy=0.462500
2020-06-22 23:27:13,895 [INFO]  Epoch[1] Batch [0-23]	Speed: 1040.83 samples/sec	cross-entropy=1.139063	property_magnitude-accuracy=0.429688
2020-06-22 23:27:14,193 [INFO]  Epoch[1] Train-cross-entropy=1.156024
2020-06-22 23:27:14,198 [INFO]  Epoch[1] Train-property_magnitude-accuracy=0.445833
2020-06-22 23:27:14,203 [INFO]  Epoch[1] Time cost=0.680
2020-06-22 23

2020-06-22 23:27:23,769 [INFO]  Epoch[13] Time cost=0.712
2020-06-22 23:27:23,781 [INFO]  Saved checkpoint to "imputer_model\model-0013.params"
2020-06-22 23:27:23,856 [INFO]  Epoch[13] Validation-cross-entropy=1.043900
2020-06-22 23:27:23,862 [INFO]  Epoch[13] Validation-property_magnitude-accuracy=0.500000
2020-06-22 23:27:24,313 [INFO]  Epoch[14] Batch [0-23]	Speed: 869.19 samples/sec	cross-entropy=0.922820	property_magnitude-accuracy=0.593750
2020-06-22 23:27:24,667 [INFO]  Epoch[14] Train-cross-entropy=0.930142
2020-06-22 23:27:24,672 [INFO]  Epoch[14] Train-property_magnitude-accuracy=0.594444
2020-06-22 23:27:24,678 [INFO]  Epoch[14] Time cost=0.809
2020-06-22 23:27:24,692 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-06-22 23:27:24,769 [INFO]  Epoch[14] Validation-cross-entropy=1.044718
2020-06-22 23:27:24,777 [INFO]  Epoch[14] Validation-property_magnitude-accuracy=0.500000
2020-06-22 23:27:25,271 [INFO]  Epoch[15] Batch [0-23]	Speed: 797.79 samples/sec	cr

Fitting model for column: other_payment_plans


2020-06-22 23:27:29,336 [INFO]  
2020-06-22 23:27:29,734 [INFO]  Epoch[0] Batch [0-23]	Speed: 1010.89 samples/sec	cross-entropy=0.717418	other_payment_plans-accuracy=0.783854
2020-06-22 23:27:30,015 [INFO]  Epoch[0] Train-cross-entropy=0.661797
2020-06-22 23:27:30,019 [INFO]  Epoch[0] Train-other_payment_plans-accuracy=0.793056
2020-06-22 23:27:30,025 [INFO]  Epoch[0] Time cost=0.679
2020-06-22 23:27:30,035 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:27:30,100 [INFO]  Epoch[0] Validation-cross-entropy=0.508630
2020-06-22 23:27:30,104 [INFO]  Epoch[0] Validation-other_payment_plans-accuracy=0.850000
2020-06-22 23:27:30,410 [INFO]  Epoch[1] Batch [0-23]	Speed: 1276.72 samples/sec	cross-entropy=0.540378	other_payment_plans-accuracy=0.817708
2020-06-22 23:27:30,713 [INFO]  Epoch[1] Train-cross-entropy=0.549548
2020-06-22 23:27:30,718 [INFO]  Epoch[1] Train-other_payment_plans-accuracy=0.811111
2020-06-22 23:27:30,722 [INFO]  Epoch[1] Time cost=0.612
2020-06-

Fitting model for column: housing


2020-06-22 23:27:37,154 [INFO]  
2020-06-22 23:27:37,513 [INFO]  Epoch[0] Batch [0-23]	Speed: 1150.90 samples/sec	cross-entropy=0.806953	housing-accuracy=0.697917
2020-06-22 23:27:37,785 [INFO]  Epoch[0] Train-cross-entropy=0.729255
2020-06-22 23:27:37,789 [INFO]  Epoch[0] Train-housing-accuracy=0.719444
2020-06-22 23:27:37,793 [INFO]  Epoch[0] Time cost=0.627
2020-06-22 23:27:37,802 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:27:37,854 [INFO]  Epoch[0] Validation-cross-entropy=0.565097
2020-06-22 23:27:37,859 [INFO]  Epoch[0] Validation-housing-accuracy=0.750000
2020-06-22 23:27:38,213 [INFO]  Epoch[1] Batch [0-23]	Speed: 1091.67 samples/sec	cross-entropy=0.626088	housing-accuracy=0.747396
2020-06-22 23:27:38,516 [INFO]  Epoch[1] Train-cross-entropy=0.604060
2020-06-22 23:27:38,523 [INFO]  Epoch[1] Train-housing-accuracy=0.758333
2020-06-22 23:27:38,529 [INFO]  Epoch[1] Time cost=0.665
2020-06-22 23:27:38,540 [INFO]  Saved checkpoint to "imputer_model\m

Fitting model for column: job


2020-06-22 23:27:48,444 [INFO]  
2020-06-22 23:27:48,899 [INFO]  Epoch[0] Batch [0-23]	Speed: 876.45 samples/sec	cross-entropy=1.071726	job-accuracy=0.585938
2020-06-22 23:27:49,211 [INFO]  Epoch[0] Train-cross-entropy=0.947158
2020-06-22 23:27:49,217 [INFO]  Epoch[0] Train-job-accuracy=0.634722
2020-06-22 23:27:49,222 [INFO]  Epoch[0] Time cost=0.767
2020-06-22 23:27:49,230 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:27:49,293 [INFO]  Epoch[0] Validation-cross-entropy=0.760929
2020-06-22 23:27:49,298 [INFO]  Epoch[0] Validation-job-accuracy=0.712500
2020-06-22 23:27:49,679 [INFO]  Epoch[1] Batch [0-23]	Speed: 1025.92 samples/sec	cross-entropy=0.866940	job-accuracy=0.625000
2020-06-22 23:27:49,980 [INFO]  Epoch[1] Train-cross-entropy=0.812081
2020-06-22 23:27:49,985 [INFO]  Epoch[1] Train-job-accuracy=0.655556
2020-06-22 23:27:49,991 [INFO]  Epoch[1] Time cost=0.687
2020-06-22 23:27:50,001 [INFO]  Saved checkpoint to "imputer_model\model-0001.params"
202

Fitting model for column: own_telephone


2020-06-22 23:27:56,464 [INFO]  
2020-06-22 23:27:56,842 [INFO]  Epoch[0] Batch [0-23]	Speed: 1072.63 samples/sec	cross-entropy=0.678783	own_telephone-accuracy=0.627604
2020-06-22 23:27:57,134 [INFO]  Epoch[0] Train-cross-entropy=0.630043
2020-06-22 23:27:57,139 [INFO]  Epoch[0] Train-own_telephone-accuracy=0.645833
2020-06-22 23:27:57,143 [INFO]  Epoch[0] Time cost=0.667
2020-06-22 23:27:57,154 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:27:57,207 [INFO]  Epoch[0] Validation-cross-entropy=0.559714
2020-06-22 23:27:57,211 [INFO]  Epoch[0] Validation-own_telephone-accuracy=0.750000
2020-06-22 23:27:57,532 [INFO]  Epoch[1] Batch [0-23]	Speed: 1209.79 samples/sec	cross-entropy=0.549904	own_telephone-accuracy=0.713542
2020-06-22 23:27:57,862 [INFO]  Epoch[1] Train-cross-entropy=0.549405
2020-06-22 23:27:57,868 [INFO]  Epoch[1] Train-own_telephone-accuracy=0.705556
2020-06-22 23:27:57,873 [INFO]  Epoch[1] Time cost=0.659
2020-06-22 23:27:57,883 [INFO]  Saved 

Fitting model for column: foreign_worker


2020-06-22 23:28:04,742 [INFO]  
2020-06-22 23:28:05,081 [INFO]  Epoch[0] Batch [0-23]	Speed: 1184.48 samples/sec	cross-entropy=0.343731	foreign_worker-accuracy=0.924479
2020-06-22 23:28:05,389 [INFO]  Epoch[0] Train-cross-entropy=0.236997
2020-06-22 23:28:05,394 [INFO]  Epoch[0] Train-foreign_worker-accuracy=0.947222
2020-06-22 23:28:05,399 [INFO]  Epoch[0] Time cost=0.647
2020-06-22 23:28:05,410 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:28:05,482 [INFO]  Epoch[0] Validation-cross-entropy=0.100723
2020-06-22 23:28:05,487 [INFO]  Epoch[0] Validation-foreign_worker-accuracy=0.975000
2020-06-22 23:28:05,830 [INFO]  Epoch[1] Batch [0-23]	Speed: 1149.48 samples/sec	cross-entropy=0.167863	foreign_worker-accuracy=0.950521
2020-06-22 23:28:06,147 [INFO]  Epoch[1] Train-cross-entropy=0.146229
2020-06-22 23:28:06,152 [INFO]  Epoch[1] Train-foreign_worker-accuracy=0.961111
2020-06-22 23:28:06,156 [INFO]  Epoch[1] Time cost=0.663
2020-06-22 23:28:06,166 [INFO]  S

2020-06-22 23:28:15,355 [INFO]  No improvement detected for 5 epochs compared to 0.08874571770429611 last error obtained: 0.09453924065455795, stopping here
2020-06-22 23:28:15,498 [INFO]  
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting model for column: duration


2020-06-22 23:28:17,925 [INFO]  
2020-06-22 23:28:18,238 [INFO]  Epoch[0] Batch [0-23]	Speed: 1292.79 samples/sec	cross-entropy=13.058780	duration-accuracy=0.000000
2020-06-22 23:28:18,526 [INFO]  Epoch[0] Train-cross-entropy=10.565690
2020-06-22 23:28:18,531 [INFO]  Epoch[0] Train-duration-accuracy=0.000000
2020-06-22 23:28:18,536 [INFO]  Epoch[0] Time cost=0.600
2020-06-22 23:28:18,545 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:28:18,627 [INFO]  Epoch[0] Validation-cross-entropy=8.482756
2020-06-22 23:28:18,632 [INFO]  Epoch[0] Validation-duration-accuracy=0.000000
2020-06-22 23:28:18,973 [INFO]  Epoch[1] Batch [0-23]	Speed: 1156.69 samples/sec	cross-entropy=8.253201	duration-accuracy=0.000000
2020-06-22 23:28:19,259 [INFO]  Epoch[1] Train-cross-entropy=7.854777
2020-06-22 23:28:19,264 [INFO]  Epoch[1] Train-duration-accuracy=0.000000
2020-06-22 23:28:19,269 [INFO]  Epoch[1] Time cost=0.632
2020-06-22 23:28:19,279 [INFO]  Saved checkpoint to "imputer_

Fitting model for column: credit_amount


2020-06-22 23:28:26,191 [INFO]  
2020-06-22 23:28:26,540 [INFO]  Epoch[0] Batch [0-23]	Speed: 1190.27 samples/sec	cross-entropy=11.416885	credit_amount-accuracy=0.000000
2020-06-22 23:28:26,861 [INFO]  Epoch[0] Train-cross-entropy=9.317705
2020-06-22 23:28:26,865 [INFO]  Epoch[0] Train-credit_amount-accuracy=0.000000
2020-06-22 23:28:26,871 [INFO]  Epoch[0] Time cost=0.669
2020-06-22 23:28:26,881 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:28:26,949 [INFO]  Epoch[0] Validation-cross-entropy=7.911566
2020-06-22 23:28:26,954 [INFO]  Epoch[0] Validation-credit_amount-accuracy=0.000000
2020-06-22 23:28:27,255 [INFO]  Epoch[1] Batch [0-23]	Speed: 1317.80 samples/sec	cross-entropy=7.566903	credit_amount-accuracy=0.000000
2020-06-22 23:28:27,529 [INFO]  Epoch[1] Train-cross-entropy=6.818328
2020-06-22 23:28:27,535 [INFO]  Epoch[1] Train-credit_amount-accuracy=0.000000
2020-06-22 23:28:27,539 [INFO]  Epoch[1] Time cost=0.578
2020-06-22 23:28:27,547 [INFO]  Saved

Fitting model for column: installment_commitment


2020-06-22 23:28:33,629 [INFO]  
2020-06-22 23:28:33,951 [INFO]  Epoch[0] Batch [0-23]	Speed: 1256.00 samples/sec	cross-entropy=14.151769	installment_commitment-accuracy=0.000000
2020-06-22 23:28:34,235 [INFO]  Epoch[0] Train-cross-entropy=14.169511
2020-06-22 23:28:34,240 [INFO]  Epoch[0] Train-installment_commitment-accuracy=0.000000
2020-06-22 23:28:34,244 [INFO]  Epoch[0] Time cost=0.604
2020-06-22 23:28:34,254 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:28:34,309 [INFO]  Epoch[0] Validation-cross-entropy=12.403057
2020-06-22 23:28:34,315 [INFO]  Epoch[0] Validation-installment_commitment-accuracy=0.000000
2020-06-22 23:28:34,643 [INFO]  Epoch[1] Batch [0-23]	Speed: 1190.27 samples/sec	cross-entropy=12.057370	installment_commitment-accuracy=0.000000
2020-06-22 23:28:34,948 [INFO]  Epoch[1] Train-cross-entropy=12.472823
2020-06-22 23:28:34,953 [INFO]  Epoch[1] Train-installment_commitment-accuracy=0.000000
2020-06-22 23:28:34,960 [INFO]  Epoch[1] Time

2020-06-22 23:28:43,173 [INFO]  Epoch[13] Train-cross-entropy=10.131089
2020-06-22 23:28:43,179 [INFO]  Epoch[13] Train-installment_commitment-accuracy=0.000000
2020-06-22 23:28:43,184 [INFO]  Epoch[13] Time cost=0.589
2020-06-22 23:28:43,193 [INFO]  Saved checkpoint to "imputer_model\model-0013.params"
2020-06-22 23:28:43,258 [INFO]  Epoch[13] Validation-cross-entropy=10.830826
2020-06-22 23:28:43,263 [INFO]  Epoch[13] Validation-installment_commitment-accuracy=0.000000
2020-06-22 23:28:43,556 [INFO]  Epoch[14] Batch [0-23]	Speed: 1341.77 samples/sec	cross-entropy=9.809198	installment_commitment-accuracy=0.000000
2020-06-22 23:28:43,824 [INFO]  Epoch[14] Train-cross-entropy=10.077444
2020-06-22 23:28:43,829 [INFO]  Epoch[14] Train-installment_commitment-accuracy=0.000000
2020-06-22 23:28:43,834 [INFO]  Epoch[14] Time cost=0.565
2020-06-22 23:28:43,842 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-06-22 23:28:43,907 [INFO]  Epoch[14] Validation-cross-entropy=10.817

Fitting model for column: residence_since


2020-06-22 23:28:50,911 [INFO]  
2020-06-22 23:28:51,424 [INFO]  Epoch[0] Batch [0-23]	Speed: 760.79 samples/sec	cross-entropy=15.407898	residence_since-accuracy=0.000000
2020-06-22 23:28:51,685 [INFO]  Epoch[0] Train-cross-entropy=15.667140
2020-06-22 23:28:51,690 [INFO]  Epoch[0] Train-residence_since-accuracy=0.000000
2020-06-22 23:28:51,696 [INFO]  Epoch[0] Time cost=0.773
2020-06-22 23:28:51,706 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:28:51,771 [INFO]  Epoch[0] Validation-cross-entropy=13.292624
2020-06-22 23:28:51,777 [INFO]  Epoch[0] Validation-residence_since-accuracy=0.000000
2020-06-22 23:28:52,320 [INFO]  Epoch[1] Batch [0-23]	Speed: 700.16 samples/sec	cross-entropy=14.060792	residence_since-accuracy=0.000000
2020-06-22 23:28:52,710 [INFO]  Epoch[1] Train-cross-entropy=14.394274
2020-06-22 23:28:52,717 [INFO]  Epoch[1] Train-residence_since-accuracy=0.000000
2020-06-22 23:28:52,725 [INFO]  Epoch[1] Time cost=0.944
2020-06-22 23:28:52,741 [

Fitting model for column: age


2020-06-22 23:28:57,669 [INFO]  
2020-06-22 23:28:58,167 [INFO]  Epoch[0] Batch [0-23]	Speed: 934.14 samples/sec	cross-entropy=14.737040	age-accuracy=0.000000
2020-06-22 23:28:58,679 [INFO]  Epoch[0] Train-cross-entropy=14.511963
2020-06-22 23:28:58,685 [INFO]  Epoch[0] Train-age-accuracy=0.000000
2020-06-22 23:28:58,691 [INFO]  Epoch[0] Time cost=0.946
2020-06-22 23:28:58,705 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:28:58,763 [INFO]  Epoch[0] Validation-cross-entropy=19.020075
2020-06-22 23:28:58,768 [INFO]  Epoch[0] Validation-age-accuracy=0.000000
2020-06-22 23:28:59,048 [INFO]  Epoch[1] Batch [0-23]	Speed: 1408.38 samples/sec	cross-entropy=13.210862	age-accuracy=0.000000
2020-06-22 23:28:59,330 [INFO]  Epoch[1] Train-cross-entropy=13.192243
2020-06-22 23:28:59,337 [INFO]  Epoch[1] Train-age-accuracy=0.000000
2020-06-22 23:28:59,347 [INFO]  Epoch[1] Time cost=0.573
2020-06-22 23:28:59,363 [INFO]  Saved checkpoint to "imputer_model\model-0001.params

2020-06-22 23:29:10,389 [INFO]  Epoch[14] Time cost=0.643
2020-06-22 23:29:10,398 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-06-22 23:29:10,468 [INFO]  Epoch[14] Validation-cross-entropy=15.645458
2020-06-22 23:29:10,473 [INFO]  Epoch[14] Validation-age-accuracy=0.000000
2020-06-22 23:29:10,776 [INFO]  Epoch[15] Batch [0-23]	Speed: 1287.77 samples/sec	cross-entropy=10.220600	age-accuracy=0.000000
2020-06-22 23:29:11,038 [INFO]  Epoch[15] Train-cross-entropy=10.210693
2020-06-22 23:29:11,043 [INFO]  Epoch[15] Train-age-accuracy=0.000000
2020-06-22 23:29:11,048 [INFO]  Epoch[15] Time cost=0.570
2020-06-22 23:29:11,060 [INFO]  Saved checkpoint to "imputer_model\model-0015.params"
2020-06-22 23:29:11,118 [INFO]  Epoch[15] Validation-cross-entropy=15.573041
2020-06-22 23:29:11,123 [INFO]  Epoch[15] Validation-age-accuracy=0.000000
2020-06-22 23:29:11,426 [INFO]  Epoch[16] Batch [0-23]	Speed: 1299.24 samples/sec	cross-entropy=10.153736	age-accuracy=0.000000
2020-06-22

Fitting model for column: existing_credits


2020-06-22 23:29:21,057 [INFO]  
2020-06-22 23:29:21,406 [INFO]  Epoch[0] Batch [0-23]	Speed: 1171.38 samples/sec	cross-entropy=15.518137	existing_credits-accuracy=0.000000
2020-06-22 23:29:21,742 [INFO]  Epoch[0] Train-cross-entropy=15.543194
2020-06-22 23:29:21,748 [INFO]  Epoch[0] Train-existing_credits-accuracy=0.000000
2020-06-22 23:29:21,753 [INFO]  Epoch[0] Time cost=0.684
2020-06-22 23:29:21,763 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:29:21,828 [INFO]  Epoch[0] Validation-cross-entropy=12.542969
2020-06-22 23:29:21,833 [INFO]  Epoch[0] Validation-existing_credits-accuracy=0.000000
2020-06-22 23:29:22,222 [INFO]  Epoch[1] Batch [0-23]	Speed: 1002.68 samples/sec	cross-entropy=13.392144	existing_credits-accuracy=0.000000
2020-06-22 23:29:22,552 [INFO]  Epoch[1] Train-cross-entropy=13.634745
2020-06-22 23:29:22,557 [INFO]  Epoch[1] Train-existing_credits-accuracy=0.000000
2020-06-22 23:29:22,563 [INFO]  Epoch[1] Time cost=0.725
2020-06-22 23:29:2

Fitting model for column: num_dependents


2020-06-22 23:29:31,745 [INFO]  
2020-06-22 23:29:32,037 [INFO]  Epoch[0] Batch [0-23]	Speed: 1387.15 samples/sec	cross-entropy=14.758790	num_dependents-accuracy=0.000000
2020-06-22 23:29:32,377 [INFO]  Epoch[0] Train-cross-entropy=15.609412
2020-06-22 23:29:32,383 [INFO]  Epoch[0] Train-num_dependents-accuracy=0.000000
2020-06-22 23:29:32,389 [INFO]  Epoch[0] Time cost=0.633
2020-06-22 23:29:32,400 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-22 23:29:32,509 [INFO]  Epoch[0] Validation-cross-entropy=14.319409
2020-06-22 23:29:32,516 [INFO]  Epoch[0] Validation-num_dependents-accuracy=0.000000
2020-06-22 23:29:32,874 [INFO]  Epoch[1] Batch [0-23]	Speed: 1097.62 samples/sec	cross-entropy=13.523836	num_dependents-accuracy=0.000000
2020-06-22 23:29:33,143 [INFO]  Epoch[1] Train-cross-entropy=14.538845
2020-06-22 23:29:33,149 [INFO]  Epoch[1] Train-num_dependents-accuracy=0.000000
2020-06-22 23:29:33,154 [INFO]  Epoch[1] Time cost=0.632
2020-06-22 23:29:33,165 [INF

PPP score with <jenga.cleaning.cleaner.Cleaner object at 0x0000019F50FD6390>: 0.7816736792893876


In [19]:
cleaner_scores_ppp

[0.7946470313230481,
 0.7850631136044881,
 0.7807386629266012,
 0.7946470313230481,
 0.7850631136044881,
 0.7812061711079944,
 0.7946470313230481,
 0.7850631136044881,
 0.7816736792893876]

In [None]:
from jenga.cleaning.outlier_detection import NoOutlierDetection
from jenga.cleaning.imputation import NoImputation


class Cleaner:
    
    def __init__(self, 
                 df_train,
                 df_corrupted,
                 categorical_columns,
                 numerical_columns,
                 outlier_detection=NoOutlierDetection, 
                 imputation=NoImputation):
        self.outlier_detection = outlier_detection
        self.imputation = imputation
        
    
    def apply_cleaner(self, df_train, df_corrupted, categorical_columns, numerical_columns):
        df_cleaned = self.outlier_detection(df_train, df_corrupted)
        
        # do something for fixing/removing the outliers
        if 'outlier' in df_cleaned.columns:
            ### TODO 
            df_cleaned = df_cleaned.drop('outlier', axis=1)
            
        # impute
        df_cleaned = self.imputation(df_train, df_cleaned)
        
        return df_cleaned

In [None]:
cleaner = Cleaner(train_data, test_data_corrupted, categorical_columns, numerical_columns, outlier_detection=NoOutlierDetection)

In [None]:
df_cleaned = cleaner.apply_cleaner(train_data, test_data_corrupted, categorical_columns, numerical_columns)
df_cleaned

In [None]:
from jenga.cleaning.ppp import PipelinePerformancePrediction

from jenga.cleaning.imputation import MeanModeImputation, DatawigImputation


class Clean:
    
    def __init__(self, )

### Outlier Detection

In [None]:
# detection using KNN from PyOD
outlier = PyODKNN(train_data, test_data_corrupted, categorical_columns, numerical_columms)

In [None]:
test_data_corrupted_outliers = outlier.fit_transform(train_data, test_data_corrupted)
test_data_corrupted_outliers.head(10)

In [None]:
# detection using Isolation Forest from PyOD
outlier_if = PyODIsolationForest(train_data, test_data_corrupted, categorical_columns, numerical_columms)

In [None]:
test_data_corrupted_outliers_if = outlier_if.fit_transform(train_data, test_data_corrupted)
test_data_corrupted_outliers_if.head(10)

#### Preparing the outliers for imputation

In [None]:
if "outlier" in test_data_corrupted_outliers.columns:
    print(f'Setting {test_data_corrupted_outliers["outlier"].sum()} to Nan')
    test_data_corrupted_outliers.loc[test_data_corrupted_outliers["outlier"], :] = np.nan
    test_data_corrupted_outliers = test_data_corrupted_outliers.drop('outlier', axis=1)

In [None]:
## train_data, test_data_corrupted, 
## check values in column in the training data -> check for outliers in the same column in the corrupted data
## store .loc 
## convert those .loc for those column into nan
## impute

In [None]:
numerical_columms

In [None]:
test_data_corrupted

In [None]:
from abc import abstractmethod

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from pyod.models.knn import KNN
from pyod.models.iforest import IForest


class OutlierDetection:
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):
        
        self.df_train = df_train
        self.df_corrupted = df_corrupted
        
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns
        
        
        # preprocessing pipeline for numerical columns
        transformer_numeric = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
            ('standard_scale', StandardScaler())
        ])

        # preprocessing pipeline for categorical columns
        transformer_categorical = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
            ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
        ])

        # preprocessor
        self.feature_transform = ColumnTransformer(transformers=[
            ('categorical_features', transformer_categorical, self.categorical_columns),
            ('numerical_features', transformer_numeric, self.numerical_columns)
        ], sparse_threshold=1.0)
        
        
        @abstractmethod
        def fit_transform(self, df_train, df_corrupted):
            pass



class NoOutlierDetection(OutlierDetection):
    
    def fit_transform(self, df_train, df_corrupted):
        df_outliers = df_corrupted.copy()
        
        return df_outliers
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)


        
class PyODKNN(OutlierDetection):
    
    def fit_transform(self, df_train, df_corrupted):
        df_outliers = df_corrupted.copy()
        
        feature_transformation = self.feature_transform.fit(df_train)
        x = feature_transformation.transform(df_train).toarray()
        
        model = KNN()
        model.fit(x)
        
        xx = feature_transformation.transform(df_outliers).toarray()

        df_outliers["outlier"] = model.predict(xx) ## 0: inlier, 1: outlier
        
        return df_outliers
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)

    
    
class PyODIsolationForest(OutlierDetection):
    
    def fit_transform(self, df_train, df_corrupted):
        df_outliers = df_corrupted.copy()
        
        feature_transformation = self.feature_transform.fit(df_train)
        x = feature_transformation.transform(df_train).toarray()
        
        model = IForest(contamination=0.25)
        model.fit(x)
        
        xx = feature_transformation.transform(df_outliers).toarray()

        df_outliers["outlier"] = model.predict(xx) ## 0: inlier, 1: outlier
        
        return df_outliers
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)


In [None]:
from abc import abstractmethod
import numpy as np
import pandas as pd

import datawig



class Imputation:
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):
        self.df_train = df_train
        self.df_corrupted = df_corrupted
        
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns
        
    
    @abstractmethod
    def fit_transform(self, df_train, df_corrupted):
        pass

    
    
class NoImputation(Imputation):    
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):        
        Imputation.__init__(self, df_train, df_corrupted, categorical_columns, numerical_columns)
    
    
    def fit_transform(self, df_train, df_corrupted):
        df_imputed = df_corrupted.copy()
        return df_imputed
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)
    
    
    
class MeanModeImputation(Imputation):
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):
        self.means = {}
        self.modes = {}
    
        Imputation.__init__(self, df_train, df_corrupted, categorical_columns, numerical_columns)
    
    
    def fit_transform(self, df_train, df_corrupted):
        df_imputed = df_corrupted.copy()
        
        for col in df_train.columns:
            if col in self.numerical_columns:
                # mean imputer
                mean = np.mean(df_train[col])
                self.means[col] = mean
            elif col in self.categorical_columns:
                # mode imputer
                mode = df_train[col].value_counts().index[0]
                self.modes[col] = mode
                
                
        for col in df_corrupted.columns:
            if col in self.numerical_columns:
                # mean imputer
                df_imputed[col].fillna(self.means[col], inplace=True)
            elif col in self.categorical_columns:
                # mode imputer
                df_imputed[col].fillna(self.modes[col], inplace=True)
                
        return df_imputed
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)

    

class DatawigImputation(Imputation):
    
    def __init__(self, df_train, df_corrupted, categorical_columns, numerical_columns):        
        Imputation.__init__(self, df_train, df_corrupted, categorical_columns, numerical_columns)
    
    
    def fit_transform(self, df_train, df_corrupted):
        df_imputed = df_corrupted.copy()

        for col in df_train.columns:
            if pd.api.types.is_categorical_dtype(df_train[col]):
                df_train[col] = df_train[col].astype(str)

        for col in df_corrupted.columns:
            if pd.api.types.is_categorical_dtype(df_corrupted[col]):
                df_corrupted[col] = df_corrupted[col].astype(str)


        for col in self.categorical_columns + self.numerical_columns:
            output_column = col
            input_columns = list(set(df_train.columns) - set([output_column]))

            print(f"Fitting model for column: {col}")
            model = datawig.SimpleImputer(input_columns, output_column, 'imputer_model')
            model.fit(df_train)

            df_imputed = model.predict(df_imputed)
            df_imputed[col].fillna(df_imputed[col + '_imputed'], inplace=True)
            df_imputed = df_imputed[df_corrupted.columns]

        return df_imputed
    
    
    def __call__(self, df_train, df_corrupted):
        return self.fit_transform(df_train, df_corrupted)

## Evaluation

In [None]:
# score without cleaning
model_obj.score_on_test_data(model.predict_proba(test_data))

In [None]:
# score with corruptions
model_obj.score_on_test_data(model.predict_proba(test_data_corrupted))

In [None]:
# score with mean/mode imputation
model_obj.score_on_test_data(model.predict_proba(test_data_mm_imputed))

In [None]:
# score with datawig imputation
model_obj.score_on_test_data(model.predict_proba(test_data_dw_imputed))