## Dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import SGDClassifier

from jenga.basis import Dataset
from jenga.models.simple_model import SimpleModel
from jenga.corruptions.perturbations import Perturbation
from jenga.cleaning.imputation import MeanModeImputation, DatawigImputation
from jenga.cleaning.outlier_detection import PyODKNN, PyODIsolationForest

In [2]:
seed = 10

In [3]:
dataset = Dataset(seed, "credit-g")

Dataset 'credit-g', target: 'class'
**Author**: Dr. Hans Hofmann  

**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)) - 1994    

**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)



**German Credit data**  

This dataset classifies people described by a set of attributes as good or bad credit risks.



This dataset comes with a cost matrix: 

``` 

      Good  Bad (predicted)  

Good   0    1   (actual)  

Bad    5    0  

```



It is worse to class a customer as go

Attribute types: 


Unnamed: 0,attribute_names,categorical_indicator
0,checking_status,True
1,duration,False
2,credit_history,True
3,purpose,True
4,credit_amount,False
5,savings_status,True
6,employment,True
7,installment_commitment,False
8,personal_status,True
9,other_parties,True


In [4]:
all_data = dataset.all_data
all_data

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,...,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes,good
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,...,real estate,22.0,none,own,1.0,skilled,1.0,none,yes,bad
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,...,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes,good
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,...,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes,good
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,...,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,no checking,12.0,existing paid,furniture/equipment,1736.0,<100,4<=X<7,3.0,female div/dep/mar,none,...,real estate,31.0,none,own,1.0,unskilled resident,1.0,none,yes,good
996,<0,30.0,existing paid,used car,3857.0,<100,1<=X<4,4.0,male div/sep,none,...,life insurance,40.0,none,own,1.0,high qualif/self emp/mgmt,1.0,yes,yes,good
997,no checking,12.0,existing paid,radio/tv,804.0,<100,>=7,4.0,male single,none,...,car,38.0,none,own,1.0,skilled,1.0,none,yes,good
998,<0,45.0,existing paid,radio/tv,1845.0,<100,1<=X<4,4.0,male single,none,...,no known property,23.0,none,for free,1.0,skilled,1.0,yes,yes,bad


In [5]:
attribute_names = dataset.attribute_names
attribute_names

['checking_status',
 'duration',
 'credit_history',
 'purpose',
 'credit_amount',
 'savings_status',
 'employment',
 'installment_commitment',
 'personal_status',
 'other_parties',
 'residence_since',
 'property_magnitude',
 'age',
 'other_payment_plans',
 'housing',
 'existing_credits',
 'job',
 'num_dependents',
 'own_telephone',
 'foreign_worker']

In [6]:
attribute_types = dataset.attribute_types
attribute_types

Unnamed: 0,attribute_names,categorical_indicator
0,checking_status,True
1,duration,False
2,credit_history,True
3,purpose,True
4,credit_amount,False
5,savings_status,True
6,employment,True
7,installment_commitment,False
8,personal_status,True
9,other_parties,True


### Visualize the dataset

In [None]:
## plot the original dataset
def hide_current_axis(*args, **kwds):
        plt.gca().set_visible(False)
        
def plot_data(data):
    sns.set_style("white") # grid/no grid style: darkgrid, whitegrid, dark, white, ticks
    
    plot = sns.pairplot(data, hue="class")
    plot.map_upper(hide_current_axis)
    plt.show()

In [None]:
plot_data(all_data)

### Get training and test sets

In [7]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data()

display(train_data.head())
print(train_labels[0:5])

display(test_data.head())
print(test_labels[0:5])

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
188,<0,12.0,existing paid,radio/tv,674.0,100<=X<500,4<=X<7,4.0,male mar/wid,none,1.0,life insurance,20.0,none,own,1.0,skilled,1.0,none,yes
194,0<=X<200,45.0,existing paid,radio/tv,3031.0,100<=X<500,1<=X<4,4.0,male single,guarantor,4.0,life insurance,21.0,none,rent,1.0,skilled,1.0,none,yes
225,no checking,36.0,no credits/all paid,repairs,2613.0,<100,1<=X<4,4.0,male single,none,2.0,car,27.0,none,own,2.0,skilled,1.0,none,yes
580,0<=X<200,18.0,critical/other existing credit,new car,1056.0,<100,>=7,3.0,male single,guarantor,3.0,real estate,30.0,bank,own,2.0,skilled,1.0,none,yes
428,no checking,9.0,existing paid,furniture/equipment,1313.0,<100,>=7,1.0,male single,none,4.0,car,20.0,none,own,1.0,skilled,1.0,none,yes


['bad' 'bad' 'good' 'bad' 'good']


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
841,no checking,21.0,delayed previously,used car,2993.0,<100,1<=X<4,3.0,male single,none,2.0,real estate,28.0,stores,own,2.0,unskilled resident,1.0,none,yes
956,>=200,30.0,critical/other existing credit,radio/tv,3656.0,no known savings,>=7,4.0,male single,none,4.0,life insurance,49.0,stores,own,2.0,unskilled resident,1.0,none,yes
544,no checking,12.0,critical/other existing credit,new car,1255.0,<100,>=7,4.0,male single,none,4.0,real estate,61.0,none,own,2.0,unskilled resident,1.0,none,yes
173,0<=X<200,8.0,existing paid,radio/tv,1414.0,<100,1<=X<4,4.0,male single,guarantor,2.0,real estate,33.0,none,own,1.0,skilled,1.0,none,no
759,<0,12.0,critical/other existing credit,new car,691.0,<100,>=7,4.0,male single,none,3.0,life insurance,35.0,none,own,2.0,skilled,1.0,none,yes


['good' 'good' 'good' 'good' 'bad']


## Model

In [8]:
learner = SGDClassifier(max_iter=1000)
param_grid = {
    'learner__loss': ['log'],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

In [9]:
model_obj = SimpleModel(seed, train_data, train_labels, test_data, test_labels, attribute_types, learner, param_grid)

In [10]:
categorical_columns = model_obj.categorical_columns
categorical_columns

['checking_status',
 'credit_history',
 'purpose',
 'savings_status',
 'employment',
 'personal_status',
 'other_parties',
 'property_magnitude',
 'other_payment_plans',
 'housing',
 'job',
 'own_telephone',
 'foreign_worker']

In [11]:
numerical_columms = model_obj.numerical_columms
numerical_columms

['duration',
 'credit_amount',
 'installment_commitment',
 'residence_since',
 'age',
 'existing_credits',
 'num_dependents']

In [12]:
model = model_obj.fit_baseline_model(train_data, train_labels)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   12.9s finished


## Corruptions

In [13]:
# corruption perturbations to apply
corr_perturbations = Perturbation(categorical_columns, numerical_columms)

In [14]:
test_data_corrupted, perturbations, cols_perturbed = corr_perturbations.apply_perturbation(test_data, 5)
test_data_corrupted.head(10)

GaussianNoise: {'column': 'residence_since', 'fraction': 0.25}
['residence_since']
MissingValues: {'column': 'credit_history', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MAR'}
['credit_history']
GaussianNoise: {'column': 'duration', 'fraction': 0.25}
['duration']
MissingValues: {'column': 'credit_amount', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MAR'}
['credit_amount']
SwappedValues: {'column_a': 'housing', 'column_b': 'personal_status', 'fraction': 0.25}
['housing', 'personal_status']


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
841,no checking,21.0,,used car,,<100,1<=X<4,3.0,male single,none,0.193852,real estate,28.0,stores,own,2.0,unskilled resident,1.0,none,yes
956,>=200,30.0,critical/other existing credit,radio/tv,,no known savings,>=7,4.0,male single,none,4.0,life insurance,49.0,stores,own,2.0,unskilled resident,1.0,none,yes
544,no checking,-3.982692,,new car,1255.0,<100,>=7,4.0,male single,none,4.0,real estate,61.0,none,own,2.0,unskilled resident,1.0,none,yes
173,0<=X<200,-30.933795,existing paid,radio/tv,1414.0,<100,1<=X<4,4.0,male single,guarantor,2.0,real estate,33.0,none,own,1.0,skilled,1.0,none,no
759,<0,12.0,,new car,,<100,>=7,4.0,male single,none,7.827427,life insurance,35.0,none,own,2.0,skilled,1.0,none,yes
955,<0,24.0,critical/other existing credit,radio/tv,,>=1000,>=7,4.0,female div/dep/mar,none,2.852679,life insurance,57.0,none,rent,2.0,high qualif/self emp/mgmt,1.0,yes,yes
121,no checking,24.0,critical/other existing credit,used car,,<100,>=7,4.0,rent,none,2.0,car,41.0,none,female div/dep/mar,2.0,high qualif/self emp/mgmt,1.0,yes,yes
230,>=200,36.0,existing paid,radio/tv,4210.0,<100,1<=X<4,4.0,male single,none,5.605912,car,26.0,none,own,1.0,skilled,1.0,none,yes
11,<0,48.0,existing paid,business,4308.0,<100,<1,3.0,female div/dep/mar,none,6.635504,life insurance,24.0,none,rent,1.0,skilled,1.0,none,yes
120,<0,21.0,,radio/tv,,<100,1<=X<4,3.0,female div/dep/mar,none,2.0,real estate,25.0,none,own,2.0,skilled,1.0,yes,yes


In [15]:
perturbations

[<jenga.corruptions.numerical.GaussianNoise at 0x21dedff6358>,
 <jenga.corruptions.generic.MissingValues at 0x21ddf03c828>,
 <jenga.corruptions.numerical.GaussianNoise at 0x21dedf3bf28>,
 <jenga.corruptions.generic.MissingValues at 0x21decf07128>,
 <jenga.corruptions.generic.SwappedValues at 0x21dedfbd128>]

In [44]:
cols_perturbed

['residence_since',
 'credit_history',
 'duration',
 'credit_amount',
 'housing',
 'personal_status']

### Visualize the original and corrupted test set

In [None]:
## original test data
plot_data(pd.concat([test_data, pd.Series(test_labels, name='class')], axis=1))

In [None]:
## corrupted test data
plot_data(pd.concat([test_data_corrupted, pd.Series(test_labels, name='class')], axis=1))

## Cleaning

### Imputation

In [17]:
mean_mode_imputer = MeanModeImputation(train_data, test_data_corrupted, categorical_columns, numerical_columms)

mean_mode_imputer.fit(train_data)
test_data_mm_imputed = mean_mode_imputer.transform(test_data_corrupted)
test_data_mm_imputed

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
841,no checking,21.000000,existing paid,used car,3297.265,<100,1<=X<4,3.0,male single,none,0.193852,real estate,28.0,stores,own,2.0,unskilled resident,1.0,none,yes
956,>=200,30.000000,critical/other existing credit,radio/tv,3297.265,no known savings,>=7,4.0,male single,none,4.000000,life insurance,49.0,stores,own,2.0,unskilled resident,1.0,none,yes
544,no checking,-3.982692,existing paid,new car,1255.000,<100,>=7,4.0,male single,none,4.000000,real estate,61.0,none,own,2.0,unskilled resident,1.0,none,yes
173,0<=X<200,-30.933795,existing paid,radio/tv,1414.000,<100,1<=X<4,4.0,male single,guarantor,2.000000,real estate,33.0,none,own,1.0,skilled,1.0,none,no
759,<0,12.000000,existing paid,new car,3297.265,<100,>=7,4.0,male single,none,7.827427,life insurance,35.0,none,own,2.0,skilled,1.0,none,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274,<0,30.000000,existing paid,repairs,3297.265,<100,<1,1.0,male div/sep,none,1.000000,no known property,34.0,none,own,1.0,unskilled resident,1.0,yes,yes
192,0<=X<200,27.000000,existing paid,business,3297.265,<100,1<=X<4,4.0,male single,none,2.000000,car,36.0,none,own,1.0,skilled,2.0,yes,yes
398,0<=X<200,12.000000,existing paid,new car,3297.265,<100,>=7,1.0,rent,none,1.000000,real estate,46.0,none,male div/sep,2.0,skilled,1.0,none,yes
450,no checking,30.597022,critical/other existing credit,used car,3297.265,no known savings,1<=X<4,4.0,male single,none,5.279632,car,30.0,none,own,1.0,high qualif/self emp/mgmt,1.0,yes,yes


In [19]:
datawig_imputer = DatawigImputation(train_data, test_data_corrupted, categorical_columns, numerical_columms)

test_data_dw_imputed = datawig_imputer.fit_transform(train_data, test_data_corrupted)
test_data_dw_imputed

2020-06-20 23:30:02,212 [INFO]  CategoricalEncoder for column checking_status                                found only 44 occurrences of value >=200


Fitting model for column: checking_status


2020-06-20 23:30:04,310 [INFO]  
2020-06-20 23:30:05,088 [INFO]  Epoch[0] Batch [0-23]	Speed: 527.46 samples/sec	cross-entropy=1.270066	checking_status-accuracy=0.369792
2020-06-20 23:30:05,703 [INFO]  Epoch[0] Train-cross-entropy=1.232720
2020-06-20 23:30:05,705 [INFO]  Epoch[0] Train-checking_status-accuracy=0.373611
2020-06-20 23:30:05,708 [INFO]  Epoch[0] Time cost=1.388
2020-06-20 23:30:05,725 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:30:05,868 [INFO]  Epoch[0] Validation-cross-entropy=1.033619
2020-06-20 23:30:05,870 [INFO]  Epoch[0] Validation-checking_status-accuracy=0.550000
2020-06-20 23:30:06,579 [INFO]  Epoch[1] Batch [0-23]	Speed: 552.20 samples/sec	cross-entropy=1.128721	checking_status-accuracy=0.434896
2020-06-20 23:30:07,225 [INFO]  Epoch[1] Train-cross-entropy=1.131450
2020-06-20 23:30:07,228 [INFO]  Epoch[1] Train-checking_status-accuracy=0.437500
2020-06-20 23:30:07,231 [INFO]  Epoch[1] Time cost=1.359
2020-06-20 23:30:07,254 [INFO]

Fitting model for column: credit_history


2020-06-20 23:30:27,254 [INFO]  
2020-06-20 23:30:28,047 [INFO]  Epoch[0] Batch [0-23]	Speed: 513.86 samples/sec	cross-entropy=1.085513	credit_history-accuracy=0.580729
2020-06-20 23:30:28,806 [INFO]  Epoch[0] Train-cross-entropy=1.050243
2020-06-20 23:30:28,810 [INFO]  Epoch[0] Train-credit_history-accuracy=0.627778
2020-06-20 23:30:28,814 [INFO]  Epoch[0] Time cost=1.547
2020-06-20 23:30:28,844 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:30:28,948 [INFO]  Epoch[0] Validation-cross-entropy=0.864091
2020-06-20 23:30:28,951 [INFO]  Epoch[0] Validation-credit_history-accuracy=0.700000
2020-06-20 23:30:29,749 [INFO]  Epoch[1] Batch [0-23]	Speed: 484.53 samples/sec	cross-entropy=0.867723	credit_history-accuracy=0.705729
2020-06-20 23:30:30,431 [INFO]  Epoch[1] Train-cross-entropy=0.897992
2020-06-20 23:30:30,434 [INFO]  Epoch[1] Train-credit_history-accuracy=0.701389
2020-06-20 23:30:30,436 [INFO]  Epoch[1] Time cost=1.483
2020-06-20 23:30:30,452 [INFO]  Sav

Fitting model for column: purpose


2020-06-20 23:30:41,628 [INFO]  
2020-06-20 23:30:42,597 [INFO]  Epoch[0] Batch [0-23]	Speed: 416.22 samples/sec	cross-entropy=1.849812	purpose-accuracy=0.223958
2020-06-20 23:30:43,473 [INFO]  Epoch[0] Train-cross-entropy=1.780414
2020-06-20 23:30:43,479 [INFO]  Epoch[0] Train-purpose-accuracy=0.273611
2020-06-20 23:30:43,484 [INFO]  Epoch[0] Time cost=1.845
2020-06-20 23:30:43,508 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:30:43,661 [INFO]  Epoch[0] Validation-cross-entropy=1.745351
2020-06-20 23:30:43,665 [INFO]  Epoch[0] Validation-purpose-accuracy=0.325000
2020-06-20 23:30:44,637 [INFO]  Epoch[1] Batch [0-23]	Speed: 402.82 samples/sec	cross-entropy=1.605791	purpose-accuracy=0.304688
2020-06-20 23:30:46,097 [INFO]  Epoch[1] Train-cross-entropy=1.606979
2020-06-20 23:30:46,103 [INFO]  Epoch[1] Train-purpose-accuracy=0.338889
2020-06-20 23:30:46,108 [INFO]  Epoch[1] Time cost=2.435
2020-06-20 23:30:46,184 [INFO]  Saved checkpoint to "imputer_model\mod

2020-06-20 23:31:20,395 [INFO]  CategoricalEncoder for column savings_status                                found only 74 occurrences of value 100<=X<500
2020-06-20 23:31:20,399 [INFO]  CategoricalEncoder for column savings_status                                found only 42 occurrences of value 500<=X<1000
2020-06-20 23:31:20,403 [INFO]  CategoricalEncoder for column savings_status                                found only 32 occurrences of value >=1000


Fitting model for column: savings_status


2020-06-20 23:31:22,427 [INFO]  
2020-06-20 23:31:23,244 [INFO]  Epoch[0] Batch [0-23]	Speed: 513.90 samples/sec	cross-entropy=1.111047	savings_status-accuracy=0.630208
2020-06-20 23:31:23,903 [INFO]  Epoch[0] Train-cross-entropy=1.164750
2020-06-20 23:31:23,907 [INFO]  Epoch[0] Train-savings_status-accuracy=0.595833
2020-06-20 23:31:23,911 [INFO]  Epoch[0] Time cost=1.473
2020-06-20 23:31:23,928 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:31:24,068 [INFO]  Epoch[0] Validation-cross-entropy=1.253702
2020-06-20 23:31:24,072 [INFO]  Epoch[0] Validation-savings_status-accuracy=0.487500
2020-06-20 23:31:24,906 [INFO]  Epoch[1] Batch [0-23]	Speed: 464.93 samples/sec	cross-entropy=0.972662	savings_status-accuracy=0.653646
2020-06-20 23:31:25,720 [INFO]  Epoch[1] Train-cross-entropy=1.065886
2020-06-20 23:31:25,722 [INFO]  Epoch[1] Train-savings_status-accuracy=0.604167
2020-06-20 23:31:25,726 [INFO]  Epoch[1] Time cost=1.650
2020-06-20 23:31:25,741 [INFO]  Sav

Fitting model for column: employment


2020-06-20 23:31:38,719 [INFO]  
2020-06-20 23:31:39,617 [INFO]  Epoch[0] Batch [0-23]	Speed: 446.68 samples/sec	cross-entropy=1.466683	employment-accuracy=0.338542
2020-06-20 23:31:40,340 [INFO]  Epoch[0] Train-cross-entropy=1.396741
2020-06-20 23:31:40,345 [INFO]  Epoch[0] Train-employment-accuracy=0.381944
2020-06-20 23:31:40,349 [INFO]  Epoch[0] Time cost=1.616
2020-06-20 23:31:40,363 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:31:40,488 [INFO]  Epoch[0] Validation-cross-entropy=1.195135
2020-06-20 23:31:40,492 [INFO]  Epoch[0] Validation-employment-accuracy=0.500000
2020-06-20 23:31:41,321 [INFO]  Epoch[1] Batch [0-23]	Speed: 468.83 samples/sec	cross-entropy=1.273681	employment-accuracy=0.434896
2020-06-20 23:31:41,982 [INFO]  Epoch[1] Train-cross-entropy=1.270686
2020-06-20 23:31:41,986 [INFO]  Epoch[1] Train-employment-accuracy=0.447222
2020-06-20 23:31:41,990 [INFO]  Epoch[1] Time cost=1.494
2020-06-20 23:31:42,006 [INFO]  Saved checkpoint to "im

Fitting model for column: personal_status


2020-06-20 23:32:04,357 [INFO]  
2020-06-20 23:32:05,186 [INFO]  Epoch[0] Batch [0-23]	Speed: 481.98 samples/sec	cross-entropy=1.092576	personal_status-accuracy=0.557292
2020-06-20 23:32:06,565 [INFO]  Epoch[0] Train-cross-entropy=1.007212
2020-06-20 23:32:06,571 [INFO]  Epoch[0] Train-personal_status-accuracy=0.587500
2020-06-20 23:32:06,579 [INFO]  Epoch[0] Time cost=2.211
2020-06-20 23:32:06,610 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:32:06,742 [INFO]  Epoch[0] Validation-cross-entropy=0.895504
2020-06-20 23:32:06,748 [INFO]  Epoch[0] Validation-personal_status-accuracy=0.600000
2020-06-20 23:32:07,608 [INFO]  Epoch[1] Batch [0-23]	Speed: 470.91 samples/sec	cross-entropy=0.901227	personal_status-accuracy=0.627604
2020-06-20 23:32:08,323 [INFO]  Epoch[1] Train-cross-entropy=0.887230
2020-06-20 23:32:08,329 [INFO]  Epoch[1] Train-personal_status-accuracy=0.631944
2020-06-20 23:32:08,333 [INFO]  Epoch[1] Time cost=1.578
2020-06-20 23:32:08,348 [INFO]

Fitting model for column: other_parties


2020-06-20 23:32:21,543 [INFO]  
2020-06-20 23:32:22,293 [INFO]  Epoch[0] Batch [0-23]	Speed: 535.13 samples/sec	cross-entropy=0.559034	other_parties-accuracy=0.869792
2020-06-20 23:32:23,050 [INFO]  Epoch[0] Train-cross-entropy=0.474270
2020-06-20 23:32:23,053 [INFO]  Epoch[0] Train-other_parties-accuracy=0.886111
2020-06-20 23:32:23,058 [INFO]  Epoch[0] Time cost=1.503
2020-06-20 23:32:23,098 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:32:23,223 [INFO]  Epoch[0] Validation-cross-entropy=0.340883
2020-06-20 23:32:23,228 [INFO]  Epoch[0] Validation-other_parties-accuracy=0.912500
2020-06-20 23:32:23,947 [INFO]  Epoch[1] Batch [0-23]	Speed: 541.03 samples/sec	cross-entropy=0.356413	other_parties-accuracy=0.898438
2020-06-20 23:32:24,570 [INFO]  Epoch[1] Train-cross-entropy=0.353405
2020-06-20 23:32:24,574 [INFO]  Epoch[1] Train-other_parties-accuracy=0.901389
2020-06-20 23:32:24,579 [INFO]  Epoch[1] Time cost=1.344
2020-06-20 23:32:24,598 [INFO]  Saved ch

Fitting model for column: property_magnitude


2020-06-20 23:32:38,119 [INFO]  
2020-06-20 23:32:39,274 [INFO]  Epoch[0] Batch [0-23]	Speed: 345.22 samples/sec	cross-entropy=1.285269	property_magnitude-accuracy=0.351562
2020-06-20 23:32:40,341 [INFO]  Epoch[0] Train-cross-entropy=1.263142
2020-06-20 23:32:40,345 [INFO]  Epoch[0] Train-property_magnitude-accuracy=0.376389
2020-06-20 23:32:40,349 [INFO]  Epoch[0] Time cost=2.208
2020-06-20 23:32:40,363 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:32:40,497 [INFO]  Epoch[0] Validation-cross-entropy=1.103818
2020-06-20 23:32:40,501 [INFO]  Epoch[0] Validation-property_magnitude-accuracy=0.475000
2020-06-20 23:32:41,720 [INFO]  Epoch[1] Batch [0-23]	Speed: 320.25 samples/sec	cross-entropy=1.136989	property_magnitude-accuracy=0.442708
2020-06-20 23:32:42,408 [INFO]  Epoch[1] Train-cross-entropy=1.154124
2020-06-20 23:32:42,412 [INFO]  Epoch[1] Train-property_magnitude-accuracy=0.450000
2020-06-20 23:32:42,419 [INFO]  Epoch[1] Time cost=1.911
2020-06-20 23:3

2020-06-20 23:33:07,453 [INFO]  Epoch[13] Time cost=1.648
2020-06-20 23:33:07,474 [INFO]  Saved checkpoint to "imputer_model\model-0013.params"
2020-06-20 23:33:07,661 [INFO]  Epoch[13] Validation-cross-entropy=1.043692
2020-06-20 23:33:07,667 [INFO]  Epoch[13] Validation-property_magnitude-accuracy=0.500000
2020-06-20 23:33:08,636 [INFO]  Epoch[14] Batch [0-23]	Speed: 403.26 samples/sec	cross-entropy=0.919484	property_magnitude-accuracy=0.583333
2020-06-20 23:33:09,414 [INFO]  Epoch[14] Train-cross-entropy=0.925184
2020-06-20 23:33:09,418 [INFO]  Epoch[14] Train-property_magnitude-accuracy=0.597222
2020-06-20 23:33:09,423 [INFO]  Epoch[14] Time cost=1.751
2020-06-20 23:33:09,440 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-06-20 23:33:09,563 [INFO]  Epoch[14] Validation-cross-entropy=1.043858
2020-06-20 23:33:09,568 [INFO]  Epoch[14] Validation-property_magnitude-accuracy=0.500000
2020-06-20 23:33:10,510 [INFO]  Epoch[15] Batch [0-23]	Speed: 407.03 samples/sec	cr

Fitting model for column: other_payment_plans


2020-06-20 23:33:21,516 [INFO]  
2020-06-20 23:33:22,513 [INFO]  Epoch[0] Batch [0-23]	Speed: 391.89 samples/sec	cross-entropy=0.724464	other_payment_plans-accuracy=0.783854
2020-06-20 23:33:23,164 [INFO]  Epoch[0] Train-cross-entropy=0.663771
2020-06-20 23:33:23,170 [INFO]  Epoch[0] Train-other_payment_plans-accuracy=0.793056
2020-06-20 23:33:23,179 [INFO]  Epoch[0] Time cost=1.647
2020-06-20 23:33:23,232 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:33:23,338 [INFO]  Epoch[0] Validation-cross-entropy=0.508287
2020-06-20 23:33:23,343 [INFO]  Epoch[0] Validation-other_payment_plans-accuracy=0.850000
2020-06-20 23:33:24,093 [INFO]  Epoch[1] Batch [0-23]	Speed: 519.61 samples/sec	cross-entropy=0.540565	other_payment_plans-accuracy=0.817708
2020-06-20 23:33:25,004 [INFO]  Epoch[1] Train-cross-entropy=0.549881
2020-06-20 23:33:25,009 [INFO]  Epoch[1] Train-other_payment_plans-accuracy=0.811111
2020-06-20 23:33:25,014 [INFO]  Epoch[1] Time cost=1.665
2020-06-20

Fitting model for column: housing


2020-06-20 23:33:41,688 [INFO]  
2020-06-20 23:33:42,474 [INFO]  Epoch[0] Batch [0-23]	Speed: 552.75 samples/sec	cross-entropy=0.805393	housing-accuracy=0.690104
2020-06-20 23:33:43,069 [INFO]  Epoch[0] Train-cross-entropy=0.726860
2020-06-20 23:33:43,073 [INFO]  Epoch[0] Train-housing-accuracy=0.715278
2020-06-20 23:33:43,078 [INFO]  Epoch[0] Time cost=1.364
2020-06-20 23:33:43,093 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:33:43,220 [INFO]  Epoch[0] Validation-cross-entropy=0.567186
2020-06-20 23:33:43,223 [INFO]  Epoch[0] Validation-housing-accuracy=0.750000
2020-06-20 23:33:43,885 [INFO]  Epoch[1] Batch [0-23]	Speed: 586.13 samples/sec	cross-entropy=0.630634	housing-accuracy=0.742188
2020-06-20 23:33:44,494 [INFO]  Epoch[1] Train-cross-entropy=0.606601
2020-06-20 23:33:44,501 [INFO]  Epoch[1] Train-housing-accuracy=0.754167
2020-06-20 23:33:44,506 [INFO]  Epoch[1] Time cost=1.278
2020-06-20 23:33:44,521 [INFO]  Saved checkpoint to "imputer_model\mod

2020-06-20 23:34:14,567 [INFO]  CategoricalEncoder for column job                                found only 16 occurrences of value unemp/unskilled non res


Fitting model for column: job


2020-06-20 23:34:17,248 [INFO]  
2020-06-20 23:34:18,092 [INFO]  Epoch[0] Batch [0-23]	Speed: 490.25 samples/sec	cross-entropy=1.082900	job-accuracy=0.583333
2020-06-20 23:34:18,986 [INFO]  Epoch[0] Train-cross-entropy=0.953873
2020-06-20 23:34:18,991 [INFO]  Epoch[0] Train-job-accuracy=0.623611
2020-06-20 23:34:18,996 [INFO]  Epoch[0] Time cost=1.729
2020-06-20 23:34:19,011 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:34:19,120 [INFO]  Epoch[0] Validation-cross-entropy=0.750655
2020-06-20 23:34:19,124 [INFO]  Epoch[0] Validation-job-accuracy=0.712500
2020-06-20 23:34:20,026 [INFO]  Epoch[1] Batch [0-23]	Speed: 430.91 samples/sec	cross-entropy=0.874057	job-accuracy=0.617188
2020-06-20 23:34:20,994 [INFO]  Epoch[1] Train-cross-entropy=0.816379
2020-06-20 23:34:20,999 [INFO]  Epoch[1] Train-job-accuracy=0.651389
2020-06-20 23:34:21,017 [INFO]  Epoch[1] Time cost=1.888
2020-06-20 23:34:21,041 [INFO]  Saved checkpoint to "imputer_model\model-0001.params"
2020

Fitting model for column: own_telephone


2020-06-20 23:34:35,254 [INFO]  
2020-06-20 23:34:36,236 [INFO]  Epoch[0] Batch [0-23]	Speed: 409.98 samples/sec	cross-entropy=0.675368	own_telephone-accuracy=0.648438
2020-06-20 23:34:36,932 [INFO]  Epoch[0] Train-cross-entropy=0.630649
2020-06-20 23:34:36,938 [INFO]  Epoch[0] Train-own_telephone-accuracy=0.652778
2020-06-20 23:34:36,944 [INFO]  Epoch[0] Time cost=1.674
2020-06-20 23:34:36,960 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:34:37,106 [INFO]  Epoch[0] Validation-cross-entropy=0.552909
2020-06-20 23:34:37,110 [INFO]  Epoch[0] Validation-own_telephone-accuracy=0.700000
2020-06-20 23:34:37,888 [INFO]  Epoch[1] Batch [0-23]	Speed: 502.02 samples/sec	cross-entropy=0.552795	own_telephone-accuracy=0.723958
2020-06-20 23:34:38,457 [INFO]  Epoch[1] Train-cross-entropy=0.553175
2020-06-20 23:34:38,464 [INFO]  Epoch[1] Train-own_telephone-accuracy=0.702778
2020-06-20 23:34:38,469 [INFO]  Epoch[1] Time cost=1.353
2020-06-20 23:34:38,486 [INFO]  Saved ch

Fitting model for column: foreign_worker


2020-06-20 23:34:54,575 [INFO]  
2020-06-20 23:34:55,372 [INFO]  Epoch[0] Batch [0-23]	Speed: 492.59 samples/sec	cross-entropy=0.356163	foreign_worker-accuracy=0.932292
2020-06-20 23:34:56,647 [INFO]  Epoch[0] Train-cross-entropy=0.244204
2020-06-20 23:34:56,653 [INFO]  Epoch[0] Train-foreign_worker-accuracy=0.951389
2020-06-20 23:34:56,659 [INFO]  Epoch[0] Time cost=2.069
2020-06-20 23:34:56,675 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:34:56,789 [INFO]  Epoch[0] Validation-cross-entropy=0.103048
2020-06-20 23:34:56,793 [INFO]  Epoch[0] Validation-foreign_worker-accuracy=0.975000
2020-06-20 23:34:57,466 [INFO]  Epoch[1] Batch [0-23]	Speed: 575.61 samples/sec	cross-entropy=0.170115	foreign_worker-accuracy=0.950521
2020-06-20 23:34:58,083 [INFO]  Epoch[1] Train-cross-entropy=0.147504
2020-06-20 23:34:58,089 [INFO]  Epoch[1] Train-foreign_worker-accuracy=0.961111
2020-06-20 23:34:58,094 [INFO]  Epoch[1] Time cost=1.294
2020-06-20 23:34:58,110 [INFO]  Sav

2020-06-20 23:35:21,336 [INFO]  No improvement detected for 5 epochs compared to 0.08806659989058971 last error obtained: 0.09074801616370679, stopping here
2020-06-20 23:35:21,342 [INFO]  
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting model for column: duration


2020-06-20 23:35:26,865 [INFO]  
2020-06-20 23:35:27,522 [INFO]  Epoch[0] Batch [0-23]	Speed: 602.38 samples/sec	cross-entropy=12.713513	duration-accuracy=0.000000
2020-06-20 23:35:28,068 [INFO]  Epoch[0] Train-cross-entropy=10.462074
2020-06-20 23:35:28,072 [INFO]  Epoch[0] Train-duration-accuracy=0.000000
2020-06-20 23:35:28,080 [INFO]  Epoch[0] Time cost=1.200
2020-06-20 23:35:28,100 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:35:28,231 [INFO]  Epoch[0] Validation-cross-entropy=8.909241
2020-06-20 23:35:28,236 [INFO]  Epoch[0] Validation-duration-accuracy=0.000000
2020-06-20 23:35:28,936 [INFO]  Epoch[1] Batch [0-23]	Speed: 552.31 samples/sec	cross-entropy=8.206984	duration-accuracy=0.000000
2020-06-20 23:35:29,565 [INFO]  Epoch[1] Train-cross-entropy=7.872074
2020-06-20 23:35:29,571 [INFO]  Epoch[1] Train-duration-accuracy=0.000000
2020-06-20 23:35:29,579 [INFO]  Epoch[1] Time cost=1.338
2020-06-20 23:35:29,598 [INFO]  Saved checkpoint to "imputer_mo

Fitting model for column: credit_amount


2020-06-20 23:35:44,728 [INFO]  
2020-06-20 23:35:45,501 [INFO]  Epoch[0] Batch [0-23]	Speed: 510.69 samples/sec	cross-entropy=11.320720	credit_amount-accuracy=0.000000
2020-06-20 23:35:46,064 [INFO]  Epoch[0] Train-cross-entropy=9.224416
2020-06-20 23:35:46,071 [INFO]  Epoch[0] Train-credit_amount-accuracy=0.000000
2020-06-20 23:35:46,077 [INFO]  Epoch[0] Time cost=1.333
2020-06-20 23:35:46,095 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:35:46,234 [INFO]  Epoch[0] Validation-cross-entropy=7.743279
2020-06-20 23:35:46,243 [INFO]  Epoch[0] Validation-credit_amount-accuracy=0.000000
2020-06-20 23:35:46,838 [INFO]  Epoch[1] Batch [0-23]	Speed: 657.78 samples/sec	cross-entropy=7.586662	credit_amount-accuracy=0.000000
2020-06-20 23:35:47,609 [INFO]  Epoch[1] Train-cross-entropy=6.803789
2020-06-20 23:35:47,614 [INFO]  Epoch[1] Train-credit_amount-accuracy=0.000000
2020-06-20 23:35:47,620 [INFO]  Epoch[1] Time cost=1.372
2020-06-20 23:35:47,634 [INFO]  Saved c

Fitting model for column: installment_commitment


2020-06-20 23:35:59,580 [INFO]  
2020-06-20 23:36:01,528 [INFO]  Epoch[0] Batch [0-23]	Speed: 193.06 samples/sec	cross-entropy=14.108082	installment_commitment-accuracy=0.000000
2020-06-20 23:36:03,201 [INFO]  Epoch[0] Train-cross-entropy=14.167253
2020-06-20 23:36:03,208 [INFO]  Epoch[0] Train-installment_commitment-accuracy=0.000000
2020-06-20 23:36:03,216 [INFO]  Epoch[0] Time cost=3.621
2020-06-20 23:36:03,233 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:36:03,689 [INFO]  Epoch[0] Validation-cross-entropy=11.924471
2020-06-20 23:36:03,698 [INFO]  Epoch[0] Validation-installment_commitment-accuracy=0.000000
2020-06-20 23:36:05,341 [INFO]  Epoch[1] Batch [0-23]	Speed: 232.14 samples/sec	cross-entropy=11.859720	installment_commitment-accuracy=0.000000
2020-06-20 23:36:06,275 [INFO]  Epoch[1] Train-cross-entropy=12.435888
2020-06-20 23:36:06,282 [INFO]  Epoch[1] Train-installment_commitment-accuracy=0.000000
2020-06-20 23:36:06,288 [INFO]  Epoch[1] Time c

Fitting model for column: residence_since


2020-06-20 23:36:31,181 [INFO]  
2020-06-20 23:36:32,213 [INFO]  Epoch[0] Batch [0-23]	Speed: 386.76 samples/sec	cross-entropy=15.546577	residence_since-accuracy=0.000000
2020-06-20 23:36:33,230 [INFO]  Epoch[0] Train-cross-entropy=15.758459
2020-06-20 23:36:33,238 [INFO]  Epoch[0] Train-residence_since-accuracy=0.000000
2020-06-20 23:36:33,244 [INFO]  Epoch[0] Time cost=2.046
2020-06-20 23:36:33,289 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:36:33,452 [INFO]  Epoch[0] Validation-cross-entropy=13.713459
2020-06-20 23:36:33,464 [INFO]  Epoch[0] Validation-residence_since-accuracy=0.000000
2020-06-20 23:36:34,265 [INFO]  Epoch[1] Batch [0-23]	Speed: 514.58 samples/sec	cross-entropy=14.026292	residence_since-accuracy=0.000000
2020-06-20 23:36:35,134 [INFO]  Epoch[1] Train-cross-entropy=14.358400
2020-06-20 23:36:35,141 [INFO]  Epoch[1] Train-residence_since-accuracy=0.000000
2020-06-20 23:36:35,146 [INFO]  Epoch[1] Time cost=1.665
2020-06-20 23:36:35,174 [

Fitting model for column: age


2020-06-20 23:36:49,683 [INFO]  
2020-06-20 23:36:50,332 [INFO]  Epoch[0] Batch [0-23]	Speed: 626.80 samples/sec	cross-entropy=14.747533	age-accuracy=0.000000
2020-06-20 23:36:51,022 [INFO]  Epoch[0] Train-cross-entropy=14.467552
2020-06-20 23:36:51,027 [INFO]  Epoch[0] Train-age-accuracy=0.000000
2020-06-20 23:36:51,034 [INFO]  Epoch[0] Time cost=1.332
2020-06-20 23:36:51,048 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:36:51,202 [INFO]  Epoch[0] Validation-cross-entropy=18.820893
2020-06-20 23:36:51,209 [INFO]  Epoch[0] Validation-age-accuracy=0.000000
2020-06-20 23:36:51,913 [INFO]  Epoch[1] Batch [0-23]	Speed: 567.66 samples/sec	cross-entropy=13.196806	age-accuracy=0.000000
2020-06-20 23:36:52,676 [INFO]  Epoch[1] Train-cross-entropy=13.189665
2020-06-20 23:36:52,683 [INFO]  Epoch[1] Train-age-accuracy=0.000000
2020-06-20 23:36:52,691 [INFO]  Epoch[1] Time cost=1.473
2020-06-20 23:36:52,708 [INFO]  Saved checkpoint to "imputer_model\model-0001.params"

2020-06-20 23:37:24,147 [INFO]  Epoch[14] Time cost=1.340
2020-06-20 23:37:24,190 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-06-20 23:37:24,337 [INFO]  Epoch[14] Validation-cross-entropy=15.801934
2020-06-20 23:37:24,344 [INFO]  Epoch[14] Validation-age-accuracy=0.000000
2020-06-20 23:37:24,974 [INFO]  Epoch[15] Batch [0-23]	Speed: 621.18 samples/sec	cross-entropy=10.281278	age-accuracy=0.000000
2020-06-20 23:37:25,569 [INFO]  Epoch[15] Train-cross-entropy=10.240587
2020-06-20 23:37:25,574 [INFO]  Epoch[15] Train-age-accuracy=0.000000
2020-06-20 23:37:25,580 [INFO]  Epoch[15] Time cost=1.228
2020-06-20 23:37:25,623 [INFO]  Saved checkpoint to "imputer_model\model-0015.params"
2020-06-20 23:37:25,794 [INFO]  Epoch[15] Validation-cross-entropy=15.807922
2020-06-20 23:37:25,801 [INFO]  Epoch[15] Validation-age-accuracy=0.000000
2020-06-20 23:37:26,473 [INFO]  Epoch[16] Batch [0-23]	Speed: 597.96 samples/sec	cross-entropy=10.204822	age-accuracy=0.000000
2020-06-20 2

Fitting model for column: existing_credits


2020-06-20 23:37:39,841 [INFO]  
2020-06-20 23:37:40,562 [INFO]  Epoch[0] Batch [0-23]	Speed: 584.94 samples/sec	cross-entropy=15.585867	existing_credits-accuracy=0.000000
2020-06-20 23:37:41,131 [INFO]  Epoch[0] Train-cross-entropy=15.560104
2020-06-20 23:37:41,137 [INFO]  Epoch[0] Train-existing_credits-accuracy=0.000000
2020-06-20 23:37:41,145 [INFO]  Epoch[0] Time cost=1.280
2020-06-20 23:37:41,163 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:37:41,278 [INFO]  Epoch[0] Validation-cross-entropy=12.469859
2020-06-20 23:37:41,283 [INFO]  Epoch[0] Validation-existing_credits-accuracy=0.000000
2020-06-20 23:37:42,061 [INFO]  Epoch[1] Batch [0-23]	Speed: 526.81 samples/sec	cross-entropy=13.436795	existing_credits-accuracy=0.000000
2020-06-20 23:37:42,619 [INFO]  Epoch[1] Train-cross-entropy=13.678627
2020-06-20 23:37:42,628 [INFO]  Epoch[1] Train-existing_credits-accuracy=0.000000
2020-06-20 23:37:42,633 [INFO]  Epoch[1] Time cost=1.339
2020-06-20 23:37:42,

Fitting model for column: num_dependents


2020-06-20 23:38:09,057 [INFO]  
2020-06-20 23:38:09,662 [INFO]  Epoch[0] Batch [0-23]	Speed: 660.34 samples/sec	cross-entropy=14.704023	num_dependents-accuracy=0.000000
2020-06-20 23:38:10,253 [INFO]  Epoch[0] Train-cross-entropy=15.495457
2020-06-20 23:38:10,260 [INFO]  Epoch[0] Train-num_dependents-accuracy=0.000000
2020-06-20 23:38:10,275 [INFO]  Epoch[0] Time cost=1.200
2020-06-20 23:38:10,302 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-06-20 23:38:10,479 [INFO]  Epoch[0] Validation-cross-entropy=14.296767
2020-06-20 23:38:10,493 [INFO]  Epoch[0] Validation-num_dependents-accuracy=0.000000
2020-06-20 23:38:11,386 [INFO]  Epoch[1] Batch [0-23]	Speed: 436.92 samples/sec	cross-entropy=13.424981	num_dependents-accuracy=0.000000
2020-06-20 23:38:11,912 [INFO]  Epoch[1] Train-cross-entropy=14.404770
2020-06-20 23:38:11,917 [INFO]  Epoch[1] Train-num_dependents-accuracy=0.000000
2020-06-20 23:38:11,922 [INFO]  Epoch[1] Time cost=1.417
2020-06-20 23:38:11,935 [INFO]

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
841,no checking,21.000000,,used car,1598.119777,<100,1<=X<4,3.0,male single,none,0.193852,real estate,28.0,stores,own,2.0,unskilled resident,1.0,none,yes
956,>=200,30.000000,critical/other existing credit,radio/tv,2533.905632,no known savings,>=7,4.0,male single,none,4.000000,life insurance,49.0,stores,own,2.0,unskilled resident,1.0,none,yes
544,no checking,-3.982692,,new car,1255.000000,<100,>=7,4.0,male single,none,4.000000,real estate,61.0,none,own,2.0,unskilled resident,1.0,none,yes
173,0<=X<200,-30.933795,existing paid,radio/tv,1414.000000,<100,1<=X<4,4.0,male single,guarantor,2.000000,real estate,33.0,none,own,1.0,skilled,1.0,none,no
759,<0,12.000000,,new car,927.096854,<100,>=7,4.0,male single,none,7.827427,life insurance,35.0,none,own,2.0,skilled,1.0,none,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274,<0,30.000000,existing paid,repairs,4776.586919,<100,<1,1.0,male div/sep,none,1.000000,no known property,34.0,none,own,1.0,unskilled resident,1.0,yes,yes
192,0<=X<200,27.000000,existing paid,business,2737.398185,<100,1<=X<4,4.0,male single,none,2.000000,car,36.0,none,own,1.0,skilled,2.0,yes,yes
398,0<=X<200,12.000000,,new car,614.418651,<100,>=7,1.0,rent,none,1.000000,real estate,46.0,none,male div/sep,2.0,skilled,1.0,none,yes
450,no checking,30.597022,critical/other existing credit,used car,5179.101487,no known savings,1<=X<4,4.0,male single,none,5.279632,car,30.0,none,own,1.0,high qualif/self emp/mgmt,1.0,yes,yes


In [None]:
# for all imputers return scores, take best

### Outlier Detection

In [22]:
# detection using KNN from PyOD
outlier = PyODKNN(train_data, test_data_corrupted, categorical_columns, numerical_columms)

In [23]:
test_data_corrupted_outliers = outlier.fit_transform(train_data, test_data_corrupted)
test_data_corrupted_outliers.head(10)

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,outlier
841,no checking,21.0,,used car,,<100,1<=X<4,3.0,male single,none,...,real estate,28.0,stores,own,2.0,unskilled resident,1.0,none,yes,0
956,>=200,30.0,critical/other existing credit,radio/tv,,no known savings,>=7,4.0,male single,none,...,life insurance,49.0,stores,own,2.0,unskilled resident,1.0,none,yes,0
544,no checking,-3.982692,,new car,1255.0,<100,>=7,4.0,male single,none,...,real estate,61.0,none,own,2.0,unskilled resident,1.0,none,yes,0
173,0<=X<200,-30.933795,existing paid,radio/tv,1414.0,<100,1<=X<4,4.0,male single,guarantor,...,real estate,33.0,none,own,1.0,skilled,1.0,none,no,1
759,<0,12.0,,new car,,<100,>=7,4.0,male single,none,...,life insurance,35.0,none,own,2.0,skilled,1.0,none,yes,1
955,<0,24.0,critical/other existing credit,radio/tv,,>=1000,>=7,4.0,female div/dep/mar,none,...,life insurance,57.0,none,rent,2.0,high qualif/self emp/mgmt,1.0,yes,yes,0
121,no checking,24.0,critical/other existing credit,used car,,<100,>=7,4.0,rent,none,...,car,41.0,none,female div/dep/mar,2.0,high qualif/self emp/mgmt,1.0,yes,yes,0
230,>=200,36.0,existing paid,radio/tv,4210.0,<100,1<=X<4,4.0,male single,none,...,car,26.0,none,own,1.0,skilled,1.0,none,yes,0
11,<0,48.0,existing paid,business,4308.0,<100,<1,3.0,female div/dep/mar,none,...,life insurance,24.0,none,rent,1.0,skilled,1.0,none,yes,1
120,<0,21.0,,radio/tv,,<100,1<=X<4,3.0,female div/dep/mar,none,...,real estate,25.0,none,own,2.0,skilled,1.0,yes,yes,0


In [24]:
# detection using Isolation Forest from PyOD
outlier_if = PyODIsolationForest(train_data, test_data_corrupted, categorical_columns, numerical_columms)

In [42]:
test_data_corrupted_outliers_if = outlier_if.fit_transform(train_data, test_data_corrupted)
test_data_corrupted_outliers_if.head(10)

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,outlier
841,no checking,21.0,,used car,,<100,1<=X<4,3.0,male single,none,...,real estate,28.0,stores,own,2.0,unskilled resident,1.0,none,yes,0
956,>=200,30.0,critical/other existing credit,radio/tv,,no known savings,>=7,4.0,male single,none,...,life insurance,49.0,stores,own,2.0,unskilled resident,1.0,none,yes,1
544,no checking,-3.982692,,new car,1255.0,<100,>=7,4.0,male single,none,...,real estate,61.0,none,own,2.0,unskilled resident,1.0,none,yes,0
173,0<=X<200,-30.933795,existing paid,radio/tv,1414.0,<100,1<=X<4,4.0,male single,guarantor,...,real estate,33.0,none,own,1.0,skilled,1.0,none,no,1
759,<0,12.0,,new car,,<100,>=7,4.0,male single,none,...,life insurance,35.0,none,own,2.0,skilled,1.0,none,yes,0
955,<0,24.0,critical/other existing credit,radio/tv,,>=1000,>=7,4.0,female div/dep/mar,none,...,life insurance,57.0,none,rent,2.0,high qualif/self emp/mgmt,1.0,yes,yes,1
121,no checking,24.0,critical/other existing credit,used car,,<100,>=7,4.0,rent,none,...,car,41.0,none,female div/dep/mar,2.0,high qualif/self emp/mgmt,1.0,yes,yes,0
230,>=200,36.0,existing paid,radio/tv,4210.0,<100,1<=X<4,4.0,male single,none,...,car,26.0,none,own,1.0,skilled,1.0,none,yes,0
11,<0,48.0,existing paid,business,4308.0,<100,<1,3.0,female div/dep/mar,none,...,life insurance,24.0,none,rent,1.0,skilled,1.0,none,yes,0
120,<0,21.0,,radio/tv,,<100,1<=X<4,3.0,female div/dep/mar,none,...,real estate,25.0,none,own,2.0,skilled,1.0,yes,yes,0


#### Preparing the outliers for imputation

In [28]:
if 'outlier' in test_data_corrupted_outliers_if.columns:
    print(test_data_corrupted_outliers_if['outlier'].sum())

58


In [None]:
## train_data, test_data_corrupted, 
## check values in column in the training data -> check for outliers in the same column in the corrupted data
## store .loc 
## convert those .loc for those column into nan
## impute

In [50]:
numerical_columms

['duration',
 'credit_amount',
 'installment_commitment',
 'residence_since',
 'age',
 'existing_credits',
 'num_dependents']

In [49]:
test_data_corrupted

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
841,no checking,21.000000,,used car,,<100,1<=X<4,3.0,male single,none,0.193852,real estate,28.0,stores,own,2.0,unskilled resident,1.0,none,yes
956,>=200,30.000000,critical/other existing credit,radio/tv,,no known savings,>=7,4.0,male single,none,4.000000,life insurance,49.0,stores,own,2.0,unskilled resident,1.0,none,yes
544,no checking,-3.982692,,new car,1255.0,<100,>=7,4.0,male single,none,4.000000,real estate,61.0,none,own,2.0,unskilled resident,1.0,none,yes
173,0<=X<200,-30.933795,existing paid,radio/tv,1414.0,<100,1<=X<4,4.0,male single,guarantor,2.000000,real estate,33.0,none,own,1.0,skilled,1.0,none,no
759,<0,12.000000,,new car,,<100,>=7,4.0,male single,none,7.827427,life insurance,35.0,none,own,2.0,skilled,1.0,none,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274,<0,30.000000,existing paid,repairs,,<100,<1,1.0,male div/sep,none,1.000000,no known property,34.0,none,own,1.0,unskilled resident,1.0,yes,yes
192,0<=X<200,27.000000,existing paid,business,,<100,1<=X<4,4.0,male single,none,2.000000,car,36.0,none,own,1.0,skilled,2.0,yes,yes
398,0<=X<200,12.000000,,new car,,<100,>=7,1.0,rent,none,1.000000,real estate,46.0,none,male div/sep,2.0,skilled,1.0,none,yes
450,no checking,30.597022,critical/other existing credit,used car,,no known savings,1<=X<4,4.0,male single,none,5.279632,car,30.0,none,own,1.0,high qualif/self emp/mgmt,1.0,yes,yes


## Evaluation

In [45]:
# score without cleaning
model_obj.score_on_test_data(model.predict_proba(test_data))

0.8093735390369332

In [46]:
# score with corruptions
model_obj.score_on_test_data(model.predict_proba(test_data_corrupted))

0.8020102851799906

In [47]:
# score with mean/mode imputation
model_obj.score_on_test_data(model.predict_proba(test_data_mm_imputed))

0.791374474053296

In [48]:
# score with datawig imputation
model_obj.score_on_test_data(model.predict_proba(test_data_dw_imputed))

0.7910238429172511