## Adjustment for Google Colab

In [None]:
# mount drive for access to the
# from google.colab import drive

In [None]:
# drive.mount("/content/drive")

In [None]:
# all the drive the files are present in "/content/drive/My Drive"
# !ls "/content/drive/My Drive/Beuth Uni/Master Thesis"

In [None]:
# import sys
# sys.path.append('/content/drive/My Drive/Beuth Uni/Master Thesis/jenga')

In [None]:
#! pip install openml

In [None]:
# !pip freeze | grep sklearn

In [None]:
# !pip install --upgrade sklearn

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from jenga.basis import Dataset
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise
from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNN, PyODIsolationForest
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, DatawigImputation
from jenga.cleaning.clean import Clean

In [2]:
seed = 100

## Dataset

In [3]:
dataset = Dataset(seed, "credit-g")

Dataset 'credit-g', target: 'class'
**Author**: Dr. Hans Hofmann  

**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)) - 1994    

**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)



**German Credit data**  

This dataset classifies people described by a set of attributes as good or bad credit risks.



This dataset comes with a cost matrix: 

``` 

      Good  Bad (predicted)  

Good   0    1   (actual)  

Bad    5    0  

```



It is worse to class a customer as go

Attribute types: 


Unnamed: 0,attribute_names,categorical_indicator
0,checking_status,True
1,duration,False
2,credit_history,True
3,purpose,True
4,credit_amount,False
5,savings_status,True
6,employment,True
7,installment_commitment,False
8,personal_status,True
9,other_parties,True


In [4]:
all_data = dataset.all_data
# all_data

In [5]:
attribute_names = dataset.attribute_names
# attribute_names

In [6]:
attribute_types = dataset.attribute_types
# attribute_types

### Categorical and Numerical Features

In [7]:
categorical_columns = dataset.categorical_columns
# categorical_columns

In [8]:
numerical_columns = dataset.numerical_columns
# numerical_columns

In [9]:
print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features")

Found 13 categorical and 7 numeric features


## Model

### Model parameters

In [10]:
## model parameters
learner = SGDClassifier(max_iter=1000)
param_grid = {
    'learner__loss': ['log'],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

### Train and Test Data

In [11]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data()

# display(train_data.head())
# print(train_labels[0:5])

# display(test_data.head())
# print(test_labels[0:5])

## Corruptions

In [12]:
corruptions = [MissingValues, Scaling, SwappedValues, GaussianNoise]

In [13]:
fractions = [0.25, 0.5, 0.75]

In [14]:
ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)

In [15]:
# generate corrpted test data
test_data_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions, fractions)

Generating corrupted training data on 200 rows... 

Applying perturbations... 

MissingValues: {'column': 'property_magnitude', 'fraction': 0.25, 'na_value': nan, 'missingness': 'MAR'}
Scaling: {'column': 'existing_credits', 'fraction': 0.25}
SwappedValues: {'column_a': 'own_telephone', 'column_b': 'personal_status', 'fraction': 0.5}
GaussianNoise: {'column': 'num_dependents', 'fraction': 0.5}


In [16]:
summary_col_corrupt

defaultdict(list,
            {('property_magnitude',): [<jenga.corruptions.generic.MissingValues at 0x2063b3873c8>],
             ('existing_credits',): [<jenga.corruptions.numerical.Scaling at 0x2063b387cf8>],
             ('own_telephone',
              'personal_status'): [<jenga.corruptions.generic.SwappedValues at 0x2063b415240>],
             ('num_dependents',): [<jenga.corruptions.numerical.GaussianNoise at 0x2063b387550>]})

In [17]:
test_data_corrupted

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,18.0,existing paid,radio/tv,433.0,<100,unemployed,3.0,none,co applicant,4.0,real estate,22.0,none,rent,10.0,skilled,1.000000,female div/dep/mar,yes
353,<0,12.0,no credits/all paid,radio/tv,6199.0,<100,1<=X<4,4.0,male single,none,2.0,life insurance,28.0,none,rent,20.0,skilled,1.595905,yes,yes
537,0<=X<200,18.0,critical/other existing credit,furniture/equipment,3612.0,<100,>=7,3.0,female div/dep/mar,none,4.0,life insurance,37.0,none,own,10.0,skilled,1.000000,yes,yes
424,0<=X<200,12.0,existing paid,furniture/equipment,2762.0,no known savings,>=7,1.0,yes,none,2.0,life insurance,25.0,bank,own,10.0,skilled,0.900623,female div/dep/mar,yes
564,0<=X<200,24.0,delayed previously,business,4712.0,no known savings,1<=X<4,4.0,yes,none,2.0,life insurance,37.0,bank,own,2.0,high qualif/self emp/mgmt,1.000000,male single,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,0<=X<200,36.0,delayed previously,business,9857.0,100<=X<500,4<=X<7,1.0,male single,none,3.0,,31.0,none,own,2.0,unskilled resident,2.141840,yes,yes
644,<0,18.0,critical/other existing credit,radio/tv,1880.0,<100,4<=X<7,4.0,yes,none,1.0,life insurance,32.0,none,own,2.0,high qualif/self emp/mgmt,0.335332,male mar/wid,yes
110,0<=X<200,6.0,delayed previously,business,1449.0,100<=X<500,>=7,1.0,male div/sep,none,2.0,car,31.0,bank,own,2.0,skilled,2.000000,none,yes
28,0<=X<200,7.0,existing paid,radio/tv,2415.0,<100,1<=X<4,3.0,male single,guarantor,2.0,real estate,34.0,none,own,1.0,skilled,1.000000,none,yes


## Cleaning

In [19]:
cleaners = [
#     (NoOutlierDetection, MeanModeImputation),
    (NoOutlierDetection, DatawigImputation),
#     (PyODKNN, NoImputation),
     (PyODKNN, MeanModeImputation),
#     # (PyODKNN, DatawigImputation),
#     (PyODIsolationForest, NoImputation),
    (PyODIsolationForest, MeanModeImputation)
#     # (PyODIsolationForest, DatawigImputation)
]

In [20]:
ppp_model = ppp.fit_ppp(train_data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    5.5s finished


In [21]:
clean = Clean(train_data, test_data_corrupted, categorical_columns, numerical_columns, ppp, ppp_model, cleaners)

In [22]:
test_data_cleaned, score_no_cleaning, cleaner_scores_ppp, summary_cleaners = clean(train_data, test_data_corrupted)

2020-07-03 19:28:46,760 [INFO]  CategoricalEncoder for column checking_status                                found only 47 occurrences of value >=200


PPP score no cleaning: 0.5329861111111112
Fitting model for column: checking_status


2020-07-03 19:28:47,706 [INFO]  
2020-07-03 19:28:48,076 [INFO]  Epoch[0] Batch [0-23]	Speed: 1098.15 samples/sec	cross-entropy=1.256872	checking_status-accuracy=0.377604
2020-07-03 19:28:48,386 [INFO]  Epoch[0] Train-cross-entropy=1.219677
2020-07-03 19:28:48,387 [INFO]  Epoch[0] Train-checking_status-accuracy=0.383333
2020-07-03 19:28:48,388 [INFO]  Epoch[0] Time cost=0.675
2020-07-03 19:28:48,397 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:28:48,459 [INFO]  Epoch[0] Validation-cross-entropy=1.151220
2020-07-03 19:28:48,460 [INFO]  Epoch[0] Validation-checking_status-accuracy=0.412500
2020-07-03 19:28:48,798 [INFO]  Epoch[1] Batch [0-23]	Speed: 1153.00 samples/sec	cross-entropy=1.126862	checking_status-accuracy=0.427083
2020-07-03 19:28:49,089 [INFO]  Epoch[1] Train-cross-entropy=1.124785
2020-07-03 19:28:49,091 [INFO]  Epoch[1] Train-checking_status-accuracy=0.445833
2020-07-03 19:28:49,092 [INFO]  Epoch[1] Time cost=0.631
2020-07-03 19:28:49,100 [INF

Fitting model for column: credit_history


2020-07-03 19:28:57,377 [INFO]  
2020-07-03 19:28:57,797 [INFO]  Epoch[0] Batch [0-23]	Speed: 981.34 samples/sec	cross-entropy=1.128355	credit_history-accuracy=0.562500
2020-07-03 19:28:58,127 [INFO]  Epoch[0] Train-cross-entropy=1.052969
2020-07-03 19:28:58,130 [INFO]  Epoch[0] Train-credit_history-accuracy=0.629167
2020-07-03 19:28:58,132 [INFO]  Epoch[0] Time cost=0.747
2020-07-03 19:28:58,143 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:28:58,206 [INFO]  Epoch[0] Validation-cross-entropy=1.045864
2020-07-03 19:28:58,208 [INFO]  Epoch[0] Validation-credit_history-accuracy=0.625000
2020-07-03 19:28:58,565 [INFO]  Epoch[1] Batch [0-23]	Speed: 1097.24 samples/sec	cross-entropy=0.886140	credit_history-accuracy=0.700521
2020-07-03 19:28:58,875 [INFO]  Epoch[1] Train-cross-entropy=0.881799
2020-07-03 19:28:58,876 [INFO]  Epoch[1] Train-credit_history-accuracy=0.702778
2020-07-03 19:28:58,878 [INFO]  Epoch[1] Time cost=0.668
2020-07-03 19:28:58,886 [INFO]  Sa

Fitting model for column: purpose


2020-07-03 19:29:04,302 [INFO]  
2020-07-03 19:29:04,799 [INFO]  Epoch[0] Batch [0-23]	Speed: 810.96 samples/sec	cross-entropy=1.820075	purpose-accuracy=0.273438
2020-07-03 19:29:05,195 [INFO]  Epoch[0] Train-cross-entropy=1.759505
2020-07-03 19:29:05,196 [INFO]  Epoch[0] Train-purpose-accuracy=0.297222
2020-07-03 19:29:05,198 [INFO]  Epoch[0] Time cost=0.889
2020-07-03 19:29:05,214 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:29:05,283 [INFO]  Epoch[0] Validation-cross-entropy=1.758454
2020-07-03 19:29:05,285 [INFO]  Epoch[0] Validation-purpose-accuracy=0.275000
2020-07-03 19:29:05,713 [INFO]  Epoch[1] Batch [0-23]	Speed: 903.20 samples/sec	cross-entropy=1.602011	purpose-accuracy=0.348958
2020-07-03 19:29:06,084 [INFO]  Epoch[1] Train-cross-entropy=1.602638
2020-07-03 19:29:06,087 [INFO]  Epoch[1] Train-purpose-accuracy=0.359722
2020-07-03 19:29:06,089 [INFO]  Epoch[1] Time cost=0.801
2020-07-03 19:29:06,105 [INFO]  Saved checkpoint to "imputer_model\mod

Fitting model for column: savings_status


2020-07-03 19:29:16,566 [INFO]  
2020-07-03 19:29:17,019 [INFO]  Epoch[0] Batch [0-23]	Speed: 890.17 samples/sec	cross-entropy=1.223779	savings_status-accuracy=0.596354
2020-07-03 19:29:17,345 [INFO]  Epoch[0] Train-cross-entropy=1.151443
2020-07-03 19:29:17,347 [INFO]  Epoch[0] Train-savings_status-accuracy=0.609722
2020-07-03 19:29:17,349 [INFO]  Epoch[0] Time cost=0.778
2020-07-03 19:29:17,362 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:29:17,459 [INFO]  Epoch[0] Validation-cross-entropy=1.079870
2020-07-03 19:29:17,462 [INFO]  Epoch[0] Validation-savings_status-accuracy=0.612500
2020-07-03 19:29:17,914 [INFO]  Epoch[1] Batch [0-23]	Speed: 870.02 samples/sec	cross-entropy=1.061617	savings_status-accuracy=0.601562
2020-07-03 19:29:18,276 [INFO]  Epoch[1] Train-cross-entropy=1.046015
2020-07-03 19:29:18,278 [INFO]  Epoch[1] Train-savings_status-accuracy=0.613889
2020-07-03 19:29:18,280 [INFO]  Epoch[1] Time cost=0.816
2020-07-03 19:29:18,289 [INFO]  Sav

Fitting model for column: employment


2020-07-03 19:29:27,563 [INFO]  
2020-07-03 19:29:27,940 [INFO]  Epoch[0] Batch [0-23]	Speed: 1072.63 samples/sec	cross-entropy=1.446418	employment-accuracy=0.346354
2020-07-03 19:29:28,282 [INFO]  Epoch[0] Train-cross-entropy=1.375839
2020-07-03 19:29:28,284 [INFO]  Epoch[0] Train-employment-accuracy=0.376389
2020-07-03 19:29:28,286 [INFO]  Epoch[0] Time cost=0.715
2020-07-03 19:29:28,299 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:29:28,367 [INFO]  Epoch[0] Validation-cross-entropy=1.374498
2020-07-03 19:29:28,369 [INFO]  Epoch[0] Validation-employment-accuracy=0.387500
2020-07-03 19:29:28,752 [INFO]  Epoch[1] Batch [0-23]	Speed: 1030.68 samples/sec	cross-entropy=1.284522	employment-accuracy=0.442708
2020-07-03 19:29:29,068 [INFO]  Epoch[1] Train-cross-entropy=1.257434
2020-07-03 19:29:29,069 [INFO]  Epoch[1] Train-employment-accuracy=0.463889
2020-07-03 19:29:29,071 [INFO]  Epoch[1] Time cost=0.700
2020-07-03 19:29:29,080 [INFO]  Saved checkpoint to "

2020-07-03 19:29:38,715 [INFO]  Epoch[14] Batch [0-23]	Speed: 1106.37 samples/sec	cross-entropy=1.059443	employment-accuracy=0.557292
2020-07-03 19:29:39,016 [INFO]  Epoch[14] Train-cross-entropy=1.043048
2020-07-03 19:29:39,019 [INFO]  Epoch[14] Train-employment-accuracy=0.556944
2020-07-03 19:29:39,022 [INFO]  Epoch[14] Time cost=0.656
2020-07-03 19:29:39,030 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-07-03 19:29:39,094 [INFO]  Epoch[14] Validation-cross-entropy=1.317452
2020-07-03 19:29:39,097 [INFO]  Epoch[14] Validation-employment-accuracy=0.437500
2020-07-03 19:29:39,447 [INFO]  Epoch[15] Batch [0-23]	Speed: 1109.88 samples/sec	cross-entropy=1.050480	employment-accuracy=0.557292
2020-07-03 19:29:39,735 [INFO]  Epoch[15] Train-cross-entropy=1.034911
2020-07-03 19:29:39,737 [INFO]  Epoch[15] Train-employment-accuracy=0.556944
2020-07-03 19:29:39,739 [INFO]  Epoch[15] Time cost=0.640
2020-07-03 19:29:39,748 [INFO]  Saved checkpoint to "imputer_model\model-001

Fitting model for column: personal_status


2020-07-03 19:29:41,377 [INFO]  
2020-07-03 19:29:41,777 [INFO]  Epoch[0] Batch [0-23]	Speed: 1016.46 samples/sec	cross-entropy=1.051791	personal_status-accuracy=0.552083
2020-07-03 19:29:42,106 [INFO]  Epoch[0] Train-cross-entropy=1.002251
2020-07-03 19:29:42,109 [INFO]  Epoch[0] Train-personal_status-accuracy=0.548611
2020-07-03 19:29:42,111 [INFO]  Epoch[0] Time cost=0.729
2020-07-03 19:29:42,119 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:29:42,173 [INFO]  Epoch[0] Validation-cross-entropy=1.009213
2020-07-03 19:29:42,175 [INFO]  Epoch[0] Validation-personal_status-accuracy=0.562500
2020-07-03 19:29:42,517 [INFO]  Epoch[1] Batch [0-23]	Speed: 1138.82 samples/sec	cross-entropy=0.903638	personal_status-accuracy=0.611979
2020-07-03 19:29:42,807 [INFO]  Epoch[1] Train-cross-entropy=0.900721
2020-07-03 19:29:42,809 [INFO]  Epoch[1] Train-personal_status-accuracy=0.605556
2020-07-03 19:29:42,811 [INFO]  Epoch[1] Time cost=0.635
2020-07-03 19:29:42,818 [INF

Fitting model for column: other_parties


2020-07-03 19:29:50,786 [INFO]  
2020-07-03 19:29:51,121 [INFO]  Epoch[0] Batch [0-23]	Speed: 1197.85 samples/sec	cross-entropy=0.558243	other_parties-accuracy=0.864583
2020-07-03 19:29:51,422 [INFO]  Epoch[0] Train-cross-entropy=0.472889
2020-07-03 19:29:51,424 [INFO]  Epoch[0] Train-other_parties-accuracy=0.884722
2020-07-03 19:29:51,426 [INFO]  Epoch[0] Time cost=0.632
2020-07-03 19:29:51,439 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:29:51,493 [INFO]  Epoch[0] Validation-cross-entropy=0.344814
2020-07-03 19:29:51,495 [INFO]  Epoch[0] Validation-other_parties-accuracy=0.912500
2020-07-03 19:29:51,824 [INFO]  Epoch[1] Batch [0-23]	Speed: 1231.93 samples/sec	cross-entropy=0.363665	other_parties-accuracy=0.901042
2020-07-03 19:29:52,110 [INFO]  Epoch[1] Train-cross-entropy=0.352969
2020-07-03 19:29:52,112 [INFO]  Epoch[1] Train-other_parties-accuracy=0.904167
2020-07-03 19:29:52,114 [INFO]  Epoch[1] Time cost=0.616
2020-07-03 19:29:52,122 [INFO]  Saved 

Fitting model for column: property_magnitude


2020-07-03 19:29:58,755 [INFO]  
2020-07-03 19:29:59,152 [INFO]  Epoch[0] Batch [0-23]	Speed: 1001.29 samples/sec	cross-entropy=1.252705	property_magnitude-accuracy=0.377604
2020-07-03 19:29:59,476 [INFO]  Epoch[0] Train-cross-entropy=1.223547
2020-07-03 19:29:59,479 [INFO]  Epoch[0] Train-property_magnitude-accuracy=0.395833
2020-07-03 19:29:59,481 [INFO]  Epoch[0] Time cost=0.720
2020-07-03 19:29:59,492 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:29:59,550 [INFO]  Epoch[0] Validation-cross-entropy=1.216426
2020-07-03 19:29:59,552 [INFO]  Epoch[0] Validation-property_magnitude-accuracy=0.412500
2020-07-03 19:29:59,887 [INFO]  Epoch[1] Batch [0-23]	Speed: 1156.69 samples/sec	cross-entropy=1.100462	property_magnitude-accuracy=0.453125
2020-07-03 19:30:00,315 [INFO]  Epoch[1] Train-cross-entropy=1.112668
2020-07-03 19:30:00,317 [INFO]  Epoch[1] Train-property_magnitude-accuracy=0.454167
2020-07-03 19:30:00,319 [INFO]  Epoch[1] Time cost=0.765
2020-07-03 19

2020-07-03 19:30:11,526 [INFO]  Epoch[13] Time cost=0.664
2020-07-03 19:30:11,535 [INFO]  Saved checkpoint to "imputer_model\model-0013.params"
2020-07-03 19:30:11,606 [INFO]  Epoch[13] Validation-cross-entropy=1.159648
2020-07-03 19:30:11,609 [INFO]  Epoch[13] Validation-property_magnitude-accuracy=0.425000
2020-07-03 19:30:12,021 [INFO]  Epoch[14] Batch [0-23]	Speed: 940.06 samples/sec	cross-entropy=0.853654	property_magnitude-accuracy=0.611979
2020-07-03 19:30:12,355 [INFO]  Epoch[14] Train-cross-entropy=0.874441
2020-07-03 19:30:12,358 [INFO]  Epoch[14] Train-property_magnitude-accuracy=0.595833
2020-07-03 19:30:12,360 [INFO]  Epoch[14] Time cost=0.749
2020-07-03 19:30:12,367 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-07-03 19:30:12,438 [INFO]  Epoch[14] Validation-cross-entropy=1.159362
2020-07-03 19:30:12,441 [INFO]  Epoch[14] Validation-property_magnitude-accuracy=0.437500
2020-07-03 19:30:12,800 [INFO]  Epoch[15] Batch [0-23]	Speed: 1083.63 samples/sec	c

Fitting model for column: other_payment_plans


2020-07-03 19:30:16,709 [INFO]  
2020-07-03 19:30:17,357 [INFO]  Epoch[0] Batch [0-23]	Speed: 609.27 samples/sec	cross-entropy=0.700869	other_payment_plans-accuracy=0.796875
2020-07-03 19:30:17,707 [INFO]  Epoch[0] Train-cross-entropy=0.615596
2020-07-03 19:30:17,710 [INFO]  Epoch[0] Train-other_payment_plans-accuracy=0.818056
2020-07-03 19:30:17,712 [INFO]  Epoch[0] Time cost=0.995
2020-07-03 19:30:17,719 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:30:17,782 [INFO]  Epoch[0] Validation-cross-entropy=0.614651
2020-07-03 19:30:17,785 [INFO]  Epoch[0] Validation-other_payment_plans-accuracy=0.800000
2020-07-03 19:30:18,148 [INFO]  Epoch[1] Batch [0-23]	Speed: 1066.43 samples/sec	cross-entropy=0.536147	other_payment_plans-accuracy=0.825521
2020-07-03 19:30:18,427 [INFO]  Epoch[1] Train-cross-entropy=0.514863
2020-07-03 19:30:18,429 [INFO]  Epoch[1] Train-other_payment_plans-accuracy=0.833333
2020-07-03 19:30:18,431 [INFO]  Epoch[1] Time cost=0.644
2020-07-0

Fitting model for column: housing


2020-07-03 19:30:24,539 [INFO]  
2020-07-03 19:30:24,937 [INFO]  Epoch[0] Batch [0-23]	Speed: 1019.29 samples/sec	cross-entropy=0.771174	housing-accuracy=0.723958
2020-07-03 19:30:25,224 [INFO]  Epoch[0] Train-cross-entropy=0.731754
2020-07-03 19:30:25,227 [INFO]  Epoch[0] Train-housing-accuracy=0.723611
2020-07-03 19:30:25,231 [INFO]  Epoch[0] Time cost=0.685
2020-07-03 19:30:25,239 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:30:25,294 [INFO]  Epoch[0] Validation-cross-entropy=0.671609
2020-07-03 19:30:25,296 [INFO]  Epoch[0] Validation-housing-accuracy=0.700000
2020-07-03 19:30:25,612 [INFO]  Epoch[1] Batch [0-23]	Speed: 1237.07 samples/sec	cross-entropy=0.584212	housing-accuracy=0.760417
2020-07-03 19:30:25,910 [INFO]  Epoch[1] Train-cross-entropy=0.601003
2020-07-03 19:30:25,912 [INFO]  Epoch[1] Train-housing-accuracy=0.750000
2020-07-03 19:30:25,914 [INFO]  Epoch[1] Time cost=0.615
2020-07-03 19:30:25,922 [INFO]  Saved checkpoint to "imputer_model\m

2020-07-03 19:30:34,981 [INFO]  Epoch[14] Train-cross-entropy=0.417369
2020-07-03 19:30:34,984 [INFO]  Epoch[14] Train-housing-accuracy=0.838889
2020-07-03 19:30:34,986 [INFO]  Epoch[14] Time cost=0.629
2020-07-03 19:30:34,994 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-07-03 19:30:35,061 [INFO]  Epoch[14] Validation-cross-entropy=0.629073
2020-07-03 19:30:35,063 [INFO]  Epoch[14] Validation-housing-accuracy=0.750000
2020-07-03 19:30:35,393 [INFO]  Epoch[15] Batch [0-23]	Speed: 1171.38 samples/sec	cross-entropy=0.427659	housing-accuracy=0.843750
2020-07-03 19:30:35,658 [INFO]  Epoch[15] Train-cross-entropy=0.411520
2020-07-03 19:30:35,661 [INFO]  Epoch[15] Train-housing-accuracy=0.841667
2020-07-03 19:30:35,664 [INFO]  Epoch[15] Time cost=0.598
2020-07-03 19:30:35,672 [INFO]  Saved checkpoint to "imputer_model\model-0015.params"
2020-07-03 19:30:35,726 [INFO]  Epoch[15] Validation-cross-entropy=0.629239
2020-07-03 19:30:35,729 [INFO]  Epoch[15] Validation-housing

Fitting model for column: job


2020-07-03 19:30:38,730 [INFO]  
2020-07-03 19:30:39,130 [INFO]  Epoch[0] Batch [0-23]	Speed: 1002.67 samples/sec	cross-entropy=1.013914	job-accuracy=0.604167
2020-07-03 19:30:39,435 [INFO]  Epoch[0] Train-cross-entropy=0.970759
2020-07-03 19:30:39,437 [INFO]  Epoch[0] Train-job-accuracy=0.615278
2020-07-03 19:30:39,440 [INFO]  Epoch[0] Time cost=0.703
2020-07-03 19:30:39,451 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:30:39,517 [INFO]  Epoch[0] Validation-cross-entropy=0.881992
2020-07-03 19:30:39,520 [INFO]  Epoch[0] Validation-job-accuracy=0.625000
2020-07-03 19:30:39,868 [INFO]  Epoch[1] Batch [0-23]	Speed: 1111.37 samples/sec	cross-entropy=0.816791	job-accuracy=0.656250
2020-07-03 19:30:40,178 [INFO]  Epoch[1] Train-cross-entropy=0.835135
2020-07-03 19:30:40,181 [INFO]  Epoch[1] Train-job-accuracy=0.651389
2020-07-03 19:30:40,183 [INFO]  Epoch[1] Time cost=0.661
2020-07-03 19:30:40,191 [INFO]  Saved checkpoint to "imputer_model\model-0001.params"
20

2020-07-03 19:30:49,864 [INFO]  Epoch[14] Train-cross-entropy=0.636030
2020-07-03 19:30:49,867 [INFO]  Epoch[14] Train-job-accuracy=0.723611
2020-07-03 19:30:49,870 [INFO]  Epoch[14] Time cost=0.656
2020-07-03 19:30:49,877 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-07-03 19:30:49,942 [INFO]  Epoch[14] Validation-cross-entropy=0.824580
2020-07-03 19:30:49,944 [INFO]  Epoch[14] Validation-job-accuracy=0.612500
2020-07-03 19:30:50,281 [INFO]  Epoch[15] Batch [0-23]	Speed: 1162.89 samples/sec	cross-entropy=0.628657	job-accuracy=0.734375
2020-07-03 19:30:50,587 [INFO]  Epoch[15] Train-cross-entropy=0.629385
2020-07-03 19:30:50,591 [INFO]  Epoch[15] Train-job-accuracy=0.727778
2020-07-03 19:30:50,593 [INFO]  Epoch[15] Time cost=0.646
2020-07-03 19:30:50,601 [INFO]  Saved checkpoint to "imputer_model\model-0015.params"
2020-07-03 19:30:50,663 [INFO]  Epoch[15] Validation-cross-entropy=0.823490
2020-07-03 19:30:50,666 [INFO]  Epoch[15] Validation-job-accuracy=0.612500
2

Fitting model for column: own_telephone


2020-07-03 19:30:58,952 [INFO]  
2020-07-03 19:30:59,333 [INFO]  Epoch[0] Batch [0-23]	Speed: 1060.29 samples/sec	cross-entropy=0.689698	own_telephone-accuracy=0.614583
2020-07-03 19:30:59,620 [INFO]  Epoch[0] Train-cross-entropy=0.639711
2020-07-03 19:30:59,622 [INFO]  Epoch[0] Train-own_telephone-accuracy=0.637500
2020-07-03 19:30:59,625 [INFO]  Epoch[0] Time cost=0.665
2020-07-03 19:30:59,632 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:30:59,688 [INFO]  Epoch[0] Validation-cross-entropy=0.640804
2020-07-03 19:30:59,690 [INFO]  Epoch[0] Validation-own_telephone-accuracy=0.612500
2020-07-03 19:31:00,015 [INFO]  Epoch[1] Batch [0-23]	Speed: 1209.76 samples/sec	cross-entropy=0.572657	own_telephone-accuracy=0.671875
2020-07-03 19:31:00,588 [INFO]  Epoch[1] Train-cross-entropy=0.560654
2020-07-03 19:31:00,591 [INFO]  Epoch[1] Train-own_telephone-accuracy=0.684722
2020-07-03 19:31:00,596 [INFO]  Epoch[1] Time cost=0.903
2020-07-03 19:31:00,618 [INFO]  Saved 

Fitting model for column: foreign_worker


2020-07-03 19:31:08,434 [INFO]  
2020-07-03 19:31:09,046 [INFO]  Epoch[0] Batch [0-23]	Speed: 631.82 samples/sec	cross-entropy=0.278625	foreign_worker-accuracy=0.958333
2020-07-03 19:31:09,415 [INFO]  Epoch[0] Train-cross-entropy=0.229621
2020-07-03 19:31:09,417 [INFO]  Epoch[0] Train-foreign_worker-accuracy=0.962500
2020-07-03 19:31:09,420 [INFO]  Epoch[0] Time cost=0.980
2020-07-03 19:31:09,430 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:31:09,482 [INFO]  Epoch[0] Validation-cross-entropy=0.064087
2020-07-03 19:31:09,485 [INFO]  Epoch[0] Validation-foreign_worker-accuracy=0.987500
2020-07-03 19:31:09,818 [INFO]  Epoch[1] Batch [0-23]	Speed: 1154.65 samples/sec	cross-entropy=0.123329	foreign_worker-accuracy=0.968750
2020-07-03 19:31:10,108 [INFO]  Epoch[1] Train-cross-entropy=0.127453
2020-07-03 19:31:10,111 [INFO]  Epoch[1] Train-foreign_worker-accuracy=0.968056
2020-07-03 19:31:10,113 [INFO]  Epoch[1] Time cost=0.626
2020-07-03 19:31:10,121 [INFO]  Sa

Fitting model for column: duration


2020-07-03 19:31:14,681 [INFO]  
2020-07-03 19:31:15,032 [INFO]  Epoch[0] Batch [0-23]	Speed: 1157.62 samples/sec	cross-entropy=10.315295	duration-accuracy=0.000000
2020-07-03 19:31:15,301 [INFO]  Epoch[0] Train-cross-entropy=9.951751
2020-07-03 19:31:15,303 [INFO]  Epoch[0] Train-duration-accuracy=0.000000
2020-07-03 19:31:15,306 [INFO]  Epoch[0] Time cost=0.616
2020-07-03 19:31:15,314 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:31:15,367 [INFO]  Epoch[0] Validation-cross-entropy=12.943263
2020-07-03 19:31:15,370 [INFO]  Epoch[0] Validation-duration-accuracy=0.000000
2020-07-03 19:31:15,652 [INFO]  Epoch[1] Batch [0-23]	Speed: 1381.96 samples/sec	cross-entropy=7.422330	duration-accuracy=0.000000
2020-07-03 19:31:15,898 [INFO]  Epoch[1] Train-cross-entropy=7.869175
2020-07-03 19:31:15,902 [INFO]  Epoch[1] Train-duration-accuracy=0.000000
2020-07-03 19:31:15,904 [INFO]  Epoch[1] Time cost=0.531
2020-07-03 19:31:15,911 [INFO]  Saved checkpoint to "imputer_

Fitting model for column: credit_amount


2020-07-03 19:31:23,743 [INFO]  
2020-07-03 19:31:24,082 [INFO]  Epoch[0] Batch [0-23]	Speed: 1205.83 samples/sec	cross-entropy=10.035989	credit_amount-accuracy=0.000000
2020-07-03 19:31:24,334 [INFO]  Epoch[0] Train-cross-entropy=9.471800
2020-07-03 19:31:24,336 [INFO]  Epoch[0] Train-credit_amount-accuracy=0.000000
2020-07-03 19:31:24,340 [INFO]  Epoch[0] Time cost=0.586
2020-07-03 19:31:24,348 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:31:24,401 [INFO]  Epoch[0] Validation-cross-entropy=7.680556
2020-07-03 19:31:24,403 [INFO]  Epoch[0] Validation-credit_amount-accuracy=0.000000
2020-07-03 19:31:24,690 [INFO]  Epoch[1] Batch [0-23]	Speed: 1369.09 samples/sec	cross-entropy=7.242795	credit_amount-accuracy=0.000000
2020-07-03 19:31:24,941 [INFO]  Epoch[1] Train-cross-entropy=7.470071
2020-07-03 19:31:24,943 [INFO]  Epoch[1] Train-credit_amount-accuracy=0.000000
2020-07-03 19:31:24,948 [INFO]  Epoch[1] Time cost=0.543
2020-07-03 19:31:24,958 [INFO]  Saved

Fitting model for column: installment_commitment


2020-07-03 19:31:30,721 [INFO]  
2020-07-03 19:31:31,031 [INFO]  Epoch[0] Batch [0-23]	Speed: 1287.75 samples/sec	cross-entropy=14.603040	installment_commitment-accuracy=0.000000
2020-07-03 19:31:31,289 [INFO]  Epoch[0] Train-cross-entropy=14.966197
2020-07-03 19:31:31,292 [INFO]  Epoch[0] Train-installment_commitment-accuracy=0.000000
2020-07-03 19:31:31,295 [INFO]  Epoch[0] Time cost=0.566
2020-07-03 19:31:31,303 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:31:31,358 [INFO]  Epoch[0] Validation-cross-entropy=11.474794
2020-07-03 19:31:31,360 [INFO]  Epoch[0] Validation-installment_commitment-accuracy=0.000000
2020-07-03 19:31:31,644 [INFO]  Epoch[1] Batch [0-23]	Speed: 1376.81 samples/sec	cross-entropy=12.821382	installment_commitment-accuracy=0.000000
2020-07-03 19:31:31,891 [INFO]  Epoch[1] Train-cross-entropy=13.454024
2020-07-03 19:31:31,894 [INFO]  Epoch[1] Train-installment_commitment-accuracy=0.000000
2020-07-03 19:31:31,896 [INFO]  Epoch[1] Time

Fitting model for column: residence_since


2020-07-03 19:31:37,255 [INFO]  
2020-07-03 19:31:37,585 [INFO]  Epoch[0] Batch [0-23]	Speed: 1198.00 samples/sec	cross-entropy=15.819932	residence_since-accuracy=0.000000
2020-07-03 19:31:37,847 [INFO]  Epoch[0] Train-cross-entropy=15.511631
2020-07-03 19:31:37,850 [INFO]  Epoch[0] Train-residence_since-accuracy=0.000000
2020-07-03 19:31:37,853 [INFO]  Epoch[0] Time cost=0.590
2020-07-03 19:31:37,863 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:31:37,914 [INFO]  Epoch[0] Validation-cross-entropy=15.595026
2020-07-03 19:31:37,917 [INFO]  Epoch[0] Validation-residence_since-accuracy=0.000000
2020-07-03 19:31:38,200 [INFO]  Epoch[1] Batch [0-23]	Speed: 1376.71 samples/sec	cross-entropy=14.760755	residence_since-accuracy=0.000000
2020-07-03 19:31:38,469 [INFO]  Epoch[1] Train-cross-entropy=14.569405
2020-07-03 19:31:38,472 [INFO]  Epoch[1] Train-residence_since-accuracy=0.000000
2020-07-03 19:31:38,476 [INFO]  Epoch[1] Time cost=0.557
2020-07-03 19:31:38,483

2020-07-03 19:31:46,014 [INFO]  Saved checkpoint to "imputer_model\model-0013.params"
2020-07-03 19:31:46,085 [INFO]  No improvement detected for 5 epochs compared to 13.928264999389649 last error obtained: 14.012668800354003, stopping here
2020-07-03 19:31:46,088 [INFO]  


Fitting model for column: age


2020-07-03 19:31:47,669 [INFO]  
2020-07-03 19:31:47,993 [INFO]  Epoch[0] Batch [0-23]	Speed: 1252.60 samples/sec	cross-entropy=14.733498	age-accuracy=0.000000
2020-07-03 19:31:48,258 [INFO]  Epoch[0] Train-cross-entropy=14.386343
2020-07-03 19:31:48,261 [INFO]  Epoch[0] Train-age-accuracy=0.000000
2020-07-03 19:31:48,264 [INFO]  Epoch[0] Time cost=0.588
2020-07-03 19:31:48,272 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:31:48,340 [INFO]  Epoch[0] Validation-cross-entropy=15.883817
2020-07-03 19:31:48,342 [INFO]  Epoch[0] Validation-age-accuracy=0.000000
2020-07-03 19:31:48,616 [INFO]  Epoch[1] Batch [0-23]	Speed: 1438.49 samples/sec	cross-entropy=13.894121	age-accuracy=0.000000
2020-07-03 19:31:48,867 [INFO]  Epoch[1] Train-cross-entropy=13.461311
2020-07-03 19:31:48,871 [INFO]  Epoch[1] Train-age-accuracy=0.000000
2020-07-03 19:31:48,873 [INFO]  Epoch[1] Time cost=0.529
2020-07-03 19:31:48,882 [INFO]  Saved checkpoint to "imputer_model\model-0001.param

2020-07-03 19:31:57,104 [INFO]  Epoch[14] Time cost=0.552
2020-07-03 19:31:57,111 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-07-03 19:31:57,169 [INFO]  No improvement detected for 5 epochs compared to 15.000701332092286 last error obtained: 15.068478393554688, stopping here
2020-07-03 19:31:57,172 [INFO]  


Fitting model for column: existing_credits


2020-07-03 19:31:58,734 [INFO]  
2020-07-03 19:31:59,065 [INFO]  Epoch[0] Batch [0-23]	Speed: 1207.76 samples/sec	cross-entropy=17.229758	existing_credits-accuracy=0.000000
2020-07-03 19:31:59,529 [INFO]  Epoch[0] Train-cross-entropy=15.291499
2020-07-03 19:31:59,532 [INFO]  Epoch[0] Train-existing_credits-accuracy=0.000000
2020-07-03 19:31:59,535 [INFO]  Epoch[0] Time cost=0.793
2020-07-03 19:31:59,543 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:31:59,595 [INFO]  Epoch[0] Validation-cross-entropy=17.594420
2020-07-03 19:31:59,598 [INFO]  Epoch[0] Validation-existing_credits-accuracy=0.000000
2020-07-03 19:31:59,889 [INFO]  Epoch[1] Batch [0-23]	Speed: 1336.80 samples/sec	cross-entropy=15.192479	existing_credits-accuracy=0.000000
2020-07-03 19:32:00,296 [INFO]  Epoch[1] Train-cross-entropy=13.528103
2020-07-03 19:32:00,299 [INFO]  Epoch[1] Train-existing_credits-accuracy=0.000000
2020-07-03 19:32:00,303 [INFO]  Epoch[1] Time cost=0.701
2020-07-03 19:32:0

Fitting model for column: num_dependents


2020-07-03 19:32:06,817 [INFO]  
2020-07-03 19:32:07,145 [INFO]  Epoch[0] Batch [0-23]	Speed: 1198.00 samples/sec	cross-entropy=18.116580	num_dependents-accuracy=0.000000
2020-07-03 19:32:07,425 [INFO]  Epoch[0] Train-cross-entropy=16.237795
2020-07-03 19:32:07,428 [INFO]  Epoch[0] Train-num_dependents-accuracy=0.000000
2020-07-03 19:32:07,431 [INFO]  Epoch[0] Time cost=0.606
2020-07-03 19:32:07,439 [INFO]  Saved checkpoint to "imputer_model\model-0000.params"
2020-07-03 19:32:07,493 [INFO]  Epoch[0] Validation-cross-entropy=15.339165
2020-07-03 19:32:07,496 [INFO]  Epoch[0] Validation-num_dependents-accuracy=0.000000
2020-07-03 19:32:07,787 [INFO]  Epoch[1] Batch [0-23]	Speed: 1344.08 samples/sec	cross-entropy=15.630258	num_dependents-accuracy=0.000000
2020-07-03 19:32:08,048 [INFO]  Epoch[1] Train-cross-entropy=14.448263
2020-07-03 19:32:08,051 [INFO]  Epoch[1] Train-num_dependents-accuracy=0.000000
2020-07-03 19:32:08,054 [INFO]  Epoch[1] Time cost=0.555
2020-07-03 19:32:08,063 [INF

2020-07-03 19:32:16,125 [INFO]  Saved checkpoint to "imputer_model\model-0013.params"
2020-07-03 19:32:16,178 [INFO]  Epoch[13] Validation-cross-entropy=13.931648
2020-07-03 19:32:16,181 [INFO]  Epoch[13] Validation-num_dependents-accuracy=0.000000
2020-07-03 19:32:16,463 [INFO]  Epoch[14] Batch [0-23]	Speed: 1387.17 samples/sec	cross-entropy=12.457022	num_dependents-accuracy=0.000000
2020-07-03 19:32:16,699 [INFO]  Epoch[14] Train-cross-entropy=11.985633
2020-07-03 19:32:16,702 [INFO]  Epoch[14] Train-num_dependents-accuracy=0.000000
2020-07-03 19:32:16,705 [INFO]  Epoch[14] Time cost=0.522
2020-07-03 19:32:16,713 [INFO]  Saved checkpoint to "imputer_model\model-0014.params"
2020-07-03 19:32:16,768 [INFO]  Epoch[14] Validation-cross-entropy=13.902756
2020-07-03 19:32:16,771 [INFO]  Epoch[14] Validation-num_dependents-accuracy=0.000000
2020-07-03 19:32:17,071 [INFO]  Epoch[15] Batch [0-23]	Speed: 1294.68 samples/sec	cross-entropy=12.375407	num_dependents-accuracy=0.000000
2020-07-03 19

Outlier detection method: <jenga.cleaning.outlier_detection.NoOutlierDetection object at 0x000002032F1592E8>
Imputation method: <jenga.cleaning.imputation.DatawigImputation object at 0x000002033DB6EF28>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x000002033DB6E470>: 0.5329861111111112 



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Outlier detection method: <jenga.cleaning.outlier_detection.PyODKNN object at 0x000002033DB6EC18>
Imputation method: <jenga.cleaning.imputation.MeanModeImputation object at 0x000002033DB6E550>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x000002033DBBA828>: 0.628844246031746 



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Outlier detection method: <jenga.cleaning.outlier_detection.PyODIsolationForest object at 0x000002033DBBAA90>
Imputation method: <jenga.cleaning.imputation.MeanModeImputation object at 0x000002033DBBA9E8>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x000002033DBBAA20>: 0.6047867063492063 



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Best cleaning method:
Outlier detection method: <jenga.cleaning.outlier_detection.PyODKNN object at 0x000002033DB6EC18>
Imputation method: <jenga.cleaning.imputation.MeanModeImputation object at 0x000002033DB6E550>
Cleaning score: 0.628844246031746 



In [23]:
test_data_cleaned

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,20.915,existing paid,radio/tv,433.0000,<100,unemployed,3.0,male single,co applicant,4.0,real estate,22.0,none,rent,1.4025,skilled,1.00,none,yes
353,no checking,20.915,no credits/all paid,radio/tv,6199.0000,<100,1<=X<4,4.0,male single,none,2.0,life insurance,28.0,none,rent,1.4025,skilled,1.15,yes,yes
537,0<=X<200,20.915,critical/other existing credit,furniture/equipment,3612.0000,<100,>=7,3.0,female div/dep/mar,none,4.0,life insurance,37.0,none,own,1.4025,skilled,1.00,yes,yes
424,no checking,20.915,existing paid,furniture/equipment,2762.0000,no known savings,>=7,1.0,male single,none,2.0,life insurance,25.0,bank,own,1.4025,skilled,1.15,none,yes
564,no checking,20.915,delayed previously,business,4712.0000,no known savings,1<=X<4,4.0,male single,none,2.0,life insurance,37.0,bank,own,2.0000,high qualif/self emp/mgmt,1.00,none,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,no checking,20.915,delayed previously,business,3243.4775,100<=X<500,4<=X<7,1.0,male single,none,3.0,life insurance,31.0,none,own,2.0000,unskilled resident,1.15,yes,yes
644,no checking,20.915,critical/other existing credit,radio/tv,1880.0000,<100,4<=X<7,4.0,male single,none,1.0,life insurance,32.0,none,own,2.0000,high qualif/self emp/mgmt,1.15,none,yes
110,no checking,20.915,delayed previously,business,1449.0000,100<=X<500,>=7,1.0,male div/sep,none,2.0,car,31.0,bank,own,2.0000,skilled,2.00,none,yes
28,no checking,20.915,existing paid,radio/tv,2415.0000,<100,1<=X<4,3.0,male single,guarantor,2.0,car,34.0,none,own,1.0000,skilled,1.00,none,yes


In [24]:
summary_cleaners

[{'Outlier detection method': <jenga.cleaning.outlier_detection.NoOutlierDetection at 0x2032f1592e8>,
  'Imputation method': <jenga.cleaning.imputation.DatawigImputation at 0x2033db6ef28>,
  'PPP score with cleaning': 0.5329861111111112},
 {'Outlier detection method': <jenga.cleaning.outlier_detection.PyODKNN at 0x2033db6ec18>,
  'Imputation method': <jenga.cleaning.imputation.MeanModeImputation at 0x2033db6e550>,
  'PPP score with cleaning': 0.628844246031746},
 {'Outlier detection method': <jenga.cleaning.outlier_detection.PyODIsolationForest at 0x2033dbbaa90>,
  'Imputation method': <jenga.cleaning.imputation.MeanModeImputation at 0x2033dbba9e8>,
  'PPP score with cleaning': 0.6047867063492063}]

## Results

### Model Results

In [None]:
# model 
pipeline.fit(train_data, train_labels)

In [None]:
# original data test score
pipeline.score(test_data, test_labels)

In [None]:
# corrupted data test score
pipeline.score(test_data_corrupted, test_labels)

In [None]:
# cleaned data test score
pipeline.score(test_data_cleaned, test_labels)

### PPP Results

In [None]:
# ppp model score
ppp.predict_score_ppp(ppp_model, test_data)

In [None]:
# ppp score corrupted
score_no_cleaning

In [None]:
# ppp score cleaned
np.array(cleaner_scores_ppp).max()

In [None]:
# ppp cleaner scores
cleaner_scores_ppp

## EXTRAS

### Preprocessing Pipeline

In [11]:
## preprocessing pipeline for both numerical and categorical columns

# preprocessing pipeline for numerical columns
transformer_numeric = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('standard_scale', StandardScaler())
])

# preprocessing pipeline for categorical columns
transformer_categorical = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
    ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
])

# preprocessor
feature_transform = ColumnTransformer(transformers=[
    ('categorical_features', transformer_categorical, categorical_columns),
    ('numerical_features', transformer_numeric, numerical_columns)
])

### Prediction Pipeline

In [12]:
## prediction pipeline: append classifier (learner) to the preprocessing pipeline
pipeline = Pipeline([
    ('features', feature_transform),
    ('learner', learner)
])

### outlier detection

In [1]:
from pyod.utils.data import generate_data, get_outliers_inliers

#generate random data with two features
X_train, Y_train = generate_data(n_train=200,train_only=True, n_features=2)

In [2]:
X_train

array([[ 8.51615306,  8.7315578 ],
       [ 6.7322496 ,  8.00917028],
       [ 8.6367426 ,  7.24556432],
       [ 8.09561334,  7.41976414],
       [ 7.4099449 ,  7.46755281],
       [ 7.14619694,  8.16099031],
       [10.03938861,  7.95722193],
       [ 7.31618559,  7.84987553],
       [ 8.46410738,  8.33567099],
       [ 7.90115358,  7.49439556],
       [ 8.30237492,  8.21427738],
       [ 8.33886135,  8.45386323],
       [ 8.10981993,  8.00756189],
       [ 8.46038572,  7.65280695],
       [ 6.10450922,  8.80721603],
       [ 8.28296169,  7.30103076],
       [ 7.57118073,  7.89521747],
       [ 7.04488375,  8.34598763],
       [ 8.09273844,  8.86217989],
       [ 8.23449151,  8.52668653],
       [ 8.17656397,  7.10061961],
       [ 7.96070463,  8.47428073],
       [ 8.06127681,  8.80215393],
       [ 7.9491775 ,  7.40272466],
       [ 9.28671322,  8.01916915],
       [ 7.26723327,  8.2732644 ],
       [ 8.28074434,  7.39919939],
       [ 7.98889556,  7.63667412],
       [ 7.88446711,

In [3]:
Y_train

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [4]:
# by default the outlier fraction is 0.1 in generate data function 
outlier_fraction = 0.1

# store outliers and inliers in different numpy arrays
x_outliers, x_inliers = get_outliers_inliers(X_train,Y_train)

In [5]:
x_outliers

array([[ 3.46638464,  1.66383774],
       [ 3.46116636, -0.24907732],
       [-1.09397821,  4.69686054],
       [ 6.92823219, -0.48389623],
       [-5.16544285,  6.62893345],
       [ 5.32046529, -1.87074669],
       [-0.02510136,  4.40765382],
       [ 1.93012975,  5.09150205],
       [-3.39643009, -2.7525707 ],
       [-3.82795609, -5.80884865],
       [-0.79745389, -7.65166313],
       [-6.90159551,  4.38607016],
       [ 3.17950698,  4.58745473],
       [ 4.48079632,  4.71797235],
       [-3.65249257, -5.25377588],
       [ 6.82601689,  1.39215629],
       [ 0.30984574, -2.43661654],
       [ 2.81480041, -1.2130262 ],
       [-7.68289149,  0.53465819],
       [-5.07744838, -2.79611131]])

In [6]:
#separate the two features and use it to plot the data 
F1 = X_train[:,[0]].reshape(-1,1)
F2 = X_train[:,[1]].reshape(-1,1)

In [10]:
from pyod.models.abod import ABOD
from pyod.models.knn import KNN

classifiers = {
     'Angle-based Outlier Detector (ABOD)'   : ABOD(contamination=outlier_fraction),
     'K Nearest Neighbors (KNN)' :  KNN(contamination=outlier_fraction)
}

In [11]:
for i, (clf_name,clf) in enumerate(classifiers.items()) :
    # fit the dataset to the model
    clf.fit(X_train)

    # predict raw anomaly score
    scores_pred = clf.decision_function(X_train)*-1

    # prediction of a datapoint category outlier or inlier
    y_pred = clf.predict(X_train)

    # no of errors in prediction
    n_errors = (y_pred != Y_train).sum()
    print('No of Errors : ',clf_name, n_errors)

No of Errors :  Angle-based Outlier Detector (ABOD) 4
No of Errors :  K Nearest Neighbors (KNN) 0


In [12]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [None]:
clf = KNN(contamination=outlier_fraction)

In [16]:
# fit the dataset to the model
clf.fit(X_train[:,[0]].reshape(-1,1))

# predict raw anomaly score
scores_pred = clf.decision_function(X_train[:,[0]].reshape(-1,1))*-1

# prediction of a datapoint category outlier or inlier
y_pred = clf.predict(X_train[:,[0]].reshape(-1,1))

In [19]:
X_train[:,[0]].reshape(-1,1)

array([[ 8.51615306],
       [ 6.7322496 ],
       [ 8.6367426 ],
       [ 8.09561334],
       [ 7.4099449 ],
       [ 7.14619694],
       [10.03938861],
       [ 7.31618559],
       [ 8.46410738],
       [ 7.90115358],
       [ 8.30237492],
       [ 8.33886135],
       [ 8.10981993],
       [ 8.46038572],
       [ 6.10450922],
       [ 8.28296169],
       [ 7.57118073],
       [ 7.04488375],
       [ 8.09273844],
       [ 8.23449151],
       [ 8.17656397],
       [ 7.96070463],
       [ 8.06127681],
       [ 7.9491775 ],
       [ 9.28671322],
       [ 7.26723327],
       [ 8.28074434],
       [ 7.98889556],
       [ 7.88446711],
       [ 8.60839795],
       [ 7.8322325 ],
       [ 8.58707181],
       [ 7.84761668],
       [ 6.68689173],
       [ 8.09209503],
       [ 8.02867887],
       [ 7.96580713],
       [ 8.24638358],
       [ 7.43085713],
       [ 8.41263073],
       [ 8.52555968],
       [ 8.39711329],
       [ 7.81648929],
       [ 6.67489617],
       [ 7.45563748],
       [ 7

In [18]:
y_pred

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1])

#### Numerical column

In [17]:
df_outliers = test_data_corrupted[numerical_columns].copy()
df_outliers

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents
249,1800000.0,433.0,3.0,4.0,22.0,10.0,1.000000
353,1200000.0,6199.0,4.0,2.0,28.0,20.0,1.595905
537,1800000.0,3612.0,3.0,4.0,37.0,10.0,1.000000
424,1200000.0,2762.0,1.0,2.0,25.0,10.0,0.900623
564,2400000.0,4712.0,4.0,2.0,37.0,2.0,1.000000
...,...,...,...,...,...,...,...
684,3600000.0,9857.0,1.0,3.0,31.0,2.0,2.141840
644,18000.0,1880.0,4.0,1.0,32.0,2.0,0.335332
110,600000.0,1449.0,1.0,2.0,31.0,2.0,2.000000
28,700.0,2415.0,3.0,2.0,34.0,1.0,1.000000


In [19]:
col = 'duration'

In [20]:
train_data[col]

675    30.0
358    12.0
159     6.0
533    24.0
678    24.0
       ... 
855    24.0
871     6.0
835    12.0
792     6.0
520    24.0
Name: duration, Length: 800, dtype: float64

In [21]:
nan_idx = test_data_corrupted[test_data_corrupted[col].isnull()].index
non_nan_idx = test_data_corrupted.loc[set(test_data_corrupted.index) - set(nan_idx)].index

print(nan_idx)
print(non_nan_idx)

Int64Index([], dtype='int64')
Int64Index([512, 515,  19,  22, 534,  24, 537,  28, 542, 543,
            ...
            982, 480, 483, 996, 489, 492, 503, 508, 509, 510],
           dtype='int64', length=200)


In [22]:
col_tr_arr = np.array(train_data[col]).reshape(-1,1)
col_corr_arr = np.array(test_data_corrupted.loc[non_nan_idx][col]).reshape(-1,1)

In [24]:
from pyod.models.knn import KNN

clf = KNN(contamination=0.1)

# fit the dataset to the model
clf.fit(col_tr_arr)

# predict raw anomaly score
scores_pred = clf.decision_function(col_corr_arr)*-1

# prediction of a datapoint category outlier or inlier
y_pred = clf.predict(col_corr_arr)
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [28]:
df_outliers[col + "_outlier"] = ''
df_outliers

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,duration_outlier
249,1800000.0,433.0,3.0,4.0,22.0,10.0,1.000000,
353,1200000.0,6199.0,4.0,2.0,28.0,20.0,1.595905,
537,1800000.0,3612.0,3.0,4.0,37.0,10.0,1.000000,
424,1200000.0,2762.0,1.0,2.0,25.0,10.0,0.900623,
564,2400000.0,4712.0,4.0,2.0,37.0,2.0,1.000000,
...,...,...,...,...,...,...,...,...
684,3600000.0,9857.0,1.0,3.0,31.0,2.0,2.141840,
644,18000.0,1880.0,4.0,1.0,32.0,2.0,0.335332,
110,600000.0,1449.0,1.0,2.0,31.0,2.0,2.000000,
28,700.0,2415.0,3.0,2.0,34.0,1.0,1.000000,


In [26]:
df_outliers[col + "_outlier"].loc[non_nan_idx] = y_pred ## 0: inlier, 1: outlier

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [29]:
df_outliers.loc[non_nan_idx, col + "_outlier"] = y_pred ## 0: inlier, 1: outlier

In [222]:
df_outliers[col + "_outlier"].loc[nan_idx] = 0

In [30]:
df_outliers

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,duration_outlier
249,1800000.0,433.0,3.0,4.0,22.0,10.0,1.000000,1
353,1200000.0,6199.0,4.0,2.0,28.0,20.0,1.595905,1
537,1800000.0,3612.0,3.0,4.0,37.0,10.0,1.000000,1
424,1200000.0,2762.0,1.0,2.0,25.0,10.0,0.900623,1
564,2400000.0,4712.0,4.0,2.0,37.0,2.0,1.000000,1
...,...,...,...,...,...,...,...,...
684,3600000.0,9857.0,1.0,3.0,31.0,2.0,2.141840,1
644,18000.0,1880.0,4.0,1.0,32.0,2.0,0.335332,1
110,600000.0,1449.0,1.0,2.0,31.0,2.0,2.000000,1
28,700.0,2415.0,3.0,2.0,34.0,1.0,1.000000,1


In [225]:
print(train_data[col].describe())
print(train_data[col].median(), '\n')

print(test_data_corrupted[col].describe())
print(test_data_corrupted[col].median())

count      800.000000
mean      3243.477500
std       2819.853229
min        250.000000
25%       1354.250000
50%       2308.500000
75%       3972.250000
max      18424.000000
Name: credit_amount, dtype: float64
2308.5 

count    1.500000e+02
mean     1.268448e+06
std      1.903046e+06
min      3.390000e+02
25%      2.378500e+03
50%      1.268600e+04
75%      1.881500e+06
max      8.978000e+06
Name: credit_amount, dtype: float64
12686.0


In [226]:
test_data_corrupted[col]

249        433.0
353    6199000.0
537       3612.0
424    2762000.0
564       4712.0
         ...    
684       9857.0
644    1880000.0
110    1449000.0
28           NaN
804       7472.0
Name: credit_amount, Length: 200, dtype: float64

In [19]:
from pyod.models.knn import KNN

In [20]:
columns = train_data.columns
columns

Index(['checking_status', 'duration', 'credit_history', 'purpose',
       'credit_amount', 'savings_status', 'employment',
       'installment_commitment', 'personal_status', 'other_parties',
       'residence_since', 'property_magnitude', 'age', 'other_payment_plans',
       'housing', 'existing_credits', 'job', 'num_dependents', 'own_telephone',
       'foreign_worker'],
      dtype='object')

In [21]:
def num_out_detect(df_train, df_corrupted, pyod_model):
    df_outliers = df_corrupted[numerical_columns].copy()
    
    for col in df_train.columns:
        if col in numerical_columns:
            ## find indices of records with NaNs in col in df_corrupted
            nan_idx = df_corrupted[df_corrupted[col].isnull()].index
            non_nan_idx = df_corrupted.loc[set(df_corrupted.index) - set(nan_idx)].index
            
            ## pd series -> np column, needs to be 2D array
            ## taking only the non-NaN records in the corrupted data
            col_tr_arr = np.array(df_train[col]).reshape(-1,1)
            col_corr_arr = np.array(df_corrupted.loc[non_nan_idx][col]).reshape(-1,1)

            ## fit the dataset to the model
            model = pyod_model
            model.fit(col_tr_arr)

            ## predict raw anomaly score
            scores_pred = model.decision_function(col_corr_arr) * -1

            ## prediction of a datapoint category outlier or inlier
            y_pred = model.predict(col_corr_arr)

            ## add a respective outlier col for each col
            df_outliers[col + "_outlier"] = ''
            df_outliers[col + "_outlier"].loc[non_nan_idx] = y_pred ## 0: inlier, 1: outlier
            df_outliers[col + "_outlier"].loc[nan_idx] = 0
            
    return df_outliers

In [22]:
df_outliers_num = num_out_detect(train_data, test_data_corrupted, KNN())
df_outliers_num

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,duration_outlier,credit_amount_outlier,installment_commitment_outlier,residence_since_outlier,age_outlier,existing_credits_outlier,num_dependents_outlier
249,1800000.0,433.0,3.0,4.0,22.0,10.0,1.000000,1,0,0,0,0,1,0
353,1200000.0,6199.0,4.0,2.0,28.0,20.0,1.595905,1,0,0,0,0,1,1
537,1800000.0,3612.0,3.0,4.0,37.0,10.0,1.000000,1,0,0,0,0,1,0
424,1200000.0,2762.0,1.0,2.0,25.0,10.0,0.900623,1,0,0,0,0,1,1
564,2400000.0,4712.0,4.0,2.0,37.0,2.0,1.000000,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,3600000.0,9857.0,1.0,3.0,31.0,2.0,2.141840,1,1,0,0,0,0,1
644,18000.0,1880.0,4.0,1.0,32.0,2.0,0.335332,1,0,0,0,0,0,1
110,600000.0,1449.0,1.0,2.0,31.0,2.0,2.000000,1,0,0,0,0,0,0
28,700.0,2415.0,3.0,2.0,34.0,1.0,1.000000,1,0,0,0,0,0,0


#### Categorical column

In [None]:
## unlike numerical columns, we can't use PyOD here
## take the unique values from train
## find the values in corrupted that don't belong to the unique from the train
## mark as outlier

In [120]:
vals_train_unique = train_data['property_magnitude'].unique()
vals_train_unique

[car, real estate, life insurance, no known property]
Categories (4, object): [real estate < life insurance < car < no known property]

In [121]:
test_data_corrupted['property_magnitude']

249              none
353              none
537    life insurance
424    life insurance
564    life insurance
            ...      
684    life insurance
644              none
110               car
28               none
804       real estate
Name: property_magnitude, Length: 200, dtype: object

In [None]:
## the values in corrupted that don't belong to 'vals_train_unique'

In [123]:
test_data_corrupted['property_magnitude_outlier'] = ''

for i in test_data_corrupted['property_magnitude'].index:
    if test_data_corrupted['property_magnitude'].loc[i] in vals_train_unique:
        test_data_corrupted['property_magnitude_outlier'].loc[i] = 0
    else:
        test_data_corrupted['property_magnitude_outlier'].loc[i] = 1

In [124]:
test_data_corrupted

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,property_magnitude_outlier
249,no checking,18.0,existing paid,radio/tv,433.0,<100,unemployed,3.0,female div/dep/mar,co applicant,...,none,22.0,real estate,rent,1.0,skilled,1.000000,none,yes,1
353,<0,12000.0,no credits/all paid,radio/tv,6199000.0,<100,,4.0,male single,none,...,none,28.0,life insurance,rent,2.0,skilled,0.726067,yes,yes,1
537,0<=X<200,18.0,critical/other existing credit,furniture/equipment,3612.0,<100,,,female div/dep/mar,none,...,life insurance,37.0,none,own,1.0,skilled,-0.749838,yes,yes,0
424,0<=X<200,12000.0,existing paid,furniture/equipment,2762000.0,no known savings,>=7,,female div/dep/mar,none,...,life insurance,25.0,bank,own,1.0,skilled,3.433997,yes,yes,0
564,0<=X<200,24000.0,delayed previously,business,4712.0,no known savings,,4.0,male single,none,...,life insurance,37.0,bank,own,2.0,high qualif/self emp/mgmt,0.030199,yes,yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,0<=X<200,36.0,delayed previously,business,9857.0,100<=X<500,,1.0,male single,none,...,life insurance,31.0,none,own,2.0,unskilled resident,2.000000,yes,yes,0
644,<0,18.0,critical/other existing credit,radio/tv,1880000.0,<100,,,male mar/wid,none,...,none,32.0,life insurance,own,2.0,high qualif/self emp/mgmt,2.875736,yes,yes,1
110,0<=X<200,6.0,delayed previously,business,1449000.0,100<=X<500,,,male div/sep,none,...,car,31.0,bank,own,2.0,skilled,2.047525,none,yes,0
28,0<=X<200,7000.0,existing paid,radio/tv,,<100,,3.0,male single,guarantor,...,none,34.0,real estate,own,1.0,skilled,1.526985,none,yes,1


In [23]:
def cat_out_detect(df_train, df_corrupted):
    df_outliers = df_corrupted[categorical_columns].copy()
    
    for col in df_train.columns:
        if col in categorical_columns:
            vals_train_unique = df_train[col].unique()

            ## add a respective outlier col for each col
            df_outliers[col + "_outlier"] = ''
            
            for i in df_corrupted[col].index:
                if df_corrupted[col].loc[i] in vals_train_unique:
                    df_outliers[col + "_outlier"].loc[i] = 0
                else:
                    df_outliers[col + "_outlier"].loc[i] = 1
            
    return df_outliers

In [24]:
df_outliers_cat = cat_out_detect(train_data, test_data_corrupted)
df_outliers_cat

Unnamed: 0,checking_status,credit_history,purpose,savings_status,employment,personal_status,other_parties,property_magnitude,other_payment_plans,housing,...,savings_status_outlier,employment_outlier,personal_status_outlier,other_parties_outlier,property_magnitude_outlier,other_payment_plans_outlier,housing_outlier,job_outlier,own_telephone_outlier,foreign_worker_outlier
249,,existing paid,radio/tv,<100,unemployed,none,co applicant,,none,rent,...,0,0,1,0,1,0,0,0,1,0
353,,no credits/all paid,radio/tv,<100,1<=X<4,male single,none,life insurance,none,rent,...,0,0,0,0,0,0,0,0,0,0
537,,critical/other existing credit,furniture/equipment,<100,>=7,female div/dep/mar,none,life insurance,none,own,...,0,0,0,0,0,0,0,0,0,0
424,,existing paid,furniture/equipment,no known savings,>=7,yes,none,life insurance,bank,own,...,0,0,1,0,0,0,0,0,1,0
564,,delayed previously,business,no known savings,1<=X<4,yes,none,life insurance,bank,own,...,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,,delayed previously,business,100<=X<500,4<=X<7,male single,none,life insurance,none,own,...,0,0,0,0,0,0,0,0,0,0
644,,critical/other existing credit,radio/tv,<100,4<=X<7,yes,none,life insurance,none,own,...,0,0,1,0,0,0,0,0,1,0
110,,delayed previously,business,100<=X<500,>=7,male div/sep,none,car,bank,own,...,0,0,0,0,0,0,0,0,0,0
28,,existing paid,radio/tv,<100,1<=X<4,male single,guarantor,real estate,none,own,...,0,0,0,0,0,0,0,0,0,0


In [None]:
## joining the two outlier dfs (inner join on index)

In [270]:
df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,duration_outlier,credit_amount_outlier,installment_commitment_outlier,...,savings_status_outlier,employment_outlier,personal_status_outlier,other_parties_outlier,property_magnitude_outlier,other_payment_plans_outlier,housing_outlier,job_outlier,own_telephone_outlier,foreign_worker_outlier
249,18.0,433.0,3.0,4000.0,22.0,1.0,1.000000,0,0,0,...,0,0,0,0,1,1,0,0,0,0
353,12000.0,6199000.0,4.0,2.0,28.0,2.0,0.726067,1,1,0,...,0,1,0,0,1,1,0,0,0,0
537,18.0,3612.0,,4.0,37.0,1.0,-0.749838,0,0,0,...,0,1,0,0,0,0,0,0,0,0
424,12000.0,2762000.0,,2000.0,25.0,1.0,3.433997,1,1,0,...,0,0,0,0,0,0,0,0,0,0
564,24000.0,4712.0,4.0,2000.0,37.0,2.0,0.030199,1,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,36.0,9857.0,1.0,3000.0,31.0,2.0,2.000000,0,1,0,...,0,1,0,0,0,0,0,0,0,0
644,18.0,1880000.0,,1000.0,32.0,2.0,2.875736,0,1,0,...,0,1,0,0,1,1,0,0,0,0
110,6.0,1449000.0,,2000.0,31.0,2.0,2.047525,0,1,0,...,0,1,0,0,0,0,0,0,0,0
28,7000.0,,3.0,2000.0,34.0,1.0,1.526985,1,0,0,...,0,1,0,0,1,1,0,0,0,0


In [None]:
## where the corresponding outlier column is 1, set the original value to NaN

In [259]:
col = 'duration'

In [260]:
df_outliers[['duration', 'duration_outlier']]

Unnamed: 0,duration,duration_outlier
249,18.0,0
353,12000.0,1
537,18.0,0
424,12000.0,1
564,24000.0,1
...,...,...
684,36.0,0
644,18.0,0
110,6.0,0
28,7000.0,1


In [261]:
for i in df_outliers.index:
    if df_outliers[col + "_outlier"].loc[i] == 1:
        df_outliers[col].loc[i] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [262]:
df_outliers

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,duration_outlier,credit_amount_outlier,installment_commitment_outlier,...,savings_status_outlier,employment_outlier,personal_status_outlier,other_parties_outlier,property_magnitude_outlier,other_payment_plans_outlier,housing_outlier,job_outlier,own_telephone_outlier,foreign_worker_outlier
249,18.0,433.0,3.0,4000.0,22.0,1.0,1.000000,0,0,0,...,0,0,0,0,1,1,0,0,0,0
353,,6199000.0,4.0,2.0,28.0,2.0,0.726067,1,1,0,...,0,1,0,0,1,1,0,0,0,0
537,18.0,3612.0,,4.0,37.0,1.0,-0.749838,0,0,0,...,0,1,0,0,0,0,0,0,0,0
424,,2762000.0,,2000.0,25.0,1.0,3.433997,1,1,0,...,0,0,0,0,0,0,0,0,0,0
564,,4712.0,4.0,2000.0,37.0,2.0,0.030199,1,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,36.0,9857.0,1.0,3000.0,31.0,2.0,2.000000,0,1,0,...,0,1,0,0,0,0,0,0,0,0
644,18.0,1880000.0,,1000.0,32.0,2.0,2.875736,0,1,0,...,0,1,0,0,1,1,0,0,0,0
110,6.0,1449000.0,,2000.0,31.0,2.0,2.047525,0,1,0,...,0,1,0,0,0,0,0,0,0,0
28,,,3.0,2000.0,34.0,1.0,1.526985,1,0,0,...,0,1,0,0,1,1,0,0,0,0


In [271]:
for col in columns:
    for i in df_outliers.index:
        if df_outliers[col + "_outlier"].loc[i] == 1:
            df_outliers[col].loc[i] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [272]:
df_outliers[columns]

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,18.0,existing paid,radio/tv,433.0,<100,unemployed,3.0,female div/dep/mar,co applicant,,,22.0,,rent,1.0,skilled,1.0,none,yes
353,<0,,no credits/all paid,radio/tv,,<100,,4.0,male single,none,2.0,,28.0,,rent,2.0,skilled,,yes,yes
537,0<=X<200,18.0,critical/other existing credit,furniture/equipment,3612.0,<100,,,female div/dep/mar,none,4.0,life insurance,37.0,none,own,1.0,skilled,,yes,yes
424,0<=X<200,,existing paid,furniture/equipment,,no known savings,>=7,,female div/dep/mar,none,,life insurance,25.0,bank,own,1.0,skilled,,yes,yes
564,0<=X<200,,delayed previously,business,4712.0,no known savings,,4.0,male single,none,,life insurance,37.0,bank,own,2.0,high qualif/self emp/mgmt,,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,0<=X<200,36.0,delayed previously,business,,100<=X<500,,1.0,male single,none,,life insurance,31.0,none,own,2.0,unskilled resident,2.0,yes,yes
644,<0,18.0,critical/other existing credit,radio/tv,,<100,,,male mar/wid,none,,,32.0,,own,2.0,high qualif/self emp/mgmt,,yes,yes
110,0<=X<200,6.0,delayed previously,business,,100<=X<500,,,male div/sep,none,,car,31.0,bank,own,2.0,skilled,,none,yes
28,0<=X<200,,existing paid,radio/tv,,<100,,3.0,male single,guarantor,,,34.0,,own,1.0,skilled,,none,yes


In [19]:
pyod_knn = PyODKNN(train_data, test_data_corrupted, categorical_columns, numerical_columns)
df_outliers = pyod_knn.fit_transform(train_data, test_data_corrupted)
df_outliers

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,,existing paid,radio/tv,433.0,<100,unemployed,3.0,,co applicant,4.0,real estate,22.0,none,rent,,skilled,1.0,,yes
353,,,no credits/all paid,radio/tv,6199.0,<100,1<=X<4,4.0,male single,none,2.0,life insurance,28.0,none,rent,,skilled,,yes,yes
537,,,critical/other existing credit,furniture/equipment,3612.0,<100,>=7,3.0,female div/dep/mar,none,4.0,life insurance,37.0,none,own,,skilled,1.0,yes,yes
424,,,existing paid,furniture/equipment,2762.0,no known savings,>=7,1.0,,none,2.0,life insurance,25.0,bank,own,,skilled,,,yes
564,,,delayed previously,business,4712.0,no known savings,1<=X<4,4.0,,none,2.0,life insurance,37.0,bank,own,2.0,high qualif/self emp/mgmt,1.0,,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,,,delayed previously,business,,100<=X<500,4<=X<7,1.0,male single,none,3.0,life insurance,31.0,none,own,2.0,unskilled resident,,yes,yes
644,,,critical/other existing credit,radio/tv,1880.0,<100,4<=X<7,4.0,,none,1.0,life insurance,32.0,none,own,2.0,high qualif/self emp/mgmt,,,yes
110,,,delayed previously,business,1449.0,100<=X<500,>=7,1.0,male div/sep,none,2.0,,31.0,bank,own,2.0,skilled,2.0,none,yes
28,,,existing paid,radio/tv,2415.0,<100,1<=X<4,3.0,male single,guarantor,2.0,real estate,34.0,none,own,1.0,skilled,1.0,none,yes


In [20]:
test_data

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,18.0,existing paid,radio/tv,433.0,<100,unemployed,3.0,female div/dep/mar,co applicant,4.0,real estate,22.0,none,rent,1.0,skilled,1.0,none,yes
353,<0,12.0,no credits/all paid,radio/tv,6199.0,<100,1<=X<4,4.0,male single,none,2.0,life insurance,28.0,none,rent,2.0,skilled,1.0,yes,yes
537,0<=X<200,18.0,critical/other existing credit,furniture/equipment,3612.0,<100,>=7,3.0,female div/dep/mar,none,4.0,life insurance,37.0,none,own,1.0,skilled,1.0,yes,yes
424,0<=X<200,12.0,existing paid,furniture/equipment,2762.0,no known savings,>=7,1.0,female div/dep/mar,none,2.0,life insurance,25.0,bank,own,1.0,skilled,1.0,yes,yes
564,0<=X<200,24.0,delayed previously,business,4712.0,no known savings,1<=X<4,4.0,male single,none,2.0,life insurance,37.0,bank,own,2.0,high qualif/self emp/mgmt,1.0,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,0<=X<200,36.0,delayed previously,business,9857.0,100<=X<500,4<=X<7,1.0,male single,none,3.0,life insurance,31.0,none,own,2.0,unskilled resident,2.0,yes,yes
644,<0,18.0,critical/other existing credit,radio/tv,1880.0,<100,4<=X<7,4.0,male mar/wid,none,1.0,life insurance,32.0,none,own,2.0,high qualif/self emp/mgmt,1.0,yes,yes
110,0<=X<200,6.0,delayed previously,business,1449.0,100<=X<500,>=7,1.0,male div/sep,none,2.0,car,31.0,bank,own,2.0,skilled,2.0,none,yes
28,0<=X<200,7.0,existing paid,radio/tv,2415.0,<100,1<=X<4,3.0,male single,guarantor,2.0,real estate,34.0,none,own,1.0,skilled,1.0,none,yes


In [21]:
pyod_iforest = PyODIsolationForest(train_data, test_data_corrupted, categorical_columns, numerical_columns)
df_outliers = pyod_iforest.fit_transform(train_data, test_data_corrupted)
df_outliers

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,,existing paid,radio/tv,,<100,unemployed,3.0,,co applicant,4.0,real estate,,none,rent,,skilled,1.000000,,yes
353,,,no credits/all paid,radio/tv,,<100,1<=X<4,4.0,male single,none,2.0,life insurance,28.0,none,rent,,skilled,,yes,yes
537,,,critical/other existing credit,furniture/equipment,3612.0,<100,>=7,3.0,female div/dep/mar,none,4.0,life insurance,37.0,none,own,,skilled,1.000000,yes,yes
424,,,existing paid,furniture/equipment,2762.0,no known savings,>=7,,,none,2.0,life insurance,25.0,bank,own,,skilled,0.900623,,yes
564,,,delayed previously,business,,no known savings,1<=X<4,4.0,,none,2.0,life insurance,37.0,bank,own,2.0,high qualif/self emp/mgmt,1.000000,,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,,,delayed previously,business,,100<=X<500,4<=X<7,,male single,none,3.0,life insurance,31.0,none,own,2.0,unskilled resident,,yes,yes
644,,,critical/other existing credit,radio/tv,1880.0,<100,4<=X<7,4.0,,none,,life insurance,32.0,none,own,2.0,high qualif/self emp/mgmt,0.335332,,yes
110,,,delayed previously,business,1449.0,100<=X<500,>=7,,male div/sep,none,2.0,,31.0,bank,own,2.0,skilled,,none,yes
28,,,existing paid,radio/tv,2415.0,<100,1<=X<4,3.0,male single,guarantor,2.0,real estate,34.0,none,own,1.0,skilled,1.000000,none,yes


In [None]:
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer

In [None]:
test_data_corrupted[numerical_columns]

In [None]:
imp = IterativeImputer(random_state=seed)
imp.fit(train_data[numerical_columns])

x = pd.DataFrame(imp.transform(test_data_corrupted[numerical_columns]))
x.columns = test_data_corrupted[numerical_columns].columns
x.index = test_data_corrupted[numerical_columns].index
x

In [None]:
imp_ = IterativeImputer(transformers=feature_transform)
imp_.fit(train_data[numerical_columns])

xx = pd.DataFrame(imp_.transform(test_data_corrupted[categorical_columns]))
xx.columns = test_data_corrupted[categorical_columns].columns
xx.index = test_data_corrupted[categorical_columns].index
xx

In [20]:
test_data_corrupted['purpose'][test_data_corrupted['purpose'].isnull()].index

Int64Index([659, 944, 845, 277, 218, 171, 334, 539, 953, 305, 604, 663, 387,
            482, 248, 628, 298, 448, 271, 700, 898, 614, 339, 707, 326, 795,
            837, 897, 233, 723, 155, 824,  92, 601, 335, 793, 295,   6, 261,
            172, 408, 444, 930,  34, 124, 176, 750, 299,  31, 576],
           dtype='int64')

In [21]:
test_data_corrupted['purpose'].value_counts()

radio/tv               44
new car                43
used car               21
business               16
education              12
repairs                 4
other                   3
retraining              3
domestic appliance      2
furniture/equipment     2
vacation                0
Name: purpose, dtype: int64