## Adjustment for Google Colab

In [None]:
# mount drive for access to the
# from google.colab import drive

In [None]:
# drive.mount("/content/drive")

In [None]:
# all the drive the files are present in "/content/drive/My Drive"
# !ls "/content/drive/My Drive/Beuth Uni/Master Thesis"

In [None]:
# import sys
# sys.path.append('/content/drive/My Drive/Beuth Uni/Master Thesis/jenga')

In [None]:
#! pip install openml

In [None]:
# !pip freeze | grep sklearn

In [None]:
# !pip install --upgrade sklearn

In [20]:
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from jenga.basis import Dataset
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise
from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNN, PyODIsolationForest
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, DatawigImputation
from jenga.cleaning.clean import Clean

In [21]:
seed = 100

## Dataset

In [22]:
dataset = Dataset(seed, "credit-g")

Dataset 'credit-g', target: 'class'
**Author**: Dr. Hans Hofmann  

**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)) - 1994    

**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)



**German Credit data**  

This dataset classifies people described by a set of attributes as good or bad credit risks.



This dataset comes with a cost matrix: 

``` 

      Good  Bad (predicted)  

Good   0    1   (actual)  

Bad    5    0  

```



It is worse to class a customer as go

Attribute types: 


Unnamed: 0,attribute_names,categorical_indicator
0,checking_status,True
1,duration,False
2,credit_history,True
3,purpose,True
4,credit_amount,False
5,savings_status,True
6,employment,True
7,installment_commitment,False
8,personal_status,True
9,other_parties,True


In [23]:
all_data = dataset.all_data
# all_data

In [24]:
attribute_names = dataset.attribute_names
# attribute_names

In [25]:
attribute_types = dataset.attribute_types
# attribute_types

### Categorical and Numerical Features

In [26]:
categorical_columns = dataset.categorical_columns
# categorical_columns

In [27]:
numerical_columns = dataset.numerical_columns
# numerical_columns

In [28]:
print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features")

Found 13 categorical and 7 numeric features


## Model

### Model parameters

In [29]:
## model parameters
learner = SGDClassifier(max_iter=1000)
param_grid = {
    'learner__loss': ['log'],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

### Preprocessing Pipeline

In [30]:
## preprocessing pipeline for both numerical and categorical columns

# preprocessing pipeline for numerical columns
transformer_numeric = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('standard_scale', StandardScaler())
])

# preprocessing pipeline for categorical columns
transformer_categorical = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
    ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
])

# preprocessor
feature_transform = ColumnTransformer(transformers=[
    ('categorical_features', transformer_categorical, categorical_columns),
    ('numerical_features', transformer_numeric, numerical_columns)
])

### Prediction Pipeline

In [31]:
## prediction pipeline: append classifier (learner) to the preprocessing pipeline
pipeline = Pipeline([
    ('features', feature_transform),
    ('learner', learner)
])

### Train and Test Data

In [32]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data()

# display(train_data.head())
# print(train_labels[0:5])

# display(test_data.head())
# print(test_labels[0:5])

## Corruptions

In [80]:
corruptions = [MissingValues, Scaling, SwappedValues, GaussianNoise, MissingValues, MissingValues, Scaling, Scaling]

In [81]:
ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid, corruptions)

In [83]:
# generate corrpted test data
test_data_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions)

Generating corrupted training data on 200 rows...
Applying perturbations...
MissingValues: {'column': 'credit_amount', 'fraction': 0.25, 'na_value': nan, 'missingness': 'MAR'}
Scaling: {'column': 'residence_since', 'fraction': 0.75}
SwappedValues: {'column_a': 'other_payment_plans', 'column_b': 'property_magnitude', 'fraction': 0.5}
GaussianNoise: {'column': 'num_dependents', 'fraction': 0.75}
MissingValues: {'column': 'employment', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MAR'}
MissingValues: {'column': 'installment_commitment', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MAR'}
Scaling: {'column': 'credit_amount', 'fraction': 0.5}
Scaling: {'column': 'duration', 'fraction': 0.5}


In [84]:
summary_col_corrupt

defaultdict(list,
            {('credit_amount',): [<jenga.corruptions.generic.MissingValues at 0x1d992ec9ba8>,
              <jenga.corruptions.numerical.Scaling at 0x1d9929fa400>],
             ('residence_since',): [<jenga.corruptions.numerical.Scaling at 0x1d992124e80>],
             ('other_payment_plans',
              'property_magnitude'): [<jenga.corruptions.generic.SwappedValues at 0x1d9929fa208>],
             ('num_dependents',): [<jenga.corruptions.numerical.GaussianNoise at 0x1d9920f5240>],
             ('employment',): [<jenga.corruptions.generic.MissingValues at 0x1d992209b38>],
             ('installment_commitment',): [<jenga.corruptions.generic.MissingValues at 0x1d992ec9f28>],
             ('duration',): [<jenga.corruptions.numerical.Scaling at 0x1d9921caa20>]})

In [85]:
test_data_corrupted

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,18.0,existing paid,radio/tv,433.0,<100,unemployed,3.0,female div/dep/mar,co applicant,4000.0,none,22.0,real estate,rent,1.0,skilled,1.000000,none,yes
353,<0,12000.0,no credits/all paid,radio/tv,6199000.0,<100,,4.0,male single,none,2.0,none,28.0,life insurance,rent,2.0,skilled,0.726067,yes,yes
537,0<=X<200,18.0,critical/other existing credit,furniture/equipment,3612.0,<100,,,female div/dep/mar,none,4.0,life insurance,37.0,none,own,1.0,skilled,-0.749838,yes,yes
424,0<=X<200,12000.0,existing paid,furniture/equipment,2762000.0,no known savings,>=7,,female div/dep/mar,none,2000.0,life insurance,25.0,bank,own,1.0,skilled,3.433997,yes,yes
564,0<=X<200,24000.0,delayed previously,business,4712.0,no known savings,,4.0,male single,none,2000.0,life insurance,37.0,bank,own,2.0,high qualif/self emp/mgmt,0.030199,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,0<=X<200,36.0,delayed previously,business,9857.0,100<=X<500,,1.0,male single,none,3000.0,life insurance,31.0,none,own,2.0,unskilled resident,2.000000,yes,yes
644,<0,18.0,critical/other existing credit,radio/tv,1880000.0,<100,,,male mar/wid,none,1000.0,none,32.0,life insurance,own,2.0,high qualif/self emp/mgmt,2.875736,yes,yes
110,0<=X<200,6.0,delayed previously,business,1449000.0,100<=X<500,,,male div/sep,none,2000.0,car,31.0,bank,own,2.0,skilled,2.047525,none,yes
28,0<=X<200,7000.0,existing paid,radio/tv,,<100,,3.0,male single,guarantor,2000.0,none,34.0,real estate,own,1.0,skilled,1.526985,none,yes


## Cleaning

### outlier detection

In [1]:
from pyod.utils.data import generate_data, get_outliers_inliers

#generate random data with two features
X_train, Y_train = generate_data(n_train=200,train_only=True, n_features=2)

In [2]:
X_train

array([[ 8.51615306,  8.7315578 ],
       [ 6.7322496 ,  8.00917028],
       [ 8.6367426 ,  7.24556432],
       [ 8.09561334,  7.41976414],
       [ 7.4099449 ,  7.46755281],
       [ 7.14619694,  8.16099031],
       [10.03938861,  7.95722193],
       [ 7.31618559,  7.84987553],
       [ 8.46410738,  8.33567099],
       [ 7.90115358,  7.49439556],
       [ 8.30237492,  8.21427738],
       [ 8.33886135,  8.45386323],
       [ 8.10981993,  8.00756189],
       [ 8.46038572,  7.65280695],
       [ 6.10450922,  8.80721603],
       [ 8.28296169,  7.30103076],
       [ 7.57118073,  7.89521747],
       [ 7.04488375,  8.34598763],
       [ 8.09273844,  8.86217989],
       [ 8.23449151,  8.52668653],
       [ 8.17656397,  7.10061961],
       [ 7.96070463,  8.47428073],
       [ 8.06127681,  8.80215393],
       [ 7.9491775 ,  7.40272466],
       [ 9.28671322,  8.01916915],
       [ 7.26723327,  8.2732644 ],
       [ 8.28074434,  7.39919939],
       [ 7.98889556,  7.63667412],
       [ 7.88446711,

In [3]:
Y_train

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [4]:
# by default the outlier fraction is 0.1 in generate data function 
outlier_fraction = 0.1

# store outliers and inliers in different numpy arrays
x_outliers, x_inliers = get_outliers_inliers(X_train,Y_train)

In [5]:
x_outliers

array([[ 3.46638464,  1.66383774],
       [ 3.46116636, -0.24907732],
       [-1.09397821,  4.69686054],
       [ 6.92823219, -0.48389623],
       [-5.16544285,  6.62893345],
       [ 5.32046529, -1.87074669],
       [-0.02510136,  4.40765382],
       [ 1.93012975,  5.09150205],
       [-3.39643009, -2.7525707 ],
       [-3.82795609, -5.80884865],
       [-0.79745389, -7.65166313],
       [-6.90159551,  4.38607016],
       [ 3.17950698,  4.58745473],
       [ 4.48079632,  4.71797235],
       [-3.65249257, -5.25377588],
       [ 6.82601689,  1.39215629],
       [ 0.30984574, -2.43661654],
       [ 2.81480041, -1.2130262 ],
       [-7.68289149,  0.53465819],
       [-5.07744838, -2.79611131]])

In [6]:
#separate the two features and use it to plot the data 
F1 = X_train[:,[0]].reshape(-1,1)
F2 = X_train[:,[1]].reshape(-1,1)

In [10]:
from pyod.models.abod import ABOD
from pyod.models.knn import KNN

classifiers = {
     'Angle-based Outlier Detector (ABOD)'   : ABOD(contamination=outlier_fraction),
     'K Nearest Neighbors (KNN)' :  KNN(contamination=outlier_fraction)
}

In [11]:
for i, (clf_name,clf) in enumerate(classifiers.items()) :
    # fit the dataset to the model
    clf.fit(X_train)

    # predict raw anomaly score
    scores_pred = clf.decision_function(X_train)*-1

    # prediction of a datapoint category outlier or inlier
    y_pred = clf.predict(X_train)

    # no of errors in prediction
    n_errors = (y_pred != Y_train).sum()
    print('No of Errors : ',clf_name, n_errors)

No of Errors :  Angle-based Outlier Detector (ABOD) 4
No of Errors :  K Nearest Neighbors (KNN) 0


In [12]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [None]:
clf = KNN(contamination=outlier_fraction)

In [16]:
# fit the dataset to the model
clf.fit(X_train[:,[0]].reshape(-1,1))

# predict raw anomaly score
scores_pred = clf.decision_function(X_train[:,[0]].reshape(-1,1))*-1

# prediction of a datapoint category outlier or inlier
y_pred = clf.predict(X_train[:,[0]].reshape(-1,1))

In [19]:
X_train[:,[0]].reshape(-1,1)

array([[ 8.51615306],
       [ 6.7322496 ],
       [ 8.6367426 ],
       [ 8.09561334],
       [ 7.4099449 ],
       [ 7.14619694],
       [10.03938861],
       [ 7.31618559],
       [ 8.46410738],
       [ 7.90115358],
       [ 8.30237492],
       [ 8.33886135],
       [ 8.10981993],
       [ 8.46038572],
       [ 6.10450922],
       [ 8.28296169],
       [ 7.57118073],
       [ 7.04488375],
       [ 8.09273844],
       [ 8.23449151],
       [ 8.17656397],
       [ 7.96070463],
       [ 8.06127681],
       [ 7.9491775 ],
       [ 9.28671322],
       [ 7.26723327],
       [ 8.28074434],
       [ 7.98889556],
       [ 7.88446711],
       [ 8.60839795],
       [ 7.8322325 ],
       [ 8.58707181],
       [ 7.84761668],
       [ 6.68689173],
       [ 8.09209503],
       [ 8.02867887],
       [ 7.96580713],
       [ 8.24638358],
       [ 7.43085713],
       [ 8.41263073],
       [ 8.52555968],
       [ 8.39711329],
       [ 7.81648929],
       [ 6.67489617],
       [ 7.45563748],
       [ 7

In [18]:
y_pred

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1])

#### Numerical column

In [214]:
df_outliers = test_data_corrupted[numerical_columns].copy()
df_outliers

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents
249,18.0,433.0,3.0,4000.0,22.0,1.0,1.000000
353,12000.0,6199000.0,4.0,2.0,28.0,2.0,0.726067
537,18.0,3612.0,,4.0,37.0,1.0,-0.749838
424,12000.0,2762000.0,,2000.0,25.0,1.0,3.433997
564,24000.0,4712.0,4.0,2000.0,37.0,2.0,0.030199
...,...,...,...,...,...,...,...
684,36.0,9857.0,1.0,3000.0,31.0,2.0,2.000000
644,18.0,1880000.0,,1000.0,32.0,2.0,2.875736
110,6.0,1449000.0,,2000.0,31.0,2.0,2.047525
28,7000.0,,3.0,2000.0,34.0,1.0,1.526985


In [215]:
col = 'credit_amount'

In [216]:
train_data[col]

675    4530.0
358     776.0
159    1898.0
533    1311.0
678    2384.0
        ...  
855    1474.0
871    1343.0
835    1082.0
792    1221.0
520    5507.0
Name: credit_amount, Length: 800, dtype: float64

In [217]:
nan_idx = test_data_corrupted[test_data_corrupted[col].isnull()].index
non_nan_idx = test_data_corrupted.loc[set(test_data_corrupted.index) - set(nan_idx)].index

print(nan_idx)
print(non_nan_idx)

Int64Index([633, 672, 395, 330, 879, 562, 949, 312, 155, 489, 543, 512,  24,
            402, 219, 263, 780, 567, 733,  19, 381, 114, 559, 134, 630, 833,
            461, 143, 205, 980, 306, 117, 107, 731, 106, 864, 832, 881, 646,
            374, 865, 874, 457,  72, 551, 718, 277,  73, 156,  28],
           dtype='int64')
Int64Index([515,  22, 534, 537, 542, 547,  40, 553,  41,  43,
            ...
            467, 982, 480, 483, 996, 492, 503, 508, 509, 510],
           dtype='int64', length=150)


In [218]:
col_tr_arr = np.array(train_data[col]).reshape(-1,1)
col_corr_arr = np.array(test_data_corrupted.loc[non_nan_idx][col]).reshape(-1,1)

In [219]:
from pyod.models.knn import KNN

clf = KNN(contamination=outlier_fraction)

# fit the dataset to the model
clf.fit(col_tr_arr)

# predict raw anomaly score
scores_pred = clf.decision_function(col_corr_arr)*-1

# prediction of a datapoint category outlier or inlier
y_pred = clf.predict(col_corr_arr)
y_pred

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0])

In [220]:
df_outliers[col + "_outlier"] = ''
df_outliers

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,credit_amount_outlier
249,18.0,433.0,3.0,4000.0,22.0,1.0,1.000000,
353,12000.0,6199000.0,4.0,2.0,28.0,2.0,0.726067,
537,18.0,3612.0,,4.0,37.0,1.0,-0.749838,
424,12000.0,2762000.0,,2000.0,25.0,1.0,3.433997,
564,24000.0,4712.0,4.0,2000.0,37.0,2.0,0.030199,
...,...,...,...,...,...,...,...,...
684,36.0,9857.0,1.0,3000.0,31.0,2.0,2.000000,
644,18.0,1880000.0,,1000.0,32.0,2.0,2.875736,
110,6.0,1449000.0,,2000.0,31.0,2.0,2.047525,
28,7000.0,,3.0,2000.0,34.0,1.0,1.526985,


In [221]:
df_outliers[col + "_outlier"].loc[non_nan_idx] = y_pred ## 0: inlier, 1: outlier

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [222]:
df_outliers[col + "_outlier"].loc[nan_idx] = 0

In [223]:
df_outliers

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,credit_amount_outlier
249,18.0,433.0,3.0,4000.0,22.0,1.0,1.000000,0
353,12000.0,6199000.0,4.0,2.0,28.0,2.0,0.726067,1
537,18.0,3612.0,,4.0,37.0,1.0,-0.749838,0
424,12000.0,2762000.0,,2000.0,25.0,1.0,3.433997,1
564,24000.0,4712.0,4.0,2000.0,37.0,2.0,0.030199,0
...,...,...,...,...,...,...,...,...
684,36.0,9857.0,1.0,3000.0,31.0,2.0,2.000000,1
644,18.0,1880000.0,,1000.0,32.0,2.0,2.875736,1
110,6.0,1449000.0,,2000.0,31.0,2.0,2.047525,1
28,7000.0,,3.0,2000.0,34.0,1.0,1.526985,0


In [225]:
print(train_data[col].describe())
print(train_data[col].median(), '\n')

print(test_data_corrupted[col].describe())
print(test_data_corrupted[col].median())

count      800.000000
mean      3243.477500
std       2819.853229
min        250.000000
25%       1354.250000
50%       2308.500000
75%       3972.250000
max      18424.000000
Name: credit_amount, dtype: float64
2308.5 

count    1.500000e+02
mean     1.268448e+06
std      1.903046e+06
min      3.390000e+02
25%      2.378500e+03
50%      1.268600e+04
75%      1.881500e+06
max      8.978000e+06
Name: credit_amount, dtype: float64
12686.0


In [226]:
test_data_corrupted[col]

249        433.0
353    6199000.0
537       3612.0
424    2762000.0
564       4712.0
         ...    
684       9857.0
644    1880000.0
110    1449000.0
28           NaN
804       7472.0
Name: credit_amount, Length: 200, dtype: float64

In [269]:
columns = train_data.columns
columns

Index(['checking_status', 'duration', 'credit_history', 'purpose',
       'credit_amount', 'savings_status', 'employment',
       'installment_commitment', 'personal_status', 'other_parties',
       'residence_since', 'property_magnitude', 'age', 'other_payment_plans',
       'housing', 'existing_credits', 'job', 'num_dependents', 'own_telephone',
       'foreign_worker'],
      dtype='object')

In [240]:
def num_out_detect(df_train, df_corrupted, pyod_model):
    df_outliers = df_corrupted[numerical_columns].copy()
    
    for col in df_train.columns:
        if col in numerical_columns:
            ## find indices of records with NaNs in col in df_corrupted
            nan_idx = df_corrupted[df_corrupted[col].isnull()].index
            non_nan_idx = df_corrupted.loc[set(df_corrupted.index) - set(nan_idx)].index
            
            ## pd series -> np column, needs to be 2D array
            ## taking only the non-NaN records in the corrupted data
            col_tr_arr = np.array(df_train[col]).reshape(-1,1)
            col_corr_arr = np.array(df_corrupted.loc[non_nan_idx][col]).reshape(-1,1)

            ## fit the dataset to the model
            model = pyod_model
            model.fit(col_tr_arr)

            ## predict raw anomaly score
            scores_pred = model.decision_function(col_corr_arr) * -1

            ## prediction of a datapoint category outlier or inlier
            y_pred = model.predict(col_corr_arr)

            ## add a respective outlier col for each col
            df_outliers[col + "_outlier"] = ''
            df_outliers[col + "_outlier"].loc[non_nan_idx] = y_pred ## 0: inlier, 1: outlier
            df_outliers[col + "_outlier"].loc[nan_idx] = 0
            
    return df_outliers

In [241]:
df_outliers_num = num_out_detect(train_data, test_data_corrupted, KNN())
df_outliers_num

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,duration_outlier,credit_amount_outlier,installment_commitment_outlier,residence_since_outlier,age_outlier,existing_credits_outlier,num_dependents_outlier
249,18.0,433.0,3.0,4000.0,22.0,1.0,1.000000,0,0,0,1,0,0,0
353,12000.0,6199000.0,4.0,2.0,28.0,2.0,0.726067,1,1,0,0,0,0,1
537,18.0,3612.0,,4.0,37.0,1.0,-0.749838,0,0,0,0,0,0,1
424,12000.0,2762000.0,,2000.0,25.0,1.0,3.433997,1,1,0,1,0,0,1
564,24000.0,4712.0,4.0,2000.0,37.0,2.0,0.030199,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,36.0,9857.0,1.0,3000.0,31.0,2.0,2.000000,0,1,0,1,0,0,0
644,18.0,1880000.0,,1000.0,32.0,2.0,2.875736,0,1,0,1,0,0,1
110,6.0,1449000.0,,2000.0,31.0,2.0,2.047525,0,1,0,1,0,0,1
28,7000.0,,3.0,2000.0,34.0,1.0,1.526985,1,0,0,1,0,0,1


#### Categorical column

In [None]:
## unlike numerical columns, we can't use PyOD here
## take the unique values from train
## find the values in corrupted that don't belong to the unique from the train
## mark as outlier

In [120]:
vals_train_unique = train_data['property_magnitude'].unique()
vals_train_unique

[car, real estate, life insurance, no known property]
Categories (4, object): [real estate < life insurance < car < no known property]

In [121]:
test_data_corrupted['property_magnitude']

249              none
353              none
537    life insurance
424    life insurance
564    life insurance
            ...      
684    life insurance
644              none
110               car
28               none
804       real estate
Name: property_magnitude, Length: 200, dtype: object

In [None]:
## the values in corrupted that don't belong to 'vals_train_unique'

In [123]:
test_data_corrupted['property_magnitude_outlier'] = ''

for i in test_data_corrupted['property_magnitude'].index:
    if test_data_corrupted['property_magnitude'].loc[i] in vals_train_unique:
        test_data_corrupted['property_magnitude_outlier'].loc[i] = 0
    else:
        test_data_corrupted['property_magnitude_outlier'].loc[i] = 1

In [124]:
test_data_corrupted

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,property_magnitude_outlier
249,no checking,18.0,existing paid,radio/tv,433.0,<100,unemployed,3.0,female div/dep/mar,co applicant,...,none,22.0,real estate,rent,1.0,skilled,1.000000,none,yes,1
353,<0,12000.0,no credits/all paid,radio/tv,6199000.0,<100,,4.0,male single,none,...,none,28.0,life insurance,rent,2.0,skilled,0.726067,yes,yes,1
537,0<=X<200,18.0,critical/other existing credit,furniture/equipment,3612.0,<100,,,female div/dep/mar,none,...,life insurance,37.0,none,own,1.0,skilled,-0.749838,yes,yes,0
424,0<=X<200,12000.0,existing paid,furniture/equipment,2762000.0,no known savings,>=7,,female div/dep/mar,none,...,life insurance,25.0,bank,own,1.0,skilled,3.433997,yes,yes,0
564,0<=X<200,24000.0,delayed previously,business,4712.0,no known savings,,4.0,male single,none,...,life insurance,37.0,bank,own,2.0,high qualif/self emp/mgmt,0.030199,yes,yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,0<=X<200,36.0,delayed previously,business,9857.0,100<=X<500,,1.0,male single,none,...,life insurance,31.0,none,own,2.0,unskilled resident,2.000000,yes,yes,0
644,<0,18.0,critical/other existing credit,radio/tv,1880000.0,<100,,,male mar/wid,none,...,none,32.0,life insurance,own,2.0,high qualif/self emp/mgmt,2.875736,yes,yes,1
110,0<=X<200,6.0,delayed previously,business,1449000.0,100<=X<500,,,male div/sep,none,...,car,31.0,bank,own,2.0,skilled,2.047525,none,yes,0
28,0<=X<200,7000.0,existing paid,radio/tv,,<100,,3.0,male single,guarantor,...,none,34.0,real estate,own,1.0,skilled,1.526985,none,yes,1


In [236]:
def cat_out_detect(df_train, df_corrupted):
    df_outliers = df_corrupted[categorical_columns].copy()
    
    for col in df_train.columns:
        if col in categorical_columns:
            vals_train_unique = df_train[col].unique()

            ## add a respective outlier col for each col
            df_outliers[col + "_outlier"] = ''
            
            for i in df_corrupted[col].index:
                if df_corrupted[col].loc[i] in vals_train_unique:
                    df_outliers[col + "_outlier"].loc[i] = 0
                else:
                    df_outliers[col + "_outlier"].loc[i] = 1
            
    return df_outliers

In [242]:
df_outliers_cat = cat_out_detect(train_data, test_data_corrupted)
df_outliers_cat

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,checking_status,credit_history,purpose,savings_status,employment,personal_status,other_parties,property_magnitude,other_payment_plans,housing,...,savings_status_outlier,employment_outlier,personal_status_outlier,other_parties_outlier,property_magnitude_outlier,other_payment_plans_outlier,housing_outlier,job_outlier,own_telephone_outlier,foreign_worker_outlier
249,no checking,existing paid,radio/tv,<100,unemployed,female div/dep/mar,co applicant,none,real estate,rent,...,0,0,0,0,1,1,0,0,0,0
353,<0,no credits/all paid,radio/tv,<100,,male single,none,none,life insurance,rent,...,0,1,0,0,1,1,0,0,0,0
537,0<=X<200,critical/other existing credit,furniture/equipment,<100,,female div/dep/mar,none,life insurance,none,own,...,0,1,0,0,0,0,0,0,0,0
424,0<=X<200,existing paid,furniture/equipment,no known savings,>=7,female div/dep/mar,none,life insurance,bank,own,...,0,0,0,0,0,0,0,0,0,0
564,0<=X<200,delayed previously,business,no known savings,,male single,none,life insurance,bank,own,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,0<=X<200,delayed previously,business,100<=X<500,,male single,none,life insurance,none,own,...,0,1,0,0,0,0,0,0,0,0
644,<0,critical/other existing credit,radio/tv,<100,,male mar/wid,none,none,life insurance,own,...,0,1,0,0,1,1,0,0,0,0
110,0<=X<200,delayed previously,business,100<=X<500,,male div/sep,none,car,bank,own,...,0,1,0,0,0,0,0,0,0,0
28,0<=X<200,existing paid,radio/tv,<100,,male single,guarantor,none,real estate,own,...,0,1,0,0,1,1,0,0,0,0


In [None]:
## joining the two outlier dfs (inner join on index)

In [270]:
df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,duration_outlier,credit_amount_outlier,installment_commitment_outlier,...,savings_status_outlier,employment_outlier,personal_status_outlier,other_parties_outlier,property_magnitude_outlier,other_payment_plans_outlier,housing_outlier,job_outlier,own_telephone_outlier,foreign_worker_outlier
249,18.0,433.0,3.0,4000.0,22.0,1.0,1.000000,0,0,0,...,0,0,0,0,1,1,0,0,0,0
353,12000.0,6199000.0,4.0,2.0,28.0,2.0,0.726067,1,1,0,...,0,1,0,0,1,1,0,0,0,0
537,18.0,3612.0,,4.0,37.0,1.0,-0.749838,0,0,0,...,0,1,0,0,0,0,0,0,0,0
424,12000.0,2762000.0,,2000.0,25.0,1.0,3.433997,1,1,0,...,0,0,0,0,0,0,0,0,0,0
564,24000.0,4712.0,4.0,2000.0,37.0,2.0,0.030199,1,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,36.0,9857.0,1.0,3000.0,31.0,2.0,2.000000,0,1,0,...,0,1,0,0,0,0,0,0,0,0
644,18.0,1880000.0,,1000.0,32.0,2.0,2.875736,0,1,0,...,0,1,0,0,1,1,0,0,0,0
110,6.0,1449000.0,,2000.0,31.0,2.0,2.047525,0,1,0,...,0,1,0,0,0,0,0,0,0,0
28,7000.0,,3.0,2000.0,34.0,1.0,1.526985,1,0,0,...,0,1,0,0,1,1,0,0,0,0


In [None]:
## where the corresponding outlier column is 1, set the original value to NaN

In [259]:
col = 'duration'

In [260]:
df_outliers[['duration', 'duration_outlier']]

Unnamed: 0,duration,duration_outlier
249,18.0,0
353,12000.0,1
537,18.0,0
424,12000.0,1
564,24000.0,1
...,...,...
684,36.0,0
644,18.0,0
110,6.0,0
28,7000.0,1


In [261]:
for i in df_outliers.index:
    if df_outliers[col + "_outlier"].loc[i] == 1:
        df_outliers[col].loc[i] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [262]:
df_outliers

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,duration_outlier,credit_amount_outlier,installment_commitment_outlier,...,savings_status_outlier,employment_outlier,personal_status_outlier,other_parties_outlier,property_magnitude_outlier,other_payment_plans_outlier,housing_outlier,job_outlier,own_telephone_outlier,foreign_worker_outlier
249,18.0,433.0,3.0,4000.0,22.0,1.0,1.000000,0,0,0,...,0,0,0,0,1,1,0,0,0,0
353,,6199000.0,4.0,2.0,28.0,2.0,0.726067,1,1,0,...,0,1,0,0,1,1,0,0,0,0
537,18.0,3612.0,,4.0,37.0,1.0,-0.749838,0,0,0,...,0,1,0,0,0,0,0,0,0,0
424,,2762000.0,,2000.0,25.0,1.0,3.433997,1,1,0,...,0,0,0,0,0,0,0,0,0,0
564,,4712.0,4.0,2000.0,37.0,2.0,0.030199,1,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,36.0,9857.0,1.0,3000.0,31.0,2.0,2.000000,0,1,0,...,0,1,0,0,0,0,0,0,0,0
644,18.0,1880000.0,,1000.0,32.0,2.0,2.875736,0,1,0,...,0,1,0,0,1,1,0,0,0,0
110,6.0,1449000.0,,2000.0,31.0,2.0,2.047525,0,1,0,...,0,1,0,0,0,0,0,0,0,0
28,,,3.0,2000.0,34.0,1.0,1.526985,1,0,0,...,0,1,0,0,1,1,0,0,0,0


In [271]:
for col in columns:
    for i in df_outliers.index:
        if df_outliers[col + "_outlier"].loc[i] == 1:
            df_outliers[col].loc[i] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [272]:
df_outliers[columns]

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,18.0,existing paid,radio/tv,433.0,<100,unemployed,3.0,female div/dep/mar,co applicant,,,22.0,,rent,1.0,skilled,1.0,none,yes
353,<0,,no credits/all paid,radio/tv,,<100,,4.0,male single,none,2.0,,28.0,,rent,2.0,skilled,,yes,yes
537,0<=X<200,18.0,critical/other existing credit,furniture/equipment,3612.0,<100,,,female div/dep/mar,none,4.0,life insurance,37.0,none,own,1.0,skilled,,yes,yes
424,0<=X<200,,existing paid,furniture/equipment,,no known savings,>=7,,female div/dep/mar,none,,life insurance,25.0,bank,own,1.0,skilled,,yes,yes
564,0<=X<200,,delayed previously,business,4712.0,no known savings,,4.0,male single,none,,life insurance,37.0,bank,own,2.0,high qualif/self emp/mgmt,,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,0<=X<200,36.0,delayed previously,business,,100<=X<500,,1.0,male single,none,,life insurance,31.0,none,own,2.0,unskilled resident,2.0,yes,yes
644,<0,18.0,critical/other existing credit,radio/tv,,<100,,,male mar/wid,none,,,32.0,,own,2.0,high qualif/self emp/mgmt,,yes,yes
110,0<=X<200,6.0,delayed previously,business,,100<=X<500,,,male div/sep,none,,car,31.0,bank,own,2.0,skilled,,none,yes
28,0<=X<200,,existing paid,radio/tv,,<100,,3.0,male single,guarantor,,,34.0,,own,1.0,skilled,,none,yes


In [None]:
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer

In [None]:
test_data_corrupted[numerical_columns]

In [None]:
imp = IterativeImputer(random_state=seed)
imp.fit(train_data[numerical_columns])

x = pd.DataFrame(imp.transform(test_data_corrupted[numerical_columns]))
x.columns = test_data_corrupted[numerical_columns].columns
x.index = test_data_corrupted[numerical_columns].index
x

In [None]:
imp_ = IterativeImputer(transformers=feature_transform)
imp_.fit(train_data[numerical_columns])

xx = pd.DataFrame(imp_.transform(test_data_corrupted[categorical_columns]))
xx.columns = test_data_corrupted[categorical_columns].columns
xx.index = test_data_corrupted[categorical_columns].index
xx

In [20]:
test_data_corrupted['purpose'][test_data_corrupted['purpose'].isnull()].index

Int64Index([659, 944, 845, 277, 218, 171, 334, 539, 953, 305, 604, 663, 387,
            482, 248, 628, 298, 448, 271, 700, 898, 614, 339, 707, 326, 795,
            837, 897, 233, 723, 155, 824,  92, 601, 335, 793, 295,   6, 261,
            172, 408, 444, 930,  34, 124, 176, 750, 299,  31, 576],
           dtype='int64')

In [21]:
test_data_corrupted['purpose'].value_counts()

radio/tv               44
new car                43
used car               21
business               16
education              12
repairs                 4
other                   3
retraining              3
domestic appliance      2
furniture/equipment     2
vacation                0
Name: purpose, dtype: int64

In [None]:
cleaners = [
#     (NoOutlierDetection, MeanModeImputation),
    (NoOutlierDetection, DatawigImputation)
#     (PyODKNN, NoImputation),
#     (PyODKNN, MeanModeImputation),
#     # (PyODKNN, DatawigImputation),
#     (PyODIsolationForest, NoImputation),
#     (PyODIsolationForest, MeanModeImputation)
#     # (PyODIsolationForest, DatawigImputation)
]

In [None]:
ppp_model = ppp.fit_ppp(train_data)

In [None]:
clean = Clean(train_data, test_data_corrupted, categorical_columns, numerical_columns, ppp, ppp_model, cleaners)

In [None]:
test_data_cleaned, score_no_cleaning, cleaner_scores_ppp = clean(train_data, test_data_corrupted)

In [None]:
test_data_cleaned

## Results

### Model Results

In [None]:
# model 
pipeline.fit(train_data, train_labels)

In [None]:
# original data test score
pipeline.score(test_data, test_labels)

In [None]:
# corrupted data test score
pipeline.score(test_data_corrupted, test_labels)

In [None]:
# cleaned data test score
pipeline.score(test_data_cleaned, test_labels)

### PPP Results

In [None]:
# ppp model score
ppp.predict_score_ppp(ppp_model, test_data)

In [None]:
# ppp score corrupted
score_no_cleaning

In [None]:
# ppp score cleaned
np.array(cleaner_scores_ppp).max()

In [None]:
# ppp cleaner scores
cleaner_scores_ppp