## Adjustment for Google Colab

In [1]:
# mount drive for access to the files
from google.colab import drive

drive.mount("/content/drive")

# all the drive the files are present in "/content/drive/My Drive"
!ls "/content/drive/My Drive/Beuth Uni/Master Thesis"

import sys
sys.path.append('/content/drive/My Drive/Beuth Uni/Master Thesis/jenga')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Data
'Data Quality in ML Production Systems.pdf'
'Datawig: Missing Value Imputation for Tables.pdf'
 Declaration
 Images
 jenga
 jenga.pdf
 MICE_Multivariate_Imputation_by_Chained_Equations_.pdf


In [2]:
!pip install openml
!pip install pyod
!pip install datawig ##



In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from jenga.basis import Dataset
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise
from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNN, PyODIsolationForest
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, DatawigImputation
from jenga.cleaning.clean import Clean

In [3]:
seed = 100

## Dataset

In [6]:
dataset = Dataset(seed, "credit-g")
all_data = dataset.all_data
attribute_names = dataset.attribute_names
attribute_types = dataset.attribute_types

Dataset: credit-g


### Categorical and Numerical Features

In [7]:
categorical_columns = dataset.categorical_columns
numerical_columns = dataset.numerical_columns

print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features")

Found 13 categorical and 7 numeric features


## Model

### Model parameters

In [8]:
## model parameters
learner = SGDClassifier(max_iter=1000)
param_grid = {
    'learner__loss': ['log'],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

### Train and Test Data

In [9]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data()

# display(train_data.head())
# print(train_labels[0:5])

# display(test_data.head())
# print(test_labels[0:5])

## Corruptions

In [10]:
corruptions = [MissingValues, Scaling, SwappedValues, GaussianNoise]
fraction = 0.5

ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)

# generate corrpted test data
test_data_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions, fraction)

2020-07-22 12:51:21,075 [INFO]  NumExpr defaulting to 2 threads.


Generating corrupted training data on 200 rows... 

Applying perturbations... 

MissingValues: {'column': 'property_magnitude', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MCAR'}
Scaling: {'column': 'residence_since', 'fraction': 0.5}
SwappedValues: {'column_a': 'foreign_worker', 'column_b': 'purpose', 'fraction': 0.5}
GaussianNoise: {'column': 'num_dependents', 'fraction': 0.5}


In [11]:
summary_col_corrupt

defaultdict(list,
            {('foreign_worker',
              'purpose'): [SwappedValues: {'column_a': 'foreign_worker', 'column_b': 'purpose', 'fraction': 0.5}],
             ('num_dependents',): [GaussianNoise: {'column': 'num_dependents', 'fraction': 0.5}],
             ('property_magnitude',): [MissingValues: {'column': 'property_magnitude', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MCAR'}],
             ('residence_since',): [Scaling: {'column': 'residence_since', 'fraction': 0.5}]})

In [12]:
test_data_corrupted

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,18.0,existing paid,radio/tv,433.0,<100,unemployed,3.0,female div/dep/mar,co applicant,4.0,real estate,22.0,none,rent,1.0,skilled,1.000000,none,yes
353,<0,12.0,no credits/all paid,radio/tv,6199.0,<100,1<=X<4,4.0,male single,none,20.0,,28.0,none,rent,2.0,skilled,1.000000,yes,yes
537,0<=X<200,18.0,critical/other existing credit,yes,3612.0,<100,>=7,3.0,female div/dep/mar,none,40.0,,37.0,none,own,1.0,skilled,0.589524,yes,furniture/equipment
424,0<=X<200,12.0,existing paid,furniture/equipment,2762.0,no known savings,>=7,1.0,female div/dep/mar,none,2.0,life insurance,25.0,bank,own,1.0,skilled,-0.466446,yes,yes
564,0<=X<200,24.0,delayed previously,business,4712.0,no known savings,1<=X<4,4.0,male single,none,20.0,,37.0,bank,own,2.0,high qualif/self emp/mgmt,1.000000,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,0<=X<200,36.0,delayed previously,business,9857.0,100<=X<500,4<=X<7,1.0,male single,none,30.0,life insurance,31.0,none,own,2.0,unskilled resident,2.000000,yes,yes
644,<0,18.0,critical/other existing credit,yes,1880.0,<100,4<=X<7,4.0,male mar/wid,none,10.0,life insurance,32.0,none,own,2.0,high qualif/self emp/mgmt,1.000000,yes,radio/tv
110,0<=X<200,6.0,delayed previously,business,1449.0,100<=X<500,>=7,1.0,male div/sep,none,2.0,car,31.0,bank,own,2.0,skilled,2.000000,none,yes
28,0<=X<200,7.0,existing paid,yes,2415.0,<100,1<=X<4,3.0,male single,guarantor,20.0,,34.0,none,own,1.0,skilled,1.000000,none,radio/tv


In [20]:
## Cleaning

In [13]:
cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNN, NoImputation),
    (PyODKNN, MeanModeImputation),
    (PyODIsolationForest, NoImputation),
    (PyODIsolationForest, MeanModeImputation)
]

In [15]:
ppp_model = ppp.fit_ppp(train_data)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  57 out of  60 | elapsed:    3.8s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    3.9s finished


In [16]:
clean = Clean(train_data, test_data_corrupted, categorical_columns, numerical_columns, ppp, ppp_model, cleaners)

In [17]:
test_data_cleaned, score_no_cleaning, cleaner_scores_ppp, summary_cleaners = clean(train_data, test_data_corrupted)


Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.7182539682539683, 'classification_report': {'bad': {'precision': 0.6470588235294118, 'recall': 0.19642857142857142, 'f1-score': 0.3013698630136986, 'support': 56}, 'good': {'precision': 0.7540983606557377, 'recall': 0.9583333333333334, 'f1-score': 0.8440366972477064, 'support': 144}, 'accuracy': 0.745, 'macro avg': {'precision': 0.7005785920925747, 'recall': 0.5773809523809524, 'f1-score': 0.5727032801307025, 'support': 200}, 'weighted avg': {'precision': 0.7241272902603663, 'recall': 0.745, 'f1-score': 0.6920899836621842, 'support': 200}}}
PPP score with cleaning: Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.7182539682539683, 'classification_report': {'bad': {'precision': 0.6470588235294118, 'recall': 0.19642857142857142, 'f1-score': 0.3013698630136986, 'support': 56}, 'good': {'precision': 0.7540983606557377, 'recall': 0.9583333333333334, 'f1-score': 0.84403

In [18]:
test_data_cleaned

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,18.000,existing paid,radio/tv,433.0000,<100,unemployed,3.0,female div/dep/mar,co applicant,4.0000,real estate,22.0,none,rent,1.0,skilled,1.00,none,yes
353,<0,12.000,no credits/all paid,radio/tv,6199.0000,<100,1<=X<4,4.0,male single,none,2.8575,car,28.0,none,rent,2.0,skilled,1.00,yes,yes
537,0<=X<200,18.000,critical/other existing credit,radio/tv,3612.0000,<100,>=7,3.0,female div/dep/mar,none,2.8575,car,37.0,none,own,1.0,skilled,1.15,yes,yes
424,0<=X<200,12.000,existing paid,furniture/equipment,2762.0000,no known savings,>=7,1.0,female div/dep/mar,none,2.0000,life insurance,25.0,bank,own,1.0,skilled,1.15,yes,yes
564,0<=X<200,24.000,delayed previously,business,4712.0000,no known savings,1<=X<4,4.0,male single,none,2.8575,car,37.0,bank,own,2.0,high qualif/self emp/mgmt,1.00,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,0<=X<200,36.000,delayed previously,business,3243.4775,100<=X<500,4<=X<7,1.0,male single,none,2.8575,life insurance,31.0,none,own,2.0,unskilled resident,2.00,yes,yes
644,<0,18.000,critical/other existing credit,radio/tv,1880.0000,<100,4<=X<7,4.0,male mar/wid,none,2.8575,life insurance,32.0,none,own,2.0,high qualif/self emp/mgmt,1.00,yes,yes
110,0<=X<200,6.000,delayed previously,business,1449.0000,100<=X<500,>=7,1.0,male div/sep,none,2.0000,car,31.0,bank,own,2.0,skilled,2.00,none,yes
28,0<=X<200,20.915,existing paid,radio/tv,2415.0000,<100,1<=X<4,3.0,male single,guarantor,2.8575,car,34.0,none,own,1.0,skilled,1.00,none,yes


In [25]:
cleaner_scores_ppp

[{'classification_report': {'accuracy': 0.745,
   'bad': {'f1-score': 0.3013698630136986,
    'precision': 0.6470588235294118,
    'recall': 0.19642857142857142,
    'support': 56},
   'good': {'f1-score': 0.8440366972477064,
    'precision': 0.7540983606557377,
    'recall': 0.9583333333333334,
    'support': 144},
   'macro avg': {'f1-score': 0.5727032801307025,
    'precision': 0.7005785920925747,
    'recall': 0.5773809523809524,
    'support': 200},
   'weighted avg': {'f1-score': 0.6920899836621842,
    'precision': 0.7241272902603663,
    'recall': 0.745,
    'support': 200}},
  'roc_auc_acore': 0.7182539682539683},
 {'classification_report': {'accuracy': 0.75,
   'bad': {'f1-score': 0.375,
    'precision': 0.625,
    'recall': 0.26785714285714285,
    'support': 56},
   'good': {'f1-score': 0.84375,
    'precision': 0.7670454545454546,
    'recall': 0.9375,
    'support': 144},
   'macro avg': {'f1-score': 0.609375,
    'precision': 0.6960227272727273,
    'recall': 0.602678571

In [19]:
summary_cleaners

[{'Imputation method': MeanModeImputation,
  'Outlier detection method': NoOutlierDetection,
  'PPP score with cleaning': {'classification_report': {'accuracy': 0.745,
    'bad': {'f1-score': 0.3013698630136986,
     'precision': 0.6470588235294118,
     'recall': 0.19642857142857142,
     'support': 56},
    'good': {'f1-score': 0.8440366972477064,
     'precision': 0.7540983606557377,
     'recall': 0.9583333333333334,
     'support': 144},
    'macro avg': {'f1-score': 0.5727032801307025,
     'precision': 0.7005785920925747,
     'recall': 0.5773809523809524,
     'support': 200},
    'weighted avg': {'f1-score': 0.6920899836621842,
     'precision': 0.7241272902603663,
     'recall': 0.745,
     'support': 200}},
   'roc_auc_acore': 0.7182539682539683}},
 {'Imputation method': NoImputation,
  'Outlier detection method': PyODKNN,
  'PPP score with cleaning': {'classification_report': {'accuracy': 0.75,
    'bad': {'f1-score': 0.375,
     'precision': 0.625,
     'recall': 0.2678571

In [20]:
test_data

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,18.0,existing paid,radio/tv,433.0,<100,unemployed,3.0,female div/dep/mar,co applicant,4.0,real estate,22.0,none,rent,1.0,skilled,1.0,none,yes
353,<0,12.0,no credits/all paid,radio/tv,6199.0,<100,1<=X<4,4.0,male single,none,2.0,life insurance,28.0,none,rent,2.0,skilled,1.0,yes,yes
537,0<=X<200,18.0,critical/other existing credit,furniture/equipment,3612.0,<100,>=7,3.0,female div/dep/mar,none,4.0,life insurance,37.0,none,own,1.0,skilled,1.0,yes,yes
424,0<=X<200,12.0,existing paid,furniture/equipment,2762.0,no known savings,>=7,1.0,female div/dep/mar,none,2.0,life insurance,25.0,bank,own,1.0,skilled,1.0,yes,yes
564,0<=X<200,24.0,delayed previously,business,4712.0,no known savings,1<=X<4,4.0,male single,none,2.0,life insurance,37.0,bank,own,2.0,high qualif/self emp/mgmt,1.0,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,0<=X<200,36.0,delayed previously,business,9857.0,100<=X<500,4<=X<7,1.0,male single,none,3.0,life insurance,31.0,none,own,2.0,unskilled resident,2.0,yes,yes
644,<0,18.0,critical/other existing credit,radio/tv,1880.0,<100,4<=X<7,4.0,male mar/wid,none,1.0,life insurance,32.0,none,own,2.0,high qualif/self emp/mgmt,1.0,yes,yes
110,0<=X<200,6.0,delayed previously,business,1449.0,100<=X<500,>=7,1.0,male div/sep,none,2.0,car,31.0,bank,own,2.0,skilled,2.0,none,yes
28,0<=X<200,7.0,existing paid,radio/tv,2415.0,<100,1<=X<4,3.0,male single,guarantor,2.0,real estate,34.0,none,own,1.0,skilled,1.0,none,yes


In [21]:
cols_perturbed

['property_magnitude',
 'residence_since',
 'foreign_worker',
 'purpose',
 'num_dependents']

In [None]:
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error

for col in cols_perturbed:
    if col in categorical_columns:
        print(col)
        print(classification_report(test_data[col], test_data_cleaned[col]))
    elif col in numerical_columns:
        print(col)
        print("MSE: ", mean_squared_error(test_data[col], test_data_cleaned[col]))
        print("MAE: ", mean_absolute_error(test_data[col], test_data_cleaned[col]))

property_magnitude
                   precision    recall  f1-score   support

              car       0.46      1.00      0.63        62
   life insurance       1.00      0.46      0.63        48
no known property       1.00      0.56      0.72        32
      real estate       1.00      0.45      0.62        58

         accuracy                           0.64       200
        macro avg       0.87      0.62      0.65       200
     weighted avg       0.83      0.64      0.64       200

residence_since
MSE:  0.7051123125000001
MAE:  0.55855
foreign_worker
              precision    recall  f1-score   support

          no       1.00      0.54      0.70        13
         yes       0.97      1.00      0.98       187

    accuracy                           0.97       200
   macro avg       0.98      0.77      0.84       200
weighted avg       0.97      0.97      0.97       200

purpose
                     precision    recall  f1-score   support

           business       1.00      0.5

## Results

### Model Results

In [None]:
# model 
pipeline.fit(train_data, train_labels)

In [None]:
# original data test score
pipeline.score(test_data, test_labels)

In [None]:
# corrupted data test score
pipeline.score(test_data_corrupted, test_labels)

In [None]:
# cleaned data test score
pipeline.score(test_data_cleaned, test_labels)

### PPP Results

In [None]:
# ppp model score
ppp.predict_score_ppp(ppp_model, test_data)

In [None]:
# ppp score corrupted
score_no_cleaning

In [None]:
# ppp score cleaned
np.array(cleaner_scores_ppp).max()

In [None]:
# ppp cleaner scores
cleaner_scores_ppp

## EXTRAS

### Preprocessing Pipeline

In [None]:
## preprocessing pipeline for both numerical and categorical columns

# preprocessing pipeline for numerical columns
transformer_numeric = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('standard_scale', StandardScaler())
])

# preprocessing pipeline for categorical columns
transformer_categorical = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
    ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
])

# preprocessor
feature_transform = ColumnTransformer(transformers=[
    ('categorical_features', transformer_categorical, categorical_columns),
    ('numerical_features', transformer_numeric, numerical_columns)
])

### Prediction Pipeline

In [None]:
## prediction pipeline: append classifier (learner) to the preprocessing pipeline
pipeline = Pipeline([
    ('features', feature_transform),
    ('learner', learner)
])

### outlier detection

In [None]:
from pyod.utils.data import generate_data, get_outliers_inliers

#generate random data with two features
X_train, Y_train = generate_data(n_train=200,train_only=True, n_features=2)

In [None]:
X_train

array([[ 8.51615306,  8.7315578 ],
       [ 6.7322496 ,  8.00917028],
       [ 8.6367426 ,  7.24556432],
       [ 8.09561334,  7.41976414],
       [ 7.4099449 ,  7.46755281],
       [ 7.14619694,  8.16099031],
       [10.03938861,  7.95722193],
       [ 7.31618559,  7.84987553],
       [ 8.46410738,  8.33567099],
       [ 7.90115358,  7.49439556],
       [ 8.30237492,  8.21427738],
       [ 8.33886135,  8.45386323],
       [ 8.10981993,  8.00756189],
       [ 8.46038572,  7.65280695],
       [ 6.10450922,  8.80721603],
       [ 8.28296169,  7.30103076],
       [ 7.57118073,  7.89521747],
       [ 7.04488375,  8.34598763],
       [ 8.09273844,  8.86217989],
       [ 8.23449151,  8.52668653],
       [ 8.17656397,  7.10061961],
       [ 7.96070463,  8.47428073],
       [ 8.06127681,  8.80215393],
       [ 7.9491775 ,  7.40272466],
       [ 9.28671322,  8.01916915],
       [ 7.26723327,  8.2732644 ],
       [ 8.28074434,  7.39919939],
       [ 7.98889556,  7.63667412],
       [ 7.88446711,

In [None]:
Y_train

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [None]:
# by default the outlier fraction is 0.1 in generate data function 
outlier_fraction = 0.1

# store outliers and inliers in different numpy arrays
x_outliers, x_inliers = get_outliers_inliers(X_train,Y_train)

In [None]:
x_outliers

array([[ 3.46638464,  1.66383774],
       [ 3.46116636, -0.24907732],
       [-1.09397821,  4.69686054],
       [ 6.92823219, -0.48389623],
       [-5.16544285,  6.62893345],
       [ 5.32046529, -1.87074669],
       [-0.02510136,  4.40765382],
       [ 1.93012975,  5.09150205],
       [-3.39643009, -2.7525707 ],
       [-3.82795609, -5.80884865],
       [-0.79745389, -7.65166313],
       [-6.90159551,  4.38607016],
       [ 3.17950698,  4.58745473],
       [ 4.48079632,  4.71797235],
       [-3.65249257, -5.25377588],
       [ 6.82601689,  1.39215629],
       [ 0.30984574, -2.43661654],
       [ 2.81480041, -1.2130262 ],
       [-7.68289149,  0.53465819],
       [-5.07744838, -2.79611131]])

In [None]:
#separate the two features and use it to plot the data 
F1 = X_train[:,[0]].reshape(-1,1)
F2 = X_train[:,[1]].reshape(-1,1)

In [None]:
from pyod.models.abod import ABOD
from pyod.models.knn import KNN

classifiers = {
     'Angle-based Outlier Detector (ABOD)'   : ABOD(contamination=outlier_fraction),
     'K Nearest Neighbors (KNN)' :  KNN(contamination=outlier_fraction)
}

In [None]:
for i, (clf_name,clf) in enumerate(classifiers.items()) :
    # fit the dataset to the model
    clf.fit(X_train)

    # predict raw anomaly score
    scores_pred = clf.decision_function(X_train)*-1

    # prediction of a datapoint category outlier or inlier
    y_pred = clf.predict(X_train)

    # no of errors in prediction
    n_errors = (y_pred != Y_train).sum()
    print('No of Errors : ',clf_name, n_errors)

No of Errors :  Angle-based Outlier Detector (ABOD) 4
No of Errors :  K Nearest Neighbors (KNN) 0


In [None]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [None]:
clf = KNN(contamination=outlier_fraction)

In [None]:
# fit the dataset to the model
clf.fit(X_train[:,[0]].reshape(-1,1))

# predict raw anomaly score
scores_pred = clf.decision_function(X_train[:,[0]].reshape(-1,1))*-1

# prediction of a datapoint category outlier or inlier
y_pred = clf.predict(X_train[:,[0]].reshape(-1,1))

In [None]:
X_train[:,[0]].reshape(-1,1)

array([[ 8.51615306],
       [ 6.7322496 ],
       [ 8.6367426 ],
       [ 8.09561334],
       [ 7.4099449 ],
       [ 7.14619694],
       [10.03938861],
       [ 7.31618559],
       [ 8.46410738],
       [ 7.90115358],
       [ 8.30237492],
       [ 8.33886135],
       [ 8.10981993],
       [ 8.46038572],
       [ 6.10450922],
       [ 8.28296169],
       [ 7.57118073],
       [ 7.04488375],
       [ 8.09273844],
       [ 8.23449151],
       [ 8.17656397],
       [ 7.96070463],
       [ 8.06127681],
       [ 7.9491775 ],
       [ 9.28671322],
       [ 7.26723327],
       [ 8.28074434],
       [ 7.98889556],
       [ 7.88446711],
       [ 8.60839795],
       [ 7.8322325 ],
       [ 8.58707181],
       [ 7.84761668],
       [ 6.68689173],
       [ 8.09209503],
       [ 8.02867887],
       [ 7.96580713],
       [ 8.24638358],
       [ 7.43085713],
       [ 8.41263073],
       [ 8.52555968],
       [ 8.39711329],
       [ 7.81648929],
       [ 6.67489617],
       [ 7.45563748],
       [ 7

In [None]:
y_pred

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1])

#### Numerical column

In [None]:
df_outliers = test_data_corrupted[numerical_columns].copy()
df_outliers

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents
249,1800000.0,433.0,3.0,4.0,22.0,10.0,1.000000
353,1200000.0,6199.0,4.0,2.0,28.0,20.0,1.595905
537,1800000.0,3612.0,3.0,4.0,37.0,10.0,1.000000
424,1200000.0,2762.0,1.0,2.0,25.0,10.0,0.900623
564,2400000.0,4712.0,4.0,2.0,37.0,2.0,1.000000
...,...,...,...,...,...,...,...
684,3600000.0,9857.0,1.0,3.0,31.0,2.0,2.141840
644,18000.0,1880.0,4.0,1.0,32.0,2.0,0.335332
110,600000.0,1449.0,1.0,2.0,31.0,2.0,2.000000
28,700.0,2415.0,3.0,2.0,34.0,1.0,1.000000


In [None]:
col = 'duration'

In [None]:
train_data[col]

675    30.0
358    12.0
159     6.0
533    24.0
678    24.0
       ... 
855    24.0
871     6.0
835    12.0
792     6.0
520    24.0
Name: duration, Length: 800, dtype: float64

In [None]:
nan_idx = test_data_corrupted[test_data_corrupted[col].isnull()].index
non_nan_idx = test_data_corrupted.loc[set(test_data_corrupted.index) - set(nan_idx)].index

print(nan_idx)
print(non_nan_idx)

Int64Index([], dtype='int64')
Int64Index([512, 515,  19,  22, 534,  24, 537,  28, 542, 543,
            ...
            982, 480, 483, 996, 489, 492, 503, 508, 509, 510],
           dtype='int64', length=200)


In [None]:
col_tr_arr = np.array(train_data[col]).reshape(-1,1)
col_corr_arr = np.array(test_data_corrupted.loc[non_nan_idx][col]).reshape(-1,1)

In [None]:
from pyod.models.knn import KNN

clf = KNN(contamination=0.1)

# fit the dataset to the model
clf.fit(col_tr_arr)

# predict raw anomaly score
scores_pred = clf.decision_function(col_corr_arr)*-1

# prediction of a datapoint category outlier or inlier
y_pred = clf.predict(col_corr_arr)
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [None]:
df_outliers[col + "_outlier"] = ''
df_outliers

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,duration_outlier
249,1800000.0,433.0,3.0,4.0,22.0,10.0,1.000000,
353,1200000.0,6199.0,4.0,2.0,28.0,20.0,1.595905,
537,1800000.0,3612.0,3.0,4.0,37.0,10.0,1.000000,
424,1200000.0,2762.0,1.0,2.0,25.0,10.0,0.900623,
564,2400000.0,4712.0,4.0,2.0,37.0,2.0,1.000000,
...,...,...,...,...,...,...,...,...
684,3600000.0,9857.0,1.0,3.0,31.0,2.0,2.141840,
644,18000.0,1880.0,4.0,1.0,32.0,2.0,0.335332,
110,600000.0,1449.0,1.0,2.0,31.0,2.0,2.000000,
28,700.0,2415.0,3.0,2.0,34.0,1.0,1.000000,


In [None]:
df_outliers[col + "_outlier"].loc[non_nan_idx] = y_pred ## 0: inlier, 1: outlier

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [None]:
df_outliers.loc[non_nan_idx, col + "_outlier"] = y_pred ## 0: inlier, 1: outlier

In [None]:
df_outliers[col + "_outlier"].loc[nan_idx] = 0

In [None]:
df_outliers

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,duration_outlier
249,1800000.0,433.0,3.0,4.0,22.0,10.0,1.000000,1
353,1200000.0,6199.0,4.0,2.0,28.0,20.0,1.595905,1
537,1800000.0,3612.0,3.0,4.0,37.0,10.0,1.000000,1
424,1200000.0,2762.0,1.0,2.0,25.0,10.0,0.900623,1
564,2400000.0,4712.0,4.0,2.0,37.0,2.0,1.000000,1
...,...,...,...,...,...,...,...,...
684,3600000.0,9857.0,1.0,3.0,31.0,2.0,2.141840,1
644,18000.0,1880.0,4.0,1.0,32.0,2.0,0.335332,1
110,600000.0,1449.0,1.0,2.0,31.0,2.0,2.000000,1
28,700.0,2415.0,3.0,2.0,34.0,1.0,1.000000,1


In [None]:
print(train_data[col].describe())
print(train_data[col].median(), '\n')

print(test_data_corrupted[col].describe())
print(test_data_corrupted[col].median())

count      800.000000
mean      3243.477500
std       2819.853229
min        250.000000
25%       1354.250000
50%       2308.500000
75%       3972.250000
max      18424.000000
Name: credit_amount, dtype: float64
2308.5 

count    1.500000e+02
mean     1.268448e+06
std      1.903046e+06
min      3.390000e+02
25%      2.378500e+03
50%      1.268600e+04
75%      1.881500e+06
max      8.978000e+06
Name: credit_amount, dtype: float64
12686.0


In [None]:
test_data_corrupted[col]

249        433.0
353    6199000.0
537       3612.0
424    2762000.0
564       4712.0
         ...    
684       9857.0
644    1880000.0
110    1449000.0
28           NaN
804       7472.0
Name: credit_amount, Length: 200, dtype: float64

In [None]:
from pyod.models.knn import KNN

In [None]:
columns = train_data.columns
columns

Index(['checking_status', 'duration', 'credit_history', 'purpose',
       'credit_amount', 'savings_status', 'employment',
       'installment_commitment', 'personal_status', 'other_parties',
       'residence_since', 'property_magnitude', 'age', 'other_payment_plans',
       'housing', 'existing_credits', 'job', 'num_dependents', 'own_telephone',
       'foreign_worker'],
      dtype='object')

In [None]:
def num_out_detect(df_train, df_corrupted, pyod_model):
    df_outliers = df_corrupted[numerical_columns].copy()
    
    for col in df_train.columns:
        if col in numerical_columns:
            ## find indices of records with NaNs in col in df_corrupted
            nan_idx = df_corrupted[df_corrupted[col].isnull()].index
            non_nan_idx = df_corrupted.loc[set(df_corrupted.index) - set(nan_idx)].index
            
            ## pd series -> np column, needs to be 2D array
            ## taking only the non-NaN records in the corrupted data
            col_tr_arr = np.array(df_train[col]).reshape(-1,1)
            col_corr_arr = np.array(df_corrupted.loc[non_nan_idx][col]).reshape(-1,1)

            ## fit the dataset to the model
            model = pyod_model
            model.fit(col_tr_arr)

            ## predict raw anomaly score
            scores_pred = model.decision_function(col_corr_arr) * -1

            ## prediction of a datapoint category outlier or inlier
            y_pred = model.predict(col_corr_arr)

            ## add a respective outlier col for each col
            df_outliers[col + "_outlier"] = ''
            df_outliers[col + "_outlier"].loc[non_nan_idx] = y_pred ## 0: inlier, 1: outlier
            df_outliers[col + "_outlier"].loc[nan_idx] = 0
            
    return df_outliers

In [None]:
df_outliers_num = num_out_detect(train_data, test_data_corrupted, KNN())
df_outliers_num

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,duration_outlier,credit_amount_outlier,installment_commitment_outlier,residence_since_outlier,age_outlier,existing_credits_outlier,num_dependents_outlier
249,1800000.0,433.0,3.0,4.0,22.0,10.0,1.000000,1,0,0,0,0,1,0
353,1200000.0,6199.0,4.0,2.0,28.0,20.0,1.595905,1,0,0,0,0,1,1
537,1800000.0,3612.0,3.0,4.0,37.0,10.0,1.000000,1,0,0,0,0,1,0
424,1200000.0,2762.0,1.0,2.0,25.0,10.0,0.900623,1,0,0,0,0,1,1
564,2400000.0,4712.0,4.0,2.0,37.0,2.0,1.000000,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,3600000.0,9857.0,1.0,3.0,31.0,2.0,2.141840,1,1,0,0,0,0,1
644,18000.0,1880.0,4.0,1.0,32.0,2.0,0.335332,1,0,0,0,0,0,1
110,600000.0,1449.0,1.0,2.0,31.0,2.0,2.000000,1,0,0,0,0,0,0
28,700.0,2415.0,3.0,2.0,34.0,1.0,1.000000,1,0,0,0,0,0,0


#### Categorical column

In [None]:
## unlike numerical columns, we can't use PyOD here
## take the unique values from train
## find the values in corrupted that don't belong to the unique from the train
## mark as outlier

In [None]:
vals_train_unique = train_data['property_magnitude'].unique()
vals_train_unique

[car, real estate, life insurance, no known property]
Categories (4, object): [real estate < life insurance < car < no known property]

In [None]:
test_data_corrupted['property_magnitude']

249              none
353              none
537    life insurance
424    life insurance
564    life insurance
            ...      
684    life insurance
644              none
110               car
28               none
804       real estate
Name: property_magnitude, Length: 200, dtype: object

In [None]:
## the values in corrupted that don't belong to 'vals_train_unique'

In [None]:
test_data_corrupted['property_magnitude_outlier'] = ''

for i in test_data_corrupted['property_magnitude'].index:
    if test_data_corrupted['property_magnitude'].loc[i] in vals_train_unique:
        test_data_corrupted['property_magnitude_outlier'].loc[i] = 0
    else:
        test_data_corrupted['property_magnitude_outlier'].loc[i] = 1

In [None]:
test_data_corrupted

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,property_magnitude_outlier
249,no checking,18.0,existing paid,radio/tv,433.0,<100,unemployed,3.0,female div/dep/mar,co applicant,...,none,22.0,real estate,rent,1.0,skilled,1.000000,none,yes,1
353,<0,12000.0,no credits/all paid,radio/tv,6199000.0,<100,,4.0,male single,none,...,none,28.0,life insurance,rent,2.0,skilled,0.726067,yes,yes,1
537,0<=X<200,18.0,critical/other existing credit,furniture/equipment,3612.0,<100,,,female div/dep/mar,none,...,life insurance,37.0,none,own,1.0,skilled,-0.749838,yes,yes,0
424,0<=X<200,12000.0,existing paid,furniture/equipment,2762000.0,no known savings,>=7,,female div/dep/mar,none,...,life insurance,25.0,bank,own,1.0,skilled,3.433997,yes,yes,0
564,0<=X<200,24000.0,delayed previously,business,4712.0,no known savings,,4.0,male single,none,...,life insurance,37.0,bank,own,2.0,high qualif/self emp/mgmt,0.030199,yes,yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,0<=X<200,36.0,delayed previously,business,9857.0,100<=X<500,,1.0,male single,none,...,life insurance,31.0,none,own,2.0,unskilled resident,2.000000,yes,yes,0
644,<0,18.0,critical/other existing credit,radio/tv,1880000.0,<100,,,male mar/wid,none,...,none,32.0,life insurance,own,2.0,high qualif/self emp/mgmt,2.875736,yes,yes,1
110,0<=X<200,6.0,delayed previously,business,1449000.0,100<=X<500,,,male div/sep,none,...,car,31.0,bank,own,2.0,skilled,2.047525,none,yes,0
28,0<=X<200,7000.0,existing paid,radio/tv,,<100,,3.0,male single,guarantor,...,none,34.0,real estate,own,1.0,skilled,1.526985,none,yes,1


In [None]:
def cat_out_detect(df_train, df_corrupted):
    df_outliers = df_corrupted[categorical_columns].copy()
    
    for col in df_train.columns:
        if col in categorical_columns:
            vals_train_unique = df_train[col].unique()

            ## add a respective outlier col for each col
            df_outliers[col + "_outlier"] = ''
            
            for i in df_corrupted[col].index:
                if df_corrupted[col].loc[i] in vals_train_unique:
                    df_outliers[col + "_outlier"].loc[i] = 0
                else:
                    df_outliers[col + "_outlier"].loc[i] = 1
            
    return df_outliers

In [None]:
df_outliers_cat = cat_out_detect(train_data, test_data_corrupted)
df_outliers_cat

Unnamed: 0,checking_status,credit_history,purpose,savings_status,employment,personal_status,other_parties,property_magnitude,other_payment_plans,housing,...,savings_status_outlier,employment_outlier,personal_status_outlier,other_parties_outlier,property_magnitude_outlier,other_payment_plans_outlier,housing_outlier,job_outlier,own_telephone_outlier,foreign_worker_outlier
249,,existing paid,radio/tv,<100,unemployed,none,co applicant,,none,rent,...,0,0,1,0,1,0,0,0,1,0
353,,no credits/all paid,radio/tv,<100,1<=X<4,male single,none,life insurance,none,rent,...,0,0,0,0,0,0,0,0,0,0
537,,critical/other existing credit,furniture/equipment,<100,>=7,female div/dep/mar,none,life insurance,none,own,...,0,0,0,0,0,0,0,0,0,0
424,,existing paid,furniture/equipment,no known savings,>=7,yes,none,life insurance,bank,own,...,0,0,1,0,0,0,0,0,1,0
564,,delayed previously,business,no known savings,1<=X<4,yes,none,life insurance,bank,own,...,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,,delayed previously,business,100<=X<500,4<=X<7,male single,none,life insurance,none,own,...,0,0,0,0,0,0,0,0,0,0
644,,critical/other existing credit,radio/tv,<100,4<=X<7,yes,none,life insurance,none,own,...,0,0,1,0,0,0,0,0,1,0
110,,delayed previously,business,100<=X<500,>=7,male div/sep,none,car,bank,own,...,0,0,0,0,0,0,0,0,0,0
28,,existing paid,radio/tv,<100,1<=X<4,male single,guarantor,real estate,none,own,...,0,0,0,0,0,0,0,0,0,0


In [None]:
## joining the two outlier dfs (inner join on index)

In [None]:
df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')
df_outliers

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,duration_outlier,credit_amount_outlier,installment_commitment_outlier,...,savings_status_outlier,employment_outlier,personal_status_outlier,other_parties_outlier,property_magnitude_outlier,other_payment_plans_outlier,housing_outlier,job_outlier,own_telephone_outlier,foreign_worker_outlier
249,18.0,433.0,3.0,4000.0,22.0,1.0,1.000000,0,0,0,...,0,0,0,0,1,1,0,0,0,0
353,12000.0,6199000.0,4.0,2.0,28.0,2.0,0.726067,1,1,0,...,0,1,0,0,1,1,0,0,0,0
537,18.0,3612.0,,4.0,37.0,1.0,-0.749838,0,0,0,...,0,1,0,0,0,0,0,0,0,0
424,12000.0,2762000.0,,2000.0,25.0,1.0,3.433997,1,1,0,...,0,0,0,0,0,0,0,0,0,0
564,24000.0,4712.0,4.0,2000.0,37.0,2.0,0.030199,1,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,36.0,9857.0,1.0,3000.0,31.0,2.0,2.000000,0,1,0,...,0,1,0,0,0,0,0,0,0,0
644,18.0,1880000.0,,1000.0,32.0,2.0,2.875736,0,1,0,...,0,1,0,0,1,1,0,0,0,0
110,6.0,1449000.0,,2000.0,31.0,2.0,2.047525,0,1,0,...,0,1,0,0,0,0,0,0,0,0
28,7000.0,,3.0,2000.0,34.0,1.0,1.526985,1,0,0,...,0,1,0,0,1,1,0,0,0,0


In [None]:
## where the corresponding outlier column is 1, set the original value to NaN

In [None]:
col = 'duration'

In [None]:
df_outliers[['duration', 'duration_outlier']]

Unnamed: 0,duration,duration_outlier
249,18.0,0
353,12000.0,1
537,18.0,0
424,12000.0,1
564,24000.0,1
...,...,...
684,36.0,0
644,18.0,0
110,6.0,0
28,7000.0,1


In [None]:
for i in df_outliers.index:
    if df_outliers[col + "_outlier"].loc[i] == 1:
        df_outliers[col].loc[i] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [None]:
df_outliers

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,duration_outlier,credit_amount_outlier,installment_commitment_outlier,...,savings_status_outlier,employment_outlier,personal_status_outlier,other_parties_outlier,property_magnitude_outlier,other_payment_plans_outlier,housing_outlier,job_outlier,own_telephone_outlier,foreign_worker_outlier
249,18.0,433.0,3.0,4000.0,22.0,1.0,1.000000,0,0,0,...,0,0,0,0,1,1,0,0,0,0
353,,6199000.0,4.0,2.0,28.0,2.0,0.726067,1,1,0,...,0,1,0,0,1,1,0,0,0,0
537,18.0,3612.0,,4.0,37.0,1.0,-0.749838,0,0,0,...,0,1,0,0,0,0,0,0,0,0
424,,2762000.0,,2000.0,25.0,1.0,3.433997,1,1,0,...,0,0,0,0,0,0,0,0,0,0
564,,4712.0,4.0,2000.0,37.0,2.0,0.030199,1,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,36.0,9857.0,1.0,3000.0,31.0,2.0,2.000000,0,1,0,...,0,1,0,0,0,0,0,0,0,0
644,18.0,1880000.0,,1000.0,32.0,2.0,2.875736,0,1,0,...,0,1,0,0,1,1,0,0,0,0
110,6.0,1449000.0,,2000.0,31.0,2.0,2.047525,0,1,0,...,0,1,0,0,0,0,0,0,0,0
28,,,3.0,2000.0,34.0,1.0,1.526985,1,0,0,...,0,1,0,0,1,1,0,0,0,0


In [None]:
for col in columns:
    for i in df_outliers.index:
        if df_outliers[col + "_outlier"].loc[i] == 1:
            df_outliers[col].loc[i] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [None]:
df_outliers[columns]

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,18.0,existing paid,radio/tv,433.0,<100,unemployed,3.0,female div/dep/mar,co applicant,,,22.0,,rent,1.0,skilled,1.0,none,yes
353,<0,,no credits/all paid,radio/tv,,<100,,4.0,male single,none,2.0,,28.0,,rent,2.0,skilled,,yes,yes
537,0<=X<200,18.0,critical/other existing credit,furniture/equipment,3612.0,<100,,,female div/dep/mar,none,4.0,life insurance,37.0,none,own,1.0,skilled,,yes,yes
424,0<=X<200,,existing paid,furniture/equipment,,no known savings,>=7,,female div/dep/mar,none,,life insurance,25.0,bank,own,1.0,skilled,,yes,yes
564,0<=X<200,,delayed previously,business,4712.0,no known savings,,4.0,male single,none,,life insurance,37.0,bank,own,2.0,high qualif/self emp/mgmt,,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,0<=X<200,36.0,delayed previously,business,,100<=X<500,,1.0,male single,none,,life insurance,31.0,none,own,2.0,unskilled resident,2.0,yes,yes
644,<0,18.0,critical/other existing credit,radio/tv,,<100,,,male mar/wid,none,,,32.0,,own,2.0,high qualif/self emp/mgmt,,yes,yes
110,0<=X<200,6.0,delayed previously,business,,100<=X<500,,,male div/sep,none,,car,31.0,bank,own,2.0,skilled,,none,yes
28,0<=X<200,,existing paid,radio/tv,,<100,,3.0,male single,guarantor,,,34.0,,own,1.0,skilled,,none,yes


In [None]:
pyod_knn = PyODKNN(train_data, test_data_corrupted, categorical_columns, numerical_columns)
df_outliers = pyod_knn.fit_transform(train_data, test_data_corrupted)
df_outliers

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,,existing paid,radio/tv,433.0,<100,unemployed,3.0,,co applicant,4.0,real estate,22.0,none,rent,,skilled,1.0,,yes
353,,,no credits/all paid,radio/tv,6199.0,<100,1<=X<4,4.0,male single,none,2.0,life insurance,28.0,none,rent,,skilled,,yes,yes
537,,,critical/other existing credit,furniture/equipment,3612.0,<100,>=7,3.0,female div/dep/mar,none,4.0,life insurance,37.0,none,own,,skilled,1.0,yes,yes
424,,,existing paid,furniture/equipment,2762.0,no known savings,>=7,1.0,,none,2.0,life insurance,25.0,bank,own,,skilled,,,yes
564,,,delayed previously,business,4712.0,no known savings,1<=X<4,4.0,,none,2.0,life insurance,37.0,bank,own,2.0,high qualif/self emp/mgmt,1.0,,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,,,delayed previously,business,,100<=X<500,4<=X<7,1.0,male single,none,3.0,life insurance,31.0,none,own,2.0,unskilled resident,,yes,yes
644,,,critical/other existing credit,radio/tv,1880.0,<100,4<=X<7,4.0,,none,1.0,life insurance,32.0,none,own,2.0,high qualif/self emp/mgmt,,,yes
110,,,delayed previously,business,1449.0,100<=X<500,>=7,1.0,male div/sep,none,2.0,,31.0,bank,own,2.0,skilled,2.0,none,yes
28,,,existing paid,radio/tv,2415.0,<100,1<=X<4,3.0,male single,guarantor,2.0,real estate,34.0,none,own,1.0,skilled,1.0,none,yes


In [None]:
test_data

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,18.0,existing paid,radio/tv,433.0,<100,unemployed,3.0,female div/dep/mar,co applicant,4.0,real estate,22.0,none,rent,1.0,skilled,1.0,none,yes
353,<0,12.0,no credits/all paid,radio/tv,6199.0,<100,1<=X<4,4.0,male single,none,2.0,life insurance,28.0,none,rent,2.0,skilled,1.0,yes,yes
537,0<=X<200,18.0,critical/other existing credit,furniture/equipment,3612.0,<100,>=7,3.0,female div/dep/mar,none,4.0,life insurance,37.0,none,own,1.0,skilled,1.0,yes,yes
424,0<=X<200,12.0,existing paid,furniture/equipment,2762.0,no known savings,>=7,1.0,female div/dep/mar,none,2.0,life insurance,25.0,bank,own,1.0,skilled,1.0,yes,yes
564,0<=X<200,24.0,delayed previously,business,4712.0,no known savings,1<=X<4,4.0,male single,none,2.0,life insurance,37.0,bank,own,2.0,high qualif/self emp/mgmt,1.0,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,0<=X<200,36.0,delayed previously,business,9857.0,100<=X<500,4<=X<7,1.0,male single,none,3.0,life insurance,31.0,none,own,2.0,unskilled resident,2.0,yes,yes
644,<0,18.0,critical/other existing credit,radio/tv,1880.0,<100,4<=X<7,4.0,male mar/wid,none,1.0,life insurance,32.0,none,own,2.0,high qualif/self emp/mgmt,1.0,yes,yes
110,0<=X<200,6.0,delayed previously,business,1449.0,100<=X<500,>=7,1.0,male div/sep,none,2.0,car,31.0,bank,own,2.0,skilled,2.0,none,yes
28,0<=X<200,7.0,existing paid,radio/tv,2415.0,<100,1<=X<4,3.0,male single,guarantor,2.0,real estate,34.0,none,own,1.0,skilled,1.0,none,yes


In [None]:
pyod_iforest = PyODIsolationForest(train_data, test_data_corrupted, categorical_columns, numerical_columns)
df_outliers = pyod_iforest.fit_transform(train_data, test_data_corrupted)
df_outliers

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
249,no checking,,existing paid,radio/tv,,<100,unemployed,3.0,,co applicant,4.0,real estate,,none,rent,,skilled,1.000000,,yes
353,,,no credits/all paid,radio/tv,,<100,1<=X<4,4.0,male single,none,2.0,life insurance,28.0,none,rent,,skilled,,yes,yes
537,,,critical/other existing credit,furniture/equipment,3612.0,<100,>=7,3.0,female div/dep/mar,none,4.0,life insurance,37.0,none,own,,skilled,1.000000,yes,yes
424,,,existing paid,furniture/equipment,2762.0,no known savings,>=7,,,none,2.0,life insurance,25.0,bank,own,,skilled,0.900623,,yes
564,,,delayed previously,business,,no known savings,1<=X<4,4.0,,none,2.0,life insurance,37.0,bank,own,2.0,high qualif/self emp/mgmt,1.000000,,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,,,delayed previously,business,,100<=X<500,4<=X<7,,male single,none,3.0,life insurance,31.0,none,own,2.0,unskilled resident,,yes,yes
644,,,critical/other existing credit,radio/tv,1880.0,<100,4<=X<7,4.0,,none,,life insurance,32.0,none,own,2.0,high qualif/self emp/mgmt,0.335332,,yes
110,,,delayed previously,business,1449.0,100<=X<500,>=7,,male div/sep,none,2.0,,31.0,bank,own,2.0,skilled,,none,yes
28,,,existing paid,radio/tv,2415.0,<100,1<=X<4,3.0,male single,guarantor,2.0,real estate,34.0,none,own,1.0,skilled,1.000000,none,yes


In [None]:
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer

In [None]:
test_data_corrupted[numerical_columns]

In [None]:
imp = IterativeImputer(random_state=seed)
imp.fit(train_data[numerical_columns])

x = pd.DataFrame(imp.transform(test_data_corrupted[numerical_columns]))
x.columns = test_data_corrupted[numerical_columns].columns
x.index = test_data_corrupted[numerical_columns].index
x

In [None]:
imp_ = IterativeImputer(transformers=feature_transform)
imp_.fit(train_data[numerical_columns])

xx = pd.DataFrame(imp_.transform(test_data_corrupted[categorical_columns]))
xx.columns = test_data_corrupted[categorical_columns].columns
xx.index = test_data_corrupted[categorical_columns].index
xx

In [None]:
test_data_corrupted['purpose'][test_data_corrupted['purpose'].isnull()].index

Int64Index([659, 944, 845, 277, 218, 171, 334, 539, 953, 305, 604, 663, 387,
            482, 248, 628, 298, 448, 271, 700, 898, 614, 339, 707, 326, 795,
            837, 897, 233, 723, 155, 824,  92, 601, 335, 793, 295,   6, 261,
            172, 408, 444, 930,  34, 124, 176, 750, 299,  31, 576],
           dtype='int64')

In [None]:
test_data_corrupted['purpose'].value_counts()

radio/tv               44
new car                43
used car               21
business               16
education              12
repairs                 4
other                   3
retraining              3
domestic appliance      2
furniture/equipment     2
vacation                0
Name: purpose, dtype: int64

In [None]:
from jenga.cleaning.cleaner import Cleaner

cleaners_ = []
for outd, imp in cleaners:
    cleaners_.append(Cleaner(train_data,
                             test_data_corrupted,
                             categorical_columns,
                             numerical_columns,
                             outlier_detection = outd(train_data,
                                                      test_data_corrupted,
                                                      categorical_columns,
                                                      numerical_columns),
                             imputation = imp(train_data,
                                              test_data_corrupted,
                                              categorical_columns,
                                              numerical_columns)
                                )
                        )
    
ppp_model = ppp.fit_ppp(train_data)

print("\nApplying cleaners... \n")
        
score_no_cleaning = ppp.predict_score_ppp(ppp_model, test_data_corrupted)
print(f"PPP score no cleaning: {score_no_cleaning}")

summ_clean = {}
summary_cleaners = []

cleaner_scores_ppp = []
for cleaner in cleaners_:
  test_data_cleaned = cleaner.apply_cleaner(train_data, test_data_corrupted, categorical_columns, numerical_columns)
  cleaner_score = ppp.predict_score_ppp(ppp_model, test_data_cleaned)
  # print(f"Outlier detection method: {cleaner.outlier_detection}")
  # print(f"Imputation method: {cleaner.imputation}")
  print(f"PPP score with cleaning: {cleaner}: {cleaner_score} \n")
  cleaner_scores_ppp.append(cleaner_score)

  summ_clean = {"Outlier detection method": cleaner.outlier_detection, "Imputation method": cleaner.imputation, "PPP score with cleaning": cleaner_score}
  summary_cleaners.append(summ_clean) ## saving results for returning individuals too


roc_scores_for_best = []
for i in range(len(cleaner_scores_ppp)):
  roc_scores_for_best.append(cleaner_scores_ppp[i]["roc_auc_acore"])

best_cleaning_idx = pd.Series(roc_scores_for_best).idxmax()
best_cleaning_score = cleaner_scores_ppp[best_cleaning_idx]

if best_cleaning_score["roc_auc_acore"] > score_no_cleaning["roc_auc_acore"]:
  df_cleaned = cleaners_[best_cleaning_idx].apply_cleaner(train_data, test_data_corrupted, categorical_columns, numerical_columns)
  print(f"Best cleaning method:")
  # print(f"Outlier detection method: {self.cleaners[best_cleaning_idx].outlier_detection}")
  # print(f"Imputation method: {self.cleaners[best_cleaning_idx].imputation}")
  print(f"Cleaning score: {cleaners_[best_cleaning_idx]}: {best_cleaning_score} \n\n\n\n")
else:
  print("Cleaning didnt't improve the score \n\n\n\n")

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.3s finished



Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.701016865079365, 'classification_report': {'bad': {'precision': 0.6, 'recall': 0.16071428571428573, 'f1-score': 0.25352112676056343, 'support': 56}, 'good': {'precision': 0.745945945945946, 'recall': 0.9583333333333334, 'f1-score': 0.8389057750759878, 'support': 144}, 'accuracy': 0.735, 'macro avg': {'precision': 0.672972972972973, 'recall': 0.5595238095238095, 'f1-score': 0.5462134509182757, 'support': 200}, 'weighted avg': {'precision': 0.7050810810810811, 'recall': 0.735, 'f1-score': 0.674998073547669, 'support': 200}}}
PPP score with cleaning: Cleaner: {'outlier_detection': NoOutlierDetection, 'imputation': MeanModeImputation}: {'roc_auc_acore': 0.7020089285714286, 'classification_report': {'bad': {'precision': 0.6, 'recall': 0.16071428571428573, 'f1-score': 0.25352112676056343, 'support': 56}, 'good': {'precision': 0.745945945945946, 'recall': 0.9583333333333334, 'f1-score': 0.8389057750759878, 'support': 144}, 'ac

In [None]:
ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)
ppp_model = ppp.fit_ppp(train_data)

ppp.predict_score_ppp(ppp_model, test_data)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  57 out of  60 | elapsed:    3.9s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    4.0s finished


{'classification_report': {'accuracy': 0.755,
  'bad': {'f1-score': 0.4842105263157895,
   'precision': 0.5897435897435898,
   'recall': 0.4107142857142857,
   'support': 56},
  'good': {'f1-score': 0.8393442622950819,
   'precision': 0.7950310559006211,
   'recall': 0.8888888888888888,
   'support': 144},
  'macro avg': {'f1-score': 0.6617773943054357,
   'precision': 0.6923873228221054,
   'recall': 0.6498015873015872,
   'support': 200},
  'weighted avg': {'f1-score': 0.7399068162208801,
   'precision': 0.7375505653766523,
   'recall': 0.755,
   'support': 200}},
 'roc_auc_acore': 0.759796626984127}

In [None]:
y_pred = ppp_model.predict(test_data)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(test_labels, np.transpose(ppp_model.predict_proba(test_data))[1])

0.7543402777777778

In [None]:
from sklearn.metrics import classification_report
classification_report(test_labels, y_pred, output_dict=True)

{'accuracy': 0.75,
 'bad': {'f1-score': 0.47916666666666663,
  'precision': 0.575,
  'recall': 0.4107142857142857,
  'support': 56},
 'good': {'f1-score': 0.8355263157894737,
  'precision': 0.79375,
  'recall': 0.8819444444444444,
  'support': 144},
 'macro avg': {'f1-score': 0.6573464912280702,
  'precision': 0.684375,
  'recall': 0.6463293650793651,
  'support': 200},
 'weighted avg': {'f1-score': 0.7357456140350876,
  'precision': 0.7325,
  'recall': 0.75,
  'support': 200}}