In [None]:
## mount drive for access to the files
from google.colab import drive

drive.mount("/content/drive")

## all the drive the files are present in "/content/drive/My Drive"
# !ls "/content/drive/My Drive/Beuth Uni/Master Thesis"

import sys
sys.path.append('/content/drive/My Drive/Beuth Uni/Master Thesis/jenga')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install openml
!pip install pyod

!pip install mxnet autogluon
!pip install mxnet-mkl --pre --upgrade

Requirement already up-to-date: mxnet-mkl in /usr/local/lib/python3.6/dist-packages (1.6.0)


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from jenga.basis import Dataset

In [None]:
seed = 10

In [None]:
dataset = Dataset(seed, "parkinsons")

all_data = dataset.all_data
attribute_names = dataset.attribute_names
attribute_types = dataset.attribute_types

categorical_columns = dataset.categorical_columns
numerical_columns = dataset.numerical_columns

print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")

Dataset: parkinsons
Found 0 categorical and 22 numeric features 



### Get training and test sets

In [None]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data(0.3)

## Model

In [None]:
from sklearn.linear_model import SGDClassifier

learner = SGDClassifier(loss='log')
param_grid = {
    'learner__max_iter': [500, 1000, 5000],
    'learner__penalty': ['l2', 'l1', 'elasticnet'], 
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

## Corruptions

In [None]:
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise

corruptions = [MissingValues, SwappedValues, Scaling, GaussianNoise]
fraction = 0.5
num_repetitions = 5

In [None]:
from jenga.cleaning.ppp import PipelinePerformancePrediction

ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)
ppp_model = ppp.fit_ppp(train_data)

## generate corrpted data
df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions, fraction, num_repetitions)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.



Generating corrupted training data on 59 rows... 

Can't apply the SwappedValues corruption because there are no categorical columns. 


	perturbation: MissingValues: {'column': 'V2', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MNAR'}
	perturbation: Scaling: {'column': 'V14', 'fraction': 0.5}
	perturbation: GaussianNoise: {'column': 'V16', 'fraction': 0.5}
	perturbation: MissingValues: {'column': 'V15', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: Scaling: {'column': 'V16', 'fraction': 0.5}
	perturbation: GaussianNoise: {'column': 'V9', 'fraction': 0.5}
	perturbation: MissingValues: {'column': 'V2', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: Scaling: {'column': 'V17', 'fraction': 0.5}
	perturbation: GaussianNoise: {'column': 'V16', 'fraction': 0.5}
	perturbation: MissingValues: {'column': 'V12', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MCAR'}
	perturbation: Scaling: {'column': 'V2', 'fraction': 0.5}
	perturbation: 

[Parallel(n_jobs=-1)]: Done 174 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    3.8s finished


## Cleaning

In [None]:
categorical_precision_threshold=0.7
numerical_std_error_threshold=2.0

In [None]:
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNNOutlierDetection, PyODIsolationForestOutlierDetection, AutoGluonOutlierDetection
from jenga.cleaning.imputation import MeanModeImputation, AutoGluonImputation

cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, MeanModeImputation),
    # (PyODKNNOutlierDetection, AutoGluonImputation),
    (PyODIsolationForestOutlierDetection, MeanModeImputation),
    # (PyODIsolationForestOutlierDetection, AutoGluonImputation),
    # (AutoGluonOutlierDetection, MeanModeImputation)
    # (AutoGluonOutlierDetection, AutoGluonImputation)
]

In [None]:
from jenga.cleaning.clean import Clean

categorical_precision_threshold=0.7
numerical_std_error_threshold=2.0

clean = Clean(train_data, df_corrupted, categorical_columns, numerical_columns, categorical_precision_threshold, numerical_std_error_threshold, ppp, ppp_model, cleaners)
df_outliers, df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(train_data, test_data, df_corrupted, cols_perturbed)


Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.9462209302325582, 'classification_report': {'1': {'precision': 1.0, 'recall': 0.0625, 'f1-score': 0.11764705882352941, 'support': 16}, '2': {'precision': 0.7413793103448276, 'recall': 1.0, 'f1-score': 0.8514851485148515, 'support': 43}, 'accuracy': 0.7457627118644068, 'macro avg': {'precision': 0.8706896551724138, 'recall': 0.53125, 'f1-score': 0.48456610366919045, 'support': 59}, 'weighted avg': {'precision': 0.8115137346580947, 'recall': 0.7457627118644068, 'f1-score': 0.6524782089375438, 'support': 59}}}
PPP scores with cleaning: 

Outlier detection method: NoOutlierDetection, Outlier Detection Score: {'Precision': 0.2457627118644068, 'Recall': 0.5, 'F1-score': 0.32948772889687267, 'Accuracy': 0.4915254237288136}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 1.6296400976797498}
Cleaner: (NoOutlierDetection, MeanModeIm

  _warn_prf(average, modifier, msg_start, len(result))
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))



Outlier detection method: PyODKNNOutlierDetection, Outlier Detection Score: {'Precision': 0.859090909090909, 'Recall': 0.8458666596041456, 'F1-score': 0.8348107881637726, 'Accuracy': 0.8418079096045199}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': nan, 'Recall': nan, 'F1-score': nan, 'Accuracy': nan, 'Mean Squared Error': 0.0003782591546052212}
Cleaner: (PyODKNNOutlierDetection, MeanModeImputation): {'roc_auc_acore': 0.9229651162790697, 'classification_report': {'1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 16}, '2': {'precision': 0.7288135593220338, 'recall': 1.0, 'f1-score': 0.8431372549019608, 'support': 43}, 'accuracy': 0.7288135593220338, 'macro avg': {'precision': 0.3644067796610169, 'recall': 0.5, 'f1-score': 0.4215686274509804, 'support': 59}, 'weighted avg': {'precision': 0.5311692042516518, 'recall': 0.7288135593220338, 'f1-score': 0.614489863742107, 'support': 59}}}

Outlier detection method: PyODIsolationForestOutlierDetection

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
df_outliers = df_corrupted.copy()

for col in df_corrupted.columns:
  df_outliers[col + "_outlier"] = 0

df_outliers

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V1_outlier,V2_outlier,V3_outlier,V4_outlier,V5_outlier,V6_outlier,V7_outlier,V8_outlier,V9_outlier,V10_outlier,V11_outlier,V12_outlier,V13_outlier,V14_outlier,V15_outlier,V16_outlier,V17_outlier,V18_outlier,V19_outlier,V20_outlier,V21_outlier,V22_outlier
59,114.847,271.314,104.68,0.00867,8e-05,0.00373,0.0052,0.0112,0.03225,0.35,0.01805,,0.02519,0.045622,0.01143,21.66,0.547975,0.817396,-4.609161,0.221711,1.831691,0.316395,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,120.552,131.162,113.787,0.00968,8e-05,0.00463,0.0075,0.01388,0.04701,0.456,0.02328,,3.243,0.064955,0.01222,21.378,0.415564,0.825069,-4.242867,0.299111,2.18756,0.357775,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
20,153.848,165.738,65.782,0.0084,5e-05,0.00428,0.0045,0.01285,0.0381,0.328,0.01667,,0.04055,0.05,0.03871,17.536,0.660125,0.704087,-4.095442,0.262564,2.73971,0.365391,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
121,176.824,215.293,83.961,0.0046,3e-05,0.00209,0.00221,0.00628,0.01169,0.117,0.00534,0.0063,0.01104,0.01603,0.01161,27.166,0.400088,0.656182,-4.711007,0.281618,2.655744,0.234809,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
52,128.001,138.052,122.08,0.00436,3e-05,0.00137,0.00166,0.00411,0.02297,0.21,0.01323,0.01072,0.01677,0.03969,0.00481,24.692,0.459766,0.766204,-7.072419,0.220434,1.972297,0.119308,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19,156.405,189.398,142.822,0.00768,5e-05,0.00372,0.00399,0.01116,0.03995,0.348,0.01721,,0.0431,0.0747,0.03365,17.153,0.649554,0.68608,-4.554466,0.340176,2.856676,0.322111,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
187,116.342,581.289,94.246,0.00267,2e-05,0.00115,0.00148,0.00345,0.013,0.117,0.00631,0.00789,0.01144,0.031607,0.0068,25.023,0.528485,0.663884,-6.359018,0.116636,2.152083,0.138868,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
55,109.86,126.358,104.437,0.00874,8e-05,0.00398,0.00539,0.01193,0.03209,0.307,0.01789,,0.02454,-0.062783,0.0118,20.767,0.558586,0.811843,-4.333543,0.221727,2.014606,0.344834,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
69,148.09,162.824,67.343,0.00762,5e-05,0.00467,0.00354,0.014,0.05428,0.497,0.03357,,3.635,0.023739,0.02431,21.718,0.487407,0.727313,-6.261141,0.120956,2.137075,0.141958,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,0.482,0.02757,0.03858,3.59,0.018573,0.01309,20.651,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Outlier and Imputation scores

In [None]:
df_outliers

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V1_outlier,V2_outlier,V3_outlier,V4_outlier,V5_outlier,V6_outlier,V7_outlier,V8_outlier,V9_outlier,V10_outlier,V11_outlier,V12_outlier,V13_outlier,V14_outlier,V15_outlier,V16_outlier,V17_outlier,V18_outlier,V19_outlier,V20_outlier,V21_outlier,V22_outlier
59,,271.314,104.68,0.00867,8e-05,0.00373,0.0052,0.0112,0.03225,0.35,,,0.02519,,0.01143,21.66,0.547975,0.817396,-4.609161,0.221711,1.831691,0.316395,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0
5,120.552,131.162,113.787,0.00968,8e-05,,0.0075,,0.04701,0.456,0.02328,,,0.064955,0.01222,21.378,0.415564,0.825069,-4.242867,0.299111,2.18756,0.357775,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
20,153.848,165.738,65.782,0.0084,5e-05,0.00428,0.0045,0.01285,0.0381,0.328,0.01667,,0.04055,0.05,0.03871,17.536,0.660125,0.704087,-4.095442,0.262564,2.73971,0.365391,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
121,176.824,215.293,83.961,0.0046,3e-05,0.00209,0.00221,0.00628,0.01169,0.117,0.00534,0.0063,0.01104,0.01603,0.01161,27.166,0.400088,0.656182,-4.711007,0.281618,2.655744,0.234809,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
52,128.001,138.052,122.08,0.00436,3e-05,0.00137,0.00166,0.00411,0.02297,0.21,0.01323,0.01072,0.01677,0.03969,0.00481,24.692,0.459766,0.766204,-7.072419,0.220434,1.972297,0.119308,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19,156.405,189.398,,0.00768,5e-05,0.00372,0.00399,0.01116,0.03995,0.348,,,0.0431,,0.03365,17.153,0.649554,0.68608,-4.554466,0.340176,2.856676,0.322111,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0
187,,,94.246,0.00267,2e-05,0.00115,0.00148,0.00345,0.013,0.117,,0.00789,0.01144,,0.0068,25.023,0.528485,0.663884,-6.359018,0.116636,2.152083,0.138868,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0
55,109.86,126.358,104.437,0.00874,8e-05,0.00398,0.00539,0.01193,0.03209,0.307,,,0.02454,,0.0118,20.767,0.558586,0.811843,-4.333543,0.221727,2.014606,0.344834,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0
69,148.09,162.824,67.343,0.00762,5e-05,0.00467,0.00354,0.014,0.05428,0.497,,,,,0.02431,21.718,0.487407,0.727313,-6.261141,0.120956,2.137075,0.141958,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0
2,116.682,131.111,111.555,0.0105,,,0.00781,,0.05233,0.482,,0.03858,,,0.01309,20.651,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634,0,0,0,0,1,1,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,0


In [None]:
df_cleaned

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
841,no checking,21.000000,delayed previously,used car,2993.0,<100,1<=X<4,3.0,male single,none,2.000000,real estate,28.000000,stores,own,2.000000,unskilled resident,1.0,none,yes
956,>=200,30.000000,critical/other existing credit,radio/tv,3656.0,no known savings,>=7,4.0,male single,none,2.852857,life insurance,49.000000,stores,own,1.397143,unskilled resident,1.0,none,yes
544,no checking,12.000000,critical/other existing credit,new car,1255.0,<100,>=7,4.0,male single,none,2.852857,real estate,61.000000,none,own,1.397143,unskilled resident,1.0,none,yes
173,0<=X<200,21.167143,existing paid,radio/tv,1414.0,<100,1<=X<4,4.0,male single,guarantor,2.852857,real estate,33.000000,none,own,1.000000,skilled,1.0,none,no
759,<0,12.000000,critical/other existing credit,new car,691.0,<100,>=7,4.0,male single,none,2.852857,life insurance,35.000000,none,own,1.397143,skilled,1.0,none,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
949,no checking,24.000000,existing paid,radio/tv,3621.0,100<=X<500,>=7,2.0,male single,none,2.852857,car,31.000000,none,own,2.000000,skilled,1.0,none,yes
802,<0,20.000000,critical/other existing credit,furniture/equipment,4272.0,<100,>=7,1.0,female div/dep/mar,none,2.852857,life insurance,24.000000,none,own,2.000000,skilled,1.0,none,yes
293,>=200,42.000000,critical/other existing credit,used car,4796.0,<100,>=7,4.0,male single,none,2.852857,no known property,35.537143,none,for free,1.397143,skilled,1.0,none,yes
414,<0,24.000000,existing paid,new car,1381.0,no known savings,1<=X<4,4.0,female div/dep/mar,none,2.852857,life insurance,35.000000,none,own,1.397143,skilled,1.0,none,yes


In [None]:
from sklearn.metrics import mean_squared_error

print(mean_squared_error(test_data["num_dependents"], df_cleaned["num_dependents"]))

0.05232729931972789


In [None]:
from sklearn.metrics import mean_squared_error

mse = []
for col in cols_perturbed:
  if col in numerical_columns:
    mse.append(mean_squared_error(test_data[col], df_cleaned[col]))

print(mse)
np.mean(mse)

[0.05232729931972789, 0.5175841632653062]


0.28495573129251706

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_data["savings_status"], df_cleaned["savings_status"], output_dict=True))

{'100<=X<500': {'precision': 1.0, 'recall': 0.5714285714285714, 'f1-score': 0.7272727272727273, 'support': 42}, '500<=X<1000': {'precision': 1.0, 'recall': 0.6, 'f1-score': 0.7499999999999999, 'support': 20}, '<100': {'precision': 0.7435897435897436, 'recall': 1.0, 'f1-score': 0.8529411764705882, 'support': 174}, '>=1000': {'precision': 1.0, 'recall': 0.5714285714285714, 'f1-score': 0.7272727272727273, 'support': 14}, 'no known savings': {'precision': 1.0, 'recall': 0.44, 'f1-score': 0.6111111111111112, 'support': 50}, 'accuracy': 0.8, 'macro avg': {'precision': 0.9487179487179487, 'recall': 0.6365714285714286, 'f1-score': 0.7337195484254309, 'support': 300}, 'weighted avg': {'precision': 0.8512820512820513, 'recall': 0.8, 'f1-score': 0.7823153099623688, 'support': 300}}


In [None]:
f1socres = []
for col in categorical_columns:
  print(col)
  labels = [k for k in classif_reports[col] if k not in ['accuracy', 'macro avg', 'weighted avg']]
  print(labels)

  f1s = []
  for label in labels:
    f1s.append(classif_reports[col][label]['f1-score'])
  print(f1s)
  
  print(np.mean(f1s))
  f1socres.append(np.mean(f1s))

print(f1socres)
np.mean(f1socres)

checking_status
['0<=X<200', '<0', '>=200', 'no checking']
[1.0, 1.0, 1.0, 1.0]
1.0
credit_history
['all paid', 'critical/other existing credit', 'delayed previously', 'existing paid', 'no credits/all paid']
[1.0, 1.0, 1.0, 1.0, 1.0]
1.0
purpose
['business', 'domestic appliance', 'education', 'furniture/equipment', 'new car', 'other', 'radio/tv', 'repairs', 'retraining', 'used car']
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
1.0
savings_status
['100<=X<500', '500<=X<1000', '<100', '>=1000', 'no known savings']
[0.7272727272727273, 0.7499999999999999, 0.8529411764705882, 0.7272727272727273, 0.6111111111111112]
0.7337195484254309
employment
['1<=X<4', '4<=X<7', '<1', '>=7', 'unemployed']
[1.0, 1.0, 1.0, 1.0, 1.0]
1.0
personal_status
['female div/dep/mar', 'male div/sep', 'male mar/wid', 'male single']
[1.0, 1.0, 1.0, 1.0]
1.0
other_parties
['co applicant', 'guarantor', 'none']
[1.0, 1.0, 1.0]
1.0
property_magnitude
['car', 'life insurance', 'no known property', 'real estate']
[1.

0.9349603050223851

In [None]:
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error

classif_reports = {}
acc_scores = []
mse = []

for col in cols_perturbed:
  if col in categorical_columns:
    classif_reports[col] = classification_report(test_data[col], df_cleaned[col], output_dict=True)
    acc_scores.append(accuracy_score(test_data[col], df_cleaned[col]))
  else:
    mse.append(mean_squared_error(test_data[col], df_cleaned[col]))

In [None]:
classif_reports['checking_status']['<0']['f1-score']

1.0

In [None]:
f1socres = []
recallscores = []
precisionscores = []

for col in cols_perturbed:
  if col in categorical_columns:
    labels = [k for k in classif_reports[col] if k not in ['accuracy', 'macro avg', 'weighted avg']]

    f1s = []
    res = []
    pres = []

    for label in labels:
      f1s.append(classif_reports[col][label]['f1-score'])
      res.append(classif_reports[col][label]['recall'])
      pres.append(classif_reports[col][label]['precision'])
    
    f1socres.append(np.mean(f1s))
    recallscores.append(np.mean(res))
    precisionscores.append(np.mean(pres))

print(f"Mean f1-score: {np.mean(f1socres)}")
print(f"Mean Recall: {np.mean(recallscores)}")
print(f"Mean Precision: {np.mean(precisionscores)}")
print(f"Mean Accuracy: {np.mean(acc_scores)}\n")

print(f"Mean MSE: {np.mean(mse)}")

Mean f1-score: 0.7589328353991343
Mean Recall: 0.670423216422092
Mean Precision: 0.9541468926553672
Mean Accuracy: 0.855

Mean MSE: 0.6877844943310657


In [None]:
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error

def imputation_scores(df_test, df_cleaned, cols_perturbed, categorical_columns):
  classif_reports = {}

  acc_scores = []
  f1socres = []
  recallscores = []
  precisionscores = []

  mse = []

  for col in cols_perturbed:
    if col in categorical_columns:
      classif_reports[col] = classification_report(df_test[col], df_cleaned[col], output_dict=True)

      labels = [k for k in classif_reports[col] if k not in ['accuracy', 'macro avg', 'weighted avg']]

      f1s = []
      res = []
      pres = []

      for label in labels:
        f1s.append(classif_reports[col][label]['f1-score'])
        res.append(classif_reports[col][label]['recall'])
        pres.append(classif_reports[col][label]['precision'])
      
      f1socres.append(np.mean(f1s))
      recallscores.append(np.mean(res))
      precisionscores.append(np.mean(pres))

      acc_scores.append(accuracy_score(df_test[col], df_cleaned[col]))
    else:
      mse.append(mean_squared_error(df_test[col], df_cleaned[col]))

  imputation_scores_summ = {
      "Precision": np.mean(precisionscores),
      "Recall": np.mean(recallscores),
      "F1-score": np.mean(f1socres),
      "Accuracy": np.mean(acc_scores),
      "Mean Squared Error": np.mean(mse)
  }

  return imputation_scores_summ

In [None]:
imputation_scores(test_data, df_cleaned, cols_perturbed, categorical_columns)

{'Accuracy': 0.855,
 'F1-score': 0.7589328353991343,
 'Mean Squared Error': 1250.940015940662,
 'Precision': 0.9541468926553672,
 'Recall': 0.670423216422092}

In [None]:
test_data_out = test_data.copy()
test_data_out

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
841,no checking,21.0,delayed previously,used car,2993.0,<100,1<=X<4,3.0,male single,none,2.0,real estate,28.0,stores,own,2.0,unskilled resident,1.0,none,yes
956,>=200,30.0,critical/other existing credit,radio/tv,3656.0,no known savings,>=7,4.0,male single,none,4.0,life insurance,49.0,stores,own,2.0,unskilled resident,1.0,none,yes
544,no checking,12.0,critical/other existing credit,new car,1255.0,<100,>=7,4.0,male single,none,4.0,real estate,61.0,none,own,2.0,unskilled resident,1.0,none,yes
173,0<=X<200,8.0,existing paid,radio/tv,1414.0,<100,1<=X<4,4.0,male single,guarantor,2.0,real estate,33.0,none,own,1.0,skilled,1.0,none,no
759,<0,12.0,critical/other existing credit,new car,691.0,<100,>=7,4.0,male single,none,3.0,life insurance,35.0,none,own,2.0,skilled,1.0,none,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
949,no checking,24.0,existing paid,radio/tv,3621.0,100<=X<500,>=7,2.0,male single,none,4.0,car,31.0,none,own,2.0,skilled,1.0,none,yes
802,<0,20.0,critical/other existing credit,furniture/equipment,4272.0,<100,>=7,1.0,female div/dep/mar,none,4.0,life insurance,24.0,none,own,2.0,skilled,1.0,none,yes
293,>=200,42.0,critical/other existing credit,used car,4796.0,<100,>=7,4.0,male single,none,4.0,no known property,56.0,none,for free,1.0,skilled,1.0,none,yes
414,<0,24.0,existing paid,new car,1381.0,no known savings,1<=X<4,4.0,female div/dep/mar,none,2.0,life insurance,35.0,none,own,1.0,skilled,1.0,none,yes


In [None]:
## compare the corrupted and original column values for manual outlier detection: add binary column, compare with df_outlier _outlier columns

In [None]:
test_data["residence_since"]

841    2.0
956    4.0
544    4.0
173    2.0
759    3.0
      ... 
949    4.0
802    4.0
293    4.0
414    2.0
260    2.0
Name: residence_since, Length: 300, dtype: float64

In [None]:
df_corrupted["residence_since"]

841     2.000000
956    74.250827
544    40.000000
173    20.000000
759   -76.598200
         ...    
949   -36.167123
802    40.000000
293    40.000000
414    76.934921
260    20.000000
Name: residence_since, Length: 300, dtype: float64

In [None]:
df_outliers[["residence_since", "residence_since_outlier"]]

Unnamed: 0,residence_since,residence_since_outlier
841,2.0,0
956,,1
544,,1
173,,1
759,,1
...,...,...
949,,1
802,,1
293,,1
414,,1


In [None]:
outiers_man = np.equal(test_data["residence_since"], df_corrupted["residence_since"])
outiers_man_ind = outiers_man.index[outiers_man == False]
print(outiers_man_ind)

non_outliers_man_ind = test_data_out.loc[set(test_data_out.index) - set(outiers_man_ind)].index
print(non_outliers_man_ind)

Int64Index([956, 544, 173, 759, 121, 230, 120, 659, 191, 944,
            ...
            894, 799, 753, 600,  35, 949, 802, 293, 414, 260],
           dtype='int64', length=233)
Int64Index([641, 518, 519, 264, 392,  11,  12, 654, 271, 398,  22, 793, 410,
            287, 673, 419, 679, 936, 939, 172, 684, 686, 814, 305, 562, 307,
            181, 569, 442, 955, 700, 573, 188, 448, 193, 706, 579, 196, 709,
            197, 960, 841, 845, 334, 976,  84, 728, 473, 218, 731, 988, 349,
            735, 354, 739, 100, 482, 614, 994, 104, 233, 875, 378, 880, 631,
            890, 381],
           dtype='int64')


In [None]:
test_data_out.loc[outiers_man_ind, "residence_since_outlier"] = 1 ## outliers
test_data_out.loc[non_outliers_man_ind, "residence_since_outlier"] = 0 ## not outliers

In [None]:
pd.concat([df_corrupted["residence_since"], test_data_out[["residence_since", "residence_since_outlier"]], df_outliers["residence_since_outlier"]], axis=1)

Unnamed: 0,residence_since,residence_since.1,residence_since_outlier,residence_since_outlier.1
841,2.000000,2.0,0.0,0
956,74.250827,4.0,1.0,1
544,40.000000,4.0,1.0,1
173,20.000000,2.0,1.0,1
759,-76.598200,3.0,1.0,1
...,...,...,...,...
949,-36.167123,4.0,1.0,1
802,40.000000,4.0,1.0,1
293,40.000000,4.0,1.0,1
414,76.934921,2.0,1.0,1


In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_data_out["residence_since_outlier"], df_outliers["residence_since_outlier"]))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        67
         1.0       1.00      1.00      1.00       233

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300



In [None]:
from sklearn.metrics import classification_report

test_data_out = test_data.copy()

classif_reports = {}
acc_scores = []
f1socres = []
recallscores = []
precisionscores = []

for col in cols_perturbed:
  print(col)
  outiers_man = np.equal(test_data[col], df_corrupted[col])
  outiers_man_ind = outiers_man.index[outiers_man == False]
  print(outiers_man_ind)

  non_outliers_man_ind = test_data_out.loc[set(test_data_out.index) - set(outiers_man_ind)].index
  print(non_outliers_man_ind)

  test_data_out.loc[outiers_man_ind, col + "_outlier"] = 1 ## outliers
  test_data_out.loc[non_outliers_man_ind, col + "_outlier"] = 0 ## not outliers

  print(classification_report(test_data_out[col + "_outlier"], df_outliers[col + "_outlier"]))
  classif_reports[col] = classification_report(test_data_out[col + "_outlier"], df_outliers[col + "_outlier"], output_dict=True)

  labels = [k for k in classif_reports[col] if k not in ['accuracy', 'macro avg', 'weighted avg']]

  f1s = []
  res = []
  pres = []

  for label in labels:
    f1s.append(classif_reports[col][label]['f1-score'])
    res.append(classif_reports[col][label]['recall'])
    pres.append(classif_reports[col][label]['precision'])
  
  f1socres.append(np.mean(f1s))
  recallscores.append(np.mean(res))
  precisionscores.append(np.mean(pres))

  acc_scores.append(accuracy_score(test_data_out[col + "_outlier"], df_outliers[col + "_outlier"]))

print(f"Mean f1-score: {np.mean(f1socres)}")
print(f"Mean Recall: {np.mean(recallscores)}")
print(f"Mean Precision: {np.mean(precisionscores)}")
print(f"Mean Accuracy: {np.mean(acc_scores)}\n")


existing_credits
Int64Index([841, 956, 544, 173, 759, 230, 120, 659, 419, 191,
            ...
            264, 536, 152, 685, 429, 852, 242, 293, 414, 260],
           dtype='int64', length=150)
Int64Index([513, 514, 519,  11,  12, 531,  22,  27,  30,  31,
            ...
            966, 971, 467, 475, 482, 994, 483, 507, 508, 510],
           dtype='int64', length=150)
              precision    recall  f1-score   support

         0.0       1.00      0.95      0.97       150
         1.0       0.95      1.00      0.97       150

    accuracy                           0.97       300
   macro avg       0.97      0.97      0.97       300
weighted avg       0.97      0.97      0.97       300

housing
Int64Index([544, 173, 759, 121, 230, 419, 417, 374, 982, 449,
            ...
            942, 467,  30, 345, 100, 284, 646, 894,  35, 802],
           dtype='int64', length=133)
Int64Index([  1, 514, 513,   6, 518, 519,  11, 525, 531,  22,
            ...
            976, 978, 473, 475, 4

In [None]:
test_data_out

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,existing_credits_outlier,housing_outlier,savings_status_outlier,residence_since_outlier
841,no checking,21.0,delayed previously,used car,2993.0,<100,1<=X<4,3.0,male single,none,2.0,real estate,28.0,stores,own,2.0,unskilled resident,1.0,none,yes,1.0,0.0,0.0,0.0
956,>=200,30.0,critical/other existing credit,radio/tv,3656.0,no known savings,>=7,4.0,male single,none,4.0,life insurance,49.0,stores,own,2.0,unskilled resident,1.0,none,yes,1.0,0.0,0.0,1.0
544,no checking,12.0,critical/other existing credit,new car,1255.0,<100,>=7,4.0,male single,none,4.0,real estate,61.0,none,own,2.0,unskilled resident,1.0,none,yes,1.0,1.0,1.0,1.0
173,0<=X<200,8.0,existing paid,radio/tv,1414.0,<100,1<=X<4,4.0,male single,guarantor,2.0,real estate,33.0,none,own,1.0,skilled,1.0,none,no,1.0,1.0,1.0,1.0
759,<0,12.0,critical/other existing credit,new car,691.0,<100,>=7,4.0,male single,none,3.0,life insurance,35.0,none,own,2.0,skilled,1.0,none,yes,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
949,no checking,24.0,existing paid,radio/tv,3621.0,100<=X<500,>=7,2.0,male single,none,4.0,car,31.0,none,own,2.0,skilled,1.0,none,yes,0.0,0.0,0.0,1.0
802,<0,20.0,critical/other existing credit,furniture/equipment,4272.0,<100,>=7,1.0,female div/dep/mar,none,4.0,life insurance,24.0,none,own,2.0,skilled,1.0,none,yes,0.0,1.0,1.0,1.0
293,>=200,42.0,critical/other existing credit,used car,4796.0,<100,>=7,4.0,male single,none,4.0,no known property,56.0,none,for free,1.0,skilled,1.0,none,yes,1.0,0.0,0.0,1.0
414,<0,24.0,existing paid,new car,1381.0,no known savings,1<=X<4,4.0,female div/dep/mar,none,2.0,life insurance,35.0,none,own,1.0,skilled,1.0,none,yes,1.0,0.0,0.0,1.0


In [None]:
def outlier_detection_scores(df_test, df_corrupted, df_outliers, cols_perturbed):
  df_test_out = df_test.copy()

  classif_reports = {}
  acc_scores = []
  f1socres = []
  recallscores = []
  precisionscores = []

  for col in cols_perturbed:
    outiers_man = np.equal(df_test[col], df_corrupted[col])
    outiers_man_ind = outiers_man.index[outiers_man == False]

    non_outliers_man_ind = df_test_out.loc[set(df_test_out.index) - set(outiers_man_ind)].index

    df_test_out.loc[outiers_man_ind, col + "_outlier"] = 1 ## outliers
    df_test_out.loc[non_outliers_man_ind, col + "_outlier"] = 0 ## not outliers

    classif_reports[col] = classification_report(df_test_out[col + "_outlier"], df_outliers[col + "_outlier"], output_dict=True)

    labels = [k for k in classif_reports[col] if k not in ['accuracy', 'macro avg', 'weighted avg']]

    f1s = []
    res = []
    pres = []

    for label in labels:
      f1s.append(classif_reports[col][label]['f1-score'])
      res.append(classif_reports[col][label]['recall'])
      pres.append(classif_reports[col][label]['precision'])
    
    f1socres.append(np.mean(f1s))
    recallscores.append(np.mean(res))
    precisionscores.append(np.mean(pres))

    acc_scores.append(accuracy_score(df_test_out[col + "_outlier"], df_outliers[col + "_outlier"]))

  od_scores_summ = {
      "Precision": np.mean(precisionscores),
      "Recall": np.mean(recallscores),
      "F1-score": np.mean(f1socres),
      "Accuracy": np.mean(acc_scores)
  }

  return od_scores_summ

In [None]:
outlier_detection_scores(test_data, df_corrupted, df_outliers, cols_perturbed)

{'Accuracy': 0.7586666666666668,
 'F1-score': 0.7586392690149641,
 'Precision': 0.8271106219042377,
 'Recall': 0.8278483974974911}