## Dataset

In [1]:
# mount drive for access to the files
from google.colab import drive

drive.mount("/content/drive")

# all the drive the files are present in "/content/drive/My Drive"
!ls "/content/drive/My Drive/Beuth Uni/Master Thesis"

import sys
sys.path.append('/content/drive/My Drive/Beuth Uni/Master Thesis/jenga')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Data
'Data Quality in ML Production Systems.pdf'
'Datawig: Missing Value Imputation for Tables.pdf'
 Declaration
 Images
 jenga
 jenga.pdf
 MICE_Multivariate_Imputation_by_Chained_Equations_.pdf
 out.zip


In [2]:
!pip install openml
!pip install pyod

!pip install mxnet autogluon
!pip install mxnet-mkl --pre --upgrade

Requirement already up-to-date: mxnet-mkl in /usr/local/lib/python3.6/dist-packages (1.6.0)


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from jenga.basis import Dataset

In [4]:
dataset = Dataset("acute-inflammations")

all_data = dataset.all_data
attribute_names = dataset.attribute_names
attribute_types = dataset.attribute_types

categorical_columns = dataset.categorical_columns
numerical_columns = dataset.numerical_columns

print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")

Dataset: acute-inflammations
Found 5 categorical and 1 numeric features 



### Get training and test sets

In [5]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data(0.3)

## Model

## Model & Corruptions using PPP

In [6]:
from sklearn.linear_model import SGDClassifier

learner = SGDClassifier(loss='log')
param_grid = {
    'learner__max_iter': [500, 1000, 5000],
    'learner__penalty': ['l2', 'l1', 'elasticnet'], 
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

In [7]:
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise

corruptions = [MissingValues] #, SwappedValues, Scaling, GaussianNoise]
fraction = 0.5
num_repetitions = 5

In [8]:
from jenga.cleaning.ppp import PipelinePerformancePrediction

ppp = PipelinePerformancePrediction(train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)
ppp_model = ppp.fit_ppp(train_data)

## generate corrpted data
for _ in range(num_repetitions):
  df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions, fraction, num_repetitions)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.



Generating corrupted training data on 36 rows... 

	perturbation: MissingValues: {'column': 'V3', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MCAR'}

Generating corrupted training data on 36 rows... 

	perturbation: MissingValues: {'column': 'V1', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MCAR'}

Generating corrupted training data on 36 rows... 

	perturbation: MissingValues: {'column': 'V1', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MNAR'}

Generating corrupted training data on 36 rows... 

	perturbation: MissingValues: {'column': 'V3', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MAR'}

Generating corrupted training data on 36 rows... 

	perturbation: MissingValues: {'column': 'V2', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MCAR'}


[Parallel(n_jobs=-1)]: Done 167 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    4.6s finished


## Cleaning

### Pyod Single Column - features

In [9]:
df_outliers = df_corrupted.copy()

In [10]:
from pyod.models.knn import KNN
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [11]:
## featurizers
def build_featurizers(columns):
  cat_processing = Pipeline(
      [('mark_missing', SimpleImputer(strategy='constant', fill_value='__NA__')),
       ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))]
       )
  
  num_processing = Pipeline(
      [('mark_missing', SimpleImputer(strategy='constant', fill_value=0))]
      )
  
  cat_cols = [c for c in categorical_columns if c in columns]
  num_cols = [c for c in numerical_columns if c in columns]

  feature_transform = ColumnTransformer(
      transformers=[('categorical_features', cat_processing, cat_cols),
                    ('scaled_numeric', num_processing, num_cols)
      ])
  
  return feature_transform

In [12]:
predictors = {}

for col in categorical_columns + numerical_columns:
  predictors[col] = Pipeline([
                              ('features', build_featurizers([col])),
                              ('outlier_detector', KNN())
  ])

In [13]:
for col in categorical_columns + numerical_columns:
  predictors[col].fit(df_outliers)

In [14]:
for col in categorical_columns + numerical_columns:
  df_outliers[col + "_outlier"] = predictors[col].predict(df_outliers)



In [15]:
df_outliers

Unnamed: 0,V1,V2,V3,V4,V5,V6,V2_outlier,V3_outlier,V4_outlier,V5_outlier,V6_outlier,V1_outlier
73,40.0,,no,no,no,no,0,0,0,0,0,0
81,40.2,,yes,no,yes,no,0,0,0,0,0,0
43,37.5,,no,yes,no,no,0,0,0,0,0,0
25,37.0,,no,yes,yes,yes,0,0,0,0,0,0
2,35.9,no,yes,no,no,no,0,0,0,0,0,0
46,37.6,,no,yes,yes,no,0,0,0,0,0,0
3,36.0,,no,yes,yes,yes,0,0,0,0,0,0
21,37.0,,no,yes,yes,no,0,0,0,0,0,0
94,40.7,,no,no,no,no,0,0,0,0,0,0
99,40.9,,yes,yes,yes,no,0,0,0,0,0,0


### Pyod Multiple Columns - features

In [16]:
predictors = Pipeline([
                       ('features', build_featurizers(numerical_columns + categorical_columns)),
                       ('outlier_detector', KNN())
                       ])
predictors.fit(df_outliers)

Pipeline(memory=None,
         steps=[('features',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('categorical_features',
                                                  Pipeline(memory=None,
                                                           steps=[('mark_missing',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='__NA__',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                     

In [17]:
outliers = predictors.predict(df_outliers)
outliers

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### Categorical from train, numerical from pyod

In [18]:
## single column based

In [19]:
def cat_out_detect(df_train, df_corrupted):
  df_outliers = df_corrupted[categorical_columns].copy()
    
  for col in df_train.columns:
      if col in categorical_columns:
          vals_train_unique = df_train[col].unique()

          ## add a respective outlier col for each col
          df_outliers[col + "_outlier"] = ''
          
          for i in df_corrupted[col].index:
              if df_corrupted.loc[i, col] in vals_train_unique:
                  df_outliers.loc[i, col + "_outlier"] = 0
              else:
                  df_outliers.loc[i, col + "_outlier"] = 1
          
  return df_outliers

In [20]:
def num_out_detect(df_train, df_corrupted, pyod_model):
  df_outliers = df_corrupted[numerical_columns].copy()
    
  for col in df_train.columns:
      if col in numerical_columns:
          ## find indices of records with NaNs in col in df_corrupted
          nan_idx = df_corrupted[df_corrupted[col].isnull()].index
          non_nan_idx = df_corrupted.loc[set(df_corrupted.index) - set(nan_idx)].index
          
          ## pd series -> np column, needs to be 2D array
          ## taking only the non-NaN records in the corrupted data
          col_tr_arr = np.array(df_train[col]).reshape(-1,1)
          col_corr_arr = np.array(df_corrupted.loc[non_nan_idx][col]).reshape(-1,1)

          ## fit the dataset to the model
          model = pyod_model
          model.fit(col_tr_arr)

          ## predict raw anomaly score
          scores_pred = model.decision_function(col_corr_arr) * -1

          ## prediction of a datapoint category outlier or inlier
          y_pred = model.predict(col_corr_arr)

          ## add a respective outlier col for each col
          df_outliers[col + "_outlier"] = ''
          df_outliers.loc[non_nan_idx, col + "_outlier"] = y_pred ## 0: inlier, 1: outlier
          df_outliers.loc[nan_idx, col + "_outlier"] = 1
          
  return df_outliers

In [21]:
pyod_model = KNN()
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')

In [22]:
df_outliers

Unnamed: 0,V1,V1_outlier,V2,V3,V4,V5,V6,V2_outlier,V3_outlier,V4_outlier,V5_outlier,V6_outlier
73,40.0,0,,no,no,no,no,1,0,0,0,0
81,40.2,0,,yes,no,yes,no,1,0,0,0,0
43,37.5,0,,no,yes,no,no,1,0,0,0,0
25,37.0,0,,no,yes,yes,yes,1,0,0,0,0
2,35.9,1,no,yes,no,no,no,0,0,0,0,0
46,37.6,0,,no,yes,yes,no,1,0,0,0,0
3,36.0,0,,no,yes,yes,yes,1,0,0,0,0
21,37.0,0,,no,yes,yes,no,1,0,0,0,0
94,40.7,0,,no,no,no,no,1,0,0,0,0
99,40.9,0,,yes,yes,yes,no,1,0,0,0,0
