## Dataset

In [4]:
import sys
sys.path.append('/home/rupali/Documents/Master Thesis/jenga')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from jenga.basis import Dataset

In [5]:
dataset = Dataset("acute-inflammations")

all_data = dataset.all_data
attribute_names = dataset.attribute_names
attribute_types = dataset.attribute_types

categorical_columns = dataset.categorical_columns
numerical_columns = dataset.numerical_columns

print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")

Dataset: acute-inflammations
Found 5 categorical and 1 numeric features 



### Get training and test sets

In [6]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data(0.3)

## Model

## Model & Corruptions using PPP

In [7]:
from sklearn.linear_model import SGDClassifier

learner = SGDClassifier(loss='log')
param_grid = {
    'learner__max_iter': [500, 1000, 5000],
    'learner__penalty': ['l2', 'l1', 'elasticnet'], 
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

In [33]:
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise

corruptions = [MissingValues] #, SwappedValues, Scaling, GaussianNoise]
fraction = 0.5
num_repetitions = 5

In [48]:
from jenga.cleaning.ppp import PipelinePerformancePrediction

ppp = PipelinePerformancePrediction(train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)
ppp_model = ppp.fit_ppp(train_data)

## generate corrpted data
for _ in range(num_repetitions):
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions, fraction, num_repetitions)

Fitting 5 folds for each of 36 candidates, totalling 180 fits

Generating corrupted training data on 36 rows... 

	perturbation: MissingValues: {'column': 'V5', 'fraction': 0.5, 'sampling': 'MAR', 'na_value': nan}

Generating corrupted training data on 36 rows... 

	perturbation: MissingValues: {'column': 'V5', 'fraction': 0.5, 'sampling': 'MNAR', 'na_value': nan}

Generating corrupted training data on 36 rows... 

	perturbation: MissingValues: {'column': 'V1', 'fraction': 0.5, 'sampling': 'MAR', 'na_value': nan}

Generating corrupted training data on 36 rows... 

	perturbation: MissingValues: {'column': 'V5', 'fraction': 0.5, 'sampling': 'MNAR', 'na_value': nan}

Generating corrupted training data on 36 rows... 

	perturbation: MissingValues: {'column': 'V4', 'fraction': 0.5, 'sampling': 'MNAR', 'na_value': nan}


## Cleaning

### Pyod Single Column - features

In [49]:
df_outliers = df_corrupted.copy(deep=True)

In [50]:
from pyod.models.knn import KNN
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [51]:
## featurizers
def build_featurizers(columns):
    categorical_preprocessing = Pipeline([
        ('mark_missing', SimpleImputer(strategy='constant', fill_value='__NA__')),
        ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
    ])

    numeric_preprocessing = Pipeline([
        ('mark_missing', SimpleImputer(strategy='constant', fill_value=0)),
    ])

    cat_cols = [c for c in categorical_columns if c in columns]
    num_cols = [c for c in numerical_columns if c in columns]
    feature_transformation = ColumnTransformer(transformers=[
        ('categorical_features', categorical_preprocessing, cat_cols),
        ('scaled_numeric', numeric_preprocessing, num_cols)
    ])

    return feature_transformation

In [52]:
predictors = {}

for col in categorical_columns + numerical_columns:
    predictors[col] = Pipeline(
        [('features', build_featurizers([col])),
         ('outlier_detector', KNN())
        ])
    
len(predictors)

6

In [53]:
for col in categorical_columns + numerical_columns:
    predictors[col].fit(df_outliers)

In [55]:
for col in categorical_columns + numerical_columns:
    df_outliers[col + "_outlier"] = predictors[col].predict(df_corrupted)

In [56]:
df_outliers

Unnamed: 0,V1,V2,V3,V4,V5,V6,V2_outlier,V3_outlier,V4_outlier,V5_outlier,V6_outlier,V1_outlier
65,38.7,no,yes,yes,no,yes,0,0,0,0,0,1
14,36.7,no,yes,,no,no,0,0,0,0,0,0
92,40.7,yes,yes,yes,yes,yes,0,0,0,0,0,0
96,40.7,no,yes,yes,no,yes,0,0,0,0,0,0
30,37.1,no,no,yes,no,no,0,0,0,0,0,0
52,37.8,no,yes,,no,no,0,0,0,0,0,0
91,40.6,yes,yes,,yes,no,0,0,0,0,0,0
100,40.9,no,yes,yes,no,yes,0,0,0,0,0,0
111,41.2,no,no,,no,no,0,0,0,0,0,0
20,37.0,no,no,yes,yes,no,0,0,0,0,0,0


### Pyod Multiple Columns - features

In [62]:
df_outliers = df_corrupted.copy(deep=True)

In [63]:
predictors = Pipeline(
    [('features', build_featurizers(numerical_columns + categorical_columns)),
     ('outlier_detector', KNN())
    ])
predictors.fit(df_outliers)

Pipeline(steps=[('features',
                 ColumnTransformer(transformers=[('categorical_features',
                                                  Pipeline(steps=[('mark_missing',
                                                                   SimpleImputer(fill_value='__NA__',
                                                                                 strategy='constant')),
                                                                  ('one_hot_encode',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['V2', 'V3', 'V4', 'V5',
                                                   'V6']),
                                                 ('scaled_numeric',
                                                  Pipeline(steps=[('mark_missing',
                                                                   SimpleImputer(fill_value=0,
                                 

In [64]:
outliers = predictors.predict(df_outliers)
outliers

array([1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

### Categorical from train, numerical from pyod

In [18]:
## single column based

In [65]:
def cat_out_detect(df_train, df_corrupted):
    df_outliers = df_corrupted[categorical_columns].copy()
    
    for col in df_train.columns:
        if col in categorical_columns:
            vals_train_unique = df_train[col].unique()
            
            ## add a respective outlier col for each col
            df_outliers[col + "_outlier"] = ''
            
            for i in df_corrupted[col].index:
                if df_corrupted.loc[i, col] in vals_train_unique:
                    df_outliers.loc[i, col + "_outlier"] = 0
                else:
                    df_outliers.loc[i, col + "_outlier"] = 1
                    
    return df_outliers

In [66]:
def num_out_detect(df_train, df_corrupted, pyod_model):
    df_outliers = df_corrupted[numerical_columns].copy()
    
    for col in df_train.columns:
        if col in numerical_columns:
            ## find indices of records with NaNs in col in df_corrupted
            nan_idx = df_corrupted[df_corrupted[col].isnull()].index
            non_nan_idx = df_corrupted.loc[set(df_corrupted.index) - set(nan_idx)].index
            
            ## pd series -> np column, needs to be 2D array
            ## taking only the non-NaN records in the corrupted data
            col_tr_arr = np.array(df_train[col]).reshape(-1,1)
            col_corr_arr = np.array(df_corrupted.loc[non_nan_idx][col]).reshape(-1,1)
            
            ## fit the dataset to the model
            model = pyod_model
            model.fit(col_tr_arr)
            
            ## predict raw anomaly score
            scores_pred = model.decision_function(col_corr_arr) * -1
            
            ## prediction of a datapoint category outlier or inlier
            y_pred = model.predict(col_corr_arr)
            
            ## add a respective outlier col for each col
            df_outliers[col + "_outlier"] = ''
            df_outliers.loc[non_nan_idx, col + "_outlier"] = y_pred ## 0: inlier, 1: outlier
            df_outliers.loc[nan_idx, col + "_outlier"] = 1
            
    return df_outliers

In [67]:
pyod_model = KNN()
    
df_outliers_num = num_out_detect(train_data, df_corrupted, pyod_model)
df_outliers_cat = cat_out_detect(train_data, df_corrupted)

df_outliers = df_outliers_num.join(df_outliers_cat, how='inner')

In [68]:
df_outliers

Unnamed: 0,V1,V1_outlier,V2,V3,V4,V5,V6,V2_outlier,V3_outlier,V4_outlier,V5_outlier,V6_outlier
65,38.7,1,no,yes,yes,no,yes,0,0,0,0,0
14,36.7,0,no,yes,,no,no,0,0,1,0,0
92,40.7,0,yes,yes,yes,yes,yes,0,0,0,0,0
96,40.7,0,no,yes,yes,no,yes,0,0,0,0,0
30,37.1,0,no,no,yes,no,no,0,0,0,0,0
52,37.8,0,no,yes,,no,no,0,0,1,0,0
91,40.6,0,yes,yes,,yes,no,0,0,1,0,0
100,40.9,0,no,yes,yes,no,yes,0,0,0,0,0
111,41.2,0,no,no,,no,no,0,0,1,0,0
20,37.0,0,no,no,yes,yes,no,0,0,0,0,0
