In [1]:
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('/home/rupali/Documents/Master Thesis/jenga')

In [2]:
import random
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from jenga.basis import Dataset
from jenga.corruptions.generic import MissingValues, SwappedValues, CategoricalShift
from jenga.corruptions.numerical import Scaling, GaussianNoise
from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNNOutlierDetection, PyODIsolationForestOutlierDetection
from jenga.cleaning.imputation import NoImputation, MeanModeImputation
from jenga.cleaning.clean import Clean

In [3]:
## use categorical columns as strings
def cat_cols_to_str(df):
    for col in df.columns:
        if pd.api.types.is_categorical_dtype(df[col]):
            df[col] = df[col].astype(str)

    return df

In [4]:
def run_experiment(dataset_name, learner, param_grid, corruptions, fraction, cleaners, num_repetitions, categorical_precision_threshold=0.7, numerical_std_error_threshold=2.0):
    
    ## dataset
    dataset = Dataset(dataset_name)
    
    all_data = dataset.all_data
    attribute_names = dataset.attribute_names
    attribute_types = dataset.attribute_types
    
    ## categorical and numerical features
    categorical_columns = dataset.categorical_columns
    numerical_columns = dataset.numerical_columns
    print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")
    
    ## train and test data
    df_train, lab_train, df_test, lab_test = dataset.get_train_test_data()
    ### if we don't convert the categorical columns to str, the swapping corruption doesn't let us assign new values to the column: "Cannot setitem on a Categorical with a new category, set the categories first"
    df_train = cat_cols_to_str(df_train)
    df_test = cat_cols_to_str(df_test)
    
    ## pipeline performance prediction (ppp)
    ppp = PipelinePerformancePrediction(df_train, lab_train, df_test, lab_test, categorical_columns, numerical_columns, learner, param_grid)
    ppp_model = ppp.fit_ppp(df_train)
    
    ## generate corrupted data
    for _ in range(num_repetitions):
        df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(df_test, corruptions, fraction, num_repetitions)
    
    ## cleaning
    clean = Clean(df_train, df_corrupted, categorical_columns, numerical_columns, categorical_precision_threshold, numerical_std_error_threshold, ppp, ppp_model, cleaners)
    df_outliers, df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(df_train, df_test, df_corrupted, cols_perturbed)
    
    ## results
    result = {
        'ppp_score_model': ppp.predict_score_ppp(ppp_model, df_test),
        'ppp_score_corrupted': corrupted_score_ppp,
        'ppp_score_cleaned': best_cleaning_score,
        'ppp_scores_cleaners': cleaner_scores_ppp
    }
#     print('\n'.join([f'{key}:{val}' for key, val in result.items()]))
    
    ## summary
    summary = {
        'dataset': dataset_name,
        'model': learner,
        'corruptions': summary_col_corrupt,
        'cleaners': summary_cleaners,
        'result': result
    }
#     print('\n\n\n\n'.join([f'{key}:{val}' for key, val in summary.items()]))
    
    return summary #summary_col_corrupt, result

### Altogether

In [5]:
datasets = [
    'thoracic_surgery',
    'cleve',
    'acute-inflammations'
]

In [6]:
## model parameters
## models is a dict where key = leaner & value = param_grid
models = {SGDClassifier(loss='log'): {'learner__max_iter': [500, 1000, 5000], 
                                         'learner__penalty': ['l2', 'l1', 'elasticnet'], 
                                         'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
                                        }, 
          RandomForestClassifier():{'learner__n_estimators': [100, 200, 500], 
                                    'learner__max_depth': [5, 10, 15]
                                   }
         }

## make dict of multiple leraners and corresponding param_grids

In [7]:
corruptions = [[MissingValues], 
               [SwappedValues], 
               [CategoricalShift],
               [Scaling], 
               [GaussianNoise],
               [MissingValues, CategoricalShift, Scaling, GaussianNoise, SwappedValues]]

In [8]:
fractions = [0.15, 0.25, 0.5, 0.75, 0.9]

In [9]:
cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, MeanModeImputation),
    (PyODIsolationForestOutlierDetection, MeanModeImputation),
]

In [10]:
# stdoutOrigin=sys.stdout 
# sys.stdout = open("/home/rupali/Documents/Master Thesis/jenga/out/experiments.txt", "w")

# for _ in range(10):
#   print("\n\n..................................ITERATION..................................\n")
ind_results = []

for dataset in datasets:
    for learner, param_grid in models.items():
        for corruption in corruptions:
            for fraction in fractions:
                ind_results.append(run_experiment(dataset, learner, param_grid, corruption, fraction, cleaners, 100))
        
# sys.stdout.close()
# sys.stdout=stdoutOrigin

Dataset: thoracic_surgery
Found 13 categorical and 3 numeric features 

Fitting 5 folds for each of 36 candidates, totalling 180 fits

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE11', 'fraction': 0.15, 'sampling': 'MNAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE17', 'fraction': 0.15, 'sampling': 'MAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE17', 'fraction': 0.15, 'sampling': 'MNAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE6', 'fraction': 0.15, 'sampling': 'MCAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE25', 'fraction': 0.15, 'sampling': 'MAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'

PPP score no cleaning: {'roc_auc_acore': 0.5594771241830065, 'classification_report': {'F': {'precision': 0.8941176470588236, 'recall': 0.8941176470588236, 'f1-score': 0.8941176470588236, 'support': 85}, 'T': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 9}, 'accuracy': 0.8085106382978723, 'macro avg': {'precision': 0.4470588235294118, 'recall': 0.4470588235294118, 'f1-score': 0.4470588235294118, 'support': 94}, 'weighted avg': {'precision': 0.8085106382978723, 'recall': 0.8085106382978723, 'f1-score': 0.8085106382978723, 'support': 94}}}
PPP scores with cleaning: 

Outlier detection method: NoOutlierDetection, Outlier Detection Score: {'Precision': 0.425531914893617, 'Recall': 0.5, 'F1-score': 0.4597701149425288, 'Accuracy': 0.851063829787234}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': 0.9939024390243902, 'Recall': 0.9615384615384616, 'F1-score': 0.9769325153374233, 'Accuracy': 0.9893617021276596, 'Mean Squared Error': nan}
Cleaner: (NoOutli


Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE19', 'fraction': 0.25, 'sampling': 'MCAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE17', 'fraction': 0.25, 'sampling': 'MNAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE9', 'fraction': 0.25, 'sampling': 'MNAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE14', 'fraction': 0.25, 'sampling': 'MCAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE19', 'fraction': 0.25, 'sampling': 'MNAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE8', 'fraction': 0.25, 'sampling': 'MCAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturb

PPP score no cleaning: {'roc_auc_acore': 0.6330409356725146, 'classification_report': {'F': {'precision': 0.8064516129032258, 'recall': 0.9868421052631579, 'f1-score': 0.8875739644970414, 'support': 76}, 'T': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 18}, 'accuracy': 0.7978723404255319, 'macro avg': {'precision': 0.4032258064516129, 'recall': 0.4934210526315789, 'f1-score': 0.4437869822485207, 'support': 94}, 'weighted avg': {'precision': 0.6520247083047358, 'recall': 0.7978723404255319, 'f1-score': 0.7176129925720761, 'support': 94}}}
PPP scores with cleaning: 

Outlier detection method: NoOutlierDetection, Outlier Detection Score: {'Precision': 0.25, 'Recall': 0.5, 'F1-score': 0.3333333333333333, 'Accuracy': 0.5}
Imputation method: MeanModeImputation, Imputation Score: {'Precision': 1.0, 'Recall': 1.0, 'F1-score': 1.0, 'Accuracy': 1.0, 'Mean Squared Error': nan}
Cleaner: (NoOutlierDetection, MeanModeImputation): {'roc_auc_acore': 0.64546783625731, 'classification_


Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE11', 'fraction': 0.75, 'sampling': 'MAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE7', 'fraction': 0.75, 'sampling': 'MAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE25', 'fraction': 0.75, 'sampling': 'MCAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE25', 'fraction': 0.75, 'sampling': 'MNAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE4', 'fraction': 0.75, 'sampling': 'MNAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE30', 'fraction': 0.75, 'sampling': 'MCAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbat


Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'AGE', 'fraction': 0.9, 'sampling': 'MNAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE19', 'fraction': 0.9, 'sampling': 'MNAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'AGE', 'fraction': 0.9, 'sampling': 'MNAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'DGN', 'fraction': 0.9, 'sampling': 'MNAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'AGE', 'fraction': 0.9, 'sampling': 'MCAR', 'na_value': nan}

Generating corrupted training data on 94 rows... 

	perturbation: MissingValues: {'column': 'PRE8', 'fraction': 0.9, 'sampling': 'MAR', 'na_value': nan}

Applying cleaners... 

PPP score no cleaning: {'roc_auc_acore': 0.4117647


Generating corrupted training data on 94 rows... 

	perturbation: SwappedValues: {'column': 'AGE', 'fraction': 0.15, 'sampling': 'MAR', 'swap_with': None}

Generating corrupted training data on 94 rows... 

	perturbation: SwappedValues: {'column': 'PRE25', 'fraction': 0.15, 'sampling': 'MNAR', 'swap_with': None}

Generating corrupted training data on 94 rows... 

	perturbation: SwappedValues: {'column': 'PRE6', 'fraction': 0.15, 'sampling': 'MNAR', 'swap_with': None}

Generating corrupted training data on 94 rows... 

	perturbation: SwappedValues: {'column': 'PRE9', 'fraction': 0.15, 'sampling': 'MCAR', 'swap_with': None}

Generating corrupted training data on 94 rows... 

	perturbation: SwappedValues: {'column': 'PRE10', 'fraction': 0.15, 'sampling': 'MAR', 'swap_with': None}

Generating corrupted training data on 94 rows... 

	perturbation: SwappedValues: {'column': 'PRE10', 'fraction': 0.15, 'sampling': 'MCAR', 'swap_with': None}

Generating corrupted training data on 94 rows... 



ValueError: could not convert string to float: 'F'

In [None]:
# stdoutOrigin=sys.stdout 
# sys.stdout = open("/home/rupali/Documents/Master Thesis/jenga/out/results.txt", "w")

# print(ind_results)
ind_results

# sys.stdout.close()
# sys.stdout=stdoutOrigin

In [9]:
datasets = [
    'thoracic_surgery'
]

## model parameters
## models is a dict where key = leaner & value = param_grid
models = {SGDClassifier(loss='log'): {'learner__max_iter': [500, 1000, 5000], 
                                         'learner__penalty': ['l2', 'l1', 'elasticnet'], 
                                         'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
                                        }
         }

## make dict of multiple leraners and corresponding param_grids

corruptions = [[MissingValues]]

fractions = [0.15]

cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, MeanModeImputation),
    (PyODIsolationForestOutlierDetection, MeanModeImputation),
]

# %%time

stdoutOrigin=sys.stdout 
sys.stdout = open("/content/drive/My Drive/Beuth Uni/Master Thesis/jenga/out/test_tho_15_miss.txt", "w")

for _ in range(20):
  print("\n\n..................................ITERATION..................................\n")
  ind_results = []

  for dataset in datasets:
    for learner, param_grid in models.items():
      for corruption in corruptions:
        for fraction in fractions:
          ind_results.append(run_experiment(dataset, learner, param_grid, corruption, fraction, cleaners, 100))
        
sys.stdout.close()
sys.stdout=stdoutOrigin

Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    5.3s finished
  _warn_prf(average, modifier, msg_start, len(result))
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    5.3s finished
  _warn_prf(average, modifier, msg_start, len(result))
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modi