In [None]:
## mount drive for access to the files
from google.colab import drive

drive.mount("/content/drive")

## all the drive the files are present in "/content/drive/My Drive"
# !ls "/content/drive/My Drive/Beuth Uni/Master Thesis/jenga"

import sys
sys.path.append('/content/drive/My Drive/Beuth Uni/Master Thesis/jenga')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install openml
!pip install pyod

!pip install mxnet autogluon
!pip install mxnet-mkl --pre --upgrade

Requirement already up-to-date: mxnet-mkl in /usr/local/lib/python3.6/dist-packages (1.6.0)


In [None]:
import random
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from jenga.basis import Dataset
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise
from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNNOutlierDetection, PyODIsolationForestOutlierDetection, AutoGluonOutlierDetection
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, AutoGluonImputation
from jenga.cleaning.clean import Clean

In [None]:
def run_experiment(dataset_name, learner, param_grid, corruptions, fraction, cleaners, num_repetitions, categorical_precision_threshold=0.7, numerical_std_error_threshold=2.0):
    
    ## dataset
    dataset = Dataset(seed, dataset_name)
    
    all_data = dataset.all_data
    attribute_names = dataset.attribute_names
    attribute_types = dataset.attribute_types
    
    ## categorical and numerical features
    categorical_columns = dataset.categorical_columns
    numerical_columns = dataset.numerical_columns
    print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")
    
    ## train and test data
    df_train, lab_train, df_test, lab_test = dataset.get_train_test_data()
    
    
    ## pipeline performance prediction (ppp)
    ppp = PipelinePerformancePrediction(df_train, lab_train, df_test, lab_test, categorical_columns, numerical_columns, learner, param_grid)
    ppp_model = ppp.fit_ppp(df_train)
    
    ## generate corrpted data
    df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(df_test, corruptions, fraction, num_repetitions)
    
    ## cleaning
    clean = Clean(df_train, df_corrupted, categorical_columns, numerical_columns, categorical_precision_threshold, numerical_std_error_threshold, ppp, ppp_model, cleaners)
    df_outliers, df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(df_train, df_test, df_corrupted, cols_perturbed)
    
    ## results
    result = {
        'ppp_score_model': ppp.predict_score_ppp(ppp_model, df_test),
        'ppp_score_corrupted': corrupted_score_ppp,
        'ppp_score_cleaned': best_cleaning_score,
        'ppp_scores_cleaners': cleaner_scores_ppp
    }
#     print('\n'.join([f'{key}:{val}' for key, val in result.items()]))
    
    ## summary
    summary = {
        'dataset': dataset_name,
        'model': learner,
        'corruptions': summary_col_corrupt,
        'cleaners': summary_cleaners,
        'result': result
    }
#     print('\n\n\n\n'.join([f'{key}:{val}' for key, val in summary.items()]))
    
    return summary #summary_col_corrupt, result

### Combined Corruptions

In [None]:
corruptions = [MissingValues, SwappedValues, Scaling, GaussianNoise]

In [None]:
fractions = [0.15, 0.25, 0.5, 0.75, 0.9]

In [None]:
cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, AutoGluonImputation),
    (PyODIsolationForestOutlierDetection, MeanModeImputation),
    (PyODIsolationForestOutlierDetection, AutoGluonImputation),
    (AutoGluonOutlierDetection, AutoGluonImputation)
]

#### Thoracic_surgery

In [None]:
dataset = 'thoracic_surgery'

##### Stochastic Gradient Descent

In [None]:
## model parameters
learner = SGDClassifier(loss='log')
param_grid = {
    'learner__max_iter': [500, 1000, 5000],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
    }

In [None]:
results = []

In [None]:
stdoutOrigin=sys.stdout 
sys.stdout = open("/content/drive/My Drive/Beuth Uni/Master Thesis/jenga/out/Combined/thoracic_surgery_sgd.txt", "w")

for _ in range(10):
  print("............................................. ITERATION .............................................")
  for fraction in fractions:
    results.append(run_experiment(dataset, learner, param_grid, corruptions, fraction, cleaners, 100))

sys.stdout.close()
sys.stdout=stdoutOrigin

##### Random Forest

In [None]:
## model parameters
learner = RandomForestClassifier()
param_grid = {
    'learner__n_estimators': [100, 200, 500],
    'learner__max_depth': [5, 10, 15]
    }

In [None]:
results = []

In [None]:
stdoutOrigin=sys.stdout 
sys.stdout = open("/content/drive/My Drive/Beuth Uni/Master Thesis/jenga/out/Combined/thoracic_surgery_rf.txt", "w")

for _ in range(10):
  print("............................................. ITERATION .............................................")
  for fraction in fractions:
    results.append(run_experiment(dataset, learner, param_grid, corruptions, fraction, cleaners, 100))

sys.stdout.close()
sys.stdout=stdoutOrigin

#### cleve

In [None]:
dataset = 'cleve'

##### Stochastic Gradient Descent

In [None]:
## model parameters
learner = SGDClassifier(loss='log')
param_grid = {
    'learner__max_iter': [500, 1000, 5000],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
    }

In [None]:
results = []

In [None]:
stdoutOrigin=sys.stdout 
sys.stdout = open("/content/drive/My Drive/Beuth Uni/Master Thesis/jenga/out/Combined/cleve_sgd.txt", "w")

for _ in range(10):
  print("............................................. ITERATION .............................................")
  for fraction in fractions:
    results.append(run_experiment(dataset, learner, param_grid, corruptions, fraction, cleaners, 100))

sys.stdout.close()
sys.stdout=stdoutOrigin

##### Random Forest

In [None]:
## model parameters
learner = RandomForestClassifier()
param_grid = {
    'learner__n_estimators': [100, 200, 500],
    'learner__max_depth': [5, 10, 15]
    }

In [None]:
results = []

In [None]:
stdoutOrigin=sys.stdout 
sys.stdout = open("/content/drive/My Drive/Beuth Uni/Master Thesis/jenga/out/Combined/cleve_rf.txt", "w")

for _ in range(10):
  print("............................................. ITERATION .............................................")
  for fraction in fractions:
    results.append(run_experiment(dataset, learner, param_grid, corruptions, fraction, cleaners, 100))

sys.stdout.close()
sys.stdout=stdoutOrigin

Saved dataset 40710: cleve to file /root/.openml/cache/org/openml/www/datasets/40710/dataset.pkl.py3
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   16.7s finished
NumExpr defaulting to 2 threads.
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   16.5s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   16.4s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   16.7s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers

#### Acute-inflammations

In [None]:
dataset = 'acute-inflammations'

##### Stochastic Gradient Descent

In [None]:
## model parameters
learner = SGDClassifier(loss='log')
param_grid = {
    'learner__max_iter': [500, 1000, 5000],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
    }

In [None]:
results = []

In [None]:
stdoutOrigin=sys.stdout 
sys.stdout = open("/content/drive/My Drive/Beuth Uni/Master Thesis/jenga/out/Combined/acute-inflammations_sgd.txt", "w")

for _ in range(10):
  print("............................................. ITERATION .............................................")
  for fraction in fractions:
    results.append(run_experiment(dataset, learner, param_grid, corruptions, fraction, cleaners, 100))

sys.stdout.close()
sys.stdout=stdoutOrigin

Saved dataset 1455: acute-inflammations to file /root/.openml/cache/org/openml/www/datasets/1455/dataset.pkl.py3
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 116 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    4.9s finished
NumExpr defaulting to 2 threads.
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    3.3s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    3.4s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers

##### Random Forest

In [None]:
## model parameters
learner = RandomForestClassifier()
param_grid = {
    'learner__n_estimators': [100, 200, 500],
    'learner__max_depth': [5, 10, 15]
    }

In [None]:
results = []

In [None]:
stdoutOrigin=sys.stdout 
sys.stdout = open("/content/drive/My Drive/Beuth Uni/Master Thesis/jenga/out/Combined/acute-inflammations_rf_.txt", "w")

for _ in range(10):
  print("............................................. ITERATION .............................................")
  for fraction in fractions:
    results.append(run_experiment(dataset, learner, param_grid, corruptions, fraction, cleaners, 100))

sys.stdout.close()
sys.stdout=stdoutOrigin

Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   17.0s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   14.9s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   14.9s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   14.8s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   15.1s finished
Data pickl