In [1]:
## mount drive for access to the files
from google.colab import drive

drive.mount("/content/drive")

## all the drive the files are present in "/content/drive/My Drive"
# !ls "/content/drive/My Drive/Beuth Uni/Master Thesis/jenga"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
sys.path.append('/content/drive/My Drive/Beuth Uni/Master Thesis/jenga')

In [3]:
!pip install openml
!pip install pyod

!pip install mxnet autogluon
!pip install mxnet-mkl --pre --upgrade

Requirement already up-to-date: mxnet-mkl in /usr/local/lib/python3.6/dist-packages (1.6.0)


In [4]:
import random
import numpy as np
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from jenga.basis import Dataset
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise
from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNNOutlierDetection, PyODIsolationForestOutlierDetection, AutoGluonOutlierDetection
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, AutoGluonImputation
from jenga.cleaning.clean import Clean

In [5]:
def run_experiment(dataset_name, learner, param_grid, corruptions, fraction, cleaners, num_repetitions, categorical_precision_threshold=0.7, numerical_std_error_threshold=2.0):
    
    ## dataset
    dataset = Dataset(dataset_name)
    
    all_data = dataset.all_data
    attribute_names = dataset.attribute_names
    attribute_types = dataset.attribute_types
    
    ## categorical and numerical features
    categorical_columns = dataset.categorical_columns
    numerical_columns = dataset.numerical_columns
    print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features \n")
    
    ## train and test data
    df_train, lab_train, df_test, lab_test = dataset.get_train_test_data()
    
    
    ## pipeline performance prediction (ppp)
    ppp = PipelinePerformancePrediction(df_train, lab_train, df_test, lab_test, categorical_columns, numerical_columns, learner, param_grid)
    ppp_model = ppp.fit_ppp(df_train)
    
    ## generate corrpted data
    for _ in range(num_repetitions):
      df_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(df_test, corruptions, fraction, num_repetitions)
    
    ## cleaning
    clean = Clean(df_train, df_corrupted, categorical_columns, numerical_columns, categorical_precision_threshold, numerical_std_error_threshold, ppp, ppp_model, cleaners)
    df_outliers, df_cleaned, corrupted_score_ppp, best_cleaning_score, cleaner_scores_ppp, summary_cleaners = clean(df_train, df_test, df_corrupted, cols_perturbed)
    
    ## results
    result = {
        'ppp_score_model': ppp.predict_score_ppp(ppp_model, df_test),
        'ppp_score_corrupted': corrupted_score_ppp,
        'ppp_score_cleaned': best_cleaning_score,
        'ppp_scores_cleaners': cleaner_scores_ppp
    }
#     print('\n'.join([f'{key}:{val}' for key, val in result.items()]))
    
    ## summary
    summary = {
        'dataset': dataset_name,
        'model': learner,
        'corruptions': summary_col_corrupt,
        'cleaners': summary_cleaners,
        'result': result
    }
#     print('\n\n\n\n'.join([f'{key}:{val}' for key, val in summary.items()]))
    
    return summary #summary_col_corrupt, result

### Missing Values

In [6]:
corruptions = [MissingValues]

In [7]:
fractions = [0.15, 0.25, 0.5, 0.75, 0.9]

In [8]:
cleaners = [
    (NoOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, MeanModeImputation),
    (PyODKNNOutlierDetection, AutoGluonImputation),
    (PyODIsolationForestOutlierDetection, MeanModeImputation),
    (PyODIsolationForestOutlierDetection, AutoGluonImputation),
    (AutoGluonOutlierDetection, AutoGluonImputation)
]

#### Thoracic_surgery

In [9]:
dataset = 'thoracic_surgery'

##### Stochastic Gradient Descent

In [10]:
## model parameters
learner = SGDClassifier(loss='log')
param_grid = {
    'learner__max_iter': [500, 1000, 5000],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
    }

In [11]:
results = []

In [12]:
stdoutOrigin=sys.stdout 
sys.stdout = open("/content/drive/My Drive/Beuth Uni/Master Thesis/jenga/out/MissingValues/thoracic_surgery_sgd.txt", "w")

for _ in range(10):
  print("............................................. ITERATION .............................................")
  for fraction in fractions:
    results.append(run_experiment(dataset, learner, param_grid, corruptions, fraction, cleaners, 100))

sys.stdout.close()
sys.stdout=stdoutOrigin

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    self._fit(**kwargs)
  File "/usr/local/lib/python3.6/dist-packages/autogluon/utils/tabular/ml/models/ensemble/weighted_ensemble_model.py", line 23, in _fit
    super()._fit(X, y, k_fold=k_fold, k_fold_start=k_fold_start, k_fold_end=k_fold_end, n_repeats=n_repeats, n_repeat_start=n_repeat_start, compute_base_preds=compute_base_preds, time_limit=time_limit, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/autogluon/utils/tabular/ml/models/ensemble/stacker_ensemble_model.py", line 132, in _fit
    super()._fit(X=X, y=y, k_fold=k_fold, k_fold_start=k_fold_start, k_fold_end=k_fold_end, n_repeats=n_repeats, n_repeat_start=n_repeat_start, time_limit=time_limit, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/autogluon/utils/tabular/ml/models/ensemble/bagged_ensemble_model.py", line 127, in _fit
    model_base.fit(X_train=X, y_train=y, time_limit=time_limit, **kwargs)
  File "/usr/local/lib/python3.6/dist-pac

##### Random Forest

In [10]:
## model parameters
learner = RandomForestClassifier()
param_grid = {
    'learner__n_estimators': [100, 200, 500],
    'learner__max_depth': [5, 10, 15]
    }

In [11]:
results = []

In [12]:
stdoutOrigin=sys.stdout 
sys.stdout = open("/content/drive/My Drive/Beuth Uni/Master Thesis/jenga/out/MissingValues/thoracic_surgery_rf.txt", "w")

for _ in range(10):
  print("............................................. ITERATION .............................................")
  for fraction in fractions:
    results.append(run_experiment(dataset, learner, param_grid, corruptions, fraction, cleaners, 100))

sys.stdout.close()
sys.stdout=stdoutOrigin

Saved dataset 4329: thoracic_surgery to file /root/.openml/cache/org/openml/www/datasets/4329/dataset.pkl.py3
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   18.3s finished
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/autogluon/utils/tabular/ml/trainer/abstract_trainer.py", line 269, in train_and_save
    score = model.score(X=X_val, y=y_val)
  File "/usr/local/lib/python3.6/dist-packages/autogluon/utils/tabular/ml/models/abstract/abstract_model.py", line 316, in score
    y_pred = self.predict(X=X, preprocess=preprocess)
  File "/usr/local/lib/python3.6/dist-packages/autogluon/utils/tabular/ml/models/abstract/abstract_model.py", line 277, in predict
    y_pred_proba = self.predict_proba(X, preprocess=preprocess)
  File "/usr/local/lib/python3.6/dist-packages/autogluon/utils/tabular/ml/models/abstract/abstract_model.py", line 284, in predict_proba
    y_pred_proba = 

#### cleve

In [9]:
dataset = 'cleve'

##### Stochastic Gradient Descent

In [10]:
## model parameters
learner = SGDClassifier(loss='log')
param_grid = {
    'learner__max_iter': [500, 1000, 5000],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
    }

In [11]:
results = []

In [12]:
stdoutOrigin=sys.stdout 
sys.stdout = open("/content/drive/My Drive/Beuth Uni/Master Thesis/jenga/out/MissingValues/cleve_sgd.txt", "w")

for _ in range(10):
  print("............................................. ITERATION .............................................")
  for fraction in fractions:
    results.append(run_experiment(dataset, learner, param_grid, corruptions, fraction, cleaners, 100))

sys.stdout.close()
sys.stdout=stdoutOrigin

Saved dataset 40710: cleve to file /root/.openml/cache/org/openml/www/datasets/40710/dataset.pkl.py3
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 116 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    5.6s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 116 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    5.4s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    3.9s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 116 tasks      

##### Random Forest

In [10]:
## model parameters
learner = RandomForestClassifier()
param_grid = {
    'learner__n_estimators': [100, 200, 500],
    'learner__max_depth': [5, 10, 15]
    }

In [11]:
results = []

In [12]:
stdoutOrigin=sys.stdout 
sys.stdout = open("/content/drive/My Drive/Beuth Uni/Master Thesis/jenga/out/MissingValues/cleve_rf.txt", "w")

for _ in range(10):
  print("............................................. ITERATION .............................................")
  for fraction in fractions:
    results.append(run_experiment(dataset, learner, param_grid, corruptions, fraction, cleaners, 100))

sys.stdout.close()
sys.stdout=stdoutOrigin

Saved dataset 40710: cleve to file /root/.openml/cache/org/openml/www/datasets/40710/dataset.pkl.py3
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   17.5s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   17.4s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   17.2s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   17.2s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45

#### Acute-inflammations

In [9]:
dataset = 'acute-inflammations'

##### Stochastic Gradient Descent

In [10]:
## model parameters
learner = SGDClassifier(loss='log')
param_grid = {
    'learner__max_iter': [500, 1000, 5000],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
    }

In [13]:
results = []

In [14]:
stdoutOrigin=sys.stdout 
sys.stdout = open("/content/drive/My Drive/Beuth Uni/Master Thesis/jenga/out/MissingValues/acute-inflammations_sgd.txt", "w")

for _ in range(10):
  print("............................................. ITERATION .............................................")
  for fraction in fractions:
    results.append(run_experiment(dataset, learner, param_grid, corruptions, fraction, cleaners, 100))

sys.stdout.close()
sys.stdout=stdoutOrigin

Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 116 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    5.0s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    3.5s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    3.5s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 164 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 180

##### Random Forest

In [15]:
## model parameters
learner = RandomForestClassifier()
param_grid = {
    'learner__n_estimators': [100, 200, 500],
    'learner__max_depth': [5, 10, 15]
    }

In [16]:
results = []

In [17]:
stdoutOrigin=sys.stdout 
sys.stdout = open("/content/drive/My Drive/Beuth Uni/Master Thesis/jenga/out/MissingValues/acute-inflammations_rf.txt", "w")

for _ in range(10):
  print("............................................. ITERATION .............................................")
  for fraction in fractions:
    results.append(run_experiment(dataset, learner, param_grid, corruptions, fraction, cleaners, 100))

sys.stdout.close()
sys.stdout=stdoutOrigin

Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   15.3s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   16.0s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   14.9s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   15.0s finished
Data pickle file already exists and is up to date.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   14.9s finished
Data pickl