In [None]:
# mount drive for access to the
from google.colab import drive

In [None]:
drive.mount("/content/drive")

In [None]:
# all the drive the files are present in "/content/drive/My Drive"
!ls "/content/drive/My Drive/Beuth Uni/Master Thesis"

In [None]:
import sys
sys.path.append('/content/drive/My Drive/Beuth Uni/Master Thesis/jenga')

In [None]:
#! pip install openml

In [None]:
from sklearn.linear_model import SGDClassifier

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from jenga.basis import Dataset
from jenga.cleaning.ppp import PipelinePerformancePrediction

In [None]:
seed = 10

In [None]:
dataset = Dataset(seed, "credit-g")

In [None]:
all_data = dataset.all_data
all_data

In [None]:
attribute_names = dataset.attribute_names
attribute_names

In [None]:
attribute_types = dataset.attribute_types
attribute_types

In [None]:
categorical_columns = dataset.categorical_columns
categorical_columns

In [None]:
numerical_columns = dataset.numerical_columns
numerical_columns

In [None]:
print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features")

In [None]:
## model parameters
learner = SGDClassifier(max_iter=1000)
param_grid = {
    'learner__loss': ['log'],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

In [None]:
## preprocessing pipeline for both numerical and categorical columns

# preprocessing pipeline for numerical columns
transformer_numeric = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('standard_scale', StandardScaler())
])

# preprocessing pipeline for categorical columns
transformer_categorical = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
    ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
])

# preprocessor
feature_transform = ColumnTransformer(transformers=[
    ('categorical_features', transformer_categorical, categorical_columns),
    ('numerical_features', transformer_numeric, numerical_columns)
])

In [None]:
## prediction pipeline: append classifier (learner) to the preprocessing pipeline
pipeline = Pipeline([
    ('features', feature_transform),
    ('learner', learner)
])

In [None]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data()

display(train_data.head())
print(train_labels[0:5])

display(test_data.head())
print(test_labels[0:5])

In [None]:
ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)

In [None]:
ppp_model = ppp.fit_ppp(train_data)

In [None]:
score_no_cleaning = ppp.predict_score_ppp(ppp_model, test_data)
score_no_cleaning

In [None]:
# generate corrpted test data
test_data_corrupted, perturbations, cols_perturbed = ppp.get_corrupted(test_data)

In [None]:
## clean

In [None]:
# test without given pipeline
ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid)

In [None]:
pipeline = ppp.pipeline
pipeline

In [None]:
# model trained on original train data
model_orig = ppp.fit_ppp(train_data)

In [None]:
# generate corrpted test data
test_data_corrupted, perturbations, cols_perturbed = ppp.get_corrupted(test_data)

In [None]:
# score on original test data
ppp.predict_score_ppp(model_orig, test_data)

In [None]:
# score on corrupted test data
ppp.predict_score_ppp(model_orig, test_data_corrupted)