## Adjustment for Google Colab

In [None]:
# mount drive for access to the
from google.colab import drive

In [None]:
drive.mount("/content/drive")

In [None]:
# all the drive the files are present in "/content/drive/My Drive"
!ls "/content/drive/My Drive/Beuth Uni/Master Thesis"

In [None]:
import sys
sys.path.append('/content/drive/My Drive/Beuth Uni/Master Thesis/jenga')

In [None]:
#! pip install openml

In [1]:
import numpy as np
from sklearn.linear_model import SGDClassifier

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from jenga.basis import Dataset
from jenga.corruptions.generic import MissingValues, SwappedValues
from jenga.corruptions.numerical import Scaling, GaussianNoise
from jenga.cleaning.ppp import PipelinePerformancePrediction
from jenga.cleaning.outlier_detection import NoOutlierDetection, PyODKNN, PyODIsolationForest
from jenga.cleaning.imputation import NoImputation, MeanModeImputation, DatawigImputation
from jenga.cleaning.clean import Clean

In [2]:
seed = 10

## Dataset

In [3]:
dataset = Dataset(seed, "credit-g")

Dataset 'credit-g', target: 'class'
**Author**: Dr. Hans Hofmann  

**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)) - 1994    

**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)



**German Credit data**  

This dataset classifies people described by a set of attributes as good or bad credit risks.



This dataset comes with a cost matrix: 

``` 

      Good  Bad (predicted)  

Good   0    1   (actual)  

Bad    5    0  

```



It is worse to class a customer as go

Attribute types: 


Unnamed: 0,attribute_names,categorical_indicator
0,checking_status,True
1,duration,False
2,credit_history,True
3,purpose,True
4,credit_amount,False
5,savings_status,True
6,employment,True
7,installment_commitment,False
8,personal_status,True
9,other_parties,True


In [4]:
all_data = dataset.all_data
# all_data

In [5]:
attribute_names = dataset.attribute_names
# attribute_names

In [6]:
attribute_types = dataset.attribute_types
# attribute_types

### Categorical and Numerical Features

In [7]:
categorical_columns = dataset.categorical_columns
# categorical_columns

In [8]:
numerical_columns = dataset.numerical_columns
# numerical_columns

In [9]:
print(f"Found {len(categorical_columns)} categorical and {len(numerical_columns)} numeric features")

Found 13 categorical and 7 numeric features


## Model

### Model parameters

In [10]:
## model parameters
learner = SGDClassifier(max_iter=1000)
param_grid = {
    'learner__loss': ['log'],
    'learner__penalty': ['l2', 'l1', 'elasticnet'],
    'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
}

### Preprocessing Pipeline

In [11]:
## preprocessing pipeline for both numerical and categorical columns

# preprocessing pipeline for numerical columns
transformer_numeric = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('standard_scale', StandardScaler())
])

# preprocessing pipeline for categorical columns
transformer_categorical = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
    ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
])

# preprocessor
feature_transform = ColumnTransformer(transformers=[
    ('categorical_features', transformer_categorical, categorical_columns),
    ('numerical_features', transformer_numeric, numerical_columns)
])

### Prediction Pipeline

In [12]:
## prediction pipeline: append classifier (learner) to the preprocessing pipeline
pipeline = Pipeline([
    ('features', feature_transform),
    ('learner', learner)
])

### Train and Test Data

In [13]:
train_data, train_labels, test_data, test_labels = dataset.get_train_test_data()

# display(train_data.head())
# print(train_labels[0:5])

# display(test_data.head())
# print(test_labels[0:5])

## Corruptions

In [14]:
corruptions = [MissingValues, Scaling, GaussianNoise, MissingValues, MissingValues, Scaling, Scaling]

In [15]:
ppp = PipelinePerformancePrediction(seed, train_data, train_labels, test_data, test_labels, categorical_columns, numerical_columns, learner, param_grid, corruptions)

In [16]:
# generate corrpted test data
test_data_corrupted, perturbations, cols_perturbed, summary_col_corrupt = ppp.get_corrupted(test_data, corruptions)

Generating corrupted training data on 200 rows...
Applying perturbations...
MissingValues: {'column': 'other_parties', 'fraction': 0.75, 'na_value': nan, 'missingness': 'MCAR'}
Scaling: {'column': 'age', 'fraction': 0.5}
GaussianNoise: {'column': 'credit_amount', 'fraction': 0.25}
MissingValues: {'column': 'existing_credits', 'fraction': 0.5, 'na_value': nan, 'missingness': 'MAR'}
MissingValues: {'column': 'other_payment_plans', 'fraction': 0.25, 'na_value': nan, 'missingness': 'MNAR'}
Scaling: {'column': 'duration', 'fraction': 0.5}
Scaling: {'column': 'existing_credits', 'fraction': 0.25}


In [17]:
summary_col_corrupt

defaultdict(list,
            {('other_parties',): [<jenga.corruptions.generic.MissingValues at 0x1e87e10d828>],
             ('age',): [<jenga.corruptions.numerical.Scaling at 0x1e87e116b00>],
             ('credit_amount',): [<jenga.corruptions.numerical.GaussianNoise at 0x1e87e10d6d8>],
             ('existing_credits',): [<jenga.corruptions.generic.MissingValues at 0x1e87e10d550>,
              <jenga.corruptions.numerical.Scaling at 0x1e87e116fd0>],
             ('other_payment_plans',): [<jenga.corruptions.generic.MissingValues at 0x1e87e116f98>],
             ('duration',): [<jenga.corruptions.numerical.Scaling at 0x1e87e1164a8>]})

## Cleaning

In [18]:
cleaners = [
    (NoOutlierDetection, MeanModeImputation),
#     # (NoOutlierDetection, DatawigImputation),
#     (PyODKNN, NoImputation),
    (PyODKNN, MeanModeImputation),
#     # (PyODKNN, DatawigImputation),
#     (PyODIsolationForest, NoImputation),
    (PyODIsolationForest, MeanModeImputation)
#     # (PyODIsolationForest, DatawigImputation)
]

In [19]:
ppp_model = ppp.fit_ppp(train_data)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    5.4s finished


In [20]:
clean = Clean(train_data, test_data_corrupted, categorical_columns, numerical_columns, ppp, ppp_model, cleaners)

In [21]:
test_data_cleaned, score_no_cleaning, cleaner_scores_ppp = clean(train_data, test_data_corrupted)

PPP score no cleaning: 0.5513090229079008
Outlier detection method: <jenga.cleaning.outlier_detection.NoOutlierDetection object at 0x000001E87E116C18>
Imputation method: <jenga.cleaning.imputation.MeanModeImputation object at 0x000001E87E1755C0>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x000001E86E509550>: 0.5494389901823283 

Outlier detection method: <jenga.cleaning.outlier_detection.PyODKNN object at 0x000001E87E1EAE10>
Imputation method: <jenga.cleaning.imputation.MeanModeImputation object at 0x000001E87E1EAEB8>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x000001E87E1EAA90>: 0.5494389901823283 

Outlier detection method: <jenga.cleaning.outlier_detection.PyODIsolationForest object at 0x000001E87E1EA978>
Imputation method: <jenga.cleaning.imputation.MeanModeImputation object at 0x000001E87E1EAD30>
PPP score with cleaning: <jenga.cleaning.cleaner.Cleaner object at 0x000001E87E1EAC18>: 0.5494389901823283 

Cleaning didnt't improve the 

## Results

### Model Results

In [22]:
# model 
pipeline.fit(train_data, train_labels)

Pipeline(memory=None,
         steps=[('features',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('categorical_features',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value='__NA__',
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                          

In [23]:
# original data test score
pipeline.score(test_data, test_labels)

0.65

In [24]:
# corrupted data test score
pipeline.score(test_data_corrupted, test_labels)

0.6

In [25]:
# cleaned data test score
pipeline.score(test_data_cleaned, test_labels)

0.605

### PPP Results

In [26]:
# ppp model score
ppp.predict_score_ppp(ppp_model, test_data)

0.8113604488078542

In [27]:
# ppp score corrupted
score_no_cleaning

0.5513090229079008

In [28]:
# ppp score cleaned
np.array(cleaner_scores_ppp).max()

0.5494389901823283

In [29]:
# ppp cleaner scores
cleaner_scores_ppp

[0.5494389901823283, 0.5494389901823283, 0.5494389901823283]