In [1]:
!pip install openml



You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [101]:
import openml
import pandas as pd
import numpy as np
import random
from abc import ABC

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from jenga.basis import BinaryClassificationTask
from jenga.corruptions.generic import MissingValues

### Datasets

In [84]:
data = openml.datasets.get_dataset("credit-g")

In [4]:
data

OpenML Dataset
Name..........: credit-g
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-06 23:21:47
Licence.......: Public
Download URL..: https://www.openml.org/data/v1/download/31/credit-g.arff
OpenML URL....: https://www.openml.org/d/31
# of features.: 21
# of instances: 1000

In [5]:
## summary
print(f"Dataset '{data.name}', target: '{data.default_target_attribute}'")
print(data.description[:500])

Dataset 'credit-g', target: 'class'
**Author**: Dr. Hans Hofmann  
**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)) - 1994    
**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)

**German Credit data**  
This dataset classifies people described by a set of attributes as good or bad credit risks.

This dataset comes with a cost matrix: 
``` 
      Good  Bad (predicted)  
Good   0    1   (actual)  
Bad    5    0  
```

It is worse to class a customer as good when they a


In [6]:
## load the data
# X: An array/dataframe where each row represents one example with the corresponding feature values
# y: the classes for each example
# categorical_indicator - an array that indicates which feature is categorical
# attribute_names - the names of the features for the examples(X) and target feature (y)

X, y, categorical_indicator, attribute_names = data.get_data(
    dataset_format='dataframe',
    target=data.default_target_attribute
)

In [7]:
X

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,4.0,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,2.0,real estate,22.0,none,own,1.0,skilled,1.0,none,yes
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,3.0,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,4.0,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,4.0,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,no checking,12.0,existing paid,furniture/equipment,1736.0,<100,4<=X<7,3.0,female div/dep/mar,none,4.0,real estate,31.0,none,own,1.0,unskilled resident,1.0,none,yes
996,<0,30.0,existing paid,used car,3857.0,<100,1<=X<4,4.0,male div/sep,none,4.0,life insurance,40.0,none,own,1.0,high qualif/self emp/mgmt,1.0,yes,yes
997,no checking,12.0,existing paid,radio/tv,804.0,<100,>=7,4.0,male single,none,4.0,car,38.0,none,own,1.0,skilled,1.0,none,yes
998,<0,45.0,existing paid,radio/tv,1845.0,<100,1<=X<4,4.0,male single,none,4.0,no known property,23.0,none,for free,1.0,skilled,1.0,yes,yes


In [8]:
y

0      good
1       bad
2      good
3      good
4       bad
       ... 
995    good
996    good
997    good
998     bad
999    good
Name: class, Length: 1000, dtype: category
Categories (2, object): [good < bad]

In [19]:
all_data = X.copy(deep=True)
all_data['class'] = y
all_data

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,...,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes,good
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,...,real estate,22.0,none,own,1.0,skilled,1.0,none,yes,bad
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,...,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes,good
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,...,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes,good
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,...,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,no checking,12.0,existing paid,furniture/equipment,1736.0,<100,4<=X<7,3.0,female div/dep/mar,none,...,real estate,31.0,none,own,1.0,unskilled resident,1.0,none,yes,good
996,<0,30.0,existing paid,used car,3857.0,<100,1<=X<4,4.0,male div/sep,none,...,life insurance,40.0,none,own,1.0,high qualif/self emp/mgmt,1.0,yes,yes,good
997,no checking,12.0,existing paid,radio/tv,804.0,<100,>=7,4.0,male single,none,...,car,38.0,none,own,1.0,skilled,1.0,none,yes,good
998,<0,45.0,existing paid,radio/tv,1845.0,<100,1<=X<4,4.0,male single,none,...,no known property,23.0,none,for free,1.0,skilled,1.0,yes,yes,bad


In [10]:
attribute_names

['checking_status',
 'duration',
 'credit_history',
 'purpose',
 'credit_amount',
 'savings_status',
 'employment',
 'installment_commitment',
 'personal_status',
 'other_parties',
 'residence_since',
 'property_magnitude',
 'age',
 'other_payment_plans',
 'housing',
 'existing_credits',
 'job',
 'num_dependents',
 'own_telephone',
 'foreign_worker']

In [11]:
categorical_indicator

[True,
 False,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 True,
 True]

In [30]:
attribute_types = pd.DataFrame(attribute_names, columns=["attribute_names"])
attribute_types['categorical_indicator'] = categorical_indicator
display(attribute_types)

Unnamed: 0,attribute_names,categorical_indicator
0,checking_status,True
1,duration,False
2,credit_history,True
3,purpose,True
4,credit_amount,False
5,savings_status,True
6,employment,True
7,installment_commitment,False
8,personal_status,True
9,other_parties,True


In [60]:
def get_alldata(dataset_name):
    ''' Get dataset from the OpenML Datasets and tranform it for use.
    
    Params:
    dataset_name: str: name of the dataset
    
    Returns:
    all_data: dataframe: combined data and labels
    attribute_names: list: names of attributes from the data
    attribute_types: dataframe: names and types of attributes from the data 
    '''
    
    data = openml.datasets.get_dataset(dataset_name)
    
    ## summary
    print(f"Dataset '{data.name}', target: '{data.default_target_attribute}'")
    print(data.description[:500])
    
    ## load the data
    # X: An array/dataframe where each row represents one example with the corresponding feature values
    # y: the classes for each example
    # categorical_indicator - an array that indicates which feature is categorical
    # attribute_names - the names of the features for the examples(X) and target feature (y)
    X, y, categorical_indicator, attribute_names = data.get_data(
        dataset_format='dataframe',
        target=data.default_target_attribute
    )
    
    ## combine the attribute names with the information of them being categorical or not
    # will be used further in order not to manually distinguish between the numerical and categorical features
    attribute_types = pd.DataFrame(attribute_names, columns=["attribute_names"])
    attribute_types['categorical_indicator'] = categorical_indicator
    print("\nAttribute types: ")
    display(attribute_types)
    
    all_data = X.copy(deep=True)
    all_data['class'] = y
    
    return all_data, attribute_names, attribute_types

In [66]:
## test credit-g dataset (openML)
all_data, attribute_names, attribute_types = get_alldata("credit-g")
print(attribute_names)
display(attribute_types)
display(all_data.head())

Dataset 'credit-g', target: 'class'
**Author**: Dr. Hans Hofmann  

**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)) - 1994    

**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)



**German Credit data**  

This dataset classifies people described by a set of attributes as good or bad credit risks.



This dataset comes with a cost matrix: 

``` 

      Good  Bad (predicted)  

Good   0    1   (actual)  

Bad    5    0  

```



It is worse to class a customer as go

Attribute types: 


Unnamed: 0,attribute_names,categorical_indicator
0,checking_status,True
1,duration,False
2,credit_history,True
3,purpose,True
4,credit_amount,False
5,savings_status,True
6,employment,True
7,installment_commitment,False
8,personal_status,True
9,other_parties,True


['checking_status', 'duration', 'credit_history', 'purpose', 'credit_amount', 'savings_status', 'employment', 'installment_commitment', 'personal_status', 'other_parties', 'residence_since', 'property_magnitude', 'age', 'other_payment_plans', 'housing', 'existing_credits', 'job', 'num_dependents', 'own_telephone', 'foreign_worker']


Unnamed: 0,attribute_names,categorical_indicator
0,checking_status,True
1,duration,False
2,credit_history,True
3,purpose,True
4,credit_amount,False
5,savings_status,True
6,employment,True
7,installment_commitment,False
8,personal_status,True
9,other_parties,True


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,...,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes,good
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,...,real estate,22.0,none,own,1.0,skilled,1.0,none,yes,bad
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,...,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes,good
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,...,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes,good
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,...,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes,bad


In [67]:
## test blood-transfusion-service-center dataset (openML)
all_data_bloodtransfusion, attribute_names_bloodtransfusion, attribute_types_bloodtransfusion = get_alldata("blood-transfusion-service-center")
print(attribute_names_bloodtransfusion)
display(attribute_types_bloodtransfusion)
all_data_bloodtransfusion.head()

Dataset 'blood-transfusion-service-center', target: 'Class'
**Author**: Prof. I-Cheng Yeh  

**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Blood+Transfusion+Service+Center)  

**Please cite**: Yeh, I-Cheng, Yang, King-Jang, and Ting, Tao-Ming, "Knowledge discovery on RFM model using Bernoulli sequence", Expert Systems with Applications, 2008.   



**Blood Transfusion Service Center Data Set**  

Data taken from the Blood Transfusion Service Center in Hsin-Chu City in Taiwan -- this is a classification problem.



To demonstrate the RFMTC mar

Attribute types: 


Unnamed: 0,attribute_names,categorical_indicator
0,V1,False
1,V2,False
2,V3,False
3,V4,False


['V1', 'V2', 'V3', 'V4']


Unnamed: 0,attribute_names,categorical_indicator
0,V1,False
1,V2,False
2,V3,False
3,V4,False


Unnamed: 0,V1,V2,V3,V4,class
0,2.0,50.0,12500.0,98.0,2
1,0.0,13.0,3250.0,28.0,2
2,1.0,16.0,4000.0,35.0,2
3,2.0,20.0,5000.0,45.0,2
4,1.0,24.0,6000.0,77.0,1


In [40]:
train_split, test_split = train_test_split(all_data, test_size=0.2)

In [49]:
train_data = train_split[attribute_names]
train_labels = np.array(train_split['class'])

In [52]:
test_data = test_split[attribute_names]
test_labels = np.array(test_split['class'])

In [59]:
def get_train_test_data(all_data, attribute_names):
    ''' Get train and test data along with train and test labels.
    
    Params:
    all_data: dataframe: combined data and labels
    attribute_names: list: names of attributes from the data
    
    Returns:
    train_data: dataframe:
    train_labels: list
    test_data: dataframe
    test_labels: list
    '''
    
    train_split, test_split = train_test_split(all_data, test_size=0.2)
    
    train_data = train_split[attribute_names]
    train_labels = np.array(train_split['class'])
    
    test_data = test_split[attribute_names]
    test_labels = np.array(test_split['class'])
    
    return train_data, train_labels, test_data, test_labels

In [69]:
## test credit-g dataset (openML)
dat_train_credit, lab_train_credit, dat_test_credit, lab_test_credit = get_train_test_data(all_data, attribute_names)

display(dat_train_credit.head())
print(lab_train_credit[0:5])

display(dat_test_credit.head())
print(lab_test_credit[0:5])

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
753,no checking,30.0,critical/other existing credit,radio/tv,5771.0,<100,4<=X<7,4.0,female div/dep/mar,none,2.0,car,25.0,none,own,2.0,skilled,1.0,none,yes
945,0<=X<200,48.0,no credits/all paid,new car,8358.0,500<=X<1000,<1,1.0,female div/dep/mar,none,1.0,car,30.0,none,own,2.0,skilled,1.0,none,yes
652,<0,24.0,existing paid,new car,2303.0,<100,>=7,4.0,male single,co applicant,1.0,real estate,45.0,none,own,1.0,skilled,1.0,none,yes
608,no checking,18.0,existing paid,radio/tv,2051.0,<100,<1,4.0,male single,none,1.0,real estate,33.0,none,own,1.0,skilled,1.0,none,yes
880,no checking,24.0,existing paid,used car,7814.0,<100,4<=X<7,3.0,male single,none,3.0,car,38.0,none,own,1.0,high qualif/self emp/mgmt,1.0,yes,yes


['good' 'good' 'bad' 'good' 'good']


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
650,<0,48.0,existing paid,education,7476.0,<100,4<=X<7,4.0,male single,none,1.0,no known property,50.0,none,for free,1.0,high qualif/self emp/mgmt,1.0,yes,yes
412,no checking,12.0,critical/other existing credit,business,2292.0,<100,unemployed,4.0,male single,none,2.0,car,42.0,stores,own,2.0,high qualif/self emp/mgmt,1.0,yes,yes
39,0<=X<200,9.0,existing paid,radio/tv,458.0,<100,1<=X<4,4.0,male single,none,3.0,real estate,24.0,none,own,1.0,skilled,1.0,none,yes
595,0<=X<200,6.0,all paid,new car,931.0,100<=X<500,<1,1.0,female div/dep/mar,none,1.0,life insurance,32.0,stores,own,1.0,unskilled resident,1.0,none,yes
840,<0,36.0,existing paid,furniture/equipment,5179.0,<100,4<=X<7,4.0,male single,none,2.0,life insurance,29.0,none,own,1.0,skilled,1.0,none,yes


['good' 'bad' 'good' 'bad' 'bad']


In [70]:
## test blood-transfusion-service-center dataset (openML)
dat_train_bloodtrandfusion, lab_train_bloodtrandfusion, dat_test_bloodtrandfusion, lab_test_bloodtrandfusion = get_train_test_data(all_data_bloodtransfusion, attribute_names_bloodtransfusion)

display(dat_train_bloodtrandfusion.head())
print(lab_train_bloodtrandfusion[0:5])

display(dat_test_bloodtrandfusion.head())
print(lab_test_bloodtrandfusion[0:5])

Unnamed: 0,V1,V2,V3,V4
496,38.0,1.0,250.0,38.0
662,16.0,3.0,750.0,21.0
239,8.0,9.0,2250.0,58.0
524,4.0,9.0,2250.0,28.0
79,2.0,2.0,500.0,4.0


['1' '1' '1' '1' '1']


Unnamed: 0,V1,V2,V3,V4
379,14.0,1.0,250.0,14.0
299,14.0,2.0,500.0,14.0
449,16.0,3.0,750.0,50.0
187,4.0,4.0,1000.0,26.0
287,9.0,2.0,500.0,16.0


['1' '1' '1' '1' '2']


In [82]:
list(attribute_types['attribute_names'][attribute_types['categorical_indicator'] == True])

['checking_status',
 'credit_history',
 'purpose',
 'savings_status',
 'employment',
 'personal_status',
 'other_parties',
 'property_magnitude',
 'other_payment_plans',
 'housing',
 'job',
 'own_telephone',
 'foreign_worker']

In [83]:
list(attribute_types['attribute_names'][attribute_types['categorical_indicator'] == False])

['duration',
 'credit_amount',
 'installment_commitment',
 'residence_since',
 'age',
 'existing_credits',
 'num_dependents']

In [94]:
#############
class Dataset(ABC):
    def __init__(self, seed, dataset_name):
        
        random.seed(seed)
        np.random.seed(seed)
        
        data = openml.datasets.get_dataset(dataset_name)

        ## summary
        print(f"Dataset '{data.name}', target: '{data.default_target_attribute}'")
        print(data.description[:500])

        ## load the data
        # X: An array/dataframe where each row represents one example with the corresponding feature values
        # y: the classes for each example
        # categorical_indicator - an array that indicates which feature is categorical
        # attribute_names - the names of the features for the examples(X) and target feature (y)
        X, y, categorical_indicator, attribute_names = data.get_data(
            dataset_format='dataframe',
            target=data.default_target_attribute
        )

        ## combine the attribute names with the information of them being categorical or not
        # will be used further in order not to manually distinguish between the numerical and categorical features
        attribute_types = pd.DataFrame(attribute_names, columns=["attribute_names"])
        attribute_types['categorical_indicator'] = categorical_indicator
        print("\nAttribute types: ")
        display(attribute_types)

        all_data = X.copy(deep=True)
        all_data['class'] = y
        
        ## Get train and test data along with train and test labels.
        train_split, test_split = train_test_split(all_data, test_size=0.2)

        train_data = train_split[attribute_names]
        train_labels = np.array(train_split['class'])

        test_data = test_split[attribute_names]
        test_labels = np.array(test_split['class'])

### Model

In [None]:
## baseline model -- LATER
class BaselineModel(BinaryClassificationTask):
    
    def __init__(self, seed, train_data, train_labels, test_data, test_labels, attribute_types):
        
        categorical_columns = list(attribute_types['attribute_names'][attribute_types['categorical_indicator'] == True])
        numerical_columms = list(attribute_types['attribute_names'][attribute_types['categorical_indicator'] == False])
        
        BinaryClassificationTask.__init__(self,
                                          seed,
                                          train_data,
                                          train_labels,
                                          test_data,
                                          test_labels,categorical_columns,
                                          numerical_columms
                                         )
    
    def fit_baseline_model(self, train_data, train_labels, learner):

In [86]:
def fit_model(train_data, train_labels, attribute_types, learner, param_grid):
    ''' Get a trained model.
    
    Params:
    train_data: dataframe
    train_labels: list
    attribute_types: dataframe: names and types of attributes from the data 
    learner: estimator object: estimator to be used
    param_grid: dict: param names as keys and lists of param settings to try as values
    
    Returns:
    model
    '''
    
    categorical_columns = list(attribute_types['attribute_names'][attribute_types['categorical_indicator'] == True])
    numerical_columms = list(attribute_types['attribute_names'][attribute_types['categorical_indicator'] == False])
    
    # preprocessing pipeline for numerical columns
    transformer_numeric = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('standard_scale', StandardScaler())
    ])
    
    # preprocessing pipeline for categorical columns
    transformer_categorical = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='__NA__')),
        ('one_hot_encode', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # preprocessor
    feature_transform = ColumnTransformer(transformers=[
        ('categorical_features', transformer_categorical, categorical_columns),
        ('numerical_features', transformer_numeric, numerical_columms)
    ])
    
    ## prediction pipeline: append classifier (learner) to the preprocessing pipeline
    pipeline = Pipeline([
        ('features', feature_transform),
        ('learner', learner)
    ])
    
    grid_search = GridSearchCV(pipeline, param_grid, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
    model = grid_search.fit(train_data, train_labels)
    
    return model    

In [87]:
## test
from sklearn.linear_model import SGDClassifier

model = fit_model(dat_train_credit, lab_train_credit, attribute_types, 
                  learner=SGDClassifier(max_iter=1000), 
                  param_grid={
                      'learner__loss': ['log'],
                      'learner__penalty': ['l2', 'l1', 'elasticnet'],
                      'learner__alpha': [0.0001, 0.001, 0.01, 0.1]
                  })

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   29.8s finished


### Corruptions

In [102]:
missing_values_corruption = MissingValues(column="age", fraction=0.45, missingness="MAR", na_value=999)

dat_test_credit_corrupted = missing_values_corruption.transform(dat_test_credit.copy())

### Cleaning