# EDCA for classification tasks

In [1]:
# import EDCA
from edca.evodata import DataCentricAutoML

# import openml to load datasets
from openml.datasets import get_dataset
# import model for cross-validation
from sklearn.model_selection import train_test_split
# import metrics
from sklearn import metrics
# import util functions
import os
from datetime import datetime

# disable warnings
import warnings
warnings.filterwarnings("ignore")

## DEFINE VARIABLES

In [2]:
# PATHS
LOGS_PATH = 'logs'
if not os.path.exists(LOGS_PATH):
    os.makedirs(LOGS_PATH)
EXP_PATH = os.path.join(LOGS_PATH, f'exp_{datetime.now()}')
os.makedirs(EXP_PATH)

# VARIABLES
TIME_BUDGET = 2*60 # 2 minutes


## Load dataset

In [3]:
dataset_id = 31 # (credit-g) UPDATE DATASET ID
dataset = get_dataset(
    dataset_id,
    download_data=True, 
    download_qualities=True,
    download_features_meta_data=True)
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)

In [4]:
X.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
0,<0,6,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4,male single,none,4,real estate,67,none,own,2,skilled,1,yes,yes
1,0<=X<200,48,existing paid,radio/tv,5951.0,<100,1<=X<4,2,female div/dep/mar,none,2,real estate,22,none,own,1,skilled,1,none,yes
2,no checking,12,critical/other existing credit,education,2096.0,<100,4<=X<7,2,male single,none,3,real estate,49,none,own,1,unskilled resident,2,none,yes
3,<0,42,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2,male single,guarantor,4,life insurance,45,none,for free,1,skilled,2,none,yes
4,<0,24,delayed previously,new car,4870.0,<100,1<=X<4,3,male single,none,4,no known property,53,none,for free,2,skilled,2,none,yes


In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   checking_status         1000 non-null   category
 1   duration                1000 non-null   uint8   
 2   credit_history          1000 non-null   category
 3   purpose                 1000 non-null   category
 4   credit_amount           1000 non-null   float64 
 5   savings_status          1000 non-null   category
 6   employment              1000 non-null   category
 7   installment_commitment  1000 non-null   uint8   
 8   personal_status         1000 non-null   category
 9   other_parties           1000 non-null   category
 10  residence_since         1000 non-null   uint8   
 11  property_magnitude      1000 non-null   category
 12  age                     1000 non-null   uint8   
 13  other_payment_plans     1000 non-null   category
 14  housing                 1

In [6]:
y.head()

0    good
1     bad
2    good
3    good
4     bad
Name: class, dtype: category
Categories (2, object): ['good' < 'bad']

In [7]:
# divide data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2, 
    random_state=42, 
    stratify=y, 
    shuffle=True)

## Train AutoML

In [8]:
automl = DataCentricAutoML(metric='roc_auc', log_folder_name=os.path.abspath(EXP_PATH))

In [9]:
automl.fit(X_train, y_train)

2024-06-18 14:56:23,822: INFO     Evolutionary Search


2024-06-18 14:56:23,823: INFO     Create Initial Population
2024-06-18 14:56:23,824: INFO     Evaluate Initial Population


>>> Dataset Analysis
<<< Dataset Analysis


2024-06-18 14:56:25,789: INFO     Start Search for the best pipeline
2024-06-18 14:56:27,020: INFO     Iteration 1 >>> Fitness: 0.616 - Train %: 1.000 - Metric: 0.210 - CPU Time: 0.023 - Sample %: 1.000 - Feature %: 1.000
2024-06-18 14:56:27,493: INFO     Iteration 2 >>> Fitness: 0.616 - Train %: 1.000 - Metric: 0.210 - CPU Time: 0.023 - Sample %: 1.000 - Feature %: 1.000
2024-06-18 14:56:27,598: INFO     Iteration 3 >>> Fitness: 0.615 - Train %: 1.000 - Metric: 0.210 - CPU Time: 0.021 - Sample %: 1.000 - Feature %: 1.000
2024-06-18 14:56:27,714: INFO     Iteration 4 >>> Fitness: 0.615 - Train %: 1.000 - Metric: 0.210 - CPU Time: 0.021 - Sample %: 1.000 - Feature %: 1.000
2024-06-18 14:56:28,014: INFO     Iteration 5 >>> Fitness: 0.615 - Train %: 1.000 - Metric: 0.210 - CPU Time: 0.021 - Sample %: 1.000 - Feature %: 1.000
2024-06-18 14:56:28,112: INFO     Iteration 6 >>> Fitness: 0.615 - Train %: 1.000 - Metric: 0.210 - CPU Time: 0.021 - Sample %: 1.000 - Feature %: 1.000
2024-06-18 14

In [10]:
y_preds = automl.predict(X_test)

In [11]:
print(metrics.classification_report(y_test, y_preds))

              precision    recall  f1-score   support

         bad       0.59      0.32      0.41        60
        good       0.76      0.91      0.82       140

    accuracy                           0.73       200
   macro avg       0.67      0.61      0.62       200
weighted avg       0.71      0.73      0.70       200

