In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, log_loss, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer,fetch_lfw_pairs,load_digits,load_iris,load_wine

from art.attacks.evasion import FastGradientMethod,AutoProjectedGradientDescent,ThresholdAttack
from art.attacks.evasion import ZooAttack,HopSkipJump, BoundaryAttack, DecisionTreeAttack
from art.attacks.evasion import HighConfidenceLowUncertainty, ProjectedGradientDescent

from art.estimators.classification import SklearnClassifier


In [2]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier, _tree
from sklearn.utils.validation import check_is_fitted
   
class MonteCarloDecisionTreeClassifier(DecisionTreeClassifier):
    def __init__(self, prob_type='fixed', *args, **kwargs):
        super().__init__(*args, **kwargs)
        if prob_type not in ['fixed', 'depth', 'certainty', 'agreement', 'bayes','confidence','distance']:
            raise ValueError('Invalid prob_type')
        self.prob_type = prob_type
   
    def get_depth_based_probability(self, depth):
        return min(0.05 * depth, 0.2)

    def get_certainty_based_probability(self, node_id):
        node_values = self.tree_.value[node_id].flatten()
        total = np.sum(node_values)
        distribution = node_values / total
        max_certainty = np.max(distribution)
        p = 0.5 - max_certainty
        return max(p, 0)
    
    def get_agreement_based_probability(self, node_id):
        node_values = self.tree_.value[node_id].flatten()
        majority_class_ratio = np.max(node_values) / np.sum(node_values)
        p = 0.5 - majority_class_ratio
        return max(p, 0)
    
    def get_confidence_based_probability(self, X, node_id, sample):
        feature_index = self.tree_.feature[node_id]
        if feature_index == _tree.TREE_UNDEFINED:
            return 0
        
        feature_values = X[:, feature_index]
        avg = np.mean(feature_values)
        std = np.std(feature_values)
        
        distance = abs(sample[feature_index] - avg)
        p = max(0.5 - (distance / (std + 1e-9)), 0)
        return p
    
    def get_distance_based_probability(self, X, node_id, sample):
        feature_index = self.tree_.feature[node_id]
        if feature_index == _tree.TREE_UNDEFINED:
            return 0
        
        threshold = self.tree_.threshold[node_id]
        feature_value = sample[feature_index]
        feature_values = X[:, feature_index]
        distance = abs(feature_value - threshold)
        std = np.std(feature_values)
        
        # The closer the distance is to 0, the lower the probability
        p =  max(0.5 - (distance / (std + 1e-9)), 0) 
        
        return p
    
    def get_bayes_based_probability(self, node_id, sample):
        if self.tree_.children_left[node_id] == _tree.TREE_LEAF or self.tree_.children_right[node_id] == _tree.TREE_LEAF:
            # Can't calculate Bayesian probability at the leaf node.
            return 0

        # Extract the class distribution in the parent node
        parent_values = self.tree_.value[node_id].flatten()
        parent_samples = np.sum(parent_values)

        # Extract the class distributions in the left and right children
        left_child_values = self.tree_.value[self.tree_.children_left[node_id]].flatten()
        right_child_values = self.tree_.value[self.tree_.children_right[node_id]].flatten()

        # Empirical prior probability (based on class distribution in parent)
        prior = np.max(parent_values) / parent_samples

        # Calculate the likelihood of the hypothesis that this node represents
        # a clear decision boundary (i.e., the children nodes have a clear majority class)
        left_majority_ratio = np.max(left_child_values) / np.sum(left_child_values)
        right_majority_ratio = np.max(right_child_values) / np.sum(right_child_values)
        likelihood = left_majority_ratio * right_majority_ratio

        # Estimate the marginal likelihood (average likelihood over all hypotheses)
        marginal_likelihood = np.mean([left_majority_ratio, right_majority_ratio])

        # Bayes' theorem to update the probability of the hypothesis
        posterior = likelihood * (prior / (marginal_likelihood + 1e-9))

        # Scaling the result
        p = 0.5 - posterior
        # Return the scaled posterior
        return max(p, 0)

    
    def traverse_tree(self, node, sample, X, depth=0):
        if self.prob_type == 'fixed':
            p = 0.05
        elif self.prob_type == 'depth':
            p = self.get_depth_based_probability(depth)
        elif self.prob_type == 'certainty':
            p = self.get_certainty_based_probability(node)
        elif self.prob_type == 'agreement':
            p = self.get_agreement_based_probability(node)
        elif self.prob_type == 'confidence':
            p = self.get_confidence_based_probability(X,node, sample)
        elif self.prob_type == 'bayes':
            p = self.get_bayes_based_probability(node, sample)
        elif self.prob_type == 'distance':
             p = self.get_distance_based_probability(X, node, sample)
        else:
            raise ValueError('Invalid prob_type')

        if self.tree_.feature[node] != _tree.TREE_UNDEFINED:
            # internal node
            if sample[self.tree_.feature[node]] <= self.tree_.threshold[node]:
                # go to the left child with high probability
                if np.random.rand() > p:
                    return self.traverse_tree(self.tree_.children_left[node], sample, X, depth + 1)
                else:
                    return self.traverse_tree(self.tree_.children_right[node], sample, X, depth + 1)
            else:
                # go to the right child with high probability
                if np.random.rand() > p:
                    return self.traverse_tree(self.tree_.children_right[node], sample, X, depth + 1)
                else:
                    return self.traverse_tree(self.tree_.children_left[node], sample, X, depth + 1)
        else:
            # leaf
            return self.tree_.value[node]

    def predict_proba(self, X, n_simulations=100):
        check_is_fitted(self)
        X = super()._validate_X_predict(X, check_input=True)

        proba = []
        for x in X:
            simulation_results = [self.traverse_tree(0, x, X).ravel() for _ in range(n_simulations)]
            mean_proba = np.mean(simulation_results, axis=0)
            proba.append(mean_proba)

        return np.array(proba)



In [3]:
import pandas as pd
from sklearn.utils import Bunch
from joblib import Parallel, delayed

# Base path where the files are stored
base_path = 'Experiments/'

# List of files
dataset_list = ['!ar4.csv', '!bodyfat.csv', 'Kaggle_Surgical-deepnet.csv', 'MaternalBinary.csv', 'OPENML_philippine.csv', 'AcousticExtinguisherFire.csv', 'acute-inflammation.csv', 'acute-nephritis.csv', 'AP_Colon_Lung.csv', 'backache.csv', 'blood.csv', 'chess-krvkp.csv', 'cloud.csv', 'congressional-voting.csv', 'credit-approval.csv', 'dresses-salesN.csv', 'echocardiogram.csv', 'haberman-survival.csv', 'heart_failure_clinical_records_dataset.csv', 'heart-hungarian.csv', 'hill-valley.csv', 'horse-colic.csv', 'ilpd-indian-liver.csv', 'no2.csv', 'kaggle_REWEMA.csv', 'lowbwt.csv', 'madelon.csv', 'Mesothelioma.csv', 'MIMIC2.csv', 'molec-biol-promoter.csv', 'oil_spill.csv', 'oocytes_merluccius_nucleus_4d.csv', 'oocytes_trisopterus_nucleus_2f.csv', 'ozone.csv', 'Parkinson_Multiple_Sound_Recording.csv', 'PC1 Software defect prediction.csv', 'pd_speech_features.csv', 'pima.csv', 'Pistachio_28_Features_Dataset.csv', 'plasma_retinol.csv', 'primary-tumorNumeric.csv', 'seismic-bumps.csv', 'sleuth_case2002.csv', 'spambase.csv', 'spect.csv', 'spectf.csv', 'statlog-australian-credit.csv', 'statlog-heart_.csv', 'ThoraricSurgery.csv', 'triazines.csv' ]
bigger_datasets_list = ['Kaggle_Surgical-deepnet.csv', 'AcousticExtinguisherFire.csv', 'chess-krvkp.csv', 'kaggle_REWEMA.csv',
                        'madelon.csv', 'OPENML_philippine.csv', 'ozone.csv', 'Pistachio_28_Features_Dataset.csv', 'seismic-bumps.csv',
                        'spambase.csv']
more_bigger_datasets_list = ['kaggle_fraud_detection_bank_dataset.csv','mushroom.csv','musk.csv','bank.csv','sick_numeric2.csv',]
dataset_list = dataset_list #+ bigger_datasets_list #+ more_bigger_datasets_list

# Function to load dataset
def load_dataset(file_name, path):
    try:
        data = pd.read_csv(path + file_name)
        # Use all columns except the last one as features
        X = data.iloc[:, :-1]
        # Use the last column as the target class
        y = data.iloc[:, -1]
        return (file_name, Bunch(data=X, target=y))
    except Exception as e:
        print(f"Error loading {file_name}: {e}")
        return None

# Parallel loading of datasets (using all available cores with n_jobs=-1)
datasets = Parallel(n_jobs=-1)(delayed(load_dataset)(file_name, base_path) for file_name in dataset_list)

# Filter out None values in case of loading errors
datasets = [dataset for dataset in datasets if dataset is not None]

# Now 'datasets' is a list of tuples, where each tuple contains file_name and the corresponding dataset as a Bunch object.

In [4]:
from joblib import Parallel, delayed
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from art.defences.preprocessor import FeatureSqueezing,GaussianAugmentation
from sklearn.metrics import roc_auc_score, f1_score, log_loss, accuracy_score

# Assuming that MonteCarloRandomForestClassifier is imported or defined elsewhere

# 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# List to store evaluation results
results = []

# Define classifiers
classifiers = {
    'DecisionTree_DecisionTreeAttack': DecisionTreeClassifier(random_state=123),
    'MonteCarloDecisionTree_Fix_Prob_DecisionTreeAttack': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='fixed'),
    'MonteCarloDecisionTree_Depth_Prob_DecisionTreeAttack': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='depth'),
    'MonteCarloDecisionTree_Agreement_Prob_DecisionTreeAttack': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='agreement'),
    'MonteCarloDecisionTree_Bayes_Prob_DecisionTreeAttack': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='bayes'),
    'MonteCarloDecisionTree_Confidence_Prob_DecisionTreeAttack': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='confidence'),
    'MonteCarloDecisionTree_Distance_Prob_DecisionTreeAttack': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='distance'),
    'FeatureSqueezing_DecisionTreeAttack': DecisionTreeClassifier(random_state=123),
 #   'GaussianAugmentation_DecisionTreeAttack': DecisionTreeClassifier(random_state=123),
}

def evaluate_classifier(dataset_name, dataset, clf_name, clf, train_index, test_index):
    X, y = dataset.data.fillna(0).values, dataset.target
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
   
    # Train classifier
    clf.fit(X_train, y_train)

    if clf_name != 'DecisionTree_DecisionTreeAttack':
        dummy_clf = DecisionTreeClassifier()
        dummy_clf.fit(X_train, y_train)
        classifier = SklearnClassifier(model=dummy_clf, use_logits=True)
    else:
        classifier = SklearnClassifier(model=clf, use_logits=True)
        
    attack = DecisionTreeAttack(classifier=classifier)
    x_test_adv = attack.generate(x=X_test)
    
    if clf_name == 'FeatureSqueezing_DecisionTreeAttack':
        # Initialize the feature squeezing defence
        defence = FeatureSqueezing(clip_values=(X_train.min(), X_train.max()), bit_depth=4)

        # Fit the defence with training data
        defence.fit(X_train)

        # Apply the defence on testing data
        x_test_adv = defence(x_test_adv)[0]
        
    if clf_name == 'GaussianAugmentation_DecisionTreeAttack':
        # Initialize the Gaussian Augmentation defence
        defence = GaussianAugmentation(sigma=1.0)

        # Apply the Gaussian Augmentation on training data
        X_train_augmented, y_train_augmented = defence(X_train, y_train)

        # Retrain the model on the augmented data
        clf.fit(X_train_augmented, y_train_augmented)
    
    start_time = time.time()
    pred_probs = clf.predict_proba(x_test_adv)
    preds = clf.predict(x_test_adv)
    runtime = time.time() - start_time

    # Evaluate
    auc = roc_auc_score(y_test, pred_probs[:, 1], multi_class='ovr')
    f1 = f1_score(y_test, preds, average='macro')
    logloss = log_loss(y_test, pred_probs)
    accuracy = accuracy_score(y_test, preds)

    return [dataset_name, clf_name, auc, f1, logloss, accuracy, runtime]

# Loop through datasets
all_tasks = []

for dataset_name, dataset in datasets:
    print(dataset_name)
    X, y = dataset.data.fillna(0).values, dataset.target
    
    # Perform 5-fold cross-validation
    for clf_name, clf in classifiers.items():
        for i, (train_index, test_index) in enumerate(skf.split(X, y)):
            task = delayed(evaluate_classifier)(dataset_name, dataset, clf_name, clf, train_index, test_index)
            all_tasks.append(task)

# Execute all tasks in parallel
results = Parallel(n_jobs=-1)(all_tasks)

# Convert results to DataFrame for easy visualization
results_df_dtattack = pd.DataFrame(results, columns=['Dataset', 'Classifier', 'AUC', 'F1', 'LogLoss', 'Accuracy', 'Runtime'])


!ar4.csv
!bodyfat.csv
Kaggle_Surgical-deepnet.csv
MaternalBinary.csv
OPENML_philippine.csv
AcousticExtinguisherFire.csv
acute-inflammation.csv
acute-nephritis.csv
AP_Colon_Lung.csv
backache.csv
blood.csv
chess-krvkp.csv
cloud.csv
congressional-voting.csv
credit-approval.csv
dresses-salesN.csv
echocardiogram.csv
haberman-survival.csv
heart_failure_clinical_records_dataset.csv
heart-hungarian.csv
hill-valley.csv
horse-colic.csv
ilpd-indian-liver.csv
no2.csv
kaggle_REWEMA.csv
lowbwt.csv
madelon.csv
Mesothelioma.csv
MIMIC2.csv
molec-biol-promoter.csv
oil_spill.csv
oocytes_merluccius_nucleus_4d.csv
oocytes_trisopterus_nucleus_2f.csv
ozone.csv
Parkinson_Multiple_Sound_Recording.csv
PC1 Software defect prediction.csv
pd_speech_features.csv
pima.csv
Pistachio_28_Features_Dataset.csv
plasma_retinol.csv
primary-tumorNumeric.csv
seismic-bumps.csv
sleuth_case2002.csv
spambase.csv
spect.csv
spectf.csv
statlog-australian-credit.csv
statlog-heart_.csv
ThoraricSurgery.csv
triazines.csv


Decision tree attack: 100%|██████████| 21/21 [00:00<00:00, 1868.37it/s]
Decision tree attack:   0%|          | 0/51 [00:00<?, ?it/s]
Decision tree attack:   0%|          | 0/50 [00:00<?, ?it/s]
Decision tree attack: 100%|██████████| 50/50 [00:00<00:00, 2484.25it/s]
Decision tree attack:   0%|          | 0/50 [00:00<?, ?it/s]
Decision tree attack: 100%|██████████| 50/50 [00:00<00:00, 2566.30it/s]
Decision tree attack: 100%|██████████| 51/51 [00:00<00:00, 2554.05it/s]
Decision tree attack: 100%|██████████| 50/50 [00:00<00:00, 2563.35it/s]
Decision tree attack:   0%|          | 0/50 [00:00<?, ?it/s]
Decision tree attack: 100%|██████████| 22/22 [00:00<00:00, 1808.56it/s]
Decision tree attack: 100%|██████████| 22/22 [00:00<00:00, 1273.60it/s]
Decision tree attack: 100%|██████████| 51/51 [00:00<00:00, 2536.64it/s]

Decision tree attack: 100%|██████████| 50/50 [00:00<00:00, 1671.09it/s]
Decision tree attack: 100%|██████████| 50/50 [00:00<00:00, 2133.03it/s]
Decision tree attack: 100%|████████

In [5]:
summary_dtattack = results_df_dtattack.groupby(['Dataset','Classifier']).agg({
    'AUC': ['mean', 'std'],
    'F1': ['mean', 'std'],
    'LogLoss': ['mean', 'std'],
    'Accuracy': ['mean', 'std'],
    'Runtime': ['mean', 'std']
}).reset_index()

print(summary_dtattack)

           Dataset                                         Classifier  \
                                                                        
0         !ar4.csv                    DecisionTree_DecisionTreeAttack   
1         !ar4.csv                FeatureSqueezing_DecisionTreeAttack   
2         !ar4.csv  MonteCarloDecisionTree_Agreement_Prob_Decision...   
3         !ar4.csv  MonteCarloDecisionTree_Bayes_Prob_DecisionTree...   
4         !ar4.csv  MonteCarloDecisionTree_Confidence_Prob_Decisio...   
..             ...                                                ...   
395  triazines.csv  MonteCarloDecisionTree_Bayes_Prob_DecisionTree...   
396  triazines.csv  MonteCarloDecisionTree_Confidence_Prob_Decisio...   
397  triazines.csv  MonteCarloDecisionTree_Depth_Prob_DecisionTree...   
398  triazines.csv  MonteCarloDecisionTree_Distance_Prob_DecisionT...   
399  triazines.csv  MonteCarloDecisionTree_Fix_Prob_DecisionTreeAt...   

          AUC                  F1              Log

In [6]:
summary_dtattack[0:30]

Unnamed: 0_level_0,Dataset,Classifier,AUC,AUC,F1,F1,LogLoss,LogLoss,Accuracy,Accuracy,Runtime,Runtime
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
0,!ar4.csv,DecisionTree_DecisionTreeAttack,0.324183,0.12147,0.211553,0.092686,27.152562,3.246394,0.213853,0.093993,0.000287,9e-06
1,!ar4.csv,FeatureSqueezing_DecisionTreeAttack,0.5,0.0,0.448421,0.001441,6.4592,0.163789,0.812987,0.004742,0.000463,7.2e-05
2,!ar4.csv,MonteCarloDecisionTree_Agreement_Prob_Decision...,0.597304,0.172654,0.468683,0.156034,16.207807,6.799066,0.530736,0.196853,0.293002,0.025708
3,!ar4.csv,MonteCarloDecisionTree_Bayes_Prob_DecisionTree...,0.518873,0.197582,0.38999,0.169783,14.152641,6.58421,0.494372,0.254283,0.916625,0.202956
4,!ar4.csv,MonteCarloDecisionTree_Confidence_Prob_Decisio...,0.506863,0.130408,0.332449,0.187295,4.464011,3.518065,0.445887,0.30303,0.829934,0.132213
5,!ar4.csv,MonteCarloDecisionTree_Depth_Prob_DecisionTree...,0.576552,0.262245,0.344528,0.195268,0.706893,0.137274,0.404762,0.237218,0.078829,0.016968
6,!ar4.csv,MonteCarloDecisionTree_Distance_Prob_DecisionT...,0.722467,0.096553,0.511925,0.170043,1.359593,0.852534,0.606494,0.225194,0.585566,0.058355
7,!ar4.csv,MonteCarloDecisionTree_Fix_Prob_DecisionTreeAt...,0.418219,0.252394,0.308651,0.206723,0.69081,0.251131,0.451082,0.340225,0.068459,0.012471
8,!bodyfat.csv,DecisionTree_DecisionTreeAttack,0.004,0.008944,0.003846,0.0086,34.40333,0.302867,0.003922,0.008769,0.000345,2.1e-05
9,!bodyfat.csv,FeatureSqueezing_DecisionTreeAttack,0.5,0.0,0.336819,0.003663,16.995787,0.288185,0.507922,0.008344,0.000257,1.7e-05


In [7]:
from joblib import Parallel, delayed
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from art.defences.preprocessor import FeatureSqueezing,GaussianAugmentation
from sklearn.metrics import roc_auc_score, f1_score, log_loss, accuracy_score

# Assuming that MonteCarloRandomForestClassifier is imported or defined elsewhere

# 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# List to store evaluation results
results = []

# Define classifiers
classifiers = {
    'DecisionTree_ZooAttack': DecisionTreeClassifier(random_state=123),
    'MonteCarloDecisionTree_Fix_Prob_ZooAttack': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='fixed'),
    'MonteCarloDecisionTree_Depth_Prob_ZooAttack': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='depth'),
    'MonteCarloDecisionTree_Certainty_Prob_ZooAttack': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='certainty'),
    'MonteCarloDecisionTree_Bayes_Prob_ZooAttack': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='bayes'),
    'MonteCarloDecisionTree_Confidence_Prob_ZooAttack': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='confidence'),
    'MonteCarloDecisionTree_Distance_Prob_ZooAttack': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='distance'),
    'FeatureSqueezing_ZooAttack': DecisionTreeClassifier(random_state=123),
 #   'GaussianAugmentation_ZooAttack': DecisionTreeClassifier(random_state=123),
}

def evaluate_classifier(dataset_name, dataset, clf_name, clf, train_index, test_index):
    X, y = dataset.data.fillna(0).values, dataset.target
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train classifier
    clf.fit(X_train, y_train)

    if clf_name != 'DecisionTree_ZooAttack':
        dummy_clf = DecisionTreeClassifier(random_state=123)
        dummy_clf.fit(X_train, y_train)
        classifier = SklearnClassifier(model=dummy_clf, use_logits=True)
    else:
        classifier = SklearnClassifier(model=clf, use_logits=True)
        
    attack = ZooAttack(classifier=classifier, confidence=0.0, targeted=False, learning_rate=1e-1, max_iter=5,
           binary_search_steps=5, initial_const=1e-3, abort_early=True, use_resize=False, 
           use_importance=False, nb_parallel=2, batch_size=1, variable_h=0.2) 
    x_test_adv = attack.generate(x=X_test)
    
    if clf_name == 'FeatureSqueezing_ZooAttack':
        # Initialize the feature squeezing defence
        defence = FeatureSqueezing(clip_values=(X_train.min(), X_train.max()), bit_depth=4)

        # Fit the defence with training data
        defence.fit(X_train)

        # Apply the defence on testing data
        x_test_adv = defence(x_test_adv)[0]
        
    if clf_name == 'GaussianAugmentation_ZooAttack':
        # Initialize the Gaussian Augmentation defence
        defence = GaussianAugmentation(sigma=1.0)

        # Apply the Gaussian Augmentation on training data
        X_train_augmented, y_train_augmented = defence(X_train, y_train)

        # Retrain the model on the augmented data
        clf.fit(X_train_augmented, y_train_augmented)
    
    start_time = time.time()
    pred_probs = clf.predict_proba(x_test_adv)
    preds = clf.predict(x_test_adv)
    runtime = time.time() - start_time

    # Evaluate
    auc = roc_auc_score(y_test, pred_probs[:, 1], multi_class='ovr')
    f1 = f1_score(y_test, preds, average='macro')
    logloss = log_loss(y_test, pred_probs)
    accuracy = accuracy_score(y_test, preds)

    return [dataset_name, clf_name, auc, f1, logloss, accuracy, runtime]

# Loop through datasets
all_tasks = []

for dataset_name, dataset in datasets:
    print(dataset_name)
    X, y = dataset.data.fillna(0).values, dataset.target
    try:
        # Perform 5-fold cross-validation
        for clf_name, clf in classifiers.items():
            for i, (train_index, test_index) in enumerate(skf.split(X, y)):
                task = delayed(evaluate_classifier)(dataset_name, dataset, clf_name, clf, train_index, test_index)
                all_tasks.append(task)
    except Exception as e:
        print(f"Error in {clf_name} on {dataset_name}: {e}")

# Execute all tasks in parallel
try:
    results = Parallel(n_jobs=-1)(all_tasks)
except Exception as e:
    print(f"Error in {clf_name} on {dataset_name}: {e}")

# Convert results to DataFrame for easy visualization
results_df_ZooAttack = pd.DataFrame(results, columns=['Dataset', 'Classifier', 'AUC', 'F1', 'LogLoss', 'Accuracy', 'Runtime'])


summary_ZooAttack = results_df_ZooAttack.groupby(['Dataset','Classifier']).agg({
    'AUC': ['mean', 'std'],
    'F1': ['mean', 'std'],
    'LogLoss': ['mean', 'std'],
    'Accuracy': ['mean', 'std'],
    'Runtime': ['mean', 'std']
}).reset_index()

print(summary_ZooAttack)

!ar4.csv
!bodyfat.csv
Kaggle_Surgical-deepnet.csv
MaternalBinary.csv
OPENML_philippine.csv
AcousticExtinguisherFire.csv
acute-inflammation.csv
acute-nephritis.csv
AP_Colon_Lung.csv
backache.csv
blood.csv
chess-krvkp.csv
cloud.csv
congressional-voting.csv
credit-approval.csv
dresses-salesN.csv
echocardiogram.csv
haberman-survival.csv
heart_failure_clinical_records_dataset.csv
heart-hungarian.csv
hill-valley.csv
horse-colic.csv
ilpd-indian-liver.csv
no2.csv
kaggle_REWEMA.csv
lowbwt.csv
madelon.csv
Mesothelioma.csv
MIMIC2.csv
molec-biol-promoter.csv
oil_spill.csv
oocytes_merluccius_nucleus_4d.csv
oocytes_trisopterus_nucleus_2f.csv
ozone.csv
Parkinson_Multiple_Sound_Recording.csv
PC1 Software defect prediction.csv
pd_speech_features.csv
pima.csv
Pistachio_28_Features_Dataset.csv
plasma_retinol.csv
primary-tumorNumeric.csv
seismic-bumps.csv
sleuth_case2002.csv
spambase.csv
spect.csv
spectf.csv
statlog-australian-credit.csv
statlog-heart_.csv
ThoraricSurgery.csv
triazines.csv



ZOO:   0%|          | 0/22 [00:00<?, ?it/s]

ZOO:   0%|          | 0/21 [00:00<?, ?it/s]
ZOO:   0%|          | 0/21 [00:00<?, ?it/s]
ZOO:   0%|          | 0/21 [00:00<?, ?it/s]
ZOO:   0%|          | 0/22 [00:00<?, ?it/s]
ZOO:   0%|          | 0/22 [00:00<?, ?it/s]
ZOO:   0%|          | 0/21 [00:00<?, ?it/s]
ZOO:   0%|          | 0/21 [00:00<?, ?it/s]
ZOO:   0%|          | 0/21 [00:00<?, ?it/s]
ZOO:   0%|          | 0/22 [00:00<?, ?it/s]
ZOO:   0%|          | 0/22 [00:00<?, ?it/s]
ZOO:   0%|          | 0/21 [00:00<?, ?it/s]
ZOO:  18%|█▊        | 4/22 [00:00<00:00, 36.44it/s]
ZOO:   0%|          | 0/21 [00:00<?, ?it/s]
ZOO:  24%|██▍       | 5/21 [00:00<00:00, 40.65it/s]
ZOO:  23%|██▎       | 5/22 [00:00<00:00, 40.77it/s]
ZOO:  23%|██▎       | 5/22 [00:00<00:00, 40.51it/s]
ZOO:  19%|█▉        | 4/21 [00:00<00:00, 37.91it/s]
ZOO:  19%|█▉        | 4/21 [00:00<00:00, 38.41it/s]
ZOO:   0%|          | 0/22 [00:00<?, ?it/s]
ZOO:  18%|█▊        | 4/22 [00:00<00:00, 37.36it/s]
ZOO:  19%|█▉      

           Dataset                                        Classifier  \
                                                                       
0         !ar4.csv                            DecisionTree_ZooAttack   
1         !ar4.csv                        FeatureSqueezing_ZooAttack   
2         !ar4.csv       MonteCarloDecisionTree_Bayes_Prob_ZooAttack   
3         !ar4.csv   MonteCarloDecisionTree_Certainty_Prob_ZooAttack   
4         !ar4.csv  MonteCarloDecisionTree_Confidence_Prob_ZooAttack   
..             ...                                               ...   
395  triazines.csv   MonteCarloDecisionTree_Certainty_Prob_ZooAttack   
396  triazines.csv  MonteCarloDecisionTree_Confidence_Prob_ZooAttack   
397  triazines.csv       MonteCarloDecisionTree_Depth_Prob_ZooAttack   
398  triazines.csv    MonteCarloDecisionTree_Distance_Prob_ZooAttack   
399  triazines.csv         MonteCarloDecisionTree_Fix_Prob_ZooAttack   

          AUC                  F1              LogLoss         

In [15]:
from joblib import Parallel, delayed
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from art.defences.preprocessor import FeatureSqueezing,GaussianAugmentation
from sklearn.metrics import roc_auc_score, f1_score, log_loss, accuracy_score

# Assuming that MonteCarloRandomForestClassifier is imported or defined elsewhere

# 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# List to store evaluation results
results = []

# Define classifiers
classifiers = {
    'DecisionTree': DecisionTreeClassifier(random_state=123),
    'MonteCarloDecisionTree_Fix_Prob': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='fixed'),
    'MonteCarloDecisionTree_Depth_Prob': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='depth'),
    'MonteCarloDecisionTree_Certainty_Prob': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='certainty'),
    'MonteCarloDecisionTree_Bayes_Prob': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='bayes'),
    'MonteCarloDecisionTree_Confidence': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='confidence'),
    'MonteCarloDecisionTree_Distance_Prob': MonteCarloDecisionTreeClassifier(random_state=123,prob_type='distance'),
    'FeatureSqueezing': DecisionTreeClassifier(random_state=123),
  #  'GaussianAugmentation': DecisionTreeClassifier(random_state=123),
}

def evaluate_classifier(dataset_name, dataset, clf_name, clf, train_index, test_index):
    X, y = dataset.data.fillna(0).values, dataset.target
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train classifier
    clf.fit(X_train, y_train)

    if clf_name != 'DecisionTree':
        dummy_clf = DecisionTreeClassifier(random_state=123)
        dummy_clf.fit(X_train, y_train)
        classifier = SklearnClassifier(model=dummy_clf, use_logits=True)
    else:
        classifier = SklearnClassifier(model=clf, use_logits=True)
        
    x_test_adv = X_test
    
    if clf_name == 'FeatureSqueezing':
        # Initialize the feature squeezing defence
        defence = FeatureSqueezing(clip_values=(X_train.min(), X_train.max()), bit_depth=4)

        # Fit the defence with training data
        defence.fit(X_train)

        # Apply the defence on testing data
        x_test_adv = defence(x_test_adv)[0]
        
    if clf_name == 'GaussianAugmentation':
        # Initialize the Gaussian Augmentation defence
        defence = GaussianAugmentation(sigma=1.0)

        # Apply the Gaussian Augmentation on training data
        X_train_augmented, y_train_augmented = defence(X_train, y_train.to_numpy())

        # Retrain the model on the augmented data
        clf.fit(X_train_augmented, y_train_augmented.to_numpy())
    
    start_time = time.time()
    pred_probs = clf.predict_proba(x_test_adv)
    preds = clf.predict(x_test_adv)
    runtime = time.time() - start_time

    # Evaluate
    auc = roc_auc_score(y_test, pred_probs[:, 1], multi_class='ovr')
    f1 = f1_score(y_test, preds, average='macro')
    logloss = log_loss(y_test, pred_probs)
    accuracy = accuracy_score(y_test, preds)

    return [dataset_name, clf_name, auc, f1, logloss, accuracy, runtime]

# Loop through datasets
all_tasks = []

for dataset_name, dataset in datasets:
    print(dataset_name)
    X, y = dataset.data.fillna(0).values, dataset.target
    
    # Perform 5-fold cross-validation
    for clf_name, clf in classifiers.items():
        for i, (train_index, test_index) in enumerate(skf.split(X, y)):
            task = delayed(evaluate_classifier)(dataset_name, dataset, clf_name, clf, train_index, test_index)
            all_tasks.append(task)

# Execute all tasks in parallel
results = Parallel(n_jobs=-1)(all_tasks)

# Convert results to DataFrame for easy visualization
results_df_wo = pd.DataFrame(results, columns=['Dataset', 'Classifier', 'AUC', 'F1', 'LogLoss', 'Accuracy', 'Runtime'])


!ar4.csv
!bodyfat.csv
Kaggle_Surgical-deepnet.csv
MaternalBinary.csv
OPENML_philippine.csv
AcousticExtinguisherFire.csv
acute-inflammation.csv
acute-nephritis.csv
AP_Colon_Lung.csv
backache.csv
blood.csv
chess-krvkp.csv
cloud.csv
congressional-voting.csv
credit-approval.csv
dresses-salesN.csv
echocardiogram.csv
haberman-survival.csv
heart_failure_clinical_records_dataset.csv
heart-hungarian.csv
hill-valley.csv
horse-colic.csv
ilpd-indian-liver.csv
no2.csv
kaggle_REWEMA.csv
lowbwt.csv
madelon.csv
Mesothelioma.csv
MIMIC2.csv
molec-biol-promoter.csv
oil_spill.csv
oocytes_merluccius_nucleus_4d.csv
oocytes_trisopterus_nucleus_2f.csv
ozone.csv
Parkinson_Multiple_Sound_Recording.csv
PC1 Software defect prediction.csv
pd_speech_features.csv
pima.csv
Pistachio_28_Features_Dataset.csv
plasma_retinol.csv
primary-tumorNumeric.csv
seismic-bumps.csv
sleuth_case2002.csv
spambase.csv
spect.csv
spectf.csv
statlog-australian-credit.csv
statlog-heart_.csv
ThoraricSurgery.csv
triazines.csv




In [16]:
summary_wo = results_df_wo.groupby(['Dataset','Classifier']).agg({
    'AUC': ['mean', 'std'],
    'F1': ['mean', 'std'],
    'LogLoss': ['mean', 'std'],
    'Accuracy': ['mean', 'std'],
    'Runtime': ['mean', 'std']
}).reset_index()

print(summary_wo)

           Dataset                             Classifier       AUC            \
                                                               mean       std   
0         !ar4.csv                           DecisionTree  0.675817  0.121470   
1         !ar4.csv                       FeatureSqueezing  0.500000  0.000000   
2         !ar4.csv      MonteCarloDecisionTree_Bayes_Prob  0.755229  0.176973   
3         !ar4.csv  MonteCarloDecisionTree_Certainty_Prob  0.687337  0.121569   
4         !ar4.csv      MonteCarloDecisionTree_Confidence  0.662173  0.253817   
..             ...                                    ...       ...       ...   
395  triazines.csv  MonteCarloDecisionTree_Certainty_Prob  0.756886  0.125052   
396  triazines.csv      MonteCarloDecisionTree_Confidence  0.739662  0.075722   
397  triazines.csv      MonteCarloDecisionTree_Depth_Prob  0.771861  0.093484   
398  triazines.csv   MonteCarloDecisionTree_Distance_Prob  0.775955  0.133746   
399  triazines.csv        Mo

In [17]:
#tmp = pd.read_csv('50ds_summaryV5.csv')

In [18]:
#tmp.append(summary_ZooAttack).sort_values(by='Dataset')

In [19]:
#tmp.append(summary_ZooAttack).sort_values(by='Dataset').to_csv('50ds_summaryV5_2BL.csv',index=False)

In [20]:
#tmp.shape

In [21]:
summary_wo.append(summary_dtattack).append(summary_ZooAttack).sort_values(by='Dataset').to_csv('50ds_summaryV5.csv',index=False)

