In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, log_loss, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer,fetch_lfw_pairs,load_digits,load_iris,load_wine

from art.attacks.evasion import FastGradientMethod,AutoProjectedGradientDescent,ThresholdAttack
from art.attacks.evasion import ZooAttack,HopSkipJump, BoundaryAttack, DecisionTreeAttack
from art.attacks.evasion import HighConfidenceLowUncertainty, ProjectedGradientDescent

from art.estimators.classification import SklearnClassifier


In [2]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import _tree
from sklearn.utils.validation import check_is_fitted


class MonteCarloRandomForestClassifier(RandomForestClassifier):
    def __init__(self, prob_type='fixed', *args, **kwargs):
        super().__init__(*args, **kwargs)
        if prob_type not in ['fixed', 'depth', 'certainty', 'agreement', 'bayes', 'confidence','distance']:
            raise ValueError('Invalid prob_type')
        self.prob_type = prob_type

    def get_depth_based_probability(self, depth):
        return min(0.05 * depth, 0.2)

    def get_certainty_based_probability(self, node_id, tree):
        node_values = tree.value[node_id].flatten()
        total = np.sum(node_values)
        distribution = node_values / total
        max_certainty = np.max(distribution)
        p = 0.5 - max_certainty
        return max(p, 0)

    def get_agreement_based_probability(self, node_id, tree):
        node_values = tree.value[node_id].flatten()
        majority_class_ratio = np.max(node_values) / np.sum(node_values)
        p = 0.5 - majority_class_ratio
        return max(p, 0)

    def get_confidence_based_probability(self, X, node_id, sample, tree):
        feature_index = tree.feature[node_id]
        if feature_index == _tree.TREE_UNDEFINED:
            return 0

        feature_values = X[:, feature_index]
        avg = np.mean(feature_values)
        std = np.std(feature_values)

        distance = abs(sample[feature_index] - avg)
        p = max(0.5 - (distance / (std + 1e-9)), 0)
        return p

    def get_bayes_based_probability(self, node_id, sample, tree):
        if tree.children_left[node_id] == _tree.TREE_LEAF or tree.children_right[node_id] == _tree.TREE_LEAF:
            # Can't calculate Bayesian probability at the leaf node.
            return 0

        parent_values = tree.value[node_id].flatten()
        parent_samples = np.sum(parent_values)
        
        left_child_values = tree.value[tree.children_left[node_id]].flatten()
        right_child_values = tree.value[tree.children_right[node_id]].flatten()
        
        prior = np.max(parent_values) / parent_samples
        
        left_majority_ratio = np.max(left_child_values) / np.sum(left_child_values)
        right_majority_ratio = np.max(right_child_values) / np.sum(right_child_values)
        likelihood = left_majority_ratio * right_majority_ratio
        
        marginal_likelihood = np.mean([left_majority_ratio, right_majority_ratio])
        
        posterior = likelihood * (prior / (marginal_likelihood + 1e-9))
        
        p = 0.5 - posterior
        # Return the scaled posterior
        return max(p, 0)

    def get_distance_based_probability(self, X,tree, node_id, sample):
        feature_index = tree.feature[node_id]
        if feature_index == _tree.TREE_UNDEFINED:
            return 0
        
        threshold = tree.threshold[node_id]
        feature_value = sample[feature_index]
        feature_values = X[:, feature_index]
        distance = abs(feature_value - threshold)
        std = np.std(feature_values)
        
        # The closer the distance is to 0, the lower the probability
        p =  max(0.5 - (distance / (std + 1e-9)), 0) 
        
        return p
    
    def traverse_tree(self, tree, node, sample, X, depth=0):
        if self.prob_type == 'fixed':
            p = 0.05
        elif self.prob_type == 'depth':
            p = self.get_depth_based_probability(depth)
        elif self.prob_type == 'certainty':
            p = self.get_certainty_based_probability(node, tree)
        elif self.prob_type == 'agreement':
            p = self.get_agreement_based_probability(node, tree)
        elif self.prob_type == 'confidence':
            p = self.get_confidence_based_probability(X, node, sample, tree)
        elif self.prob_type == 'bayes':
            p = self.get_bayes_based_probability(node, sample, tree)
        elif self.prob_type == 'distance':
            p = self.get_distance_based_probability(X,tree, node, sample)
        else:
            raise ValueError('Invalid prob_type')

        if tree.feature[node] != _tree.TREE_UNDEFINED:
            if sample[tree.feature[node]] <= tree.threshold[node]:
                if np.random.rand() > p:
                    return self.traverse_tree(tree, tree.children_left[node], sample, X, depth + 1)
                else:
                    return self.traverse_tree(tree, tree.children_right[node], sample, X, depth + 1)
            else:
                if np.random.rand() > p:
                    return self.traverse_tree(tree, tree.children_right[node], sample, X, depth + 1)
                else:
                    return self.traverse_tree(tree, tree.children_left[node], sample, X, depth + 1)
        else:
            return tree.value[node]

    def predict_proba(self, X, n_simulations=100):
        check_is_fitted(self)
        X = self._validate_X_predict(X)

        proba = []
        for x in X:
            simulation_results = []
            for tree in self.estimators_:
                tree_results = [self.traverse_tree(tree.tree_, 0, x, X).ravel() for _ in range(n_simulations)]
                simulation_results.extend(tree_results)
            mean_proba = np.mean(simulation_results, axis=0)
            proba.append(mean_proba)

        return np.array(proba)




In [3]:
import pandas as pd
from sklearn.utils import Bunch
from joblib import Parallel, delayed

# Base path where the files are stored
base_path = 'Experiments/'

# List of files
dataset_list = ['!ar4.csv', '!bodyfat.csv', 'Kaggle_Surgical-deepnet.csv', 'MaternalBinary.csv', 'OPENML_philippine.csv', 'AcousticExtinguisherFire.csv', 'acute-inflammation.csv', 'acute-nephritis.csv', 'AP_Colon_Lung.csv', 'backache.csv', 'blood.csv', 'chess-krvkp.csv', 'cloud.csv', 'congressional-voting.csv', 'credit-approval.csv', 'dresses-salesN.csv', 'echocardiogram.csv', 'haberman-survival.csv', 'heart_failure_clinical_records_dataset.csv', 'heart-hungarian.csv', 'hill-valley.csv', 'horse-colic.csv', 'ilpd-indian-liver.csv', 'no2.csv', 'kaggle_REWEMA.csv', 'lowbwt.csv', 'madelon.csv', 'Mesothelioma.csv', 'MIMIC2.csv', 'molec-biol-promoter.csv', 'oil_spill.csv', 'oocytes_merluccius_nucleus_4d.csv', 'oocytes_trisopterus_nucleus_2f.csv', 'ozone.csv', 'Parkinson_Multiple_Sound_Recording.csv', 'PC1 Software defect prediction.csv', 'pd_speech_features.csv', 'pima.csv', 'Pistachio_28_Features_Dataset.csv', 'plasma_retinol.csv', 'primary-tumorNumeric.csv', 'seismic-bumps.csv', 'sleuth_case2002.csv', 'spambase.csv', 'spect.csv', 'spectf.csv', 'statlog-australian-credit.csv', 'statlog-heart_.csv', 'ThoraricSurgery.csv', 'triazines.csv' ]

# Function to load dataset
def load_dataset(file_name, path):
    try:
        data = pd.read_csv(path + file_name)
        # Use all columns except the last one as features
        X = data.iloc[:, :-1]
        # Use the last column as the target class
        y = data.iloc[:, -1]
        return (file_name, Bunch(data=X, target=y))
    except Exception as e:
        print(f"Error loading {file_name}: {e}")
        return None

# Parallel loading of datasets (using all available cores with n_jobs=-1)
datasets = Parallel(n_jobs=-1)(delayed(load_dataset)(file_name, base_path) for file_name in dataset_list)

# Filter out None values in case of loading errors
datasets = [dataset for dataset in datasets if dataset is not None]

# Now 'datasets' is a list of tuples, where each tuple contains file_name and the corresponding dataset as a Bunch object.

In [6]:
from joblib import Parallel, delayed
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from art.defences.preprocessor import FeatureSqueezing,GaussianAugmentation
from sklearn.metrics import roc_auc_score, f1_score, log_loss, accuracy_score

# Assuming that MonteCarloRandomForestClassifier is imported or defined elsewhere

# 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# List to store evaluation results
results = []

# Define classifiers
classifiers = {
    'RandomForest':  RandomForestClassifier(random_state=123,n_estimators=10),
    'MonteCarloRandomForest_Fix_Prob':  MonteCarloRandomForestClassifier(random_state=123,prob_type='fixed',n_estimators=10),
    'MonteCarloRandomForest_Depth_Prob':  MonteCarloRandomForestClassifier(random_state=123,prob_type='depth',n_estimators=10),
    'MonteCarloRandomForest_Agreement_Prob':  MonteCarloRandomForestClassifier(random_state=123,prob_type='agreement',n_estimators=10),
    'MonteCarloRandomForest_Bayes_Prob':  MonteCarloRandomForestClassifier(random_state=123,prob_type='bayes',n_estimators=10),
    'MonteCarloRandomForest_Confidence_Prob':  MonteCarloRandomForestClassifier(random_state=123,prob_type='confidence',n_estimators=10),
    'MonteCarloRandomForest_Distance_Prob':  MonteCarloRandomForestClassifier(random_state=123,prob_type='distance',n_estimators=10),
    'FeatureSqueezing': RandomForestClassifier(random_state=123,n_estimators=10),
 #   'GaussianAugmentation_DecisionTreeAttack': DecisionTreeClassifier(random_state=123),
}

def evaluate_classifier(dataset_name, dataset, clf_name, clf, train_index, test_index):
    X, y = dataset.data.fillna(0).values, dataset.target
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train classifier
    start_time = time.time()
    clf.fit(X_train, y_train)

    # Predict
    pred_probs = clf.predict_proba(X_test)
    preds = clf.predict(X_test)
    runtime = time.time() - start_time

    # Evaluate
    auc = roc_auc_score(y_test, pred_probs[:, 1], multi_class='ovr')
    f1 = f1_score(y_test, preds, average='macro')
    logloss = log_loss(y_test, pred_probs)
    accuracy = accuracy_score(y_test, preds)

    return [dataset_name, clf_name, auc, f1, logloss, accuracy, runtime]

# Loop through datasets
all_tasks = []

for dataset_name, dataset in datasets:
    print(dataset_name)
    X, y = dataset.data.fillna(0).values, dataset.target
    
    # Perform 5-fold cross-validation
    for clf_name, clf in classifiers.items():
        for i, (train_index, test_index) in enumerate(skf.split(X, y)):
            task = delayed(evaluate_classifier)(dataset_name, dataset, clf_name, clf, train_index, test_index)
            all_tasks.append(task)

# Execute all tasks in parallel
results = Parallel(n_jobs=-1)(all_tasks)

# Convert results to DataFrame for easy visualization
results_df_wo_rf = pd.DataFrame(results, columns=['Dataset', 'Classifier', 'AUC', 'F1', 'LogLoss', 'Accuracy', 'Runtime'])


!ar4.csv
!bodyfat.csv
Kaggle_Surgical-deepnet.csv
MaternalBinary.csv
OPENML_philippine.csv
AcousticExtinguisherFire.csv
acute-inflammation.csv
acute-nephritis.csv
AP_Colon_Lung.csv
backache.csv
blood.csv
chess-krvkp.csv
cloud.csv
congressional-voting.csv
credit-approval.csv
dresses-salesN.csv
echocardiogram.csv
haberman-survival.csv
heart_failure_clinical_records_dataset.csv
heart-hungarian.csv
hill-valley.csv
horse-colic.csv
ilpd-indian-liver.csv
no2.csv
kaggle_REWEMA.csv
lowbwt.csv
madelon.csv
Mesothelioma.csv
MIMIC2.csv
molec-biol-promoter.csv
oil_spill.csv
oocytes_merluccius_nucleus_4d.csv
oocytes_trisopterus_nucleus_2f.csv
ozone.csv
Parkinson_Multiple_Sound_Recording.csv
PC1 Software defect prediction.csv
pd_speech_features.csv
pima.csv
Pistachio_28_Features_Dataset.csv
plasma_retinol.csv
primary-tumorNumeric.csv
seismic-bumps.csv
sleuth_case2002.csv
spambase.csv
spect.csv
spectf.csv
statlog-australian-credit.csv
statlog-heart_.csv
ThoraricSurgery.csv
triazines.csv


In [7]:
summary_wo = results_df_wo_rf.groupby(['Dataset','Classifier']).agg({
    'AUC': ['mean', 'std'],
    'F1': ['mean', 'std'],
    'LogLoss': ['mean', 'std'],
    'Accuracy': ['mean', 'std'],
    'Runtime': ['mean', 'std']
}).reset_index()

print(summary_wo)

           Dataset                              Classifier       AUC  \
                                                                mean   
0         !ar4.csv                        FeatureSqueezing  0.844935   
1         !ar4.csv   MonteCarloRandomForest_Agreement_Prob  0.832516   
2         !ar4.csv       MonteCarloRandomForest_Bayes_Prob  0.835458   
3         !ar4.csv  MonteCarloRandomForest_Confidence_Prob  0.858987   
4         !ar4.csv       MonteCarloRandomForest_Depth_Prob  0.827614   
..             ...                                     ...       ...   
395  triazines.csv  MonteCarloRandomForest_Confidence_Prob  0.782213   
396  triazines.csv       MonteCarloRandomForest_Depth_Prob  0.796726   
397  triazines.csv    MonteCarloRandomForest_Distance_Prob  0.820649   
398  triazines.csv         MonteCarloRandomForest_Fix_Prob  0.832251   
399  triazines.csv                            RandomForest  0.796607   

                     F1             LogLoss            Accuracy

In [4]:
from joblib import Parallel, delayed
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from art.defences.preprocessor import FeatureSqueezing,GaussianAugmentation
from sklearn.metrics import roc_auc_score, f1_score, log_loss, accuracy_score

# Assuming that MonteCarloRandomForestClassifier is imported or defined elsewhere

# 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# List to store evaluation results
results = []

# Define classifiers
# Define classifiers
classifiers = {
    'RandomForest_ZooAttack':  RandomForestClassifier(random_state=123,n_estimators=10),
    'MonteCarloRandomForest_Fix_Prob_ZooAttack':  MonteCarloRandomForestClassifier(random_state=123,prob_type='fixed',n_estimators=10),
    'MonteCarloRandomForest_Depth_Prob_ZooAttack':  MonteCarloRandomForestClassifier(random_state=123,prob_type='depth',n_estimators=10),
    'MonteCarloRandomForest_Agreement_Prob_ZooAttack':  MonteCarloRandomForestClassifier(random_state=123,prob_type='agreement',n_estimators=10),
    'MonteCarloRandomForest_Confidence_Prob_ZooAttack':  MonteCarloRandomForestClassifier(random_state=123,prob_type='confidence',n_estimators=10),
    'MonteCarloRandomForest_Distance_Prob_ZooAttack':  MonteCarloRandomForestClassifier(random_state=123,prob_type='distance',n_estimators=10),
    'FeatureSqueezing_ZooAttack': RandomForestClassifier(random_state=123,n_estimators=10),
}


def evaluate_classifier(dataset_name, dataset, clf_name, clf, train_index, test_index):
    X, y = dataset.data.fillna(0).values, dataset.target
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train classifier
    clf.fit(X_train, y_train)

    if clf_name != 'RandomForest_ZooAttack':
        dummy_clf = RandomForestClassifier(random_state=123,n_estimators=10)
        dummy_clf.fit(X_train, y_train)
        classifier = SklearnClassifier(model=dummy_clf, use_logits=True)
    else:
        classifier = SklearnClassifier(model=clf, use_logits=True)
        
    attack = ZooAttack(classifier=classifier, confidence=0.0, targeted=False, learning_rate=1e-1, max_iter=5,
           binary_search_steps=5, initial_const=1e-3, abort_early=True, use_resize=False, 
           use_importance=False, nb_parallel=2, batch_size=1, variable_h=0.2) 
    x_test_adv = attack.generate(x=X_test)
    
    if clf_name == 'FeatureSqueezing_ZooAttack':
        # Initialize the feature squeezing defence
        defence = FeatureSqueezing(clip_values=(X_train.min(), X_train.max()), bit_depth=4)

        # Fit the defence with training data
        defence.fit(X_train)

        # Apply the defence on testing data
        x_test_adv = defence(x_test_adv)[0]
        
    if clf_name == 'GaussianAugmentation_ZooAttack':
        # Initialize the Gaussian Augmentation defence
        defence = GaussianAugmentation(sigma=1.0)

        # Apply the Gaussian Augmentation on training data
        X_train_augmented, y_train_augmented = defence(X_train, y_train)

        # Retrain the model on the augmented data
        clf.fit(X_train_augmented, y_train_augmented)
    
    start_time = time.time()
    pred_probs = clf.predict_proba(x_test_adv)
    preds = clf.predict(x_test_adv)
    runtime = time.time() - start_time

    # Evaluate
    auc = roc_auc_score(y_test, pred_probs[:, 1], multi_class='ovr')
    f1 = f1_score(y_test, preds, average='macro')
    logloss = log_loss(y_test, pred_probs)
    accuracy = accuracy_score(y_test, preds)

    return [dataset_name, clf_name, auc, f1, logloss, accuracy, runtime]

# Loop through datasets
all_tasks = []

for dataset_name, dataset in datasets:
    print(dataset_name)
    X, y = dataset.data.fillna(0).values, dataset.target
    try:
        # Perform 5-fold cross-validation
        for clf_name, clf in classifiers.items():
            for i, (train_index, test_index) in enumerate(skf.split(X, y)):
                task = delayed(evaluate_classifier)(dataset_name, dataset, clf_name, clf, train_index, test_index)
                all_tasks.append(task)
    except Exception as e:
        print(f"Error in {clf_name} on {dataset_name}: {e}")

# Execute all tasks in parallel
try:
    results = Parallel(n_jobs=-1)(all_tasks)
except Exception as e:
    print(f"Error in {clf_name} on {dataset_name}: {e}")

# Convert results to DataFrame for easy visualization
results_df_ZooAttack = pd.DataFrame(results, columns=['Dataset', 'Classifier', 'AUC', 'F1', 'LogLoss', 'Accuracy', 'Runtime'])


summary_ZooAttack = results_df_ZooAttack.groupby(['Dataset','Classifier']).agg({
    'AUC': ['mean', 'std'],
    'F1': ['mean', 'std'],
    'LogLoss': ['mean', 'std'],
    'Accuracy': ['mean', 'std'],
    'Runtime': ['mean', 'std']
}).reset_index()

print(summary_ZooAttack)

!ar4.csv
!bodyfat.csv
Kaggle_Surgical-deepnet.csv
MaternalBinary.csv
OPENML_philippine.csv
AcousticExtinguisherFire.csv
acute-inflammation.csv
acute-nephritis.csv
AP_Colon_Lung.csv
backache.csv
blood.csv
chess-krvkp.csv
cloud.csv
congressional-voting.csv
credit-approval.csv
dresses-salesN.csv
echocardiogram.csv
haberman-survival.csv
heart_failure_clinical_records_dataset.csv
heart-hungarian.csv
hill-valley.csv
horse-colic.csv
ilpd-indian-liver.csv
no2.csv
kaggle_REWEMA.csv
lowbwt.csv
madelon.csv
Mesothelioma.csv
MIMIC2.csv
molec-biol-promoter.csv
oil_spill.csv
oocytes_merluccius_nucleus_4d.csv
oocytes_trisopterus_nucleus_2f.csv
ozone.csv
Parkinson_Multiple_Sound_Recording.csv
PC1 Software defect prediction.csv
pd_speech_features.csv
pima.csv
Pistachio_28_Features_Dataset.csv
plasma_retinol.csv
primary-tumorNumeric.csv
seismic-bumps.csv
sleuth_case2002.csv
spambase.csv
spect.csv
spectf.csv
statlog-australian-credit.csv
statlog-heart_.csv
ThoraricSurgery.csv
triazines.csv


ZOO:  48%|████▊     | 10/21 [00:01<00:01,  7.12it/s]
ZOO:   6%|▌         | 3/51 [00:00<00:05,  8.64it/s]]
ZOO:  47%|████▋     | 24/51 [00:02<00:02,  9.58it/s]
ZOO:  90%|█████████ | 19/21 [00:02<00:00, 10.01it/s]
ZOO: 100%|██████████| 21/21 [00:02<00:00,  8.16it/s]
ZOO:  57%|█████▋    | 29/51 [00:03<00:02,  9.86it/s]
ZOO: 100%|██████████| 21/21 [00:02<00:00,  7.55it/s]
ZOO:  52%|█████▏    | 26/50 [00:02<00:02, 10.40it/s]
ZOO:  50%|█████     | 25/50 [00:02<00:02,  9.71it/s]
ZOO:  57%|█████▋    | 29/51 [00:03<00:02,  9.95it/s]
ZOO:  60%|██████    | 30/50 [00:03<00:01, 11.05it/s]
ZOO:  57%|█████▋    | 29/51 [00:03<00:02,  9.87it/s]
ZOO:  57%|█████▋    | 29/51 [00:03<00:02,  9.38it/s]
ZOO:  20%|██        | 10/50 [00:00<00:04,  9.93it/s]
ZOO:  33%|███▎      | 17/51 [00:01<00:03,  9.17it/s]
ZOO:  36%|███▌      | 18/50 [00:01<00:03,  9.24it/s]]
ZOO:  98%|█████████▊| 49/50 [00:05<00:00,  9.01it/s]]
ZOO: 100%|██████████| 50/50 [00:05<00:00,  9.06it/s]]
ZOO:  58%|█████▊    | 29/50 [00:03<00:02,  

           Dataset                                        Classifier  \
                                                                       
0         !ar4.csv                        FeatureSqueezing_ZooAttack   
1         !ar4.csv   MonteCarloRandomForest_Agreement_Prob_ZooAttack   
2         !ar4.csv  MonteCarloRandomForest_Confidence_Prob_ZooAttack   
3         !ar4.csv       MonteCarloRandomForest_Depth_Prob_ZooAttack   
4         !ar4.csv    MonteCarloRandomForest_Distance_Prob_ZooAttack   
..             ...                                               ...   
345  triazines.csv  MonteCarloRandomForest_Confidence_Prob_ZooAttack   
346  triazines.csv       MonteCarloRandomForest_Depth_Prob_ZooAttack   
347  triazines.csv    MonteCarloRandomForest_Distance_Prob_ZooAttack   
348  triazines.csv         MonteCarloRandomForest_Fix_Prob_ZooAttack   
349  triazines.csv                            RandomForest_ZooAttack   

          AUC                  F1             LogLoss          

In [8]:
summary_wo.append(summary_ZooAttack).sort_values(by='Dataset').to_csv('50ds_rf_summaryV5.csv',index=False)

In [9]:
summary_wo.sort_values(by='Dataset').to_csv('50ds_wo_rf_summaryV5.csv',index=False)

In [5]:
summary_ZooAttack.sort_values(by='Dataset').to_csv('50ds_zoo_rf_summaryV5.csv',index=False)











