In [5]:
import pandas as pd
from aif360.datasets import BinaryLabelDataset

import argparse

import numpy as np
import optuna
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor
from catboost.metrics import RMSEWithUncertainty
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

ModuleNotFoundError: No module named 'catboost'

In [130]:
privileged_groups = [{'race': 1}]
unprivileged_groups = [{'race': 0}]
df = pd.read_csv("./data/compas_preprocessed_final.csv")
dataset = BinaryLabelDataset(
                  favorable_label=1,  
                  unfavorable_label=0,  
                  df=df,
                  label_names=['two_year_recid'], 
                  protected_attribute_names=['race'])


In [131]:
X = dataset.features
y = dataset.labels.ravel()

In [132]:
y.shape

(6172,)

In [133]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)

In [136]:
y_test

array([0., 0., 0., ..., 1., 1., 1.])

In [139]:
X_val, X_test_us, y_val, y_test_us = train_test_split(X_test, y_test, test_size=0.2, random_state=42, stratify=y_test)

In [140]:
y_val

array([1., 0., 1., ..., 0., 1., 1.])

In [141]:
X_test = np.concatenate((X_train, X_test_us), axis=0)
y_test = np.concatenate((y_train, y_test_us), axis=0)

In [142]:
y_test

array([1., 0., 0., ..., 0., 0., 0.])

In [143]:
membership = ["in"] * X_train.shape[0] + ["out"] * X_test_us.shape[0]

In [144]:
def objective(trial):
    param = {
        "depth": trial.suggest_int("depth", 1, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1, log=True),
        "random_strength": trial.suggest_float("random_strength", 1, 10, log=True),
        "objective": trial.suggest_categorical(
            "objective", ["Logloss", "CrossEntropy"]
        ),
        "iterations": trial.suggest_int("iterations", 1, 1000, log=True),
    }

    # param["early_stopping_rounds"] = 5
    param["thread_count"] = 4
    param["random_seed"] = 42
    
    _X_train, _X_valid, _y_train, _y_valid = train_test_split(
        X_train,
        y_train,
        test_size=0.2,
        stratify=y_train,
        random_state=np.random.randint(0, 1000),
    )
    
    

    clf = CatBoostClassifier(**param)
    clf.fit(_X_train, _y_train, verbose=0)
    _y_pred_test = clf.predict(_X_valid, prediction_type="Probability")[:, 1]
    score = roc_auc_score(
        _y_valid,
        _y_pred_test,
    )

    return score



In [145]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=15)

[I 2024-10-24 00:09:11,883] A new study created in memory with name: no-name-da260968-6992-45f8-a6ba-c0c253462388
[I 2024-10-24 00:09:13,803] Trial 0 finished with value: 0.707699293536226 and parameters: {'depth': 10, 'l2_leaf_reg': 1.248663430543325, 'learning_rate': 0.025096150073010134, 'random_strength': 1.2027593397072713, 'objective': 'Logloss', 'iterations': 444}. Best is trial 0 with value: 0.707699293536226.
[I 2024-10-24 00:09:13,892] Trial 1 finished with value: 0.7092304930462422 and parameters: {'depth': 8, 'l2_leaf_reg': 1.2888202849107029, 'learning_rate': 0.17742551761574213, 'random_strength': 1.01310833104929, 'objective': 'CrossEntropy', 'iterations': 56}. Best is trial 1 with value: 0.7092304930462422.
[I 2024-10-24 00:09:13,910] Trial 2 finished with value: 0.6738492243682481 and parameters: {'depth': 5, 'l2_leaf_reg': 8.101207845595134, 'learning_rate': 0.01706081360453126, 'random_strength': 1.4517657818000946, 'objective': 'Logloss', 'iterations': 1}. Best is t

In [146]:
def detailed_objective(trial):
    param = {
        "depth": trial.suggest_int("depth", 1, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1, log=True),
        "random_strength": trial.suggest_float("random_strength", 1, 10, log=True),
        "objective": trial.suggest_categorical(
            "objective", ["Logloss", "CrossEntropy"]
        ),
        "iterations": trial.suggest_int("iterations", 1, 1000, log=True),
    }

    param["thread_count"] = 4
    param["random_seed"] = 42

    clf = CatBoostClassifier(**param)
    clf.fit(X_train, y_train, verbose=0)

    score_val = clf.predict(X_val, prediction_type="Probability")[:, 1]
    score_test = clf.predict(X_test, prediction_type="Probability")[:, 1]

    return score_val, score_test


In [147]:
y_score_val, y_score_test = detailed_objective(study.best_trial)

In [148]:
type(y_score_test)

numpy.ndarray

In [149]:
X_val.shape

(2468, 11)

In [150]:
f_scores = lambda prob, l: (np.log(prob) - np.log(1 - prob)) * (2 * l - 1)

In [151]:
y_score_test = f_score(y_score_test, y_test)
y_score_val = f_score(y_score_val, y_val)

In [153]:
y_score_val

array([ 1.58045868, -0.38077105,  0.85201487, ...,  0.79002577,
       -1.12522019,  0.61227023])

In [154]:
rng = np.random.RandomState(42)

In [156]:
def objective_2(trial):
    param = {
        "depth": trial.suggest_int("depth", 1, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-4, 1e4, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1, log=True),
        "iterations": trial.suggest_int("iterations", 1, 1000, log=True),
    }

    param["thread_count"] = 1
    param["objective"] = "RMSEWithUncertainty"
    param["posterior_sampling"] = True
    param["random_seed"] = 42
    eval_metric = RMSEWithUncertainty()
    
    _X_train, _X_valid, _y_train, _y_valid = train_test_split(
        X_val,
        y_score_val,
        test_size=0.2,
        random_state=rng.randint(0, 1000),
        stratify=y_val,
    )
    
    clf = CatBoostRegressor(**param)
    try:
        clf.fit(_X_train, _y_train, verbose=0)
        _y_pred_valid = clf.predict(_X_valid, prediction_type="RawFormulaVal")
        score = eval_metric.eval(label=_y_valid.T, approx=_y_pred_valid.T)
        return score
    except:
        return np.inf


In [157]:
study = optuna.create_study(
    direction="minimize", sampler=None, pruner=optuna.pruners.HyperbandPruner
)
study.optimize(objective_2, n_trials=15, n_jobs=30)

[I 2024-10-24 00:09:45,062] A new study created in memory with name: no-name-a76431e4-94b5-4d45-a121-b3535e30c303
[I 2024-10-24 00:09:45,221] Trial 8 finished with value: 1.2919392962736547 and parameters: {'depth': 2, 'l2_leaf_reg': 0.027089870503053236, 'learning_rate': 0.06380385072881259, 'iterations': 4}. Best is trial 8 with value: 1.2919392962736547.
[I 2024-10-24 00:09:45,225] Trial 6 finished with value: 1.353624544202955 and parameters: {'depth': 5, 'l2_leaf_reg': 2580.9313287389405, 'learning_rate': 0.00020938160395582466, 'iterations': 1}. Best is trial 8 with value: 1.2919392962736547.
[I 2024-10-24 00:09:45,227] Trial 2 finished with value: 1.322183880768951 and parameters: {'depth': 8, 'l2_leaf_reg': 8936.052771777877, 'learning_rate': 0.06620389957423026, 'iterations': 2}. Best is trial 8 with value: 1.2919392962736547.
[I 2024-10-24 00:09:45,228] Trial 5 finished with value: 1.3445301823667715 and parameters: {'depth': 7, 'l2_leaf_reg': 0.0001058504179837069, 'learning

In [158]:
def detailed_objective2(trial):
    param = {
        "depth": trial.suggest_int("depth", 1, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-4, 1e4, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1, log=True),
        "iterations": trial.suggest_int("iterations", 1, 1000, log=True),
    }

    param["thread_count"] = 1
    param["objective"] = "RMSEWithUncertainty"
    param["posterior_sampling"] = True
    param["random_seed"] = 42

    clf = CatBoostRegressor(**param)
    clf.fit(X_val, y_score_val, verbose=0)

    conf_test = clf.predict(X_test, prediction_type="RawFormulaVal")

    return conf_test

In [159]:
y_conf = detailed_objective2(study.best_trial)

In [160]:
y_conf

array([[ 0.26942777, -0.52455162],
       [ 0.74101589, -0.16426721],
       [ 0.3177515 , -0.33531891],
       ...,
       [ 0.55111523, -0.26652674],
       [ 1.07496108,  0.33549692],
       [ 0.26013298, -0.36100899]])

In [161]:
gaussian_pred = {}

gaussian_pred["score"] = y_score_test
gaussian_pred["mu"] = y_conf[:, 0]
gaussian_pred["log_sigma"] = y_conf[:, 1]
gaussian_pred["membership"] = membership

gaussian_pred = pd.DataFrame(gaussian_pred)

In [162]:
gaussian_pred

Unnamed: 0,score,mu,log_sigma,membership
0,0.841834,0.269428,-0.524552,in
1,1.272788,0.741016,-0.164267,in
2,-0.975373,0.317751,-0.335319,in
3,-0.462765,0.242633,-0.658045,in
4,0.686706,0.192311,-0.643044,in
...,...,...,...,...
3699,0.961860,0.235420,-0.450417,out
3700,0.389257,0.262767,-0.406303,out
3701,-1.172062,0.551115,-0.266527,out
3702,1.864374,1.074961,0.335497,out


In [122]:
gaussian_pred.to_csv("GAUSSIAN_PRED.csv")

In [6]:
# Step 1: Set a threshold for membership prediction, here we use 0 as a default threshold
threshold = 0.5  # Adjust this threshold as necessary based on your experiment

# Step 2: Create a predicted membership list based on the mu values from the Gaussian model
predicted_membership = ["in" if mu_value < threshold else "out" for mu_value in gaussian_pred["mu"]]

# Step 3: Calculate the accuracy by comparing the predicted membership with the true membership
from sklearn.metrics import accuracy_score

# True membership is already in the dataframe, so we directly use it
true_membership = gaussian_pred["membership"]

# Calculate the accuracy
qmia_accuracy = accuracy_score(true_membership, predicted_membership)

print(f"QMIA Privacy Accuracy: {qmia_accuracy}")

NameError: name 'gaussian_pred' is not defined