In [88]:
import pandas as pd
from aif360.datasets import BinaryLabelDataset

import argparse

import numpy as np
import optuna
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [89]:
privileged_groups = [{'race': 1}]
unprivileged_groups = [{'race': 0}]
df = pd.read_csv("./data/compas_preprocessed_final.csv")
dataset = BinaryLabelDataset(
                  favorable_label=1,  
                  unfavorable_label=0,  
                  df=df,
                  label_names=['two_year_recid'], 
                  protected_attribute_names=['race'])


In [90]:
X = dataset.features
y = dataset.labels.ravel()

In [91]:
y.shape

(6172,)

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)

In [95]:
y_test

array([0., 0., 0., ..., 1., 1., 1.])

In [98]:
X_val, X_test_us, y_val, y_test_us = train_test_split(X_test, y_test, test_size=0.2, random_state=42, stratify=y_test)

In [99]:
y_val

array([1., 0., 1., ..., 0., 1., 1.])

In [100]:
X_test = np.concatenate((X_train, X_test_us), axis=0)
y_test = np.concatenate((y_train, y_test_us), axis=0)

In [101]:
y_test

array([1., 0., 0., ..., 0., 0., 0.])

In [102]:
membership = ["in"] * X_train.shape[0] + ["out"] * X_test_us.shape[0]

In [103]:
def objective(trial):
    param = {
        "depth": trial.suggest_int("depth", 1, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1, log=True),
        "random_strength": trial.suggest_float("random_strength", 1, 10, log=True),
        "objective": trial.suggest_categorical(
            "objective", ["Logloss", "CrossEntropy"]
        ),
        "iterations": trial.suggest_int("iterations", 1, 1000, log=True),
    }

    # param["early_stopping_rounds"] = 5
    param["thread_count"] = 4
    param["random_seed"] = 42
    
    _X_train, _X_valid, _y_train, _y_valid = train_test_split(
        X_train,
        y_train,
        test_size=0.2,
        stratify=y_train,
        random_state=np.random.randint(0, 1000),
    )
    
    

    clf = CatBoostClassifier(**param)
    clf.fit(_X_train, _y_train, verbose=0)
    _y_pred_test = clf.predict(_X_valid, prediction_type="Probability")[:, 1]
    score = roc_auc_score(
        _y_valid,
        _y_pred_test,
    )

    return score



In [104]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=15)

[I 2024-10-23 23:47:55,511] A new study created in memory with name: no-name-3a9f1fa2-9a6a-470c-b930-be00c610eedb
[I 2024-10-23 23:47:55,584] Trial 0 finished with value: 0.7377002439359219 and parameters: {'depth': 8, 'l2_leaf_reg': 1.1209169798855607, 'learning_rate': 0.014550178670304696, 'random_strength': 9.433044670926725, 'objective': 'Logloss', 'iterations': 32}. Best is trial 0 with value: 0.7377002439359219.
[I 2024-10-23 23:47:55,624] Trial 1 finished with value: 0.7232013685755619 and parameters: {'depth': 10, 'l2_leaf_reg': 2.3312681072143966, 'learning_rate': 0.09228566317822752, 'random_strength': 1.1279202420313996, 'objective': 'Logloss', 'iterations': 3}. Best is trial 0 with value: 0.7377002439359219.
[I 2024-10-23 23:47:55,721] Trial 2 finished with value: 0.7375735239764722 and parameters: {'depth': 4, 'l2_leaf_reg': 1.0806823401445003, 'learning_rate': 0.04014947421981846, 'random_strength': 1.1417242328398947, 'objective': 'CrossEntropy', 'iterations': 163}. Best

In [105]:
def detailed_objective(trial):
    param = {
        "depth": trial.suggest_int("depth", 1, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1, log=True),
        "random_strength": trial.suggest_float("random_strength", 1, 10, log=True),
        "objective": trial.suggest_categorical(
            "objective", ["Logloss", "CrossEntropy"]
        ),
        "iterations": trial.suggest_int("iterations", 1, 1000, log=True),
    }

    param["thread_count"] = 4
    param["random_seed"] = 42

    clf = CatBoostClassifier(**param)
    clf.fit(X_train, y_train, verbose=0)

    score_val = clf.predict(X_val, prediction_type="Probability")[:, 1]
    score_test = clf.predict(X_test, prediction_type="Probability")[:, 1]

    return score_val, score_test


In [106]:
y_score_val, y_score_test = detailed_objective(study.best_trial)

In [107]:
type(y_score_test)

numpy.ndarray

In [108]:
X_val.shape

(2468, 11)

In [109]:
f_scores = lambda prob, l: (np.log(prob) - np.log(1 - prob)) * (2 * l - 1)

In [110]:
y_score_test = f_score(y_score_test, y_test)
y_score_val = f_score(y_score_val, y_val)

In [112]:
y_score_val

array([ 1.34517604, -0.62640136,  0.50145298, ...,  0.58334185,
       -0.95632944,  0.26612056])

In [113]:
rng = np.random.RandomState(42)

In [114]:
def objective_2(trial):
    param = {
        "depth": trial.suggest_int("depth", 1, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-4, 1e4, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1, log=True),
        "iterations": trial.suggest_int("iterations", 1, 1000, log=True),
    }

    param["thread_count"] = 1
    param["objective"] = "RMSEWithUncertainty"
    param["posterior_sampling"] = True
    param["random_seed"] = args.seed
    eval_metric = RMSEWithUncertainty()

    _X_train, _X_valid, _y_train, _y_valid = train_test_split(
        X_val,
        y_score_val,
        test_size=0.2,
        random_state=rng.randint(0, 1000),
        stratify=y_val,
    )
    
    clf = CatBoostRegressor(**param)
    try:
        clf.fit(_X_train, _y_train, verbose=0)
        _y_pred_valid = clf.predict(_X_valid, prediction_type="RawFormulaVal")
        score = eval_metric.eval(label=_y_valid.T, approx=_y_pred_valid.T)
        return score
    except:
        return np.inf


In [115]:
study = optuna.create_study(
    direction="minimize", sampler=None, pruner=optuna.pruners.HyperbandPruner
)
study.optimize(objective, n_trials=15, n_jobs=30)

[I 2024-10-23 23:50:16,618] A new study created in memory with name: no-name-e770c321-7f50-4250-a45e-77f51239c3bd
[I 2024-10-23 23:50:16,760] Trial 1 finished with value: 0.7209890492835043 and parameters: {'depth': 3, 'l2_leaf_reg': 1.3912093505537122, 'learning_rate': 0.21679049616862817, 'random_strength': 5.817637415352522, 'objective': 'Logloss', 'iterations': 7}. Best is trial 1 with value: 0.7209890492835043.
[I 2024-10-23 23:50:16,795] Trial 7 finished with value: 0.6102305247262321 and parameters: {'depth': 1, 'l2_leaf_reg': 8.793990519995722, 'learning_rate': 0.049155925391785246, 'random_strength': 1.3426563635698265, 'objective': 'CrossEntropy', 'iterations': 1}. Best is trial 7 with value: 0.6102305247262321.
[I 2024-10-23 23:50:16,807] Trial 10 finished with value: 0.6683052261423277 and parameters: {'depth': 1, 'l2_leaf_reg': 1.6247589336877521, 'learning_rate': 0.7971507840782652, 'random_strength': 2.1224813052604374, 'objective': 'CrossEntropy', 'iterations': 2}. Best

In [117]:
def detailed_objective(trial):
    param = {
        "depth": trial.suggest_int("depth", 1, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-4, 1e4, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1, log=True),
        "iterations": trial.suggest_int("iterations", 1, 1000, log=True),
    }

    param["thread_count"] = 1
    param["objective"] = "RMSEWithUncertainty"
    param["posterior_sampling"] = True
    param["random_seed"] = 42

    clf = CatBoostRegressor(**param)
    clf.fit(X_val, y_score_val, verbose=0)

    conf_test = clf.predict(X_test, prediction_type="RawFormulaVal")

    return conf_test


In [118]:
y_conf = detailed_objective(study.best_trial)

In [119]:
y_conf

array([[ 0.40489746, -0.13159353],
       [ 0.39830438, -0.14300263],
       [ 0.40489746, -0.13159353],
       ...,
       [ 0.40489746, -0.13159353],
       [ 0.40489746, -0.13159353],
       [ 0.39830438, -0.14300263]])

In [120]:
gaussian_pred = {}

gaussian_pred["score"] = y_score_test
gaussian_pred["mu"] = y_conf[:, 0]
gaussian_pred["log_sigma"] = y_conf[:, 1]
gaussian_pred["membership"] = membership

gaussian_pred = pd.DataFrame(gaussian_pred)

In [123]:
gaussian_pred

Unnamed: 0,score,mu,log_sigma,membership
0,0.766573,0.404897,-0.131594,in
1,1.230996,0.398304,-0.143003,in
2,-0.031191,0.404897,-0.131594,in
3,-0.706038,0.404897,-0.131594,in
4,0.574149,0.398304,-0.143003,in
...,...,...,...,...
3699,0.953580,0.404897,-0.131594,out
3700,0.855430,0.398304,-0.143003,out
3701,-0.897647,0.404897,-0.131594,out
3702,1.649446,0.404897,-0.131594,out


In [122]:
gaussian_pred.to_csv("GAUSSIAN_PRED.csv")