# Imports

In [None]:
!pip install aequitas-lite  # Not available in default environment

import numpy as np  # Just for RNG.
import pandas as pd

from aequitas.group import Group  # Aequitas is a package for Fairness evaluation
from sklearn.metrics import roc_curve  # Performance evaluation
from typing import Tuple  # Method typing.

# Evaluation

In this notebook, we present code snippets to obtain the metrics evaluated in the original NeurIPS paper, https://arxiv.org/abs/2211.13358, for any vector of predictions.

This notebook assumes a model has been trained on a given dataset of the Bank Account Fraud Suite and a vector of predictions was obtained. 
Predictions should be model scores, in \[0, 1\] ❗ not binarized ❗

Ideally, you should only change `UPPERCASE` variables (unless splits or protected groups are changed).

In [None]:
# Loading the dataset that model used. 
DATASET_NAME = "Base" # replace with adequate: "Base", "Variant I", "Variant II", "Variant III", "Variant IV", "Variant V"

path = f"/kaggle/input/bank-account-fraud-dataset-neurips-2022/{DATASET_NAME}.csv"

df = pd.read_csv(path)  # This will load the correct dataset.

In [None]:
# Defining the test set over the whole data and obtaining the labels and groups.
test_df = df[df["month"]>=6] # if you performed a different split strategy, replace here!

labels = test_df["fraud_bool"]
groups = (test_df["customer_age"] > 50).map({True: ">50", False: "<=50"})  # If you changed your group definition, replace here!

In [None]:
# Load your predictions here
PREDICTIONS = np.random.rand(labels.shape[0])  # THIS IS A PLACEHOLDER; We are populating the predictions with random values. Replace with your model predictions!

In [None]:
# If you classified test in a custom order, replace the index order here:
ORDER = test_df.index  # PLACEHOLDER; We are assuming default order.

labels = labels[ORDER].values
groups = groups[ORDER].values

In [None]:
def get_performance_metrics(
    predictions: np.array = PREDICTIONS,
    labels: np.array = labels,
    fpr_threshold: float = 0.05,
) -> Tuple[float, float, float]:
    """For a given predictions vector, calculate the model performance.
    
    This calculates the TPR at the given target FPR threshold.
    
    Parameters
    ----------
    predictions : np.array
        The vector of scores (must be floats).
    labels : np.array
        The vector of labels (ground truth).
    fpr_threshold : float 
        The thresholding rule.

    Returns
    -------
    tpr : float
        The TPR for the defined threshold.
    fpr : float
        The observed FPR after thresholding.
    threshold : float
        The value for thresholding.
    """
    # We leverage sklearn's roc_curve method (tpr and fpr for each threshold)
    fprs, tprs, thresholds = roc_curve(labels, predictions)
    tpr = tprs[fprs<fpr_threshold][-1]
    fpr = fprs[fprs<fpr_threshold][-1]
    threshold = thresholds[fprs<fpr_threshold][-1]
    
    return tpr, fpr, threshold

In [None]:
# In this cell, we use the previous method to calculate the performance metrics.
tpr, fpr, threshold = get_performance_metrics()

In [None]:
to_pct = lambda x: str(round(x, 4) * 100) + "%"

print("TPR: ", to_pct(tpr), "\nFPR: ", to_pct(fpr), "\nThreshold: ", round(threshold, 2))

In [None]:
def get_fairness_metrics(
    predictions: np.array = PREDICTIONS,
    labels: np.array = labels,
    groups: np.array = groups,
    threshold: float = threshold,
) -> Tuple[float, pd.DataFrame]:
    """For a given predictions vector, calculate the model fairness.
    
    This calculates the FPR parity (predictive equality).
    
    Parameters
    ----------
    predictions : np.array
        The vector of scores (must be floats).
    labels : np.array
        The vector of labels (ground truth).
    groups : np.array 
        The vector of protected groups.
    threshold : float
        The model threshold (calculated previously).
    
    Returns
    -------
    predictive_equality : float
        The fairness metric value.
    disparities_df : pd.DataFrame
        A table with the metrics for each group in the dataset.
    """
    g = Group()
    
    # Building a dataframe to feed to aequitas (fairness metrics package)
    aequitas_df = pd.DataFrame(
        {"score": predictions,
         "label_value": labels,
         "group": groups}
    )
    
    # Use aequitas to compute confusion matrix metrics for every group.
    disparities_df = g.get_crosstabs(aequitas_df, score_thresholds={"score_val": [threshold]})[0]
    
    # Predictive equality is the differences in FPR (we use ratios in the paper)
    predictive_equality = disparities_df["fpr"].min() / disparities_df["fpr"].max()

    return predictive_equality, disparities_df

In [None]:
predictive_equality, disparities_df = get_fairness_metrics()

In [None]:
print("Predictive Equality: ", to_pct(predictive_equality))

In [None]:
disparities_df