In [None]:
# Package to easily run Fairness metric evaluation.
!pip install aequitas-lite

In [None]:
import glob  # Read datasets paths

import pandas as pd  # Have datasets in tabular form
import lightgbm as lgbm  # Model we are training

from sklearn.metrics import roc_curve  # Performance evaluation
from aequitas.group import Group  # Fairness evaluation

In [None]:
# Small script to get the datasets paths and transform them to a pandas dataframe. Datasets are loaded to dictionary.
extension = "csv"
data_paths = glob.glob(f"/kaggle/input/bank-account-fraud-dataset-neurips-2022/*.{extension}")

def read_dataset(path, ext=extension):
    if ext == "csv":
        return pd.read_csv(path)
    else:
        raise ValueError(f"Invalid extension: '{ext}'.")

def get_variant(path):
        return path.split("/")[-1].split(".")[0]

datasets = {
    get_variant(path): read_dataset(path) for path in data_paths
}

In [None]:
datasets

In [None]:
# In this example, we will use the base dataset.
datasets["Variant IV"].head()

In [None]:
# Choose dataset variant here.
df = datasets["Variant IV"]

# Transform categorical types.
for col in ["payment_type", "employment_status", "housing_status", "source", "device_os"]:
    df[col] = df[col].astype("category")

# We will be making a split based on month of the year. Also, we will split features from label.
X_train, y_train = df[df["month"]<6].drop(columns=["fraud_bool"]), df[df["month"]<6]["fraud_bool"]
X_test, y_test = df[df["month"]>=6].drop(columns=["fraud_bool"]), df[df["month"]>=6]["fraud_bool"]

In [None]:
df.describe()

In [None]:
# We are going to train a LGBM model on this data:

model = lgbm.LGBMClassifier()
model.fit(X_train, y_train)

In [None]:
# Define the threshold based on FPR (max recall with less than 5% FPR)
FPR_LIM = 0.05

# Score the test set
scored_test = model.predict_proba(X_test)[:,1]

# Calculate the ROC curve points
fpr, tpr, threshold = roc_curve(y_test, scored_test,)

# Obtain the threshold and TPR based on the FPR
obtained_tpr = tpr[fpr<FPR_LIM][-1]
obtained_threshold = threshold[fpr<FPR_LIM][-1]
obtained_fpr = fpr[fpr<FPR_LIM][-1]

In [None]:
print(f"""
Model TPR: {round(obtained_tpr, 4)}
Model FPR: {round(obtained_fpr, 4)}
Model Threshold : {round(obtained_threshold, 4)}
""")

In [None]:
# Calculating fairness metrics on the predictions
g = Group()

df = pd.DataFrame({"score": scored_test, "label_value": y_test, "age": (X_test["customer_age"] > 50).map({True: ">50", False: "<=50"})})

fairness_metrics = g.get_crosstabs(df, score_thresholds={"score_val": [obtained_threshold]})[0]

In [None]:
fairness_metrics

In [None]:
# The fairness ratio is given by the min over max in FPR (in our application)
fairness_ratio = fairness_metrics["fpr"].min() / fairness_metrics["fpr"].max()

print(round(fairness_ratio, 4))