In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from aif360.datasets import StandardDataset
from aif360.algorithms.preprocessing.reweighing import Reweighing
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
import numpy as np
from constants import protected_attributes

  warn_deprecated('vmap', 'torch.vmap')


In [2]:

class BenefitsDataset(StandardDataset):
    """Benefits Census Income Dataset.

    See :file:`aif360/data/raw/adult/README.md`.
    """

    def __init__(self, df, label_name='checked',
                 favorable_classes=[1],
                 protected_attribute_names=['persoon_geslacht_vrouw', 'persoon_leeftijd_bij_onderzoek'],
                 privileged_classes=[[0], [1]],
                 instance_weights_name=None,
                 categorical_features=[],
                 features_to_keep=[], features_to_drop=[],
                 na_values=['?'], custom_preprocessing=None,
                 metadata=None):


        super(BenefitsDataset, self).__init__(df=df, label_name=label_name,
            favorable_classes=favorable_classes,
            protected_attribute_names=protected_attribute_names,
            privileged_classes=privileged_classes,
            instance_weights_name=instance_weights_name,
            categorical_features=categorical_features,
            features_to_keep=features_to_keep,
            features_to_drop=features_to_drop, na_values=na_values,
            custom_preprocessing=custom_preprocessing, metadata=metadata)
        
def evaluate_model(model, X_test, y_test):
    """Evaluates the model and returns performance metrics

    Args:
        model: Trained model
        X_test: Test features
        y_test: Test labels

    Returns:
        Dictionary containing fpr, tnr, npr, fnr, precision, recall, f1
    """
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    fpr = fp / (fp + tn)  # False Positive Rate
    tnr = tn / (tn + fp)  # True Negative Rate
    npr = tp / (tp + fn)  # Negative Predictive Rate
    fnr = fn / (fn + tp)  # False Negative Rate
    precision = tp / (tp + fp)  # Precision
    recall = tp / (tp + fn)  # Recall
    f1 = 2 * (precision * recall) / (precision + recall)  # F1 Score

    return {
        "fpr": fpr,
        "tnr": tnr,
        "npr": npr,
        "fnr": fnr,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

def custom_preprocessing(df):
    """ Custom pre-processing for German Credit Data
    """

    df['persoon_leeftijd_bij_onderzoek'] = (df['persoon_leeftijd_bij_onderzoek'] >= 27).astype(float)

    return df

def test_model(data_path, num_iterations):
    """Loads data, performs train-test split, reweights, trains, evaluates, and stores metrics

    Args:
        data_path: Path to the CSV file
        num_iterations: Number of iterations to perform
    """
    # Load data
    data = pd.read_csv(data_path)
    data = data.astype(np.float32)

    features = data.columns[:-1]  # Exclude last column (target)
    target = data.columns[-1]

    # Initialize lists to store metrics
    fprs = []
    tnrs = []
    nprs = []
    fnrs = []
    precisions = []
    recalls = []
    f1s = []


    data["persoon_leeftijd_bij_onderzoek"] = (data["persoon_leeftijd_bij_onderzoek"] >= 27).astype(float)

    for _ in range(num_iterations):
        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2)

        # Reweighting with AIF360
        df_train = pd.concat([X_train, y_train], axis=1)
        # ds_train = StandardDataset(df_train, 
        #                             label_name="checked", 
        #                             favorable_classes=[0],
        #                             privileged_classes=[[0], [1]],
        #                             protected_attribute_names=["persoon_geslacht_vrouw", "persoon_leeftijd_bij_onderzoek"])
        
        # Feature partitions
        XD_features = ['persoon_leeftijd_bij_onderzoek', 'persoon_geslacht_vrouw']
        D_features = XD_features
        Y_features = ["checked"]
        X_features = list(set(XD_features)-set(D_features))

        # privileged classes
        all_privileged_classes = {"persoon_geslacht_vrouw": [0.0],
                                    "persoon_leeftijd_bij_onderzoek": [1.0]}

        
        ds_train = BenefitsDataset(df=df_train, 
                                   label_name=Y_features[0],
                                   favorable_classes=[1],
                                   protected_attribute_names=D_features,
                                   privileged_classes=[all_privileged_classes[x] for x in D_features],
                                   instance_weights_name=None,
                                   features_to_keep=X_features+Y_features+D_features,
                                   custom_preprocessing=custom_preprocessing)  

        rew = Reweighing(unprivileged_groups=[{'persoon_geslacht_vrouw': 1}, 
                                               {'persoon_leeftijd_bij_onderzoek': 0}], 
                          privileged_groups=[{'persoon_geslacht_vrouw': 0},
                                             {'persoon_leeftijd_bij_onderzoek', 1}])  # Replace ... with actual groups
        rew = rew.fit_transform(ds_train)
        # X_train_rew, y_train_rew = ds_train_rew.features, ds_train_rew.labels
        # Train model
        model = GradientBoostingClassifier(n_estimators=300, min_samples_split=800, min_samples_leaf=125, max_depth=5, learning_rate=0.155)
        model.fit(X_train, y_train, sample_weight=rew.instance_weights)

        # Evaluate model
        metrics = evaluate_model(model, X_test, y_test)
        fprs.append(metrics["fpr"])
        tnrs.append(metrics["tnr"])
        nprs.append(metrics["npr"])
        fnrs.append(metrics["fnr"])
        precisions.append(metrics["precision"])
        recalls.append(metrics["recall"])
        f1s.append(metrics["f1"])

    # Print averaged results
    print("Average Performance:")
    print(f"FPR: {sum(fprs) / len(fprs):.4f}")
    print(f"TNR: {sum(tnrs) / len(tnrs):.4f}")
    print(f"NPR: {sum(nprs) / len(nprs):.4f}")
    print(f"FNR: {sum(fnrs) / len(fnrs):.4f}")
    print(f"Precision: {sum(precisions) / len(precisions):.4f}")
    print(f"Recall: {sum(recalls) / len(recalls):.4f}")
    print(f"F1: {sum(f1s) / len(f1s):.4f}")

In [3]:
test_model("./../data/synth_data_for_training.csv", 1)



AttributeError: 'set' object has no attribute 'items'