In [7]:
import pandas as pd
import numpy as np

from aif360.algorithms.preprocessing import DisparateImpactRemover,Reweighing,LFR
from aif360.datasets import BinaryLabelDataset
from aif360.datasets import CompasDataset

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

from humancompatible.repair.methods.data_analysis import rdata_analysis

import os
path=os.path.dirname(os.getcwd())

ModuleNotFoundError: No module named 'humancompatible'

In [6]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
class Baselinepreprocess:
    """
    A class to evaluate fairness and performance of 3 bias mitigation methods
    in the AIF360 documentation:https://aif360.readthedocs.io/en/latest/modules/algorithms.html.

    This class supports methods like Reweighing, Disparate Impact Remover, and LFR (Learning fair representations)
    to preprocess data, train models, and assess fairness metrics of 
    Disparate Impact and F1 scores.

    Parameters:
        train, test (aif360.datasets.BinaryLabelDataset): The dataset to be evaluated.
        pa (str): The name of the protected attribute (e.g., 'sex', 'race').
   
    Methods:
        preprocessing(method): Preprocess the dataset using the specified method.
        prediction(method): Predict outcomes using a random forest on the test data.
        assess(method): Compute performance and fairness metrics.
    """

    def __init__(self,train,test):
        self.train = train 
        self.test = test
        self.pa = train.protected_attribute_names[0]
        self.pa_index = train.feature_names.index(self.pa)
        self.prigroups = [{self.pa: 1}]
        self.unprigroups = [{self.pa: 0}]

    def preprocessing(self,method):
        """
        Preprocess training and/or test data for a given fairness method.

        Applies preprocessing steps as described in the AIF360 documentation:
        https://aif360.readthedocs.io/en/latest/modules/algorithms.html

        Parameters:
            methods (str): The name of the method to evaluate.
                        Must be one of ['origin', 'RW', 'DIremover', 'LFR'].

        Returns:
            aif360.datasets.BinaryLabelDataset: The processed training and test data.
        """
        test_tranf = self.test.copy()
        if method == 'RW':
            RW = Reweighing(privileged_groups = self.prigroups,
                            unprivileged_groups = self.unprigroups)
            RW.fit(self.train)
            train_tranf = RW.transform(self.train)
        elif method == 'DIremover':
            di = DisparateImpactRemover(repair_level = 1,
                                        sensitive_attribute=self.pa)
            train_tranf = di.fit_transform(self.train)
            test_tranf = di.fit_transform(self.test)
        elif method == 'LFR':
            TR = LFR(privileged_groups = self.prigroups,
                     unprivileged_groups = self.unprigroups,
                     Az = 1, Ax = 0.01, Ay = 1,verbose=0)
            TR = TR.fit(self.train)
            train_tranf = TR.transform(self.train)
            test_tranf = TR.transform(self.test)
        return train_tranf, test_tranf

    def prediction(self,method):
        """
        Predict outcomes using a random forest classifier with a given fairness method.

        Parameters:
            methods (str): The name of the method to evaluate.
                        Must be one of ['origin', 'RW', 'DIremover', 'LFR'].

        Returns:
            y_pred (aif360.datasets.BinaryLabelDataset): Predictions on the test data.
            di (float): Disparate Impact computed on the (processed) training data.
        """
        test_tranf = self.test.copy()
        if method == 'origin':
            train_tranf = self.train
        elif method in ['RW','DIremover','LFR','OP']:
            train_tranf,test_tranf = self.preprocessing(method)
        else:
            print('The method does not exist')

        di=self.DisparateImpact(train_tranf)
        print('Disparate Impact of train',di)

        if method != 'LFR':
            X_train = np.delete(train_tranf.features, self.pa_index, axis=1)
            y_train = train_tranf.labels.ravel()
            weight_train = train_tranf.instance_weights
            model=RandomForestClassifier(max_depth=5).fit(X_train,y_train, sample_weight=weight_train)

            X_test = np.delete(test_tranf.features, self.pa_index, axis=1)
            y_pred = model.predict(X_test)
        else:
            y_pred = test_tranf.labels
        return y_pred,di
    
    def DisparateImpact(self,data):
        """
        Computes Disparate Impact of the given dataset.

        Parameters:
            data (aif360.datasets.BinaryLabelDataset).
        """
        di = pd.DataFrame({'S':data.protected_attributes.ravel().tolist(),
            'Y':data.labels.ravel().tolist(),
            'W':list(data.instance_weights)},columns=['S','Y','W'])
        privileged = self.train.privileged_protected_attributes[0][0]
        unprivileged = self.train.unprivileged_protected_attributes[0][0]
        numerator=sum(di[(di['S']==unprivileged)&(di['Y']==data.favorable_label)]['W'])/sum(di[di['S']==unprivileged]['W'])
        denominator=sum(di[(di['S']==privileged)&(di['Y']==data.favorable_label)]['W'])/sum(di[di['S']==privileged]['W'])
        if numerator==denominator:
            return 1
        return numerator/denominator

    def assess(self,method):
        """
        Calculate performance metrics for a given fairness method.

        Computes Disparate Impact and three types of F1 scores of the prediction on (processed) test data.

        Parameters:
            methods (str): The name of the method to evaluate.
                        Must be one of ['origin', 'RW', 'DIremover', 'LFR'].

        Returns:
            pd.DataFrame: A DataFrame containing the performance metrics
                        for the specified method.
        """
        y_pred,di_train = self.prediction(method)
        y_test_pred = self.test.copy()
        y_test_pred.labels = y_pred

        di=self.DisparateImpact(y_test_pred)
        f1_macro = f1_score(self.test.labels, y_pred, average='macro',sample_weight=self.test.instance_weights)
        f1_micro = f1_score(self.test.labels, y_pred, average='micro',sample_weight=self.test.instance_weights)
        f1_weighted = f1_score(self.test.labels, y_pred, average='weighted',sample_weight=self.test.instance_weights)
        print('Disparate Impact of '+str(method),di)
        print('f1 macro of '+str(method),f1_macro)

        new_row=pd.Series({'DI of train':di_train,'DI':di,'f1 macro':f1_macro,'f1 micro':f1_micro,'f1 weighted':f1_weighted,'method':method})
        return new_row.to_frame().T

In [None]:
class Baselinepreprocess:
    """
    A class to evaluate fairness and performance of 3 bias mitigation methods
    in the AIF360 documentation:https://aif360.readthedocs.io/en/latest/modules/algorithms.html.

    This class supports methods like Reweighing, Disparate Impact Remover, and LFR (Learning fair representations)
    to preprocess data, train models, and assess fairness metrics of 
    Disparate Impact and F1 scores.

    Parameters:
        train, test (aif360.datasets.BinaryLabelDataset): The dataset to be evaluated.
        pa (str): The name of the protected attribute (e.g., 'sex', 'race').
   
    Methods:
        preprocessing(method): Preprocess the dataset using the specified method.
        prediction(method): Predict outcomes using a random forest on the test data.
        assess(method): Compute performance and fairness metrics.
    """

    def __init__(self,train,test):
        self.train = train 
        self.test = test
        self.pa = train.protected_attribute_names[0]
        self.pa_index = train.feature_names.index(self.pa)
        self.prigroups = [{self.pa: 1}]
        self.unprigroups = [{self.pa: 0}]

    def preprocessing(self,method):
        """
        Preprocess training and/or test data for a given fairness method.

        Applies preprocessing steps as described in the AIF360 documentation:
        https://aif360.readthedocs.io/en/latest/modules/algorithms.html

        Parameters:
            methods (str): The name of the method to evaluate.
                        Must be one of ['origin', 'RW', 'DIremover', 'LFR'].

        Returns:
            aif360.datasets.BinaryLabelDataset: The processed training and test data.
        """
        test_tranf = self.test.copy()
        if method == 'RW':
            RW = Reweighing(privileged_groups = self.prigroups,
                            unprivileged_groups = self.unprigroups)
            RW.fit(self.train)
            train_tranf = RW.transform(self.train)
        elif method == 'DIremover':
            di = DisparateImpactRemover(repair_level = 1,
                                        sensitive_attribute=self.pa)
            train_tranf = di.fit_transform(self.train)
            test_tranf = di.fit_transform(self.test)
        elif method == 'LFR':
            TR = LFR(privileged_groups = self.prigroups,
                     unprivileged_groups = self.unprigroups,
                     Az = 1, Ax = 0.01, Ay = 1,verbose=0)
            TR = TR.fit(self.train)
            train_tranf = TR.transform(self.train)
            test_tranf = TR.transform(self.test)
        return train_tranf, test_tranf

    def prediction(self,method):
        """
        Predict outcomes using a random forest classifier with a given fairness method.

        Parameters:
            methods (str): The name of the method to evaluate.
                        Must be one of ['origin', 'RW', 'DIremover', 'LFR'].

        Returns:
            y_pred (aif360.datasets.BinaryLabelDataset): Predictions on the test data.
            di (float): Disparate Impact computed on the (processed) training data.
        """
        test_tranf = self.test.copy()
        if method == 'origin':
            train_tranf = self.train
        elif method in ['RW','DIremover','LFR','OP']:
            train_tranf,test_tranf = self.preprocessing(method)
        else:
            print('The method does not exist')

        di=self.DisparateImpact(train_tranf)
        print('Disparate Impact of train',di)

        if method != 'LFR':
            X_train = np.delete(train_tranf.features, self.pa_index, axis=1)
            y_train = train_tranf.labels.ravel()
            weight_train = train_tranf.instance_weights
            model=RandomForestClassifier(max_depth=5).fit(X_train,y_train, sample_weight=weight_train)

            X_test = np.delete(test_tranf.features, self.pa_index, axis=1)
            y_pred = model.predict(X_test)
        else:
            y_pred = test_tranf.labels
        return y_pred,di
    
    def DisparateImpact(self,data):
        """
        Computes Disparate Impact of the given dataset.

        Parameters:
            data (aif360.datasets.BinaryLabelDataset).
        """
        di = pd.DataFrame({'S':data.protected_attributes.ravel().tolist(),
            'Y':data.labels.ravel().tolist(),
            'W':list(data.instance_weights)},columns=['S','Y','W'])
        privileged = self.train.privileged_protected_attributes[0][0]
        unprivileged = self.train.unprivileged_protected_attributes[0][0]
        numerator=sum(di[(di['S']==unprivileged)&(di['Y']==data.favorable_label)]['W'])/sum(di[di['S']==unprivileged]['W'])
        denominator=sum(di[(di['S']==privileged)&(di['Y']==data.favorable_label)]['W'])/sum(di[di['S']==privileged]['W'])
        if numerator==denominator:
            return 1
        return numerator/denominator

    def assess(self,method):
        """
        Calculate performance metrics for a given fairness method.

        Computes Disparate Impact and three types of F1 scores of the prediction on (processed) test data.

        Parameters:
            methods (str): The name of the method to evaluate.
                        Must be one of ['origin', 'RW', 'DIremover', 'LFR'].

        Returns:
            pd.DataFrame: A DataFrame containing the performance metrics
                        for the specified method.
        """
        y_pred,di_train = self.prediction(method)
        y_test_pred = self.test.copy()
        y_test_pred.labels = y_pred

        di=self.DisparateImpact(y_test_pred)
        f1_macro = f1_score(self.test.labels, y_pred, average='macro',sample_weight=self.test.instance_weights)
        f1_micro = f1_score(self.test.labels, y_pred, average='micro',sample_weight=self.test.instance_weights)
        f1_weighted = f1_score(self.test.labels, y_pred, average='weighted',sample_weight=self.test.instance_weights)
        print('Disparate Impact of '+str(method),di)
        print('f1 macro of '+str(method),f1_macro)

        new_row=pd.Series({'DI of train':di_train,'DI':di,'f1 macro':f1_macro,'f1 micro':f1_micro,'f1 weighted':f1_weighted,'method':method})
        return new_row.to_frame().T

# Compas dataset

In [None]:
pa = 'race'
label_map = {1.0: 'Did recid.', 0.0: 'No recid.'}
protected_attribute_maps = {1.0: 'Caucasian', 0.0: 'Not Caucasian'}
privileged_groups = [{pa: 1}]
unprivileged_groups = [{pa: 0}]
cd = CompasDataset(protected_attribute_names=[pa],privileged_classes=[['Caucasian'],[1]], 
                    metadata={'label_map': label_map,'protected_attribute_maps': protected_attribute_maps},
                    features_to_drop=['age', 'sex', 'c_charge_desc'])

In [None]:
methods=['origin','RW','DIremover','LFR'] 
report=pd.DataFrame(columns=['DI of train','DI','f1 macro','f1 micro','f1 weighted','method'])
for ignore in range(10):
    # train val test 4:2:4
    train,test = cd.split([0.4], shuffle=True) 
    valid,test = test.split([0.3], shuffle=True)
    
    prepro = Baselinepreprocess(train,test)
    for method in methods:
        report = pd.concat([report,prepro.assess(method)], ignore_index=True)

report.to_csv(path+'/data/report_preprocess_compas_'+str(pa)+'.csv',index=None)

Disparate Impact of train 0.7990349420367482
Disparate Impact of origin 0.7932470145168625
f1 macro of origin 0.6777809337135148
Disparate Impact of train 0.999999999999987
Disparate Impact of RW 0.8262149542418952
f1 macro of RW 0.6690655415745095
Disparate Impact of train 0.7990349420367482
Disparate Impact of DIremover 0.7704704850964349
f1 macro of DIremover 0.6831997426375959
Disparate Impact of train 1.0001618880631213
Disparate Impact of LFR 1.0072635885447108
f1 macro of LFR 0.6803522689911459
Disparate Impact of train 0.7959084008243232
Disparate Impact of origin 0.7410729687769535
f1 macro of origin 0.6523691094564642
Disparate Impact of train 1.0000000000000004
Disparate Impact of RW 0.7439167441527788
f1 macro of RW 0.6460775469961576
Disparate Impact of train 0.7959084008243232
Disparate Impact of DIremover 0.8364702745934417
f1 macro of DIremover 0.6520259007179265
Disparate Impact of train 0.9043388068572494
Disparate Impact of LFR 0.8756245693316027
f1 macro of LFR 0.65

# Adult dataset

In [None]:
def load_data(data_path,var_list,pa):
    """
    Load and clean the Adult dataset, and discretize selected attributes 
    (age, hours-per-week, capital-gain, capital-loss).

    Parameters:
        data_path (str): Path to the input data file.
        var_list (list of str): List of non-protected attribute names.
        pa (str): Name of the protected attribute.

    Returns:
        pd.DataFrame: The cleaned dataset with discretized attributes.
    """

    column_names = ['age', 'workclass', 'fnlwgt', 'education',
                'education-num', 'marital-status', 'occupation', 'relationship',
                'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'Y']
    na_values=['?']
    pa_dict={'Male':1,'Female':0,'White':1,'Black':0}
    label_dict={'>50K.':1,'>50K':1,'<=50K.':0,'<=50K':0}
    train_path = os.path.join(data_path, 'adult.data')
    test_path = os.path.join(data_path, 'adult.test')
    train = pd.read_csv(train_path, header=None,names=column_names,
                    skipinitialspace=True, na_values=na_values)
    test = pd.read_csv(test_path, header=0,names=column_names,
                    skipinitialspace=True, na_values=na_values)
    messydata = pd.concat([test, train], ignore_index=True)[var_list+[pa,'Y']]
    messydata=messydata.rename(columns={pa:'S'})
    messydata['S']=messydata['S'].replace(pa_dict)
    messydata['Y']=messydata['Y'].replace(label_dict)
    messydata=messydata[(messydata['S']==0)|(messydata['S']==1)]
    for col in var_list+['S','Y']:
        messydata[col]=messydata[col].astype('int64')
    messydata['W']=1

    # Define bin thresholds for discretizing attributes.
    bins_capitalgain=[100,3500,7500,10000]
    bins_capitalloss=[100,1600,1900,2200]
    bins_age=[26,36,46,56]
    bins_hours=[21,36,46,61]

    # Apply discretization to attributes using predefined bins.
    messydata=categerise(messydata,'age',bins_age)
    messydata=categerise(messydata,'hours-per-week',bins_hours)
    messydata=categerise(messydata,'capital-gain',bins_capitalgain)
    messydata=categerise(messydata,'capital-loss',bins_capitalloss)
    
    return messydata

def categerise(df,col,bins):
    # Apply discretization to attributes using predefined bins.
    for i in range(len(bins)+1):
        if i == 0:
            df.loc[df[col] < bins[i], col] = i
        elif i == len(bins):
            df.loc[df[col] >= bins[i-1], col] = i
        else:
            df.loc[(df[col] >= bins[i-1])& (df[col] < bins[i]), col] = i        
    return df

In [None]:
def choose_x(var_list,messydata):
    """
    Select non-protected attributes to repair based on their 
    protected-attribute-wise Total Variation distance.

    Attributes are selected if their Total Variation distance exceeds a threshold (default: 0.1).

    Parameters:
        var_list (list of str): List of non-protected attribute names.
        messydata (pd.DataFrame): The cleaned dataset.

    Returns:
        x_list (list of str): List of non-protected attributes that need to be repaired.
        tv_dist (dict): Dictionary mapping each non-protected attribute to its 
                        protected-attribute-wise Total Variation distance.
    """

    tv_dist=dict()
    for x_name in var_list:
        x_range_single=list(pd.pivot_table(messydata,index=x_name,values=['W'])[('W')].index) 
        dist=rdata_analysis(messydata,x_range_single,x_name)
        tv_dist[x_name]=sum(abs(dist['x_0']-dist['x_1']))/2
    x_list=[]
    for key,val in tv_dist.items():
        if val>0.1:
            x_list+=[key]  
    return x_list,tv_dist

In [None]:
data_path='C://Users//zhouq//anaconda3//Lib//site-packages//aif360//data//raw//adult'
var_list=['hours-per-week','age','capital-gain','capital-loss','education-num'] #,'education-num'
pa='race'
favorable_label = 1
var_dim=len(var_list)

messydata = load_data(data_path,var_list,pa)
x_list,tv_dist = choose_x(var_list,messydata)
messydata=messydata.rename(columns={'S':pa})
cd=BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=messydata,label_names='Y',protected_attribute_names=[pa])
# train,test = cd.split([0.4], shuffle=True) 
# valid,test = test.split([0.3], shuffle=True)

In [None]:
methods=['origin','RW','DIremover','LFR'] 
report=pd.DataFrame(columns=['DI of train','DI','f1 macro','f1 micro','f1 weighted','method'])
for ignore in range(10):
    # train val test 4:2:4
    train,test = cd.split([0.4], shuffle=True) 
    valid,test = test.split([0.3], shuffle=True)
    
    prepro = Baselinepreprocess(train,test)
    for method in methods:
        report = pd.concat([report,prepro.assess(method)], ignore_index=True)

report.to_csv(path+'/data/report_preprocess_adult_'+str(pa)+'.csv',index=None)

Disparate Impact of train 0.4423024479779176
Disparate Impact of origin 0.4761241484554046
f1 macro of origin 0.6696624107858051
Disparate Impact of train 0.9999999999998899
Disparate Impact of RW 0.46521121614598077
f1 macro of RW 0.6779482632516027
Disparate Impact of train 0.4423024479779176
Disparate Impact of DIremover 0.4754678848427709
f1 macro of DIremover 0.6761039218256115
Disparate Impact of train 0.8188911000561955
Disparate Impact of LFR 0.8403672932237478
f1 macro of LFR 0.6921478761007314
Disparate Impact of train 0.45067826635145786
Disparate Impact of origin 0.49080822521331785
f1 macro of origin 0.6820805989657579
Disparate Impact of train 1.0000000000003044
Disparate Impact of RW 0.48527679623085984
f1 macro of RW 0.6765581603975014
Disparate Impact of train 0.45067826635145786
Disparate Impact of DIremover 0.4790061250933923
f1 macro of DIremover 0.6799367478843286
Disparate Impact of train 0.7768607938615907
Disparate Impact of LFR 0.7838645991916943
f1 macro of LF

In [None]:
report

Unnamed: 0,DI of train,DI,f1 macro,f1 micro,f1 weighted,method
0,0.442302,0.476124,0.669662,0.812599,0.782036,origin
1,1.0,0.465211,0.677948,0.814598,0.786436,RW
2,0.442302,0.475468,0.676104,0.814035,0.785411,DIremover
3,0.818891,0.840367,0.692148,0.808294,0.789936,LFR
4,0.450678,0.490808,0.682081,0.816341,0.787928,origin
5,1.0,0.485277,0.676558,0.815162,0.785034,RW
6,0.450678,0.479006,0.679937,0.815316,0.786582,DIremover
7,0.776861,0.783865,0.685246,0.807268,0.785651,LFR
8,0.461955,0.456373,0.674915,0.810088,0.780571,origin
9,1.0,0.464543,0.674274,0.810036,0.780265,RW


# Compas dataset

In [None]:
pa = 'race'
label_map = {1.0: 'Did recid.', 0.0: 'No recid.'}
protected_attribute_maps = {1.0: 'Caucasian', 0.0: 'Not Caucasian'}
privileged_groups = [{pa: 1}]
unprivileged_groups = [{pa: 0}]
cd = CompasDataset(protected_attribute_names=[pa],privileged_classes=[['Caucasian'],[1]], 
                    metadata={'label_map': label_map,'protected_attribute_maps': protected_attribute_maps},
                    features_to_drop=['age', 'sex', 'c_charge_desc'])

In [None]:
methods=['origin','RW','DIremover','LFR'] 
report=pd.DataFrame(columns=['DI of train','DI','f1 macro','f1 micro','f1 weighted','method'])
for ignore in range(10):
    # train val test 4:2:4
    train,test = cd.split([0.4], shuffle=True) 
    valid,test = test.split([0.3], shuffle=True)
    
    prepro = Baselinepreprocess(train,test)
    for method in methods:
        report = pd.concat([report,prepro.assess(method)], ignore_index=True)

report.to_csv(path+'/data/report_preprocess_compas_'+str(pa)+'.csv',index=None)

Disparate Impact of train 0.7990349420367482
Disparate Impact of origin 0.7932470145168625
f1 macro of origin 0.6777809337135148
Disparate Impact of train 0.999999999999987
Disparate Impact of RW 0.8262149542418952
f1 macro of RW 0.6690655415745095
Disparate Impact of train 0.7990349420367482
Disparate Impact of DIremover 0.7704704850964349
f1 macro of DIremover 0.6831997426375959
Disparate Impact of train 1.0001618880631213
Disparate Impact of LFR 1.0072635885447108
f1 macro of LFR 0.6803522689911459
Disparate Impact of train 0.7959084008243232
Disparate Impact of origin 0.7410729687769535
f1 macro of origin 0.6523691094564642
Disparate Impact of train 1.0000000000000004
Disparate Impact of RW 0.7439167441527788
f1 macro of RW 0.6460775469961576
Disparate Impact of train 0.7959084008243232
Disparate Impact of DIremover 0.8364702745934417
f1 macro of DIremover 0.6520259007179265
Disparate Impact of train 0.9043388068572494
Disparate Impact of LFR 0.8756245693316027
f1 macro of LFR 0.65

# Adult dataset

In [None]:
def load_data(data_path,var_list,pa):
    """
    Load and clean the Adult dataset, and discretize selected attributes 
    (age, hours-per-week, capital-gain, capital-loss).

    Parameters:
        data_path (str): Path to the input data file.
        var_list (list of str): List of non-protected attribute names.
        pa (str): Name of the protected attribute.

    Returns:
        pd.DataFrame: The cleaned dataset with discretized attributes.
    """

    column_names = ['age', 'workclass', 'fnlwgt', 'education',
                'education-num', 'marital-status', 'occupation', 'relationship',
                'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                'native-country', 'Y']
    na_values=['?']
    pa_dict={'Male':1,'Female':0,'White':1,'Black':0}
    label_dict={'>50K.':1,'>50K':1,'<=50K.':0,'<=50K':0}
    train_path = os.path.join(data_path, 'adult.data')
    test_path = os.path.join(data_path, 'adult.test')
    train = pd.read_csv(train_path, header=None,names=column_names,
                    skipinitialspace=True, na_values=na_values)
    test = pd.read_csv(test_path, header=0,names=column_names,
                    skipinitialspace=True, na_values=na_values)
    messydata = pd.concat([test, train], ignore_index=True)[var_list+[pa,'Y']]
    messydata=messydata.rename(columns={pa:'S'})
    messydata['S']=messydata['S'].replace(pa_dict)
    messydata['Y']=messydata['Y'].replace(label_dict)
    messydata=messydata[(messydata['S']==0)|(messydata['S']==1)]
    for col in var_list+['S','Y']:
        messydata[col]=messydata[col].astype('int64')
    messydata['W']=1

    # Define bin thresholds for discretizing attributes.
    bins_capitalgain=[100,3500,7500,10000]
    bins_capitalloss=[100,1600,1900,2200]
    bins_age=[26,36,46,56]
    bins_hours=[21,36,46,61]

    # Apply discretization to attributes using predefined bins.
    messydata=categerise(messydata,'age',bins_age)
    messydata=categerise(messydata,'hours-per-week',bins_hours)
    messydata=categerise(messydata,'capital-gain',bins_capitalgain)
    messydata=categerise(messydata,'capital-loss',bins_capitalloss)
    
    return messydata

def categerise(df,col,bins):
    # Apply discretization to attributes using predefined bins.
    for i in range(len(bins)+1):
        if i == 0:
            df.loc[df[col] < bins[i], col] = i
        elif i == len(bins):
            df.loc[df[col] >= bins[i-1], col] = i
        else:
            df.loc[(df[col] >= bins[i-1])& (df[col] < bins[i]), col] = i        
    return df

In [None]:
def choose_x(var_list,messydata):
    """
    Select non-protected attributes to repair based on their 
    protected-attribute-wise Total Variation distance.

    Attributes are selected if their Total Variation distance exceeds a threshold (default: 0.1).

    Parameters:
        var_list (list of str): List of non-protected attribute names.
        messydata (pd.DataFrame): The cleaned dataset.

    Returns:
        x_list (list of str): List of non-protected attributes that need to be repaired.
        tv_dist (dict): Dictionary mapping each non-protected attribute to its 
                        protected-attribute-wise Total Variation distance.
    """

    tv_dist=dict()
    for x_name in var_list:
        x_range_single=list(pd.pivot_table(messydata,index=x_name,values=['W'])[('W')].index) 
        dist=rdata_analysis(messydata,x_range_single,x_name)
        tv_dist[x_name]=sum(abs(dist['x_0']-dist['x_1']))/2
    x_list=[]
    for key,val in tv_dist.items():
        if val>0.1:
            x_list+=[key]  
    return x_list,tv_dist

In [None]:
data_path='C://Users//zhouq//anaconda3//Lib//site-packages//aif360//data//raw//adult'
var_list=['hours-per-week','age','capital-gain','capital-loss','education-num'] #,'education-num'
pa='race'
favorable_label = 1
var_dim=len(var_list)

messydata = load_data(data_path,var_list,pa)
x_list,tv_dist = choose_x(var_list,messydata)
messydata=messydata.rename(columns={'S':pa})
cd=BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=messydata,label_names='Y',protected_attribute_names=[pa])
# train,test = cd.split([0.4], shuffle=True) 
# valid,test = test.split([0.3], shuffle=True)

In [None]:
methods=['origin','RW','DIremover','LFR'] 
report=pd.DataFrame(columns=['DI of train','DI','f1 macro','f1 micro','f1 weighted','method'])
for ignore in range(10):
    # train val test 4:2:4
    train,test = cd.split([0.4], shuffle=True) 
    valid,test = test.split([0.3], shuffle=True)
    
    prepro = Baselinepreprocess(train,test)
    for method in methods:
        report = pd.concat([report,prepro.assess(method)], ignore_index=True)

report.to_csv(path+'/data/report_preprocess_adult_'+str(pa)+'.csv',index=None)

Disparate Impact of train 0.4423024479779176
Disparate Impact of origin 0.4761241484554046
f1 macro of origin 0.6696624107858051
Disparate Impact of train 0.9999999999998899
Disparate Impact of RW 0.46521121614598077
f1 macro of RW 0.6779482632516027
Disparate Impact of train 0.4423024479779176
Disparate Impact of DIremover 0.4754678848427709
f1 macro of DIremover 0.6761039218256115
Disparate Impact of train 0.8188911000561955
Disparate Impact of LFR 0.8403672932237478
f1 macro of LFR 0.6921478761007314
Disparate Impact of train 0.45067826635145786
Disparate Impact of origin 0.49080822521331785
f1 macro of origin 0.6820805989657579
Disparate Impact of train 1.0000000000003044
Disparate Impact of RW 0.48527679623085984
f1 macro of RW 0.6765581603975014
Disparate Impact of train 0.45067826635145786
Disparate Impact of DIremover 0.4790061250933923
f1 macro of DIremover 0.6799367478843286
Disparate Impact of train 0.7768607938615907
Disparate Impact of LFR 0.7838645991916943
f1 macro of LF

In [None]:
report

Unnamed: 0,DI of train,DI,f1 macro,f1 micro,f1 weighted,method
0,0.442302,0.476124,0.669662,0.812599,0.782036,origin
1,1.0,0.465211,0.677948,0.814598,0.786436,RW
2,0.442302,0.475468,0.676104,0.814035,0.785411,DIremover
3,0.818891,0.840367,0.692148,0.808294,0.789936,LFR
4,0.450678,0.490808,0.682081,0.816341,0.787928,origin
5,1.0,0.485277,0.676558,0.815162,0.785034,RW
6,0.450678,0.479006,0.679937,0.815316,0.786582,DIremover
7,0.776861,0.783865,0.685246,0.807268,0.785651,LFR
8,0.461955,0.456373,0.674915,0.810088,0.780571,origin
9,1.0,0.464543,0.674274,0.810036,0.780265,RW
