Import statements

In [103]:
import argparse
import numpy as np
import os
import pickle
import pandas as pd
from tqdm import tqdm

from utils.generate import generateSamples
from utils.Mixture import NMixture, PUMixture
import utils.mixtureUtils as mixtureUtils
from utils.NestedGroupDist import NestedGroupDist
from utils.NestedGroupDistUnknownGroup import NestedGroupDistUnknownGroup
import utils.correct_metrics as correct_metrics
import utils.correct_metrics_unknowngroups as correct_metrics_unknowngroups
from utils.NNclassifier import getModel, getModelScores

Set up basic parameters

In [123]:
dim = 2
comp = 2
aucpn_range = [0.75,0.85]
num_groups = 2

total_samples = 25000  # Total number of data samples
lam = 0.5  # Proportion of unbiased points in the total data

# Both groups are equally sized
eta = [0.5, 0.5]  # eta= [a,b] In unbiased data, a% is G1 and b% is G2  #adds up to 1
eta_ = [0.5, 0.5]  # eta_= [a,b] In biased data, a% is G1 and b% is G2  #adds up to 1

alpha = [0.5, 0.5]  # Fraction of positive unbiased samples from each group
# 1 - alpha fraction of negative unbiased samples from each group
alpha_ = [0.5, 0.5]  # Fraction of positive biased samples from each group
# 1 - alpha_ negative biased samples from each group

In [None]:

pkl_file_name = 'synthetic_param/parameters/synthetic_datasets_params_d{}K{}.pkl'.format(dim, comp)
params = pd.read_pickle(pkl_file_name)

# Create a dataframe with these AUCs
df = params[(params['aucpn'] > aucpn_range[0]) & (params['aucpn'] < aucpn_range[1])]

i = 60

# Positive distribution
mu = {1: df['mu_p'].iloc[i],  # positive label
        0: df['mu_n'].iloc[i]}  # negative label
sig = {1: (df['sig_p'].iloc[i]),
        0: (df['sig_n'].iloc[i])}
component_weights = {0:  # group 0
                            {1: df['w_p'].iloc[i],
                            0: df['w_n'].iloc[i]},
                        1:  # group 1
                            {1: df['w_p_g2'].iloc[i],
                            0: df['w_n_g2'].iloc[i]}}
component_weights_labeled = {0:
                                    {1: df['wl_p'].iloc[i],
                                    0: df['wl_n'].iloc[i]},
                                1:
                                    {1: df['wl_p_g2'].iloc[i],
                                    0: df['wl_n_g2'].iloc[i]}}

# df_new = df.iloc[[60]]
# df_new.to_csv("demo_param.csv", index=False)
# mu = {1: [[-0.21164415,  0.20583508],[ 3.48498534,  1.86657226]], # mu_p
#       0: [[0.28298654, 1.46959782],[2.03495596, 1.76192688]]}  # mu_n

# sig = {1: [[[1, 0],[0, 1]], [[1, 0],[0, 1]] ], # sig_p
#        0: [[[1, 0],[0, 1]], [[2.19262005, 0.48831189],[0.48831189, 0.94692817]] ] } #sig_n

# component_weights = {0:  # group 0
#                             {1: df['w_p'] ,
#                             0: df['w_n'] },
#                         1:  # group 1
#                             {1: df['w_p_g2'] ,
#                             0: df['w_n_g2'] }}


# mu = {1: df['mu_p'] ,  # positive label
#         0: df['mu_n'] }  # negative label
# sig = {1: (df['sig_p'] ),
#         0: (df['sig_n'] )}
# component_weights = {0:  # group 0
#                             {1: [0.08674269, 0.91325731], # w_p
#                             0: [0.18581671, 0.81418329]}, # w_n
#                         1:  # group 1
#                             {1: [0.74191364, 0.25808636], #w_p
#                             0: [0.92521054, 0.07478946]} }#w_n

# component_weights_labeled = {0:
#                                     {1: df['wl_p']  ,
#                                     0: df['wl_n'].iloc[i]},
#                                 1:
#                                     {1: df['wl_p_g2'].iloc[i],
#                                     0: df['wl_n_g2'].iloc[i]}}

In [None]:
# df = pd.read_csv("demo_param.csv")

# mu = {1: df['mu_p'].iloc[0],  # positive label
#         0: df['mu_n'].iloc[0]}  # negative label
# sig = {1: (df['sig_p'].iloc[0]),
#         0: (df['sig_n'].iloc[0])}

# component_weights = {0:  # group 0
#                             {1: df['w_p'].iloc[0],
#                             0: df['w_n'].iloc[0]},
#                         1:  # group 1
#                             {1: df['w_p_g2'].iloc[0],
#                             0: df['w_n_g2'].iloc[0]}}

# component_weights_labeled = {0:
#                                     {1: df['wl_p'].iloc[0],
#                                     0: df['wl_n'].iloc[0]},
#                                 1:
#                                     {1: df['wl_p_g2'].iloc[0],
#                                     0: df['wl_n_g2'].iloc[0]}}

Construct distributions

In [126]:
distrib_ub = {g: {label: NMixture(mu[label], sig[label], np.array(component_weights[g][label]))
                    for label in [1, 0]} for g in range(num_groups)}

distrib_bias = {
    g: {label: NMixture(mu[label], sig[label], np.array(component_weights_labeled[g][label]))
        for label in [1, 0]} for g in range(num_groups)}

# Oracle distributions
# Unbiased group distributions
p_oracle = {g: PUMixture(distrib_ub[g][1], distrib_ub[g][0], alpha[g]) for g in range(num_groups)}

# Biased group distributions
q_oracle = {g: PUMixture(distrib_bias[g][1], distrib_bias[g][0], alpha_[g]) for g in range(num_groups)}

# Create a distribution object for the entire unlabeled set (not conditioned on a group)
alpha_u = {1: sum(eta[g] * alpha[g] for g in range(num_groups)),
            0: sum(eta[g] * (1 - alpha[g]) for g in range(num_groups))}

wPos_u = sum(eta[g] * alpha[g] * component_weights[g][1] for g in range(num_groups)) / alpha_u[1]
wNeg_u = sum(eta[g] * (1 - alpha[g]) * component_weights[g][0] for g in range(num_groups)) / alpha_u[0]

d_pos_ub = NMixture(mu[1], sig[1], wPos_u)
d_neg_ub = NMixture(mu[0], sig[0], wNeg_u)
d_ub = PUMixture(d_pos_ub, d_neg_ub, alpha_u[1])

# Create a distribution object for the entire labeled set (not conditioned on a group)
# wPos_l and wNeg_l can be estimated directly from a different fromulation of the GMM estimation, where the labeled data is treated as one sample
# instead of two samples (one for each group).
alpha_l = {1: sum(eta_[g] * alpha_[g] for g in range(num_groups)),
            0: sum(eta_[g] * (1 - alpha_[g]) for g in range(num_groups))}
wPos_l = sum(eta_[g] * alpha_[g] * component_weights_labeled[g][1] for g in range(num_groups)) / alpha_l[1]
wNeg_l = sum(
    eta_[g] * (1 - alpha_[g]) * component_weights_labeled[g][0] for g in range(num_groups)) / alpha_l[0]

d_pos_bias = NMixture(mu[1], sig[1], wPos_l)
d_neg_bias = NMixture(mu[0], sig[0], wNeg_l)
d_bias = PUMixture(d_pos_bias, d_neg_bias, alpha_l[1])

TypeError: len() of unsized object

Generate samples according to the base distributions

In [None]:
# Generate samples according to the distributions
s1_pos_ub, s1_neg_ub, s2_pos_ub, s2_neg_ub, s1_pos_bias, \
    s1_neg_bias, s2_pos_bias, s2_neg_bias \
    = generateSamples(distrib_ub[0][1], distrib_ub[0][0],
                        distrib_ub[1][1], distrib_ub[1][0],
                        distrib_bias[0][1], distrib_bias[0][0],
                        distrib_bias[1][1], distrib_bias[1][0],
                        total_samples, lam, alpha, alpha_, eta, eta_)

# All unlabeled samples
s_ub = np.concatenate((s1_pos_ub, s1_neg_ub, s2_pos_ub, s2_neg_ub), axis=0)
s_pos_ub = np.concatenate([s1_pos_ub, s2_pos_ub])
s_neg_ub = np.concatenate([s1_neg_ub, s2_neg_ub])
s1_ub = np.concatenate((s1_pos_ub, s1_neg_ub), axis=0)
s2_ub = np.concatenate((s2_pos_ub, s2_neg_ub), axis=0)

# Positive and negative labeled samples
s_pos_bias = np.concatenate((s1_pos_bias, s2_pos_bias), axis=0)
s_neg_bias = np.concatenate((s1_neg_bias, s2_neg_bias), axis=0)
# labeled data for each group
s1_bias = np.concatenate((s1_pos_bias, s1_neg_bias), axis=0)
s2_bias = np.concatenate((s2_pos_bias, s2_neg_bias), axis=0)
s_all_bias = np.concatenate([s1_bias, s2_bias])

Train a neural network classifier. Obtain scores and predicted labels

In [None]:
# Using the biased samples to train the model
X = np.vstack((s_pos_bias, s_neg_bias))
y = np.hstack((np.ones(len(s_pos_bias)), np.zeros(len(s_neg_bias))))

# Get the trained model on this data
model = getModel(X, y, threads=None)

class_threshold = 0.5

y_bias = {'pos': {0: getModelScores(model, s1_pos_bias),
                    1: getModelScores(model, s2_pos_bias)},
            'neg': {0: getModelScores(model, s1_neg_bias),
                    1: getModelScores(model, s2_neg_bias)}}
y_ub = {'pos': {0: getModelScores(model, s1_pos_ub),
                1: getModelScores(model, s2_pos_ub)},
        'neg': {0: getModelScores(model, s1_neg_ub),
                1: getModelScores(model, s2_neg_ub)}}
y1_all_bias = getModelScores(model, s1_bias) >= class_threshold
y1_pos_bias = y_bias['pos'][0] >= class_threshold
y1_neg_bias = y_bias['neg'][0] >= class_threshold
y2_all_bias = getModelScores(model, s2_bias) >= class_threshold
y2_pos_bias = y_bias['pos'][1] >= class_threshold
y2_neg_bias = y_bias['neg'][1] >= class_threshold
y_all_bias = np.concatenate([y1_all_bias, y2_all_bias])

y1_all_ub = getModelScores(model, s1_ub) >= class_threshold
y1_pos_ub = y_ub['pos'][0] >= class_threshold
y1_neg_ub = y_ub['neg'][0] >= class_threshold
y2_all_ub = getModelScores(model, s2_ub) >= class_threshold
y2_pos_ub = y_ub['pos'][1] >= class_threshold
y2_neg_ub = y_ub['neg'][1] >= class_threshold

y_pos_bias = np.concatenate([y1_pos_bias, y2_pos_bias])
y_neg_bias = np.concatenate([y1_neg_bias, y2_neg_bias])
y_all_ub = np.concatenate([y1_all_ub, y2_all_ub])
y_pos_ub = np.concatenate([y1_pos_ub, y2_pos_ub])
y_neg_ub = np.concatenate([y1_neg_ub, y2_neg_ub])

label_all_bias = np.concatenate([np.ones(s1_pos_bias.shape[0]), np.zeros(s1_neg_bias.shape[0]), \
                                    np.ones(s2_pos_bias.shape[0]), np.zeros(s2_neg_bias.shape[0])])
label_all_bias = label_all_bias.reshape(-1, 1)

Compute TPR and FPR for groups for Correction with labelled and unlabelled points

In [None]:
# Since we do not have labels from the unlabeled data, we use the posterior probability from the GMM as soft labels
YSoft_u = d_ub.pn_posterior(s_ub).reshape(-1, 1)
# print(YSoft_u.shape)
Y = np.concatenate([label_all_bias, YSoft_u])
YPred = np.concatenate([y_all_bias, y_all_ub])
WPos_l = {g: distrib_ub[g][1].points_pdf(s_all_bias)/d_pos_bias.points_pdf(s_all_bias) for g in range(num_groups)}
WPos_u = {g: distrib_ub[g][1].points_pdf(s_ub) / d_pos_ub.points_pdf(s_ub) for g in range(num_groups)}
TPR = {g: correct_metrics.TPR(Y, YPred, class_threshold, np.concatenate((WPos_l[g], WPos_u[g]))) for g in range(num_groups)}

WNeg_l = {g: distrib_ub[g][0].points_pdf(s_all_bias) / d_neg_bias.points_pdf(s_all_bias) for g in range(num_groups)}
WNeg_u = {g: distrib_ub[0][0].points_pdf(s_ub) / d_neg_ub.points_pdf(s_ub) for g in range(num_groups)}
FPR = {g: correct_metrics.FPR(Y, YPred, class_threshold, np.concatenate((WNeg_l[g], WNeg_u[g]))) for g in range(num_groups)}

# Compute fairness measures (oracle)

In [None]:
oracle_measures = {measure: {"uncorrected": list(),
                            "corrected": list(),
                            "corrected_l": list()} for measure in ['eo', 'pe', 'ppv']}

## Equal opportunity
eo_group_difference, eo = correct_metrics.correct_eo(points_pos_bias={0: s1_pos_bias, 1: s2_pos_bias},
                                                        y_pos_bias={0: y1_pos_bias, 1: y2_pos_bias},
                                                        y_pos_ub={0: y1_pos_ub, 1: y2_pos_ub},
                                                        d_pos_bias={g: distrib_bias[g][1] for g in
                                                                    range(num_groups)},
                                                        d_pos_ub={g: distrib_ub[g][1] for g in
                                                                range(num_groups)})

eo_bias_withLabelled = {
    g: sum(y_pos_bias * distrib_ub[g][1].points_pdf(s_pos_bias) / d_pos_bias.points_pdf(s_pos_bias)) /
        y_pos_bias.shape[0] for g in range(num_groups)}

oracle_measures["eo"]["uncorrected"].append(
    abs(eo_group_difference['uncorrected'] - eo_group_difference['unbiased']))
oracle_measures["eo"]["corrected"].append(
    abs(eo_group_difference['corrected'] - eo_group_difference['unbiased']))
oracle_measures["eo"]["corrected_l"].append(
    abs((eo_bias_withLabelled[0] - eo_bias_withLabelled[1]) - eo_group_difference['unbiased']))


## predictive equality
pe_group_difference, pe = correct_metrics.correct_pe(points_neg_bias={0: s1_neg_bias, 1: s2_neg_bias},
                                                        y_neg_bias={0: y1_neg_bias, 1: y2_neg_bias},
                                                        y_neg_ub={0: y1_neg_ub, 1: y2_neg_ub},
                                                        d_neg_bias={g: distrib_bias[g][0] for g in
                                                                    range(num_groups)},
                                                        d_neg_ub={g: distrib_ub[g][0] for g in
                                                                range(num_groups)})

pe_bias_withLabelled = {
    g: sum(y_neg_bias * distrib_ub[g][0].points_pdf(s_neg_bias) / d_neg_bias.points_pdf(s_neg_bias)) /
        y_neg_bias.shape[0] for g in range(num_groups)}


oracle_measures["pe"]["uncorrected"].append(
    abs(pe_group_difference['uncorrected'] - pe_group_difference['unbiased']))
oracle_measures["pe"]["corrected"].append(
    abs(pe_group_difference['corrected'] - pe_group_difference['unbiased']))
oracle_measures["pe"]["corrected_l"].append(
    abs((pe_bias_withLabelled[0] - pe_bias_withLabelled[1]) - pe_group_difference['unbiased']))


# predictive positive value = TPR*alpha / (TPR*alpha + FPR*(1-alpha))
ppv_group_difference = correct_metrics.correct_ppv(eo, pe, alpha, alpha_)
ppv_bias_withLabelled = {g: eo_bias_withLabelled[g] * alpha[g] / (
            eo_bias_withLabelled[g] * alpha[g] + pe_bias_withLabelled[g] * (1 - alpha[g]))
                            for g in range(num_groups)}


oracle_measures["ppv"]["uncorrected"].append(
    abs(ppv_group_difference['uncorrected'] - ppv_group_difference['unbiased']))
oracle_measures["ppv"]["corrected"].append(
    abs(ppv_group_difference['corrected'] - ppv_group_difference['unbiased']))
oracle_measures["ppv"]["corrected_l"].append(
    abs((ppv_bias_withLabelled[0] - ppv_bias_withLabelled[1]) - ppv_group_difference['unbiased']))

# Estimate parameters and construct estimated distributions (with group information)

In [None]:
Kfit = [comp, comp]
nested_group_EM = NestedGroupDist(x_unlabeled=s_ub,
                                    x_labeled=[s_pos_bias, s_neg_bias],
                                    unlabeled_groups=np.hstack([np.zeros(s1_ub.shape[0]),
                                                                np.ones(s2_ub.shape[0])]),
                                    labeled_groups=[np.hstack([np.zeros(s1_pos_bias.shape[0]),
                                                                np.ones(s2_pos_bias.shape[0])]),
                                                    np.hstack([np.zeros(s1_neg_bias.shape[0]),
                                                                np.ones(s2_neg_bias.shape[0])])],
                                    components=Kfit, num_classes=2, num_groups=2)
nested_group_EM.estimate_params(max_steps=5000)

alphas_1_est, alphas_2_est = nested_group_EM.alphas
w_1_est = [nested_group_EM.w[c][0] for c in [0, 1]]  # group==0 for each class
wl_1_est = [nested_group_EM.w_labeled[c][0] for c in [0, 1]]
w_2_est = [nested_group_EM.w[c][1] for c in [0, 1]]  # group==1 for each class
wl_2_est = [nested_group_EM.w_labeled[c][1] for c in [0, 1]]
lls_1 = lls_2 = nested_group_EM.lls

# Redefining mixtures with estimated parameters.
# Distributions with estimated parameters are used for all subsequent operations
estimated_d_unbiased = {
    0: {"pos": NMixture(nested_group_EM.mu[0], nested_group_EM.sg[0], np.array(w_1_est[0])),
        "neg": NMixture(nested_group_EM.mu[1], nested_group_EM.sg[1], np.array(w_1_est[1]))},
    1: {"pos": NMixture(nested_group_EM.mu[0], nested_group_EM.sg[0], np.array(w_2_est[0])),
        "neg": NMixture(nested_group_EM.mu[1], nested_group_EM.sg[1], np.array(w_2_est[1]))}}

estimated_d_biased = {
    0: {"pos": NMixture(nested_group_EM.mu[0], nested_group_EM.sg[0], np.array(wl_1_est[0])),
        "neg": NMixture(nested_group_EM.mu[1], nested_group_EM.sg[1], np.array(wl_1_est[1]))},
    1: {"pos": NMixture(nested_group_EM.mu[0], nested_group_EM.sg[0], np.array(wl_2_est[0])),
        "neg": NMixture(nested_group_EM.mu[1], nested_group_EM.sg[1], np.array(wl_2_est[1]))}}

# Redefining alpha with the estimated parameter
alpha_est = [alphas_1_est[0], alphas_2_est[0]]
# observed alpha
alpha_l_est = [s1_pos_bias.shape[0] / sum([s1_pos_bias.shape[0], s1_neg_bias.shape[0]]),
                s2_pos_bias.shape[0] / sum([s2_pos_bias.shape[0], s2_neg_bias.shape[0]])]

# Unbiased group distribution
estimated_p = {
    0: PUMixture(estimated_d_unbiased[0]['pos'], estimated_d_unbiased[0]['neg'], alpha_est[0]),
    1: PUMixture(estimated_d_unbiased[1]['pos'], estimated_d_unbiased[1]['neg'], alpha_est[1])}

# Biased group distribution
estimated_q = {
    0: PUMixture(estimated_d_biased[0]['pos'], estimated_d_biased[0]['neg'], alpha_l_est[0]),
    1: PUMixture(estimated_d_biased[1]['pos'], estimated_d_biased[1]['neg'], alpha_l_est[1])}

# Create a distribution object for the entire unblabeled set (not conditioned on a group)
est_alpha_u = {1: sum(eta[g] * alpha_est[g] for g in range(num_groups)),
                0: sum(eta[g] * (1 - alpha_est[g]) for g in range(num_groups))}
est_wPos_u = sum(eta[g] * alpha_est[g] * estimated_d_unbiased[g]['pos'].ps for g in
                    range(num_groups)) / est_alpha_u[1]
est_wNeg_u = sum(eta[g] * (1 - alpha_est[g]) * estimated_d_unbiased[g]['neg'].ps for g in
                    range(num_groups)) / est_alpha_u[0]
est_d_pos_ub = NMixture(nested_group_EM.mu[0], nested_group_EM.sg[0], est_wPos_u)
est_d_neg_ub = NMixture(nested_group_EM.mu[1], nested_group_EM.sg[1], est_wNeg_u)
est_d_ub = PUMixture(est_d_pos_ub, est_d_neg_ub, est_alpha_u[1])

# Create a distribution object for the entire labeled set (not conditioned on a group)
# wPos_l and wNeg_l can be estimated directly from a different fromulation of the GMM estimation,
# where the labeled data is treated as one sample
# instead of two samples (one for each group).
est_alpha_l = {1: sum(eta_[g] * alpha_l_est[g] for g in range(num_groups)),
                0: sum(eta_[g] * (1 - alpha_l_est[g]) for g in range(num_groups))}
est_wPos_l = sum(
    eta_[g] * alpha_l_est[g] * estimated_d_biased[g]['pos'].ps for g in range(num_groups)) / \
                est_alpha_l[1]
est_wNeg_l = sum(eta_[g] * (1 - alpha_l_est[g]) * estimated_d_biased[g]['neg'].ps for g in
                    range(num_groups)) / est_alpha_l[0]

est_d_pos_bias = NMixture(nested_group_EM.mu[0], nested_group_EM.sg[0], est_wPos_l)
est_d_neg_bias = NMixture(nested_group_EM.mu[1], nested_group_EM.sg[1], est_wNeg_l)
est_d_bias = PUMixture(est_d_pos_bias, est_d_neg_bias, est_alpha_l[1])


YSoft_u = est_d_ub.pn_posterior(s_ub).reshape(-1, 1)
# Since we do not have labels from the unlabeled data,
# we use the posterior probability from the GMM as soft labels
Y = np.concatenate([label_all_bias, YSoft_u])
YPred = np.concatenate([y_all_bias, y_all_ub])
WPos_l = {g: estimated_d_unbiased[g]["pos"].points_pdf(s_all_bias) / est_d_pos_bias.points_pdf(s_all_bias)
            for g in range(num_groups)}
WPos_u = {g: estimated_d_unbiased[g]["pos"].points_pdf(s_ub) / est_d_pos_ub.points_pdf(s_ub)
            for g in range(num_groups)}
WNeg_l = {g: estimated_d_unbiased[g]["neg"].points_pdf(s_all_bias) / est_d_neg_bias.points_pdf(
    s_all_bias) for g in range(num_groups)}
WNeg_u = {g: estimated_d_unbiased[g]["neg"].points_pdf(s_ub) / est_d_neg_ub.points_pdf(s_ub)
            for g in range(num_groups)}

TPR = {g: correct_metrics.TPR(Y, YPred, class_threshold, np.concatenate((WPos_l[g], WPos_u[g])))
        for g in range(num_groups)}
FPR = {g: correct_metrics.FPR(Y, YPred, class_threshold, np.concatenate((WNeg_l[g], WNeg_u[g])))
        for g in range(num_groups)}

# Compute estimated fairness metrics (with group information)

In [None]:
estimated_measures = {measure: {"uncorrected": [],
                                "corrected": [],
                                "corrected_l": []} for measure in ['eo', 'pe', 'ppv']}

## Equal opportunity
eo_group_difference, eo = correct_metrics.correct_eo(
    points_pos_bias={0: s1_pos_bias, 1: s2_pos_bias},
    y_pos_bias={0: y1_pos_bias, 1: y2_pos_bias},
    y_pos_ub={0: y1_pos_ub, 1: y2_pos_ub},
    d_pos_bias={0: estimated_d_biased[0]["pos"], 1: estimated_d_biased[1]["pos"]},
    d_pos_ub={0: estimated_d_unbiased[0]["pos"], 1: estimated_d_unbiased[1]["pos"]})

eo_bias_withLabelled = {
    g: sum(
        y_pos_bias * estimated_d_unbiased[g]["pos"].points_pdf(s_pos_bias) / est_d_pos_bias.points_pdf(
        s_pos_bias)) / y_pos_bias.shape[0] for g in range(num_groups)}

estimated_measures["eo"]["uncorrected"].append(
    abs(eo_group_difference['uncorrected'] - eo_group_difference['unbiased']))
estimated_measures["eo"]["corrected"].append(
    abs(eo_group_difference['corrected'] - eo_group_difference['unbiased']))
estimated_measures["eo"]["corrected_l"].append(
    abs((eo_bias_withLabelled[0] - eo_bias_withLabelled[1]) - eo_group_difference['unbiased']))


## predictive equality
pe_group_difference, pe = correct_metrics.correct_pe(
    points_neg_bias={0: s1_neg_bias, 1: s2_neg_bias},
    y_neg_bias={0: y1_neg_bias, 1: y2_neg_bias},
    y_neg_ub={0: y1_neg_ub, 1: y2_neg_ub},
    d_neg_bias={0: estimated_d_biased[0]["neg"], 1: estimated_d_biased[1]["neg"]},
    d_neg_ub={0: estimated_d_unbiased[0]["neg"], 1: estimated_d_unbiased[1]["neg"]})
pe_bias_withLabelled = {
    g: sum(
        y_neg_bias * estimated_d_unbiased[g]["neg"].points_pdf(s_neg_bias) / est_d_neg_bias.points_pdf(
        s_neg_bias)) / y_neg_bias.shape[0] for g in range(num_groups)}


estimated_measures["pe"]["uncorrected"].append(
    abs(pe_group_difference['uncorrected'] - pe_group_difference['unbiased']))
estimated_measures["pe"]["corrected"].append(
    abs(pe_group_difference['corrected'] - pe_group_difference['unbiased']))
estimated_measures["pe"]["corrected_l"].append(
    abs((pe_bias_withLabelled[0] - pe_bias_withLabelled[1]) - pe_group_difference['unbiased']))


# predictive positive value = TPR*alpha / (TPR*alpha + FPR*(1-alpha))
ppv_group_difference = correct_metrics.correct_ppv(eo, pe, alpha_est, alpha_l_est)
ppv_bias_withLabelled = {g:  eo_bias_withLabelled[g] * alpha_est[g] / (
            eo_bias_withLabelled[g] * alpha_est[g] + pe_bias_withLabelled[g] * (1 - alpha_est[g]))
                            for g in range(num_groups)}


estimated_measures["ppv"]["uncorrected"].append(
    abs(ppv_group_difference['uncorrected'] - ppv_group_difference['unbiased']))
estimated_measures["ppv"]["corrected"].append(
    abs(ppv_group_difference['corrected'] - ppv_group_difference['unbiased']))
estimated_measures["ppv"]["corrected_l"].append(
    abs((ppv_bias_withLabelled[0] - ppv_bias_withLabelled[1]) - ppv_group_difference['unbiased']))


# Estimate parameters and construct estimated distributions (without group information)

In [None]:
Kfit = [comp, comp]
nested_group_EM = NestedGroupDistUnknownGroup(
    x_unlabeled=s_ub, x_labeled=[s_pos_bias, s_neg_bias],
    unlabeled_groups=np.hstack([np.zeros(s1_ub.shape[0]), np.ones(s2_ub.shape[0])]),
    components=Kfit, num_classes=2, num_groups=2)
nested_group_EM.estimate_params(max_steps=5000)

alphas_1_est, alphas_2_est = nested_group_EM.alphas
w_1_est = [nested_group_EM.w[c][0] for c in [0, 1]]  # group==0 for each class
wl_est = [nested_group_EM.w_labeled[c] for c in [0, 1]]
w_2_est = [nested_group_EM.w[c][1] for c in [0, 1]]  # group==1 for each class
lls_1 = lls_2 = nested_group_EM.lls

# Redefining mixtures with estimated parameters.
# Distributions with estimated parameters are used for all subsequent operations
estimated_d_unbiased = {
    0: {"pos": NMixture(nested_group_EM.mu[0], nested_group_EM.sg[0], np.array(w_1_est[0])),
        "neg": NMixture(nested_group_EM.mu[1], nested_group_EM.sg[1], np.array(w_1_est[1]))},
    1: {"pos": NMixture(nested_group_EM.mu[0], nested_group_EM.sg[0], np.array(w_2_est[0])),
        "neg": NMixture(nested_group_EM.mu[1], nested_group_EM.sg[1], np.array(w_2_est[1]))}}

estimated_d_biased = {
    "pos": NMixture(nested_group_EM.mu[0], nested_group_EM.sg[0], np.array(wl_est[0])),
    "neg": NMixture(nested_group_EM.mu[1], nested_group_EM.sg[1], np.array(wl_est[1]))}

# Redefining alpha with the estimated parameter
alpha_est = [alphas_1_est[0], alphas_2_est[0]]
# observed alpha
alpha_l_est = s_pos_bias.shape[0] / sum([s_pos_bias.shape[0], s_neg_bias.shape[0]])

# Unbiased group distribution
estimated_p = {
    0: PUMixture(estimated_d_unbiased[0]['pos'], estimated_d_unbiased[0]['neg'], alpha_est[0]),
    1: PUMixture(estimated_d_unbiased[1]['pos'], estimated_d_unbiased[1]['neg'], alpha_est[1])}

# Biased group distribution
estimated_q = PUMixture(estimated_d_biased['pos'], estimated_d_biased['neg'], alpha_l_est)

# Create a distribution object for the entire unlabeled set (not conditioned on a group)
est_alpha_u = {1: sum(eta[g] * alpha_est[g] for g in range(num_groups)),
                0: sum(eta[g] * (1 - alpha_est[g]) for g in range(num_groups))}
est_wPos_u = sum(eta[g] * alpha_est[g] * estimated_d_unbiased[g]['pos'].ps for g in
                    range(num_groups)) / est_alpha_u[1]
est_wNeg_u = sum(eta[g] * (1 - alpha_est[g]) * estimated_d_unbiased[g]['neg'].ps for g in
                    range(num_groups)) / est_alpha_u[0]
est_d_pos_ub = NMixture(nested_group_EM.mu[0], nested_group_EM.sg[0], est_wPos_u)
est_d_neg_ub = NMixture(nested_group_EM.mu[1], nested_group_EM.sg[1], est_wNeg_u)
est_d_ub = PUMixture(est_d_pos_ub, est_d_neg_ub, est_alpha_u[1])

# Create a distribution object for the entire labeled set (not conditioned on a group)
# wPos_l and wNeg_l can be estimated directly from a different fromulation of the GMM estimation,
# where the labeled data is treated as one sample
# instead of two samples (one for each group).
est_wPos_l = estimated_d_biased['pos'].ps
est_wNeg_l = estimated_d_biased['neg'].ps

est_d_pos_bias = estimated_d_biased['pos']
est_d_neg_bias = estimated_d_biased['neg']

YSoft_u = est_d_ub.pn_posterior(s_ub).reshape(-1, 1)
# Since we do not have labels from the unlabeled data,
# we use the posterior probability from the GMM as soft labels
Y = np.concatenate([label_all_bias, YSoft_u])
YPred = np.concatenate([y_all_bias, y_all_ub])
WPos_l = {
    g: estimated_d_unbiased[g]["pos"].points_pdf(s_all_bias) / est_d_pos_bias.points_pdf(s_all_bias)
    for g in range(num_groups)}
WPos_u = {g: estimated_d_unbiased[g]["pos"].points_pdf(s_ub) / est_d_pos_ub.points_pdf(s_ub)
            for g in range(num_groups)}
WNeg_l = {g: estimated_d_unbiased[g]["neg"].points_pdf(s_all_bias) / est_d_neg_bias.points_pdf(
    s_all_bias) for g in range(num_groups)}
WNeg_u = {g: estimated_d_unbiased[g]["neg"].points_pdf(s_ub) / est_d_neg_ub.points_pdf(s_ub)
            for g in range(num_groups)}

TPR = {g: correct_metrics.TPR(Y, YPred, class_threshold, np.concatenate((WPos_l[g], WPos_u[g])))
        for g in range(num_groups)}
FPR = {g: correct_metrics.FPR(Y, YPred, class_threshold, np.concatenate((WNeg_l[g], WNeg_u[g])))
        for g in range(num_groups)}


# Compute estimated fairness metrics (without group information)

In [None]:
estimated_measures_ng = {measure: {"corrected_l": []} for measure in ['eo', 'pe', 'ppv']}

## Equal opportunity
eo_group_difference, eo = correct_metrics_unknowngroups.correct_eo(
    y_pos_bias=y_pos_bias,
    y_pos_ub={0: y1_pos_ub, 1: y2_pos_ub},
    s_pos_bias=s_pos_bias,
    q_pos=estimated_d_biased["pos"],
    p_pos={0: estimated_d_unbiased[0]["pos"], 1: estimated_d_unbiased[1]["pos"]})

eo_bias_withUL = {g: TPR[g] for g in range(num_groups)}

estimated_measures_ng["eo"]["corrected_l"].append(
    abs(eo_group_difference['corrected_l'] - eo_group_difference['unbiased']))


## predictive equality
pe_group_difference, pe = correct_metrics_unknowngroups.correct_pe(
    y_neg_bias=y_neg_bias,
    y_neg_ub={0: y1_neg_ub, 1: y2_neg_ub},
    s_neg_bias=s_neg_bias,
    q_neg=estimated_d_biased["neg"],
    p_neg={0: estimated_d_unbiased[0]["neg"], 1: estimated_d_unbiased[1]["neg"]})

pe_bias_withUL = {g: FPR[g] for g in range(num_groups)}
estimated_measures_ng["pe"]["corrected_l"].append(
    abs(pe_group_difference['corrected_l'] - pe_group_difference['unbiased']))

# predictive positive value = TPR*alpha / (TPR*alpha + FPR*(1-alpha))
ppv_group_difference = correct_metrics_unknowngroups.correct_ppv(
    eo, pe, alpha_est)

estimated_measures_ng["ppv"]["corrected_l"].append(
    abs(ppv_group_difference['corrected_l'] - ppv_group_difference['unbiased']))



Print outputs

In [None]:
data = {
    "Fairness Metric" : ["EO", "PE", "PPV"],
    "Uncorrected" : [estimated_measures["eo"]["uncorrected"], estimated_measures["pe"]["uncorrected"], estimated_measures["ppv"]["uncorrected"]],
    "GIL" : [estimated_measures["eo"]["corrected"], estimated_measures["pe"]["corrected"], estimated_measures["ppv"]["corrected"]],
    "GIL*" : [oracle_measures["eo"]["corrected"], oracle_measures["pe"]["corrected"], oracle_measures["ppv"]["corrected"]],
    "GNIL1" : [estimated_measures["eo"]["corrected_l"], estimated_measures["pe"]["corrected_l"], estimated_measures["ppv"]["corrected_l"]],
    "GNIL2" :[estimated_measures_ng["eo"]["corrected_l"], estimated_measures_ng["pe"]["corrected_l"], estimated_measures_ng["ppv"]["corrected_l"]],
    "GNIL*": [oracle_measures["eo"]["corrected_l"], oracle_measures["pe"]["corrected_l"], oracle_measures["ppv"]["corrected_l"]]
}

# Create a DataFrame
df = pd.DataFrame(data)

df

Unnamed: 0,Fairness Metric,Uncorrected,GIL,GIL*,GNIL1,GNIL2,GNIL*
0,EO,[0.75168],[0.0007962449439429253],[0.0018740729152055824],[0.008199190335749396],[0.0098798748374051],[0.001271630961204806]
1,PE,[0.76864],[0.013467780233199278],[0.01631650500427051],[0.01802115222634726],[0.013949239872702135],[0.019744195298815548]
2,PPV,[0.06664404845760674],[0.009305821623556643],[0.008768788511785086],[0.015158664065379135],[0.013787523775119492],[0.01246626847805865]
