# Import libraries and define variables

In [1]:
import numpy as np
import pandas as pd
import pickle
import gc
import glob
from os import path
from utils import * # metric functions

In [9]:
dataCols = ['i_tp', 'i_fp', 'i_tn', 'i_fn', 'j_tp', 'j_fp', 'j_tn', 'j_fn']
calculationsDir = "out/calculations/"
datasetName = path.join('..', 'fairness-measures-gen', 'out', "Set(08,56).bin")

# Get calculations
As the dataset is quite large (4.2 Gb) we will write calculations to separate files in 2 stages.

## Write calculations of the 1st half of the dataset

In [6]:
# Get half of the data
with open(datasetName, "rb") as f:
    df = pd.DataFrame(pickle.load(f), columns = dataCols)

halfIdx = int(df.shape[0] / 2)
df = df.iloc[:halfIdx]
df.head()

Unnamed: 0,i_tp,i_fp,i_tn,i_fn,j_tp,j_fp,j_tn,j_fn
0,56,0,0,0,0,0,0,0
1,55,1,0,0,0,0,0,0
2,55,0,1,0,0,0,0,0
3,55,0,0,1,0,0,0,0
4,55,0,0,0,1,0,0,0


In [7]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 276635335 entries, 0 to 276635334
Data columns (total 8 columns):
 #   Column  Dtype
---  ------  -----
 0   i_tp    int8 
 1   i_fp    int8 
 2   i_tn    int8 
 3   i_fn    int8 
 4   j_tp    int8 
 5   j_fp    int8 
 6   j_tn    int8 
 7   j_fn    int8 
dtypes: int8(8)
memory usage: 2.1 GB


In [8]:
# Calculate half of GRs
with open(calculationsDir + "gr.bin", "wb+") as f:
    getGroupRatios(df).to_numpy().tofile(f)

# Calculate half of IRs
with open(calculationsDir + "ir.bin", "wb+") as f:
    getImbalanceRatios(df).to_numpy().tofile(f)

with open(calculationsDir + "i_tpr.bin", "wb+") as f:
    getTPR_i(df).to_numpy().tofile(f)

with open(calculationsDir + "j_tpr.bin", "wb+") as f:
    getTPR_j(df).to_numpy().tofile(f)

with open(calculationsDir + "i_fpr.bin", "wb+") as f:
    getFPR_i(df).to_numpy().tofile(f)

with open(calculationsDir + "j_fpr.bin", "wb+") as f:
    getFPR_j(df).to_numpy().tofile(f)

with open(calculationsDir + "i_ppv.bin", "wb+") as f:
    getPositivePredictiveValue_i(df).to_numpy().tofile(f)

with open(calculationsDir + "j_ppv.bin", "wb+") as f:
    getPositivePredictiveValue_j(df).to_numpy().tofile(f)

with open(calculationsDir + "i_npv.bin", "wb+") as f:
    getNegativePredictiveValue_i(df).to_numpy().tofile(f)

with open(calculationsDir + "j_npv.bin", "wb+") as f:
    getNegativePredictiveValue_j(df).to_numpy().tofile(f)
    
with open(calculationsDir + "stat_parity.bin", "wb+") as f:
    get_statistical_parity(df).to_numpy().tofile(f)

with open(calculationsDir + "disp_impact.bin", "wb+") as f:
    get_disparate_impact(df).to_numpy().tofile(f)

with open(calculationsDir + "acc_equality_ratio.bin", "wb+") as f:
    get_acc_equality_ratio(df).to_numpy().tofile(f)

with open(calculationsDir + "acc_equality_diff.bin", "wb+") as f:
    get_acc_equality_diff(df).to_numpy().tofile(f)
    
# Free the memory
del df
gc.collect()
gc.get_stats()

[{'collections': 425, 'collected': 2369, 'uncollectable': 0},
 {'collections': 38, 'collected': 1201, 'uncollectable': 0},
 {'collections': 4, 'collected': 2065, 'uncollectable': 0}]

## Append calculations of the 2st half of the dataset

In [10]:
with open(datasetName, "rb") as f:
    df = pd.DataFrame(pickle.load(f), columns = dataCols)

halfIdx = int(df.shape[0] / 2)
df = df.iloc[halfIdx:]

In [11]:
with open(calculationsDir + "gr.bin", "ab+") as f:
    getGroupRatios(df).to_numpy().tofile(f)

with open(calculationsDir + "ir.bin", "ab+") as f:
    getImbalanceRatios(df).to_numpy().tofile(f)

with open(calculationsDir + "i_tpr.bin", "ab+") as f:
    getTPR_i(df).to_numpy().tofile(f)

with open(calculationsDir + "j_tpr.bin", "ab+") as f:
    getTPR_j(df).to_numpy().tofile(f)

with open(calculationsDir + "i_fpr.bin", "ab+") as f:
    getFPR_i(df).to_numpy().tofile(f)

with open(calculationsDir + "j_fpr.bin", "ab+") as f:
    getFPR_j(df).to_numpy().tofile(f)

with open(calculationsDir + "i_ppv.bin", "ab+") as f:
    getPositivePredictiveValue_i(df).to_numpy().tofile(f)

with open(calculationsDir + "j_ppv.bin", "ab+") as f:
    getPositivePredictiveValue_j(df).to_numpy().tofile(f)

with open(calculationsDir + "i_npv.bin", "ab+") as f:
    getNegativePredictiveValue_i(df).to_numpy().tofile(f)

with open(calculationsDir + "j_npv.bin", "ab+") as f:
    getNegativePredictiveValue_j(df).to_numpy().tofile(f)
    
with open(calculationsDir + "stat_parity.bin", "ab+") as f:
    get_statistical_parity(df).to_numpy().tofile(f)

with open(calculationsDir + "disp_impact.bin", "ab+") as f:
    get_disparate_impact(df).to_numpy().tofile(f)

with open(calculationsDir + "acc_equality_ratio.bin", "ab+") as f:
    get_acc_equality_ratio(df).to_numpy().tofile(f)

with open(calculationsDir + "acc_equality_diff.bin", "ab+") as f:
    get_acc_equality_diff(df).to_numpy().tofile(f)
    
del df
gc.collect()
gc.get_stats()

[{'collections': 427, 'collected': 2369, 'uncollectable': 0},
 {'collections': 38, 'collected': 1201, 'uncollectable': 0},
 {'collections': 5, 'collected': 2065, 'uncollectable': 0}]

In [12]:
!ls -lah calculations

'ls' is not recognized as an internal or external command,
operable program or batch file.


The files are written as `float64`, while the dataset has data-type `int8`, thus the size of each file is the same as the size of the dataset because each of them contains one 8 times heavier column.

# Get additional calculations
These calculations will be based on the previous ones.

## Write 1st part
Here the story is even worse as we need to open 2 files of the same size, so we will do it the same way: in 2 stages.

In [13]:
with open(calculationsDir + "i_tpr.bin", "rb") as f:
    i_tpr = pd.DataFrame(np.fromfile(f), columns = ["i_tpr"])
    halfIdx = int(i_tpr.shape[0] / 2)
    i_tpr = i_tpr.iloc[:halfIdx]

with open(calculationsDir + "j_tpr.bin", "rb") as f:
    j_tpr = pd.DataFrame(np.fromfile(f), columns = ["j_tpr"])
    halfIdx = int(j_tpr.shape[0] / 2)
    j_tpr = j_tpr.iloc[:halfIdx]
    
with open(calculationsDir + "equal_opp_ratio.bin", "wb+") as f:
    get_equal_opp_ratio(j_tpr['j_tpr'], i_tpr['i_tpr']).to_numpy().tofile(f)
    
with open(calculationsDir + "equal_opp_diff.bin", "wb+") as f:
    get_equal_opp_diff(j_tpr['j_tpr'], i_tpr['i_tpr']).to_numpy().tofile(f)

del j_tpr
del i_tpr
gc.collect()

0

In [14]:
with open(calculationsDir + "i_fpr.bin", "rb") as f:
    i_fpr = pd.DataFrame(np.fromfile(f), columns = ["i_fpr"])
    halfIdx = int(i_fpr.shape[0] / 2)
    i_fpr = i_fpr.iloc[:halfIdx]

with open(calculationsDir + "j_fpr.bin", "rb") as f:
    j_fpr = pd.DataFrame(np.fromfile(f), columns = ["j_fpr"])
    halfIdx = int(j_fpr.shape[0] / 2)
    j_fpr = j_fpr.iloc[:halfIdx]

with open(calculationsDir + "pred_equality_ratio.bin", "wb+") as f:
    get_pred_equality_ratio(j_fpr['j_fpr'], i_fpr['i_fpr']).to_numpy().tofile(f)
    
with open(calculationsDir + "pred_equality_diff.bin", "wb+") as f:
    get_pred_equality_diff(j_fpr['j_fpr'], i_fpr['i_fpr']).to_numpy().tofile(f)
    
del j_fpr
del i_fpr
gc.collect()

0

In [15]:
with open(calculationsDir + "i_ppv.bin", "rb") as f:
    i_ppv = pd.DataFrame(np.fromfile(f), columns = ["i_ppv"])
    halfIdx = int(i_ppv.shape[0] / 2)
    i_ppv = i_ppv.iloc[:halfIdx]
    
with open(calculationsDir + "j_ppv.bin", "rb") as f:
    j_ppv = pd.DataFrame(np.fromfile(f), columns = ["j_ppv"])
    halfIdx = int(j_ppv.shape[0] / 2)
    j_ppv = j_ppv.iloc[:halfIdx]

with open(calculationsDir + "pred_parity_ratio.bin", "wb+") as f:
    get_pred_parity_ratio(j_ppv['j_ppv'], i_ppv['i_ppv']).to_numpy().tofile(f)

with open(calculationsDir + "pos_pred_parity_diff.bin", "wb+") as f:
    get_pos_pred_parity_diff(j_ppv['j_ppv'], i_ppv['i_ppv']).to_numpy().tofile(f)

del j_ppv
del i_ppv
gc.collect()

0

In [16]:
with open(calculationsDir + "i_npv.bin", "rb") as f:
    i_npv = pd.DataFrame(np.fromfile(f), columns = ["i_npv"])
    halfIdx = int(i_npv.shape[0] / 2)
    i_npv = i_npv.iloc[:halfIdx]

with open(calculationsDir + "j_npv.bin", "rb") as f:
    j_npv = pd.DataFrame(np.fromfile(f), columns = ["j_npv"])
    halfIdx = int(j_npv.shape[0] / 2)
    j_npv = j_npv.iloc[:halfIdx]

with open(calculationsDir + "neg_pred_parity_ratio.bin", "wb+") as f:
    get_neg_pred_parity_ratio(j_npv['j_npv'], i_npv['i_npv']).to_numpy().tofile(f)

with open(calculationsDir + "neg_pred_parity_diff.bin", "wb+") as f:
    get_neg_pred_parity_diff(j_npv['j_npv'], i_npv['i_npv']).to_numpy().tofile(f)

del j_npv
del i_npv
gc.collect()

0

## Append 2nd part

In [17]:
with open(calculationsDir + "i_tpr.bin", "rb") as f:
    i_tpr = pd.DataFrame(np.fromfile(f), columns = ["i_tpr"])
    halfIdx = int(i_tpr.shape[0] / 2)
    i_tpr = i_tpr.iloc[halfIdx:]

with open(calculationsDir + "j_tpr.bin", "rb") as f:
    j_tpr = pd.DataFrame(np.fromfile(f), columns = ["j_tpr"])
    halfIdx = int(j_tpr.shape[0] / 2)
    j_tpr = j_tpr.iloc[halfIdx:]
    
with open(calculationsDir + "equal_opp_ratio.bin", "ab+") as f:
    get_equal_opp_ratio(j_tpr['j_tpr'], i_tpr['i_tpr']).to_numpy().tofile(f)
    
with open(calculationsDir + "equal_opp_diff.bin", "ab+") as f:
    get_equal_opp_diff(j_tpr['j_tpr'], i_tpr['i_tpr']).to_numpy().tofile(f)

del j_tpr
del i_tpr
gc.collect()

0

In [18]:
with open(calculationsDir + "i_fpr.bin", "rb") as f:
    i_fpr = pd.DataFrame(np.fromfile(f), columns = ["i_fpr"])
    halfIdx = int(i_fpr.shape[0] / 2)
    i_fpr = i_fpr.iloc[halfIdx:]

with open(calculationsDir + "j_fpr.bin", "rb") as f:
    j_fpr = pd.DataFrame(np.fromfile(f), columns = ["j_fpr"])
    halfIdx = int(j_fpr.shape[0] / 2)
    j_fpr = j_fpr.iloc[halfIdx:]

with open(calculationsDir + "pred_equality_ratio.bin", "ab+") as f:
    get_pred_equality_ratio(j_fpr['j_fpr'], i_fpr['i_fpr']).to_numpy().tofile(f)
    
with open(calculationsDir + "pred_equality_diff.bin", "ab+") as f:
    get_pred_equality_diff(j_fpr['j_fpr'], i_fpr['i_fpr']).to_numpy().tofile(f)
    
del j_fpr
del i_fpr
gc.collect()

0

In [19]:
with open(calculationsDir + "i_ppv.bin", "rb") as f:
    i_ppv = pd.DataFrame(np.fromfile(f), columns = ["i_ppv"])
    halfIdx = int(i_ppv.shape[0] / 2)
    i_ppv = i_ppv.iloc[halfIdx:]
    
with open(calculationsDir + "j_ppv.bin", "rb") as f:
    j_ppv = pd.DataFrame(np.fromfile(f), columns = ["j_ppv"])
    halfIdx = int(j_ppv.shape[0] / 2)
    j_ppv = j_ppv.iloc[halfIdx:]

with open(calculationsDir + "pred_parity_ratio.bin", "ab+") as f:
    get_pred_parity_ratio(j_ppv['j_ppv'], i_ppv['i_ppv']).to_numpy().tofile(f)

with open(calculationsDir + "pos_pred_parity_diff.bin", "ab+") as f:
    get_pos_pred_parity_diff(j_ppv['j_ppv'], i_ppv['i_ppv']).to_numpy().tofile(f)

del j_ppv
del i_ppv
gc.collect()

0

In [20]:
with open(calculationsDir + "i_npv.bin", "rb") as f:
    i_npv = pd.DataFrame(np.fromfile(f), columns = ["i_npv"])
    halfIdx = int(i_npv.shape[0] / 2)
    i_npv = i_npv.iloc[:halfIdx]

with open(calculationsDir + "j_npv.bin", "rb") as f:
    j_npv = pd.DataFrame(np.fromfile(f), columns = ["j_npv"])
    halfIdx = int(j_npv.shape[0] / 2)
    j_npv = j_npv.iloc[:halfIdx]

with open(calculationsDir + "neg_pred_parity_ratio.bin", "ab+") as f:
    get_neg_pred_parity_ratio(j_npv['j_npv'], i_npv['i_npv']).to_numpy().tofile(f)

with open(calculationsDir + "neg_pred_parity_diff.bin", "ab+") as f:
    get_neg_pred_parity_diff(j_npv['j_npv'], i_npv['i_npv']).to_numpy().tofile(f)

del j_npv
del i_npv
gc.collect()

0

## Calculations for sonya plot

In [21]:
def calculate_sonya(df, base_metric):  
    diff_metrics = {
        'acc_equality_diff.bin': 'Accuracy equality difference', 
        'equal_opp_diff.bin': 'Equal opportunity difference', 
        'neg_pred_parity_diff.bin': 'Negative predictive parity difference', 
        'pos_pred_parity_diff.bin': 'Positive predictive parity difference', 
        'pred_equality_diff.bin': 'Predictive equality difference',
        'stat_parity.bin': 'Statistical parity'
    }
    
    diff_probs = {}
    compute_diff_prob = lambda df: np.sum(df['diff'] == 0) / len(df)
    
    for metricFName in diff_metrics:
        with open(calculationsDir + metricFName, "rb") as f:
            diff_metric = pd.DataFrame(np.fromfile(f).astype(np.float16), columns = ["diff"])
        df = pd.concat([df, diff_metric], axis = 1)

        diff = df.groupby(base_metric).apply(compute_diff_prob)
        diff_probs[diff_metrics[metricFName]] = diff

        df.drop('diff', axis=1, inplace=True)
        
    sonya = pd.DataFrame(diff_probs)
    sonya.reset_index(inplace=True)
    sonya.to_csv(f"{calculationsDir}sonya_xxxx" + base_metric + ".csv", index=False)

In [22]:
with open(f"{calculationsDir}ir.bin", "rb") as f:
    df = pd.DataFrame(np.fromfile(f).astype(np.float16), columns = ["ir"])
calculate_sonya(df, 'ir')