In [1]:
import sys
sys.path.insert(1, '../src/')

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.nn.functional as F
from torchvision.datasets.utils import download_url 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from tqdm.notebook import tqdm
import my_utils as ut
from aif360.datasets import BinaryLabelDataset
from aif360.datasets import AdultDataset, GermanDataset, CompasDataset, BankDataset
from aif360.datasets import MEPSDataset19
from aif360.datasets import MEPSDataset20
from aif360.datasets import MEPSDataset21

# from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions \
#     import load_preproc_data_adult, load_preproc_data_compas, load_preproc_data_german

torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

In [2]:
data = 'bank'

In [3]:
if data == 'adult':
    protect_attr= 'sex'
    label = 'income-per-year'
    ds = AdultDataset()
    
elif data == 'medical':
    protect_attr='RACE'
    label='UTILIZATION'
    ds = MEPSDataset19()

elif data == 'compas':
    protect_attr= 'race'
    label = 'two_year_recid'
    ds = CompasDataset()
    # make 1 the favorable label for consistency
    ds.labels = np.abs(1-ds.labels) 

elif data == 'bank':
    protect_attr = 'age'
    label = 'y'
    ds = BankDataset()
    
elif data == 'german':
    protect_attr = 'sex'
    label = 'credit'
    ds = GermanDataset()
    # make 0 the unfavorable label for consistency
    ds.labels[ds.labels == 2] = 0



In [4]:
tr_ds, tmp_ds = ds.split([0.6], shuffle=True)
v_ds, te_ds = tmp_ds.split([0.5], shuffle=True)

In [5]:
df_tr = tr_ds.convert_to_dataframe()[0]
df_v = v_ds.convert_to_dataframe()[0]
df_te = te_ds.convert_to_dataframe()[0]

In [6]:
def get_ds_stats(df, protect_attr='sex', label='income-per-year'):
    protect_ratio = df[protect_attr].value_counts(normalize=True)
    label_ratio = df[label].value_counts(normalize=True)
    return protect_ratio, label_ratio

In [7]:
protect_ratio, label_ratio = get_ds_stats(df_tr, protect_attr, label)
display(protect_ratio, label_ratio)

1.0    0.971682
0.0    0.028318
Name: age, dtype: float64

0.0    0.874863
1.0    0.125137
Name: y, dtype: float64

In [8]:
pr_exp_p_fav = protect_ratio[1]*label_ratio[1]
pr_obs_p_fav = df_tr[(df_tr[protect_attr]==1) & (df_tr[label]==1)].shape[0]/df_tr.shape[0]
w_p_fav = pr_exp_p_fav/pr_obs_p_fav

pr_exp_p_unfav = protect_ratio[1]*label_ratio[0]
pr_obs_p_unfav = df_tr[(df_tr[protect_attr]==1) & (df_tr[label]==0)].shape[0]/df_tr.shape[0]
w_p_unfav = pr_exp_p_unfav/pr_obs_p_unfav

pr_exp_up_fav = protect_ratio[0]*label_ratio[1]
pr_obs_up_fav = df_tr[(df_tr[protect_attr]==0) & (df_tr[label]==1)].shape[0]/df_tr.shape[0]
w_up_fav = pr_exp_up_fav/pr_obs_up_fav

pr_exp_up_unfav = protect_ratio[0]*label_ratio[0]
pr_obs_up_unfav = df_tr[(df_tr[protect_attr]==0) & (df_tr[label]==0)].shape[0]/df_tr.shape[0]
w_up_unfav = pr_exp_up_unfav/pr_obs_up_unfav

kamiran_weights = torch.tensor([w_p_fav, w_p_unfav, w_up_fav, w_up_unfav]).float()
display(kamiran_weights)
display(pr_obs_p_fav, pr_obs_p_unfav, pr_obs_up_fav, pr_obs_up_unfav)

tensor([1.0254, 0.9965, 0.5402, 1.1386])

0.11857642685326919

0.8531051825934834

0.0065602449158101905

0.02175814563743713

In [9]:
pr_fav_given_p = pr_obs_p_fav/protect_ratio[1.0]
pr_fav_given_up = pr_obs_up_fav/protect_ratio[0.0]
pr_unfav_given_p = pr_obs_p_unfav/protect_ratio[1.0]
pr_unfav_given_up = pr_obs_up_unfav/protect_ratio[0.0]

display(pr_fav_given_p, pr_fav_given_up, pr_unfav_given_p, pr_unfav_given_up)

0.12203218183864072

0.23166023166023167

0.8779678181613593

0.7683397683397684

In [10]:
def convert_df_to_torch(df, protect_attr='sex', label='income-per-year'):
    meta = df[protect_attr].values
    
    df.drop(columns=[protect_attr], inplace=True) # dropping demographics from the dataset
    X, y = df.iloc[:, :-1].values, df.iloc[:, -1].values, 
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)
    print(meta.shape, X.shape, y.shape)
    X = torch.tensor(X).float()
    y = torch.tensor(y).float()
    meta = torch.tensor(meta).float()
    torch_ds = ut.DatasetWithMeta(X, y, meta)
    return torch_ds

In [11]:
torch_tr_ds = convert_df_to_torch(df_tr, protect_attr, label)
torch_v_ds = convert_df_to_torch(df_v, protect_attr, label)
torch_te_ds = convert_df_to_torch(df_te, protect_attr, label)

(18292,) (18292, 56) (18292,)
(6098,) (6098, 56) (6098,)
(6098,) (6098, 56) (6098,)


In [12]:
# torch.save(torch_tr_ds, f'../data/{data}_pytorch/{data}_train.pt')
# torch.save(torch_v_ds, f'../data/{data}_pytorch/{data}_val.pt')
# torch.save(torch_te_ds, f'../data/{data}_pytorch/{data}_test.pt')
# torch.save(kamiran_weights, f'../data/{data}_pytorch/kamiran_weights_{data}.pt')