In [18]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("../data/compas-scores-two-years.csv")

In [5]:
# Drop unrelated columns
df=df.drop(columns=['id', 'name', 'first', 'last',
                    'compas_screening_date','dob','age','c_jail_in', 
                    'c_jail_out', 'c_case_number','c_offense_date','c_charge_desc', 
                    'c_arrest_date','r_charge_desc',
                    'r_case_number','r_charge_desc','r_offense_date', 
                    'r_jail_in', 'r_jail_out','violent_recid','vr_case_number',
                    'vr_offense_date', 'vr_charge_desc', 'screening_date',
                    'v_screening_date','in_custody','out_custody','r_charge_degree',
                    'r_days_from_arrest','vr_charge_degree','type_of_assessment',
                    'v_type_of_assessment' ])
df.head()

Unnamed: 0,sex,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,...,is_violent_recid,decile_score.1,score_text,v_decile_score,v_score_text,priors_count.1,start,end,event,two_year_recid
0,Male,Greater than 45,Other,0,1,0,0,0,-1.0,1.0,...,0,1,Low,1,Low,0,0,327,0,0
1,Male,25 - 45,African-American,0,3,0,0,0,-1.0,1.0,...,1,3,Low,1,Low,0,9,159,1,1
2,Male,Less than 25,African-American,0,4,0,1,4,-1.0,1.0,...,0,4,Low,3,Low,4,0,63,0,1
3,Male,Less than 25,African-American,0,8,1,0,1,,1.0,...,0,8,High,6,Medium,1,0,1174,0,0
4,Male,25 - 45,Other,0,1,0,0,2,,76.0,...,0,1,Low,1,Low,2,0,1102,0,0


In [6]:
df.columns

Index(['sex', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_days_from_compas', 'c_charge_degree',
       'is_recid', 'is_violent_recid', 'decile_score.1', 'score_text',
       'v_decile_score', 'v_score_text', 'priors_count.1', 'start', 'end',
       'event', 'two_year_recid'],
      dtype='object')

In [7]:
df.shape

(7214, 22)

In [9]:
# Filter only two races
df = df[(df.race=='African-American') | (df.race=='Caucasian')]
df = df.dropna()

df.shape

(5915, 22)

In [10]:
label_column = ['two_year_recid']
catogory_features = []
numeric_features = []

for col in df.columns.values:
    if col in label_column:
        continue
    elif df[col].dtypes in ('int64', 'float64') :
        numeric_features += [col]
    else:
        catogory_features += [col]
        
print("categorical:", catogory_features)
print("numerical:", numeric_features)

categorical: ['sex', 'age_cat', 'race', 'c_charge_degree', 'score_text', 'v_score_text']
numerical: ['juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count', 'priors_count', 'days_b_screening_arrest', 'c_days_from_compas', 'is_recid', 'is_violent_recid', 'decile_score.1', 'v_decile_score', 'priors_count.1', 'start', 'end', 'event']


In [16]:
# Now we replace categorical columns with numeric values
df_num = df.copy()
feat2name = {}
encoders = {}

# Use Label Encoder for categorical columns (including target column)
for feature in catogory_features:
    encoder = LabelEncoder()
    encoder.fit(df_num[feature])
    
    df_num[feature] = encoder.transform(df_num[feature])
    
    feat2name[feature] = encoder.classes_
    encoders[feature] = encoder
df_num.head()

Unnamed: 0,sex,age_cat,race,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,...,is_violent_recid,decile_score.1,score_text,v_decile_score,v_score_text,priors_count.1,start,end,event,two_year_recid
1,1,0,0,0,3,0,0,0,-1.0,1.0,...,1,3,1,1,1,0,9,159,1,1
2,1,2,0,0,4,0,1,4,-1.0,1.0,...,0,4,1,3,1,4,0,63,0,1
6,1,0,1,0,6,0,0,14,-1.0,1.0,...,0,6,2,2,1,14,5,40,1,1
8,0,0,1,0,1,0,0,0,-1.0,1.0,...,0,1,1,1,1,0,2,747,0,0
9,1,2,1,0,3,0,0,1,428.0,308.0,...,1,3,1,5,2,1,0,428,1,1


In [27]:
encoders['race'].classes_

array(['African-American', 'Caucasian'], dtype=object)

In [19]:
data_train, data_test = train_test_split(df_num, test_size=0.2)
data_train, data_val= train_test_split(data_train, test_size=0.2)

In [None]:
# First, we will define some of the constants and functions mentioned in the paper
N = df.shape[0]  # number of samples in X
sens_N =   # number of sensitive data in X
D = df.shape[1]  # Dimension of x vector
K = 10  # Number of prototypes represented in Z


In [56]:
def d(x1, x2, alpha):
    """
        Calculates the euclidean distance between x1 and x2 with feature weights alpha
        x1: First vector in X vector space (D, 1)
        x2: Second vector in X vector space (D, 1)
        alpha: weight vector for each of the features (D, 1)
    """
    print(np.multiply(np.multiply((x1 - x2), (x1 - x2)), alpha))
    return int(sum(np.multiply(np.multiply((x1 - x2), (x1 - x2)), alpha)))

In [57]:
# Test 
d(np.matrix([1,2,3]).T, np.matrix([0,0,0]).T, np.matrix([1,1,2]).T)

[[ 1]
 [ 4]
 [18]]


23

In [58]:
# To save time for later, we will cache the distance map between all inputs X_i 
# and current prototypes V_k
def d_map(X, V, alpha):
    """
        Returns a 2D matrix with shape (N, K) with each cell (i, j) 
            distance from input x_i to prototype v_j with weighted features
        X: Input matrix (N, D)
        V: Prototype matrix (K, D)
        alpha: weight vector for each of the features (D, 1)
    """
    distance_map = np.zeros((X.shape[0], V.shape[0]))
    for i in range(X.shape[0]):
        for j in range(V.shape[0]):
            distance_map[i, j] = d(X[i, ].T, V[j, ].T, alpha)
            
    return distance_map

In [59]:
# Test
d_map(np.matrix([[1,2],[3,4],[6,7]]), np.matrix([[10,2],[3,40]]), np.matrix([[1.0],[1.0]]))

[[81.]
 [ 0.]]
[[   4.]
 [1444.]]
[[49.]
 [ 4.]]
[[   0.]
 [1296.]]
[[16.]
 [25.]]
[[   9.]
 [1089.]]


array([[  81., 1448.],
       [  53., 1296.],
       [  41., 1098.]])

In [68]:
def M_nk(X, n, V, k, alpha):
    """
        Calculate the prob of X_n is classified to kth prototype using softmax
        X: Input matrix (N, D)
        n: the nth input to calculate the prob for
        V: prototype matrix (K, D)
        k: the kth prototype to classify for
        alpha: weight vector for each of the features (D, 1)
    """
    p = 0
    
    dist_map = d_map(X, V, alpha)
    exponent = np.exp(-1 * dist_map[n, k])
    summation = 0
    for k_idx in range(V.shape[0]):
        summation += np.exp(-1 * dist_map[n, k_idx])
    # To avoid value error
    if (summation == 0): 
        summation = 0.000001
    p = exponent / summation
    return p

In [70]:
M_nk(np.matrix([[1,2],[3,4],[6,7]]), 1, np.matrix([[10,2],[3,40]]), 0, np.matrix([[1.0],[1.0]]))

[[81.]
 [ 0.]]
[[   4.]
 [1444.]]
[[49.]
 [ 4.]]
[[   0.]
 [1296.]]
[[16.]
 [25.]]
[[   9.]
 [1089.]]


1.0

In [71]:
# To save time later, we will cache the probs of each x mapped to k
def M_map(X, V, alpha):
    """
        Return the prob of each x mapping to a prototype v (N, K)
        X: Input matrix (N, D)
        V: Prototype matrix (K, D)
        alpha: weight vector for each of the features (D, 1)
    """
    M = np.zeros((N, K))
    for i in range(X.shape[0]):
        for j in range(V.shape[0]):
            M[i, j] = M_nk(X, i, V, j, alpha)
    return M
    

In [None]:
def M_sub_k(M_sub_map, k):
    """
        Calculate estimated prob of mapping to k for a subset M_map.
        M_sub_map: prob of each x mapping to a prototype (N0, K)
        k: Target prototype k
    """
    Ms = np.zeros(M_sub_map.shape[1])
    
    for k in range(M_sub_map.shape[1]):
        for n in range(M_sub_map.shape[0]):
            Ms[k] += M_sub_map[n, k]
        Ms[k] /= M_sub_map.shape[0]
    return Ms

In [72]:
def x_hats(M, V):
    """
        Return a matrix of reconstructed x through M 
            using each of the prototypes. (N, D)
        M: M_map output (N, K)
        V: Prototype matrix (K, D)
    """
    return np.matmul(M, V)

In [73]:
def y_hats(M, w):
    """
        Return matrix of final estimates of each input through M and trained w.
        M: M_map output (N, K)
        w: Model weight between 0 and 1 (K, 1)
    """
    y_hats = np.zeros(N)
    for n in range(M.shape[0]):
        for k in range(M.shape[1]):
            y_hat[n] += (M[n, k] * w[k])
        # Clipping estimates to (0, 1)
        y_hat[n] = 0.000001 if y_hat[n] <= 0 else y_hat[n]
        y_hat[n] = 0.999999 if y_hat[n] >= 1 else y_hat[n]
    return y_hat

In [None]:
def L_x(X, x_hats):
    """
        Loss term for goodness of the prototype.
        X: input matrix (N, D)
        x_hats: x estimates (N, D)
    """
    Lx = 0
    for n in range(X.shape[0]):
        for d in range(X.shape[1]):
            Lx += np.matmul((X[n, d] - x_hats[n, d]), (X[n, d] - x_hats[n, d]))
    return Lx

In [74]:
def L_y(ys, y_hats):
    """
        Loss term for accuracy of the model
        ys: Gound-truth label of X (N, 1)
        y_hats: y estimates (N, 1)
    """
    Ly = 0
    for n in range(ys.shape[0]): 
        Ly += (-1 * ys[n] * np.log(y_hats[n]) - (1 - ys[n]) * (np.log(1 - y_hats[n])))
    return Ly

In [None]:
def L_z(M_sens, M_nonsens):
    """
        Loss term for fairness.
        M_sens: M_map for sensitive data (N0, K)
        M_nonsens: M_map for non-sensitive data (N-N0, K)
    """
    Lz= 0
    
    for k in range(M_sens.shape[1]):
          Lz += abs(M_sens[k] - M_nonsens[k])
    return Lz

In [None]:
class LFR():
    def __init__(
        train_data,
        val_data,
        label_column,
        sensitive_column,
        privileged_group,
        k,
        Ax,
        Ay,
        Az,
        iterations=50
    ):
        self.k = k
        self.Ax = Ax
        self.Ay = Ay
        self.Az = Az
        self.iterations = iterations
        
        self.train_data = train_data
        self.val_data = val_data
        self.label_column = label_column
        self.sensitive_column = sensitive_column
        self.privileged_group = privileged_group
        
        train_copy = train_data.copy()
        X = train_copy.drop(columns=[label_column])
        y = train_copy[label_column]
        
        sens = train_data[sensitive_column]
        priv_idx = np.array(np.where(sens==privileged_group))[0].flatten()
        nonpriv_idx = np.array(np.where(sens!=privileged_group))[0].flatten()
        self.X_plus = X[priv_idx,:]
        self.y_plus = y[priv_idx,:]
        self.X_minus = X[nonpriv_idx,:]
        self.y_minus = y[nonpriv_idx,:]
        
    def fit(init_alpha):
        
    def forward(flattened_params):
        """
            
        """
        