In [301]:
import pandas as pd
import numpy as np
import warnings
import itertools
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [2]:
compas_scores = pd.read_csv('../data/compas-scores-two-years.csv')
protected_attributes = ['sex','race']
compas_scores.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0


# Data Cleaning
Columns removed
- columns with more than 10% of data missing

Rows removed
- recidivist flag -- is_recid -- to be -1 (when no compas case would be found)
- charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested
- ordinary traffic offenses -- those with a c_charge_degree of 'O'

In [267]:
def categorize_numerical_col(num, lim1, lim2):
    if num <= lim1:
        return 0
    elif lim1 < num <= lim2:
        return 1
    elif num > lim2:
        return 2
    else:
        raise('Invalid row')
def categorize_age(age_cat):
    if age_cat=='Less than 25':
        return 0
    elif age_cat=='25 - 45':
        return 1
    elif age_cat=='Greater than 45':
        return 2
    else:
        raise('Invalid row')

In [371]:
# Data Cleaning

# Remove NaNs
percent_missing = compas_scores.isnull().sum() * 100 / len(compas_scores)
missing_value_df = pd.DataFrame({'column_name': compas_scores.columns,
                                 'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True, ascending=False)
cols2keep_df = missing_value_df[~(missing_value_df.percent_missing>10)]
cols2keep_df_list = cols2keep_df.column_name.tolist()
compas_scores_cols_trim = compas_scores[cols2keep_df_list]
compas_scores_cols_trim_dropna = compas_scores_cols_trim.dropna()

# Apply cleaning descibed in publication of data HERE: https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb
compas_df = compas_scores_cols_trim_dropna[(compas_scores_cols_trim_dropna['days_b_screening_arrest']<= 30) & 
                               (compas_scores_cols_trim_dropna['days_b_screening_arrest']>= -30) &
                               (compas_scores_cols_trim_dropna['is_recid']!= -1) &
                               (compas_scores_cols_trim_dropna['c_charge_degree']!= "O") &
                               (compas_scores_cols_trim_dropna['score_text']!= 'N/A') 
                              ]

# Select columns described in https://arxiv.org/abs/2106.00772 only which are (Age, Charge Degree, Gender, Prior Counts, Length Of Stay, race)

compas_subset_df = compas_df[["sex","age","age_cat","race","priors_count.1","c_charge_degree","c_jail_in", "c_jail_out","two_year_recid"]]


# Select only African American and Caucasian
compas_subset_df = compas_subset_df[(compas_subset_df["race"]=='Caucasian') |(compas_subset_df["race"]=='African-American') ]

# Add length of stay and drop "c_jail_in", "c_jail_out"

compas_subset_df["length_stay"] = pd.to_datetime(compas_subset_df["c_jail_out"]) - pd.to_datetime(compas_subset_df['c_jail_in'])
compas_subset_df["length_stay"] = compas_subset_df["length_stay"].apply(lambda x: x.days)
compas_subset_df = compas_subset_df.drop(columns = ["c_jail_in","c_jail_out"])
compas_subset_df['length_stay'] = compas_subset_df["length_stay"].apply(categorize_numerical_col, lim1=7, lim2=90)

# Categorize prior counts according to https://arxiv.org/abs/2106.00772 
compas_subset_df['priors_count.1'] = compas_subset_df["priors_count.1"].apply(categorize_numerical_col, lim1=0, lim2=3)

# Categorize age according to https://arxiv.org/abs/2106.00772
compas_subset_df['age_cat'] = compas_subset_df["age_cat"].apply(categorize_age)

# Encode categories
race_encoder = OrdinalEncoder(dtype='int')
compas_subset_df['race']  = race_encoder.fit_transform(compas_subset_df[['race']])

sex_encoder = OrdinalEncoder(dtype='int')
compas_subset_df['sex']  = sex_encoder.fit_transform(compas_subset_df[['sex']])

charge_encoder = OrdinalEncoder(dtype='int')
compas_subset_df['c_charge_degree']  = charge_encoder.fit_transform(compas_subset_df[['c_charge_degree']])

# Create protected attribute
protected_attribute = compas_subset_df["race"]

# Target Variable
target_variable = compas_subset_df['two_year_recid']

# Feature set
feature_df = compas_subset_df.drop(['two_year_recid','race','age'], axis=1)

X_train, X_test, y_train, y_test, protected_train, protected_test =train_test_split(
    feature_df.to_numpy(), target_variable.to_numpy(), protected_attribute.to_numpy(), test_size=0.2, random_state=42)


In [374]:
X_train.shape

(4218, 5)

In [377]:
y_train.reshape(-1, 1).shape

(4218, 1)

In [323]:
num_features = X_train.shape[1]
feat_idx = list(range(num_features))
feat_idx.pop(0)
get_feature_subsets = [x for x in get_feature_subsets(feat_idx) if len(x) > 0]

In [478]:
def unique_info_array(data):
    unique_list = []
    
    for idx in range(data.shape[1]):
        unique_list.append(np.unique(data[:, idx]).tolist())
    return unique_list

def unique_information_conditional(y, x_s, x_s_c, protected_attr):

#     protected_attr_xsc = np.concatenate((x_s_c, protected_attr), axis=1)
#     xs_protected_attr_xsc = np.concatenate((x_s, protected_attr_xsc), axis=1)    
#     y_xs_protected_attr_xsc = np.concatenate((y, xs_protected_attr_xsc), axis=1)
    
    y_xs_protected_attr_xsc = np.concatenate((y, x_s, x_s_c, protected_attr), axis=1)
    
    ui_array = unique_info_array(y_xs_protected_attr_xsc)
    ui_array_cat_product = list(itertools.product(*ui_array)) # compute the cartesian product of all arrays
    
    # Using 
    # IQ(T; R1|R2) = ∑t,r1,r2 QT ,R1,R2 (t, r1, r2) log((QT |R1,R2 (t|r1,r2))/ (QT |R2 (t|r2))) 

    row_count     = len(y)
    col_count_y  = y.shape[1]
    col_count_xs = x_s.shape[1]


    IQ = 0
    for array in ui_array_cat_product:
        r1_r2 = len(np.where((y_xs_protected_attr_xsc == array).all(axis=1))[0]) / row_count
        r1 = len(np.where((y == array[:col_count_y]).all(axis=1))[0]) / row_count
        r2 = len(np.where((y_xs_protected_attr_xsc[:, col_count_y: -col_count_xs] == array[
            col_count_y: -col_count_xs]).all(axis=1))[0]) / row_count

        try:
            r1_given_r2 = len(np.where((y_xs_protected_attr_xsc[:, :col_count_y] == array[:col_count_y]).all(axis=1)
                                       & (y_xs_protected_attr_xsc[:, -col_count_xs:]  == array[
                                           -col_count_xs:]).all(axis=1))[0]) / len(np.where( \
                (y_xs_protected_attr_xsc[:, -col_count_xs:] == array[-col_count_xs:]).all(axis=1))[0])
        except ZeroDivisionError:
            r1_given_r2 = 0

        if r1_r2 == 0 or r1 == 0 or r2 == 0 or r1_given_r2 == 0:
            temp = 0
        else:
            temp = r1_r2 * np.log(r1_r2 / r2) / r1_given_r2
        IQ += np.abs(temp)

    return IQ

y = y_train.reshape(-1, 1) # Add a dimension
x_s = X_train[:, idx_xs_ui]
x_s_c = X_train[:, idx_xsc_ui]
protected_attr = protected_train.reshape(-1, 1) # Add a dimension

# protected_attr_xsc = np.concatenate((x_s_c, protected_attr), axis=1)
# xs_protected_attr_xsc = np.concatenate((x_s, protected_attr_xsc), axis=1)    
# y_xs_protected_attr_xsc = np.concatenate((y, x_s, x_s_c, protected_attr), axis=1)

ui_array_cat_product = unique_information_conditional(y, x_s, x_s_c, protected_attr)
ui_array_cat_product


# # Using 
# # IQ(T; R1|R2) = ∑t,r1,r2 QT ,R1,R2 (t, r1, r2) log((QT |R1,R2 (t|r1,r2))/ (QT |R2 (t|r2))) 

# row_count     = len(y)
# col_count_y  = y.shape[1]
# col_count_xs = x_s.shape[1]


# IQ = 0
# for array in ui_array_cat_product:
#     r1_r2 = len(np.where((y_xs_protected_attr_xsc == array).all(axis=1))[0]) / row_count
#     r1 = len(np.where((y == array[:col_count_y]).all(axis=1))[0]) / row_count
#     r2 = len(np.where((y_xs_protected_attr_xsc[:, col_count_y: -col_count_xs] == array[
#         col_count_y: -col_count_xs]).all(axis=1))[0]) / row_count

#     try:
#         r1_given_r2 = len(np.where((y_xs_protected_attr_xsc[:, :col_count_y] == array[:col_count_y]).all(axis=1)
#                                    & (y_xs_protected_attr_xsc[:, -col_count_xs:]  == array[
#                                        -col_count_xs:]).all(axis=1))[0]) / len(np.where( \
#             (y_xs_protected_attr_xsc[:, -col_count_xs:] == array[-col_count_xs:]).all(axis=1))[0])
#     except ZeroDivisionError:
#         r1_given_r2 = 0

#     if r1_r2 == 0 or r1 == 0 or r2 == 0 or r1_given_r2 == 0:
#         temp = 0
#     else:
#         temp = r1_r2 * np.log(r1_r2 / r2) / r1_given_r2
#     IQ += np.abs(temp)
    


6.616911628774803

In [479]:
def unique_information(array_1, array_2):

    
    row_count          = len(array_1)
    col_count_array_1  = array_1.shape[1]
        
    features_combined = np.concatenate((array_1, array_2), axis=1)
    ui_array = unique_info_array(features_combined)
    ui_array_cat_product = list(itertools.product(*ui_array))
    
    # Using 
    # IQ(T; R1|R2) = ∑t,r1,r2 QT ,R1,R2 (t, r1, r2) log((QT |R1,R2 (t|r1,r2))/ (QT |R2 (t|r2))) 
    
    row_count          = len(array_1)
    col_count_array_1  = array_1.shape[1]
    
    IQ = 0
    for array in ui_array_cat_product:
        r1_r2 = len(np.where((features_combined == array).all(axis=1))[0]) / row_count
        r1 = len(np.where((array_1 == array[:col_count_array_1]).all(axis=1))[0]) / row_count
        r2 = len(np.where((array_2 == array[col_count_array_1:]).all(axis=1))[0]) / row_count
        
        if r1_r2 == 0 or r1 == 0 or r2 == 0:
            temp = 0
        else:
            temp = r1_r2 * np.log(r1_r2 / r1) / r1
        IQ += np.abs(temp)
    return IQ

y = y_train.reshape(-1, 1) # Add a dimension
x_s = X_train[:, idx_xs_ui]
x_s_c = X_train[:, idx_xsc_ui]
protected_attr = protected_train.reshape(-1, 1) # Add a dimension

# protected_attr_xsc = np.concatenate((x_s_c, protected_attr), axis=1)
# xs_protected_attr_xsc = np.concatenate((x_s, protected_attr_xsc), axis=1)    
# y_xs_protected_attr_xsc = np.concatenate((y, x_s, x_s_c, protected_attr), axis=1)

ui_array_cat_product = unique_information(y, x_s, x_s_c, protected_attr)
ui_array_cat_product

In [454]:
import itertools
mylist = list(itertools.product(*temp)) # compute the cartesian product of all arrays

In [456]:
(mylist)

[(0, 0, 0, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 0, 1),
 (0, 0, 0, 0, 0, 1, 0),
 (0, 0, 0, 0, 0, 1, 1),
 (0, 0, 0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1, 0, 1),
 (0, 0, 0, 0, 1, 1, 0),
 (0, 0, 0, 0, 1, 1, 1),
 (0, 0, 0, 0, 2, 0, 0),
 (0, 0, 0, 0, 2, 0, 1),
 (0, 0, 0, 0, 2, 1, 0),
 (0, 0, 0, 0, 2, 1, 1),
 (0, 0, 0, 1, 0, 0, 0),
 (0, 0, 0, 1, 0, 0, 1),
 (0, 0, 0, 1, 0, 1, 0),
 (0, 0, 0, 1, 0, 1, 1),
 (0, 0, 0, 1, 1, 0, 0),
 (0, 0, 0, 1, 1, 0, 1),
 (0, 0, 0, 1, 1, 1, 0),
 (0, 0, 0, 1, 1, 1, 1),
 (0, 0, 0, 1, 2, 0, 0),
 (0, 0, 0, 1, 2, 0, 1),
 (0, 0, 0, 1, 2, 1, 0),
 (0, 0, 0, 1, 2, 1, 1),
 (0, 0, 0, 2, 0, 0, 0),
 (0, 0, 0, 2, 0, 0, 1),
 (0, 0, 0, 2, 0, 1, 0),
 (0, 0, 0, 2, 0, 1, 1),
 (0, 0, 0, 2, 1, 0, 0),
 (0, 0, 0, 2, 1, 0, 1),
 (0, 0, 0, 2, 1, 1, 0),
 (0, 0, 0, 2, 1, 1, 1),
 (0, 0, 0, 2, 2, 0, 0),
 (0, 0, 0, 2, 2, 0, 1),
 (0, 0, 0, 2, 2, 1, 0),
 (0, 0, 0, 2, 2, 1, 1),
 (0, 0, 1, 0, 0, 0, 0),
 (0, 0, 1, 0, 0, 0, 1),
 (0, 0, 1, 0, 0, 1, 0),
 (0, 0, 1, 0, 0, 1, 1),
 (0, 0, 1, 0, 1, 0, 0),
 (0, 0, 1, 0, 1,

In [438]:
y.shape

AttributeError: 'tuple' object has no attribute 'shape'

In [412]:
import copy
def get_feature_subsets(sc):
    """
    Generate all subsets of feature set
    """
    if len(sc) <= 1:
        yield sc
        yield []
    else:
        for item in get_feature_subsets(sc[1:]):
            yield [sc[0]]+item
            yield item
            
# def shapley_marg_accuracy():
    

num_features = X_train.shape[1]
feat_idx = list(range(num_features))
feat_idx.pop(0)
get_feature_subsets = [x for x in get_feature_subsets(feat_idx) if len(x) > 0]
count =0

for sc_idx in get_feature_subsets:
        print(sc_idx)

        # Compute v(T ∪ {i}) 
        # œ∑ ´ ®†¥¨ˆ“‘æ…¬˚∆˙©ßåΩ≈ç√ √ µ¡™£¢∞§¶•ªº–≠ `
        idx_xs_ui = copy.deepcopy(sc_idx) # create copy of subset list
        idx_xs_ui.append(count) # append feature index
        idx_xsc_ui = list(set(list(range(num_features))).difference(set(idx_xs_ui))) # compliment of x_s

         # Compute v(T)
        idx_xsc = list(range(num_features))
        
        idx_xsc.pop(count)
        print(idx_xsc)
        idx_xsc = list(set(idx_xsc).difference(set(sc_idx)))

#         break
        if count == 1:
            break
        count = count +1
            


            
# protected_attr_xsc = np.concatenate((x_s_c, protected_attr), axis=1)
# xs_protected_attr_xsc = np.concatenate((x_s, protected_attr_xsc), axis=1)    
# y_xs_protected_attr_xsc = np.concatenate((y, xs_protected_attr_xsc), axis=1)
# concat_uniq_vals = get_uniq_vals_in_arr(concat_mat)

[1, 2, 3, 4]
[1, 2, 3, 4]
[2, 3, 4]
[0, 2, 3, 4]


In [427]:
idx_xsc_ui

[0]

In [428]:
idx_xs_ui

[2, 3, 4, 1]

In [408]:
list(set(list(range(num_features))).difference(set(idx_xs_incl)))

[0]

In [399]:
list(range(num_features))

[0, 1, 2, 3, 4]

In [395]:
list(set(idx_xsc).difference(set(sc_idx)))

[]

In [388]:
sc_idx

[2, 3, 4]

In [None]:
def accuracy_measure(y, x_s, x_s_c, protected_attr):
    protected_attr_xsc = np.concatenate((x_s_c, protected_attr), axis=1)
    
   
    num_rows = y.shape[0]
    num_left_cols = y.shape[1]
    num_right_cols = x_s.shape[1]

    xs_protected_attr_xsc = np.concatenate((x_s, protected_attr_xsc), axis=1)    
    y_xs_protected_attr_xsc = np.concatenate((y, xs_protected_attr_xsc), axis=1)
    concat_uniq_vals = get_uniq_vals_in_arr(concat_mat)
    concat_combos = list(itertools.product(*concat_uniq_vals))
    p_sum = 0
    for vec in concat_combos:
        p_r1_r2 = len(np.where((concat_mat == vec).all(axis=1))[0]) / num_rows
        p_r1 = len(np.where((left == vec[:num_left_cols]).all(axis=1))[0]) / num_rows
        p_r2 = len(np.where((concat_mat[:, num_left_cols: -num_right_cols] == vec[num_left_cols: -num_right_cols]).all(axis=1))[0]) / num_rows
        
        try:
            p_r1_given_r3 = len(np.where((concat_mat[:, :num_left_cols] == vec[:num_left_cols]).all(axis=1) & (concat_mat[:, -num_right_cols:] == vec[-num_right_cols:]).all(axis=1))[0]) / len(np.where((concat_mat[:, -num_right_cols:] == vec[-num_right_cols:]).all(axis=1))[0])
        except ZeroDivisionError:
            p_r1_given_r3 = 0
        
        if p_r1_r2 == 0 or p_r1 == 0 or p_r2 == 0 or p_r1_given_r3 == 0:
            p_iter = 0
        else:
            p_iter = p_r1_r2 * np.log(p_r1_r2 / p_r2) / p_r1_given_r3
        p_sum += np.abs(p_iter)
    
    return get_conditional_info_coef(y, x_s, conditional)

In [241]:
race_encoder.categories_

[array(['African-American', 'Caucasian'], dtype=object)]

In [196]:
# compas_scores_cols_trim_dropna = compas_scores
clean_df = compas_scores_cols_trim_dropna[(compas_scores_cols_trim_dropna['days_b_screening_arrest']<= 30) & 
                               (compas_scores_cols_trim_dropna['days_b_screening_arrest']>= -30) &
                               (compas_scores_cols_trim_dropna['is_recid']!= -1) &
                               (compas_scores_cols_trim_dropna['c_charge_degree']!= "O") &
                               (compas_scores_cols_trim_dropna['score_text']!= 'N/A') 
                              ]

# Change to datetime
clean_df['c_jail_out'] = pd.to_datetime(clean_df['c_jail_out']).astype(int)/10**9
clean_df['c_jail_in'] = pd.to_datetime(clean_df['c_jail_in']).astype(int)/10**9
clean_df['out_custody'] = pd.to_datetime(clean_df['out_custody']).astype(int)/10**9
clean_df['in_custody'] = pd.to_datetime(clean_df['in_custody']).astype(int)/10**9
clean_df['v_screening_date'] = pd.to_datetime(clean_df['v_screening_date']).astype(int)/10**9
clean_df['screening_date'] = pd.to_datetime(clean_df['screening_date']).astype(int)/10**9
clean_df['compas_screening_date'] = pd.to_datetime(clean_df['compas_screening_date']).astype(int)/10**9
clean_df['dob'] = pd.to_datetime(clean_df['dob']).astype(int)/10**9

clean_df['c_charge_desc'] = clean_df.c_charge_desc.apply(lambda x : x.strip().lower())
y = clean_df['two_year_recid'] 
features = clean_df.drop('two_year_recid', axis=1)

In [197]:
categorical_cols = ['c_charge_desc','v_score_text','score_text','sex','age_cat','race','c_charge_degree',
                    'c_case_number','v_type_of_assessment','type_of_assessment','name','first','last']
features_num = features.drop(categorical_cols, axis=1)


In [215]:
from sklearn.preprocessing import OrdinalEncoder

categorical_cols = ['c_charge_desc','v_score_text','score_text','sex','age_cat','race','c_charge_degree',
                    'c_case_number','v_type_of_assessment','type_of_assessment','name','first','last']

# instantiate ordinalencoder object
cat_encoder = OrdinalEncoder()

# apply le on categorical feature columns   
clean_df['c_charge_desc']   = cat_encoder.fit_transform(clean_df[['c_charge_desc']])
clean_df['v_score_text']    = cat_encoder.fit_transform(clean_df[['v_score_text']])
clean_df['score_text']      = cat_encoder.fit_transform(clean_df[['score_text']])
clean_df['age_cat']         = cat_encoder.fit_transform(clean_df[['age_cat']])
clean_df['c_charge_degree'] = cat_encoder.fit_transform(clean_df[['c_charge_degree']])
clean_df['sex']             = cat_encoder.fit_transform(clean_df[['sex']])
clean_df['race']            = cat_encoder.fit_transform(clean_df[['race']])

clean_df['c_case_number']       = cat_encoder.fit_transform(clean_df[['c_case_number']])
clean_df['v_type_of_assessment']= cat_encoder.fit_transform(clean_df[['v_type_of_assessment']])
clean_df['type_of_assessment']  = cat_encoder.fit_transform(clean_df[['type_of_assessment']])
clean_df['name']                = cat_encoder.fit_transform(clean_df[['name']])
clean_df['first']               = cat_encoder.fit_transform(clean_df[['first']])
clean_df['last']                = cat_encoder.fit_transform(clean_df[['last']])

In [204]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(, y, test_size=0.33, random_state=42)

num_attribs = list(features_num)
cat_attribs = categorical_cols
full_pipeline = ColumnTransformer([
 ("num", StandardScaler(), num_attribs),
 ("cat", OrdinalEncoder(), cat_attribs),
 ])
df_prepared = full_pipeline.fit_transform(features)

In [194]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(df_prepared, y)


some_data = features.iloc[:5]
some_labels = y.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", log_reg.predict(some_data_prepared))

Predictions: [0 1 1 0 1]


In [206]:
df_prepared.shape

(6167, 38)

In [209]:
df_prepared.columns

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [217]:
clean_df[['race']].value_counts() 

race
0.0     3173
2.0     2100
3.0      509
5.0      343
1.0       31
4.0       11
dtype: int64