# Fairness-Aware Feature Selection

## COMPAS Dataset

### 1) Preprocess data

The preprocessing modules and imports need to be merged 

In [4]:
#Load data 
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import log_loss
from scipy import optimize

fp = '../data/compas-scores-two-years.csv'
compas_df = pd.read_csv(fp)

In [146]:
def process_compas_dataset(compas_df): 
    #Drop Missing values and subset on columns needed
    compas_df.dropna()
    compas_subset = compas_df[["sex","age","age_cat","race","priors_count","c_charge_degree","c_jail_in", "c_jail_out",'two_year_recid']]
    compas_subset["two_year_recid"] = compas_subset["two_year_recid"].apply(lambda x: -1 if x==0 else 1)
    
    #Only select Caucasian/African American, encode to 0/1
    compas_subset = compas_subset[(compas_subset["race"]=='Caucasian') |(compas_subset["race"]=='African-American') ]
    compas_subset["race_cat"] = compas_subset["race"].apply(lambda x: 1 if x == "Caucasian" else 0)
    compas_subset = compas_subset.drop(columns = "race")
    
    #Encode gender to 0/1
    compas_subset["gender_cat"] = compas_subset["sex"].apply(lambda x: 1 if x == "Female" else 0)
    compas_subset = compas_subset.drop(columns = "sex")
    
    #Encode charge degree to 0/1
    compas_subset["charge_cat"] = compas_subset["c_charge_degree"].apply(lambda x: 1 if x == "F" else 0)
    compas_subset = compas_subset.drop(columns = "c_charge_degree")
    
    #Calculate length of stay from jail out - jail in 
    compas_subset["length_stay"] = pd.to_datetime(compas_subset["c_jail_out"]) - pd.to_datetime(compas_subset['c_jail_in'])
    compas_subset["length_stay"] = compas_subset["length_stay"].apply(lambda x: x.days)
    compas_subset = compas_subset.drop(columns = ["c_jail_in","c_jail_out"])
    compas_subset['length_stay'] = compas_subset["length_stay"].apply(lambda x: 0 if x <= 7 else x)
    compas_subset['length_stay'] = compas_subset["length_stay"].apply(lambda x: 1 if 7< x <= 90 else x)
    compas_subset['length_stay'] = compas_subset["length_stay"].apply(lambda x: 2 if x > 90 else x)
    
    #Categorize priors count into 3 categories 
    compas_subset["priors_count"] = compas_subset["priors_count"].apply(lambda x: 0 if x==0 else x)
    compas_subset["priors_count"] = compas_subset["priors_count"].apply(lambda x: 1 if (1<=x<=3) else x)
    compas_subset["priors_count"] = compas_subset["priors_count"].apply(lambda x: 2 if x>3 else x)
    
    # Include age as categorical variable 
    compas_subset = compas_subset.drop(columns = ["age_cat"])
    
    compas_subset = compas_subset.dropna()
    y_label = compas_subset["two_year_recid"]
    protected_attribute = compas_subset["race_cat"]
    df = compas_subset.drop(columns=["two_year_recid","race_cat"])

    y_label, protected_attr, df = shuffle(y_label, protected_attribute, df, random_state = 0)

    return y_label.to_numpy(), protected_attr.to_numpy(), df.to_numpy()

In [147]:
y_label, protected_attr, X =  process_compas_dataset(compas_df)

train_index = int(len(X)*6./7.)
x_train, y_train, race_train = X[:train_index], y_label[:train_index], protected_attr[:train_index]
x_test, y_test, race_test = X[train_index:], y_label[train_index:],protected_attr[train_index:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [148]:
clf = LogisticRegression(random_state=0).fit(x_train, y_train)
print(f"Logistic Regression Accuracy: {clf.score(x_test, y_test)}")

idx_race_1, idx_race_0  = np.where(race_test == 1)[0], np.where(race_test == 0)[0]
print(f"\tLogistic Regression Accuracy (Race = 1): {clf.score(x_test[idx_race_1], y_test[idx_race_1])}")
print(f"\tLogistic Regression Accuracy (Race = 0): {clf.score(x_test[idx_race_0], y_test[idx_race_0])}")

coeff = clf.coef_
intercept = clf.intercept_
optimal_loss = log_loss(y_train, clf.predict_proba(x_train))

Logistic Regression Accuracy: 0.6781065088757396
	Logistic Regression Accuracy (Race = 1): 0.670487106017192
	Logistic Regression Accuracy (Race = 0): 0.6834677419354839


In [153]:
svm = SVC(kernel="linear").fit(x_train, y_train)
print(f"SVM Accuracy: {svm.score(x_test, y_test)}")
print(f"\tSVM Accuracy (Race = 1): {svm.score(x_test[idx_race_1], y_test[idx_race_1])}")
print(f"\tSVM Accuracy (Race = 0): {svm.score(x_test[idx_race_0], y_test[idx_race_0])}")
print("\n")

for id_feature in range(x_train.shape[1]):
    print(f"Omitting Feature {id_feature + 1}")
    
    idxs = list(range(x_train.shape[1]))
    idxs.pop(id_feature)
    x_train_mod = x_train[:, idxs]
    x_test_mod = x_test[:, idxs]
    
    svm = SVC(kernel="linear").fit(x_train_mod, y_train)
    print(f"SVM Accuracy: {svm.score(x_test_mod, y_test)}")
    print(f"\tSVM Accuracy (Race = 1): {svm.score(x_test_mod[idx_race_1], y_test[idx_race_1])}")
    print(f"\tSVM Accuracy (Race = 0): {svm.score(x_test_mod[idx_race_0], y_test[idx_race_0])}")
    print("\n")

SVM Accuracy: 0.6816568047337278
	SVM Accuracy (Race = 1): 0.6934097421203438
	SVM Accuracy (Race = 0): 0.6733870967741935


Omitting Feature 1
SVM Accuracy: 0.6402366863905326
	SVM Accuracy (Race = 1): 0.6446991404011462
	SVM Accuracy (Race = 0): 0.6370967741935484


Omitting Feature 2
SVM Accuracy: 0.6106508875739645
	SVM Accuracy (Race = 1): 0.6103151862464183
	SVM Accuracy (Race = 0): 0.6108870967741935


Omitting Feature 3
SVM Accuracy: 0.6781065088757396
	SVM Accuracy (Race = 1): 0.6991404011461319
	SVM Accuracy (Race = 0): 0.6633064516129032


Omitting Feature 4
SVM Accuracy: 0.6781065088757396
	SVM Accuracy (Race = 1): 0.6876790830945558
	SVM Accuracy (Race = 0): 0.6713709677419355


Omitting Feature 5
SVM Accuracy: 0.663905325443787
	SVM Accuracy (Race = 1): 0.673352435530086
	SVM Accuracy (Race = 0): 0.657258064516129


