In [27]:
import pandas as pd

# Reading data:
df= pd.read_csv("compas-scores-two-years.csv")

# Subsetting only "African-American" and "Caucasian" races:
df = df.loc[df["race"].isin(["African-American", "Caucasian"])]

# Replacing "African-American" and "Caucasian" labels with 0 and 1:
df["race"].replace(['African-American', 'Caucasian'],[0, 1], inplace=True)

# Label encoding "score text" variable:
df["score_text"].replace(["Low","Medium","High"],[0,1,2],inplace=True)

# Replacing "Male" and "Female" labels with 0 and 1:
df["sex"].replace(["Male","Female"],[0,1],inplace=True)

In [28]:
# Target variable "two_year_recid"
y = df["two_year_recid"]

# Predictors:
X = pd.DataFrame(df, columns=["race","age","priors_count","juv_fel_count","juv_misd_count","juv_other_count",'decile_score','score_text',"sex"])

# Splitting into train-valid-test sets:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=879, random_state=42, stratify=y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
X_train, X_valid, y_train, y_valid=train_test_split(X_train,y_train,test_size=879, random_state=42, stratify=y_train)
print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

(5271, 9) (5271,) (879, 9) (879,)
(4392, 9) (4392,) (879, 9) (879,)


# Baseline

In [29]:
# Fitting logistic regression model:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

# Accuracy of logistic regression model:
print("Accuracy:", clf.score(X_test,y_test))

Accuracy: 0.7007963594994312


In [30]:
index_b= (X_test["race"]==0)
X_test_b=X_test[index_b]
y_test_b=y_test[index_b]
X_test_w=X_test[-index_b]
y_test_w=y_test[-index_b]

In [31]:
# Calibration score (Accuracy differnce between 2 groups)
print(clf.score(X_test_b,y_test_b))
print(clf.score(X_test_w,y_test_w))
print("calibration score: ",abs(clf.score(X_test_b,y_test_b)-clf.score(X_test_w,y_test_w)))

0.6893203883495146
0.717032967032967
calibration score:  0.02771257868345245


# Determine explanatory variable 'e'

In [32]:
import numpy as np
print(np.sum(clf.predict(X_test_b)==1)/len(X_test_b))
print(np.sum(clf.predict(X_test_w)==1)/len(X_test_w))
D_all=np.sum(clf.predict(X_test_b)==1)/len(X_test_b)-np.sum(clf.predict(X_test_w)==1)/len(X_test_w)
print("D_all:", D_all)

0.5009708737864078
0.2554945054945055
D_all: 0.24547636829190234


In [33]:
index_b_m= (X_test_b["sex"]==0)
X_test_b_m=X_test_b[index_b_m]
y_test_b_m=y_test_b[index_b_m]
X_test_b_f=X_test_b[-index_b_m]
y_test_b_f=y_test_b[-index_b_m]

index_w_m= (X_test_w["sex"]==0)
X_test_w_m=X_test_w[index_w_m]
y_test_w_m=y_test_w[index_w_m]
X_test_w_f=X_test_w[-index_w_m]
y_test_w_f=y_test_w[-index_w_m]
print(X_test_b_m.shape,X_test_b_f.shape,X_test_w_m.shape,X_test_w_f.shape)

(427, 9) (88, 9) (290, 9) (74, 9)


In [34]:
#e_0
p_plus_e_0=(np.sum(clf.predict(X_test_b_m)==1)/len(X_test_b_m)-np.sum(clf.predict(X_test_w_m)==1)/len(X_test_w_m))/2
#e_1
p_plus_e_1=(np.sum(clf.predict(X_test_b_f)==1)/len(X_test_b_f)-np.sum(clf.predict(X_test_w_f)==1)/len(X_test_w_f))/2

In [35]:
D_exp_0=(np.sum(X_test_b["sex"]==0)/len(X_test_b)-np.sum(X_test_w["sex"]==0)/len(X_test_w))*p_plus_e_0
D_exp_1=(np.sum(X_test_b["sex"]==1)/len(X_test_b)-np.sum(X_test_w["sex"]==1)/len(X_test_w))*p_plus_e_1
D_exp=D_exp_0+D_exp_1
print("D_exp:",D_exp)

D_exp: 0.0018442318770811048


In [36]:
print("D_bad:",D_all-D_exp)

D_bad: 0.24363213641482123


#Algorithm: Local massaging

In [37]:
#split X_test into different sex, 0:male, 1:female
index_m= (X_train["sex"]==0)
X_train_m=X_train[index_m]
y_train_m=y_train[index_m]
X_train_f=X_train[-index_m]
y_train_f=y_train[-index_m]
print(X_train_m.shape,X_train_f.shape)

(3482, 9) (910, 9)


In [38]:
from sklearn.linear_model import LogisticRegression
clf_m = LogisticRegression(random_state=0).fit(X_train_m, y_train_m)
clf_f = LogisticRegression(random_state=0).fit(X_train_f, y_train_f)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
#get probability for logistic regression
X_train_m["prob_0"],X_train_m["prob_1"]=clf_m.predict_proba(X_train_m)[:,0],clf_m.predict_proba(X_train_m)[:,1]
X_train_f["prob_0"],X_train_f["prob_1"]=clf_f.predict_proba(X_train_f)[:,0],clf_f.predict_proba(X_train_f)[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_m["prob_0"],X_train_m["prob_1"]=clf_m.predict_proba(X_train_m)[:,0],clf_m.predict_proba(X_train_m)[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_m["prob_0"],X_train_m["prob_1"]=clf_m.predict_proba(X_train_m)[:,0],clf_m.predict_proba(X_train_m)[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

In [40]:
index_b= (X_train["race"]==0)
X_train_b=X_train[index_b]
y_train_b=y_train[index_b]
X_train_w=X_train[-index_b]
y_train_w=y_train[-index_b]
print(X_train_b.shape,X_train_w.shape)

(2638, 9) (1754, 9)


In [41]:
index_b_m= (X_train_b["sex"]==0)
X_train_b_m=X_train_b[index_b_m]
y_train_b_m=y_train_b[index_b_m]
X_train_b_f=X_train_b[-index_b_m]
y_train_b_f=y_train_b[-index_b_m]

index_w_m= (X_train_w["sex"]==0)
X_train_w_m=X_train_w[index_w_m]
y_train_w_m=y_train_w[index_w_m]
X_train_w_f=X_train_w[-index_w_m]
y_train_w_f=y_train_w[-index_w_m]
print(X_train_b_m.shape,X_train_b_f.shape,X_train_w_m.shape,X_train_w_f.shape)

(2157, 9) (481, 9) (1325, 9) (429, 9)


In [42]:
#e_0
p_plus_e_0=(np.sum(clf_m.predict(X_train_b_m)==1)/len(X_train_b_m)-np.sum(clf_m.predict(X_train_w_m)==1)/len(X_train_w_m))/2
#e_1
p_plus_e_1=(np.sum(clf_f.predict(X_train_b_f)==1)/len(X_train_b_f)-np.sum(clf_f.predict(X_train_w_f)==1)/len(X_train_w_f))/2

In [43]:
G_0=len(X_train_b)
G_1=len(X_train_w)
delta_b_m=int(G_0*(np.sum(clf_m.predict(X_train_b_m)==1)/len(X_train_b_m)-p_plus_e_0))
delta_w_m=int(G_1*(np.sum(clf_m.predict(X_train_w_m)==1)/len(X_train_w_m)-p_plus_e_1))

delta_b_f=int(G_0*(np.sum(clf_f.predict(X_test_b_f)==1)/len(X_test_b_f)-p_plus_e_0))
delta_w_f=int(G_1*(np.sum(clf_f.predict(X_test_w_f)==1)/len(X_test_w_f)-p_plus_e_1))
print(delta_b_m,delta_w_m,delta_b_f,delta_w_f)

1160 506 275 254


In [44]:
index_p_to_n_m=X_train_m[(X_train_m["prob_1"]>=0.5)&(X_train_m["race"]==0)]["prob_1"].sort_values()[:delta_b_m]
index_n_to_p_m=X_train_m[(X_train_m["prob_1"]<0.5)&(X_train_m["race"]==1)]["prob_1"].sort_values(ascending=False)[:delta_w_m]
index_p_to_n_f=X_train_f[(X_train_f["prob_1"]>=0.5)&(X_train_f["race"]==0)]["prob_1"].sort_values()[:delta_b_f]
index_n_to_p_f=X_train_f[(X_train_f["prob_1"]<0.5)&(X_train_f["race"]==1)]["prob_1"].sort_values(ascending=False)[:delta_w_f]

  index_p_to_n_m=X_train_m[(X_train_m["prob_1"]>=0.5)&(X_train_m["race"]==0)]["prob_1"].sort_values()[:delta_b_m]
  index_n_to_p_m=X_train_m[(X_train_m["prob_1"]<0.5)&(X_train_m["race"]==1)]["prob_1"].sort_values(ascending=False)[:delta_w_m]
  index_p_to_n_f=X_train_f[(X_train_f["prob_1"]>=0.5)&(X_train_f["race"]==0)]["prob_1"].sort_values()[:delta_b_f]
  index_n_to_p_f=X_train_f[(X_train_f["prob_1"]<0.5)&(X_train_f["race"]==1)]["prob_1"].sort_values(ascending=False)[:delta_w_f]


In [45]:
y_train_1=y_train.copy()
y_train_1[y_train_1.index.isin(index_p_to_n_m.index)]=0
y_train_1[y_train_1.index.isin(index_n_to_p_m.index)]=1
y_train_1[y_train_1.index.isin(index_p_to_n_f.index)]=0
y_train_1[y_train_1.index.isin(index_n_to_p_f.index)]=1

In [46]:
from sklearn.linear_model import LogisticRegression
clf_new = LogisticRegression(random_state=0).fit(X_train, y_train_1)
clf_new.score(X_test,y_test)

0.4812286689419795

In [47]:
print("calibration score: ",abs(clf_new.score(X_test_b,y_test_b)-clf_new.score(X_test_w,y_test_w)))

calibration score:  0.014851168249226498


#Algorithm: Local preferential sampling

In [48]:
index_p_to_n_m_u=X_train_m[(X_train_m["prob_1"]>=0.5)&(X_train_m["race"]==0)]["prob_1"].sort_values()[:int(delta_b_m/2)]
index_p_to_n_m_d=X_train_m[(X_train_m["prob_1"]<0.5)&(X_train_m["race"]==0)]["prob_1"].sort_values(ascending=False)[:int(delta_b_m/2)]

index_n_to_p_m_d=X_train_m[(X_train_m["prob_1"]<0.5)&(X_train_m["race"]==1)]["prob_1"].sort_values(ascending=False)[:int(delta_w_m/2)]
index_n_to_p_m_u=X_train_m[(X_train_m["prob_1"]>=0.5)&(X_train_m["race"]==1)]["prob_1"].sort_values()[:int(delta_w_m/2)]

index_p_to_n_f_u=X_train_f[(X_train_f["prob_1"]>=0.5)&(X_train_f["race"]==0)]["prob_1"].sort_values()[:int(delta_b_f/2)]
index_p_to_n_f_d=X_train_f[(X_train_f["prob_1"]<0.5)&(X_train_f["race"]==0)]["prob_1"].sort_values(ascending=False)[:int(delta_b_f/2)]

index_n_to_p_f_d=X_train_f[(X_train_f["prob_1"]<0.5)&(X_train_f["race"]==1)]["prob_1"].sort_values(ascending=False)[:int(delta_w_f/2)]
index_n_to_p_f_u=X_train_f[(X_train_f["prob_1"]>=0.5)&(X_train_f["race"]==1)]["prob_1"].sort_values()[:int(delta_w_f/2)]

  index_p_to_n_m_u=X_train_m[(X_train_m["prob_1"]>=0.5)&(X_train_m["race"]==0)]["prob_1"].sort_values()[:int(delta_b_m/2)]
  index_p_to_n_m_d=X_train_m[(X_train_m["prob_1"]<0.5)&(X_train_m["race"]==0)]["prob_1"].sort_values(ascending=False)[:int(delta_b_m/2)]
  index_n_to_p_m_d=X_train_m[(X_train_m["prob_1"]<0.5)&(X_train_m["race"]==1)]["prob_1"].sort_values(ascending=False)[:int(delta_w_m/2)]
  index_n_to_p_m_u=X_train_m[(X_train_m["prob_1"]>=0.5)&(X_train_m["race"]==1)]["prob_1"].sort_values()[:int(delta_w_m/2)]
  index_p_to_n_f_u=X_train_f[(X_train_f["prob_1"]>=0.5)&(X_train_f["race"]==0)]["prob_1"].sort_values()[:int(delta_b_f/2)]
  index_p_to_n_f_d=X_train_f[(X_train_f["prob_1"]<0.5)&(X_train_f["race"]==0)]["prob_1"].sort_values(ascending=False)[:int(delta_b_f/2)]
  index_n_to_p_f_d=X_train_f[(X_train_f["prob_1"]<0.5)&(X_train_f["race"]==1)]["prob_1"].sort_values(ascending=False)[:int(delta_w_f/2)]
  index_n_to_p_f_u=X_train_f[(X_train_f["prob_1"]>=0.5)&(X_train_f["race"]==1)]["pr

In [49]:
X_train_1=X_train.copy()
y_train_1=y_train.copy()
df_new=pd.concat([X_train,y_train_1],axis=1)
df_new.drop(index_p_to_n_m_d.index,inplace=True)
df_new.drop(index_n_to_p_m_u.index,inplace=True)
df_new.drop(index_p_to_n_f_d.index,inplace=True)
df_new.drop(index_n_to_p_f_u.index,inplace=True)

In [50]:
df_dup_1=df_new[df_new.index.isin(index_p_to_n_m_u.index)]
df_dup_2=df_new[df_new.index.isin(index_n_to_p_m_d.index)]
df_dup_3=df_new[df_new.index.isin(index_p_to_n_f_u.index)]
df_dup_4=df_new[df_new.index.isin(index_n_to_p_f_d.index)]
df_new_all=df_new.append([df_dup_1,df_dup_2,df_dup_3,df_dup_4])

  df_new_all=df_new.append([df_dup_1,df_dup_2,df_dup_3,df_dup_4])


In [51]:
print(df_new.shape,X_train_1.shape,df_new_all.shape)
print(len(index_p_to_n_m_d))

(3351, 10) (4392, 9) (4407, 10)
580


In [52]:
from sklearn.linear_model import LogisticRegression
clf_new = LogisticRegression(random_state=0).fit(df_new_all.drop("two_year_recid",axis=1), df_new_all["two_year_recid"])
print(clf_new.score(X_test,y_test))
print("calibration score: ",abs(clf_new.score(X_test_b,y_test_b)-clf_new.score(X_test_w,y_test_w)))

0.6951080773606371
calibration score:  0.004598314307052154
