In [28]:
import numpy as np
import pandas as pd
from aif360.metrics import BinaryLabelDatasetMetric 
from aif360.datasets import BinaryLabelDataset
from aif360.datasets import CompasDataset
from aif360.algorithms.preprocessing import DisparateImpactRemover
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_compas
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score 
import matplotlib
from PIL import Image
from matplotlib import pyplot as plt
from tqdm import tqdm 

df = pd.read_csv('crimerate.csv', header=None, names=[str(i) for i in range(101)])
# # binarize protected attribute, african american community
df.iloc[:,2] = (df.iloc[:,2] <= 0.5)
# # binarize label. high value (high crimerate) == bad
df.iloc[:,100] = (df.iloc[:,100] <= 0.5)

protected = '2'

crime_dataset = BinaryLabelDataset(
    df=df, 
    favorable_label=0,
    unfavorable_label=1,
    label_names=['100'], # the last column "label" "crimerate"
    protected_attribute_names=['2'], # ["race_pct_african_american"] 
)
underprivileged = [{'2': 0}]
privileged = [{'2': 1}]
scaler = MinMaxScaler(copy=False) 
test, train = crime_dataset.split([500])
#Splits the Commnity Crime dataset by rows 
train.features = scaler.fit_transform(train.features)
test.features = scaler.fit_transform(test.features)

index = train.feature_names.index(protected) 
DIs = []
for level in tqdm(np.linspace(0., 1., 11)):
    di = DisparateImpactRemover(repair_level=level)
    train_repd = di.fit_transform(train)
    test_repd = di.fit_transform(test)

    X_tr = np.delete(train_repd.features, index, axis=1)
    X_te = np.delete(test_repd.features, index, axis=1)
    y_tr = train_repd.labels.ravel()

    lmod = LogisticRegression(class_weight='balanced', solver='liblinear')
    lmod.fit(X_tr, y_tr)

    test_repd_pred = test_repd.copy()
    test_repd_pred.labels = lmod.predict(X_te)

    p = [{protected: 1}]
    u = [{protected: 0}]
    cm = BinaryLabelDatasetMetric(test_repd_pred, privileged_groups=p, unprivileged_groups=u)
    DIs.append(cm.disparate_impact())
print(DIs)









#     crime_dataset_train = crime_dataset.split([0.7], shuffle=True)
#     crime_dataset_test = crime_dataset.split([0.5], shuffle=True) 

# for level in tqdm(np.linspace(0., 1., 11)):
#     di = DisparateImpactRemover(repair_level=level) 
#     new_crime_dataset = di.fit_transform(crime_dataset)
   



100%|██████████| 11/11 [00:13<00:00,  1.19s/it]


[5.3076923076923075, 5.2405063291139244, 4.939024390243902, 4.395348837209302, 4.199999999999999, 3.9255319148936167, 4.153846153846153, 3.9255319148936167, 3.9130434782608696, 3.4105263157894736, 3.1818181818181817]
