# Getting Started

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, make_scorer
from sklearn.model_selection import GridSearchCV, train_test_split

from aif360.sklearn.preprocessing import ReweighingMeta
from aif360.sklearn.datasets import fetch_adult
from aif360.sklearn.metrics import disparate_impact_ratio

## Loading data

Datasets are formatted as separate `X` (# samples x # features) and `y` (# samples x # labels) DataFrames. The index of each DataFrame contains protected attribute values per sample. Datasets may also load a `sample_weight` object to be used with certain algorithms/metrics. All of this makes it so that aif360 is compatible with scikit-learn objects.

For example, we can easily load the Adult dataset from UCI with the following line:

In [2]:
X, y, sample_weight = fetch_adult()
X.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
Unnamed: 0_level_1,race,sex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,Non-white,Male,25.0,Private,11th,7.0,Never-married,Machine-op-inspct,Own-child,Non-white,Male,0.0,0.0,40.0,United-States
1,White,Male,38.0,Private,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States
2,White,Male,28.0,Local-gov,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States
3,Non-white,Male,44.0,Private,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Non-white,Male,7688.0,0.0,40.0,United-States
5,White,Male,34.0,Private,10th,6.0,Never-married,Other-service,Not-in-family,White,Male,0.0,0.0,30.0,United-States


We can also easily load a version of the dataset which only contains numeric or binary columns and split it with scikit-learn:

In [9]:
X, y, sample_weight = fetch_adult(numeric_only=True)
(X_train, X_test,
 y_train, y_test,
 sw_train, sw_test) = train_test_split(X, y, sample_weight, train_size=0.7, random_state=123)
X_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,education-num,race,sex,capital-gain,capital-loss,hours-per-week
Unnamed: 0_level_1,race,sex,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7916,Non-white,Female,18.0,7.0,0.0,0.0,0.0,0.0,20.0
26447,White,Male,55.0,9.0,1.0,1.0,0.0,0.0,40.0
20889,White,Female,43.0,9.0,1.0,0.0,0.0,0.0,40.0
30145,White,Male,44.0,11.0,1.0,1.0,4386.0,0.0,40.0
7473,White,Male,41.0,9.0,1.0,1.0,0.0,0.0,55.0


## Running metrics

With the data in this format, we can easily train a scikit-learn model and get predictions for the test data:

In [4]:
y_pred = LogisticRegression(solver='liblinear').fit(X_train, y_train).predict(X_test)

Now, we can analyze our predictions and quickly calucate the disparate impact for females vs. males:

In [5]:
sex = y_test.index.get_level_values('sex')
disparate_impact_ratio(y_test, y_pred, prot_attr='sex', priv_group='Male', pos_label='>50K')

0.19176335549523604

## Debiasing algorithms

`ReweighingMeta` is a workaround until changing sample weights can be handled properly in `Pipeline`/`GridSearchCV`

In [10]:
rew = ReweighingMeta(estimator=LogisticRegression(solver='liblinear'))

# UGLY workaround for sklearn issue: https://stackoverflow.com/a/49598597
def score_func(y_true, y_pred, sample_weight):
    idx = y_true.index.to_flat_index()
    print(idx)
    return accuracy_score(y_true, y_pred, sample_weight=sample_weight[idx])
scoring = make_scorer(score_func, **{'sample_weight': sample_weight})

params = {'estimator__C': [1, 10], 'reweigher__prot_attr': ['sex']}

clf = GridSearchCV(rew, params, scoring=scoring, cv=5)
clf.fit(X_train, y_train, **{'sample_weight': sw_train})
clf.score(X_test, y_test)

Index([(7916, 'Non-white', 'Female'),      (26447, 'White', 'Male'),
          (20889, 'White', 'Female'),      (30145, 'White', 'Male'),
             (7473, 'White', 'Male'),      (29361, 'White', 'Male'),
            (12277, 'White', 'Male'),      (44372, 'White', 'Male'),
          (32291, 'White', 'Female'),    (44411, 'White', 'Female'),
       ...
            (38298, 'White', 'Male'),       (4173, 'White', 'Male'),
             (7854, 'White', 'Male'),    (16424, 'White', 'Female'),
             (2087, 'White', 'Male'),      (16120, 'White', 'Male'),
            (24476, 'White', 'Male'),     (8295, 'White', 'Female'),
             (1449, 'White', 'Male'),      (33323, 'White', 'Male')],
      dtype='object', length=6838)


NameError: name 'accuracy_score' is not defined