# Getting Started

In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split

from aif360.sklearn.algorithms.preprocessing import Reweighing
from aif360.sklearn.datasets import fetch_adult
from aif360.sklearn.metrics import disparate_impact_ratio

## Loading data

Datasets are formatted as separate `X` (# samples x # features) and `y` (# samples x # labels) DataFrames. The index of each DataFrame contains protected attribute values per sample. Datasets may also load a `sample_weight` object to be used with certain algorithms/metrics. All of this makes it so that aif360 is compatible with scikit-learn objects.

For example, we can easily load the Adult dataset from UCI with the following line:

In [2]:
X, y, sample_weight = fetch_adult()
X.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
race,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Non-white,Male,25.0,Private,11th,7.0,Never-married,Machine-op-inspct,Own-child,Non-white,Male,0.0,0.0,40.0,United-States
White,Male,38.0,Private,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States
White,Male,28.0,Local-gov,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States
Non-white,Male,44.0,Private,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Non-white,Male,7688.0,0.0,40.0,United-States
White,Male,34.0,Private,10th,6.0,Never-married,Other-service,Not-in-family,White,Male,0.0,0.0,30.0,United-States


We can also easily load a version of the dataset which only contains numeric or binary columns and split it with scikit-learn:

In [3]:
X, y, _ = fetch_adult(numeric_only=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)
X_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,education-num,race,sex,capital-gain,capital-loss,hours-per-week
race,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Non-white,Female,18.0,7.0,0.0,0.0,0.0,0.0,20.0
White,Male,55.0,9.0,1.0,1.0,0.0,0.0,40.0
White,Female,43.0,9.0,1.0,0.0,0.0,0.0,40.0
White,Male,44.0,11.0,1.0,1.0,4386.0,0.0,40.0
White,Male,41.0,9.0,1.0,1.0,0.0,0.0,55.0


## Running metrics

With the data in this format, we can easily train a scikit-learn model and get predictions for the test data:

In [4]:
y_pred = LogisticRegression(solver='liblinear').fit(X_train, y_train).predict(X_test)

Now, we can analyze our predictions and quickly calucate the disparate impact for females vs. males:

In [5]:
sex = y_test.index.get_level_values('sex')
disparate_impact_ratio(y_test, y_pred, groups=sex, priv_group='Male', pos_label='>50K')

0.19176335549523604

## Debiasing algorithms

Not yet implemented.

In [None]:
pipe = make_pipeline(Reweighing(), LinearRegression())
# sample_weight_ will be updated after it is fit
fit_params = {'linearregression__sample_weight':
              pipe.named_steps.reweighing.sample_weight_}
pipe.fit(X, y, **fit_params)