In [None]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

# Discrimination detection and mitigation (on houses prices dataset)

## Train a model regardless of fairness

In [None]:
from fairdream.data_preparation import *
from fairdream.compute_scores import *
from fairdream.detection import *
from fairdream.correction import *
from fairdream.plots import *

In [None]:
# set your statistics purposes
model_task = "regression"
stat_criteria = "rmse"

### Bring your own model 

If you want to bring your own model, you have to set 3 features:

1. uncorrected_model_path
Save your model in uncorrected_model_path, for fairness analysis on relevant features
Ex: uncorrected_model_path = "/work/data/models/uncorrected_model.pkl"

2. X_train_valid, Y_train_valid
pd.DataFrame with your inputs and targets on train&valid set, of shape(nb_individuals,)

3. Y_pred_train_valid
np.ndarray with the predicted label (i.e. class) or value, of shape(nb_individuals,)

### Automatically train a model statistically performant, regardless of fairness

In [None]:
# preparing the housing dataset for regression, according to its size ('small' or 'large')

from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_boston
from sklearn.datasets import load_diabetes


def choose_housing_dataset_for_regression(dataset_len):
    if dataset_len == 'small':
        boston = load_boston()
        x, y = boston.data, boston.target

        X = pd.DataFrame(boston.data, columns=boston.feature_names)
        Y = pd.DataFrame(boston.target, columns=["MEDV"])
    
    elif dataset_len == 'large':
        california = fetch_california_housing()
        x, y = california.data, california.target

        X = pd.DataFrame(california.data, columns=california.feature_names)
        Y = pd.DataFrame(california.target, columns=['MedHouseVal'])
    
    elif dataset_len == 'diabetes':
        diabetes = load_diabetes()
        x, y = diabetes.data, diabetes.target

        X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
        Y = pd.DataFrame(diabetes.target, columns=["diabete_indicator"])
    
    return X, Y

In [None]:
dataset_len = 'diabetes'

X, Y = choose_housing_dataset_for_regression(dataset_len)

In [None]:
X_train, X_valid, X_train_valid, X_test, Y_train, Y_valid, Y_train_valid, Y_test = train_valid_test_split(X,Y, model_task)

In [None]:
# save the uncorrected model, to then sort its features by importances
save_model = True
uncorrected_model_path = "/work/data/models/uncorrected_model.pkl"

Y_pred_train_valid = train_naive_xgb(X_train, X_valid, X_train_valid, X_test, Y_train, Y_valid, Y_train_valid, Y_test, model_task, stat_criteria,save_model=save_model)

## Detection alert (on train&valid data to examine if the model learned discriminant behavior)

In [None]:
augment_train_valid_set_with_results("uncorrected", X_train_valid, Y_train_valid, Y_pred_train_valid, model_task)

In [None]:
train_valid_set_with_uncorrected_results = augment_train_valid_set_with_results("uncorrected", X_train_valid, Y_train_valid, Y_pred_train_valid, model_task)

In [None]:
augmented_train_valid_set = train_valid_set_with_uncorrected_results
model_name = "uncorrected"

model_task = "regression"

fairness_purpose='distribution_gap'
injustice_acceptance=1.2
min_individuals_discrimined=0.01

discrimination_alert(augmented_train_valid_set, model_name, fairness_purpose, model_task, injustice_acceptance, min_individuals_discrimined)

## Discrimination correction with a new fair model

### Generating fairer models with weights distorsion

In [None]:
# the user determines one's fairness objectives to build new fairer models
# on which group and regarding which criteria (purpose, constraint of the models) one aims to erase discrimination

protected_attribute = 'sex' # 'MedInc', 'AveRooms', 'Population', 'AveOccup'

# then the user sets the desired balance between stat and fair performances 
tradeoff = "moderate"
weight_method = "weighted_groups"
nb_fair_models = 10

train_valid_set_with_corrected_results, models_df, best_model_dict = fair_train(
    X=X,
    Y=Y,
    train_valid_set_with_uncorrected_results=train_valid_set_with_uncorrected_results,
    protected_attribute=protected_attribute,
    fairness_purpose=fairness_purpose,
    model_task=model_task,
    stat_criteria=stat_criteria,
    tradeoff=tradeoff,
    weight_method=weight_method,
    nb_fair_models=nb_fair_models,
)

### Evaluating the best fair model

In [None]:
fair_model_results(train_valid_set_with_corrected_results, models_df, best_model_dict,protected_attribute,fairness_purpose, model_task)

In [None]:
top_models = models_df.sort_values(by='tradeoff_score',ascending=False)
top_models