In [None]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

# Basic data preparation, modelling and analysis for binary classification (Census)

## Train a model only with a statistical performance purpose

In [None]:
import sys
sys.path.append("../")

import pickle
import time
from sklearn.datasets import fetch_openml

from classif_basic.data_preparation import train_valid_test_split, set_target_if_feature, automatic_preprocessing
from classif_basic.model import train_naive_xgb, pickle_save_model, prediction_train_valid_by_task, compute_best_fscore
from classif_basic.model_analysis import features_importances_from_pickle, augment_train_valid_set_with_results

from classif_basic.model_analysis import plot_tree, get_df_first_splits

In [None]:
# set your statistics purposes
model_task = 'classification'
stat_criteria = 'auc'

preprocessing_cat_features = 'label_encoding'

t0 = time.time()

### Prepare data

Fix precise % of population distribution (sex: Male, Female) and % of loan granted according to sex, to inspect the effects of FairDream.

In [None]:
# preparing the dataset on clients for binary classification
#from sklearn.datasets import fetch_openml
data = fetch_openml(data_id=1590, as_frame=True)

X = data.data
Y = (data.target == '>50K') * 1

In [None]:
dataset = X.copy()
dataset['target'] = Y
dataset

In [None]:
# here, "treatment" is saw as being 'Male' and not 'Female'

df_response_if_feature=dataset.loc[(dataset['sex']=='Male')&(dataset['target']==1)]
df_no_response_if_feature=dataset.loc[(dataset['sex']=='Male')&(dataset['target']==0)]
df_response_if_not_feature=dataset.loc[(dataset['sex']=='Female')&(dataset['target']==1)]
df_no_response_if_not_feature=dataset.loc[(dataset['sex']=='Female')&(dataset['target']==0)]

print(df_response_if_feature.shape[0])
print(df_no_response_if_feature.shape[0])
print(df_response_if_not_feature.shape[0])
print(df_no_response_if_not_feature.shape[0])


# % of men selected by the initial data
df_response_if_feature.shape[0]/(df_response_if_feature.shape[0]+df_no_response_if_feature.shape[0])

In [None]:
# % of women selected by the initial data
df_response_if_not_feature.shape[0]/(df_response_if_feature.shape[0]+df_no_response_if_not_feature.shape[0])

In [None]:
len_dataset = 20_000

percentage_feature= 70
percentage_response_if_feature=70
percentage_response_if_not_feature=10

sexist_dataset = set_target_if_feature(
    df_response_if_feature=df_response_if_feature,
    df_no_response_if_feature=df_no_response_if_feature,
    df_response_if_not_feature=df_response_if_not_feature,
    df_no_response_if_not_feature=df_no_response_if_not_feature,
    len_dataset=len_dataset,
    percentage_feature=percentage_feature,
    percentage_response_if_feature=percentage_response_if_feature,
    percentage_response_if_not_feature=percentage_response_if_not_feature)

In [None]:
X = sexist_dataset.loc[: , dataset.columns != 'target']
Y = sexist_dataset['target']

### Bring your own model 

If you want to bring your own model, you have to set 3 features:

1. uncorrected_model_path
Save your model in uncorrected_model_path, for fairness analysis on relevant features
Ex: uncorrected_model_path = "/work/data/models/uncorrected_model.pkl"

2. X_train_valid, Y_train_valid
pd.DataFrame with your inputs and targets on train&valid set, of shape(nb_individuals,)

3. Y_pred_train_valid
np.ndarray with the predicted label (i.e. class) or value, of shape(nb_individuals,)

### Automatically train a model statistically performant, regardless of fairness

We here introduce additional interaction constraints, to reflect causal interpretation on the features (TODO later: option of the function "train_naive_xgb")

In [None]:
X_train, X_valid, X_train_valid, X_test, Y_train, Y_valid, Y_train_valid, Y_test = train_valid_test_split(
    X=X,
    Y=Y, 
    model_task=model_task,
    preprocessing_cat_features=preprocessing_cat_features)

In [None]:
Y_valid.shape

In [None]:
X_train.columns

In [None]:
# save the uncorrected model, to then sort its features by importances
save_model=True
uncorrected_model_path = "/work/data/models/uncorrected_model.pkl"

Y_pred_train_valid = train_naive_xgb(
    X_train=X_train,
    X_valid=X_valid,
    X_train_valid=X_train_valid,
    X_test=X_test,
    Y_train=Y_train,
    Y_valid=Y_valid,
    Y_train_valid=Y_train_valid,
    Y_test=Y_test,
    model_task=model_task,
    stat_criteria=stat_criteria,
    save_model=save_model)

In [None]:
augment_train_valid_set_with_results("uncorrected", X_train_valid, Y_train_valid, Y_pred_train_valid, model_task)

We now see that this process with basic data preparation, modelling and integration of the results in a DataFrame (as storage of the model) is very fast (in seconds):

In [None]:
t1 = time.time()

print(f"Basic modelling took {round(t1 - t0)} seconds")

# Decomposition in Trees: Coherent Structure?

In [None]:
file = open(uncorrected_model_path,'rb')
xgb_basic = pickle.load(file)

xgb_basic

In [None]:
booster = xgb_basic
get_max_split_feature=False
nb_min_trees = 4 #None

df_first_splits = get_df_first_splits(booster=booster, 
                                      get_max_split_feature=get_max_split_feature,
                                      nb_min_trees=nb_min_trees)
df_first_splits

With feature interaction constraints on sex being not uncoherent ancestor of 'age', 'race' or 'native-country' the splitting of trees seem more coherent. 

TODO: now, how to set other causal parenthoods (e.g. 'relationship' can not be a causal ancestor of 'education', I think?) May be some links have to be sacrified, in order to draw efficient splits...

In [None]:
first_splitting_feature = df_first_splits.index[0]

print(f"first_splitting_feature: {first_splitting_feature}")

for num_trees in df_first_splits["trees_index"][first_splitting_feature]:
    
    plot_tree(
        booster=xgb_basic,
        num_trees=num_trees)

Parallell with the feature importance: aggregation over trees, but no "sense" or hierarchy between features... 

#Conclusion#
- The features importances does not reflect the structures of the trees
- The structures of the trees do not (systematically) follow a causal hierarchy (e.g. splitting on the income before the age) 
=> Further tests: the structures of trees need to be causally constrained (even if the split on "capital_gain" before "education_num" on a tree brought a bigger node purity on a tree number_k, that order would be forbidden as unrealistic)

Intuition: only keep the leaves that make sense 

Incident questions to realise this selection, or constraint, on trees: 
n_estimators = 1000, then why n_trees = 109?

In [None]:
from xgboost.plotting import plot_importance 

# importance_type : str, default "weight"
        #How the importance is calculated: either "weight", "gain", or "cover"
        #* "weight" is the number of times a feature appears in a tree
        
plot_importance(
    booster=xgb_basic,
    importance_type="weight",
    max_num_features=10)

In [None]:
#* "gain" is the average gain of splits which use the feature
plot_importance(
    booster=xgb_basic,
    importance_type="gain",
    max_num_features=10)