In [None]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

# Basic data preparation, modelling and analysis for binary classification (Census)

## Train a model only with a statistical performance purpose

In [None]:
pip install torch

In [None]:
pip install torch_geometric

In [None]:
import sys
sys.path.append("../")

import time
from sklearn import datasets

from sklearn.preprocessing import LabelEncoder

from torch_geometric.data import Data

import itertools
import numpy as np

from classif_basic.data_preparation import train_valid_test_split, set_target_if_feature, automatic_preprocessing
from classif_basic.model import train_naive_xgb, pickle_save_model, prediction_train_valid_by_task, compute_best_fscore
from classif_basic.model_analysis import features_importances_from_pickle, augment_train_valid_set_with_results

In [None]:
# set your statistics purposes
model_task = 'classification'
stat_criteria = 'auc'

# set how to pre-process the categorical features (one-hot encoding, or label-encoding)
preprocessing_cat_features = "label_encoding"

t0 = time.time()

### Prepare data

Fix precise % of population distribution (sex: Male, Female) and % of loan granted according to sex, to inspect the effects of FairDream.

In [None]:
# preparing the dataset on clients for binary classification
from sklearn.datasets import fetch_openml
data = fetch_openml(data_id=1590, as_frame=True)

X = data.data
Y = (data.target == '>50K') * 1

In [None]:
dataset = X.copy()
dataset['target'] = Y
dataset

In [None]:
# here, "treatment" is saw as being 'Male' and not 'Female'

df_response_if_feature=dataset.loc[(dataset['sex']=='Male')&(dataset['target']==1)]
df_no_response_if_feature=dataset.loc[(dataset['sex']=='Male')&(dataset['target']==0)]
df_response_if_not_feature=dataset.loc[(dataset['sex']=='Female')&(dataset['target']==1)]
df_no_response_if_not_feature=dataset.loc[(dataset['sex']=='Female')&(dataset['target']==0)]

print(df_response_if_feature.shape[0])
print(df_no_response_if_feature.shape[0])
print(df_response_if_not_feature.shape[0])
print(df_no_response_if_not_feature.shape[0])


# % of men selected by the initial data
df_response_if_feature.shape[0]/(df_response_if_feature.shape[0]+df_no_response_if_feature.shape[0])

In [None]:
# % of women selected by the initial data
df_response_if_not_feature.shape[0]/(df_response_if_feature.shape[0]+df_no_response_if_not_feature.shape[0])

In [None]:
len_dataset = 20_000

percentage_feature= 70
percentage_response_if_feature=70
percentage_response_if_not_feature=10

sexist_dataset = set_target_if_feature(
    df_response_if_feature=df_response_if_feature,
    df_no_response_if_feature=df_no_response_if_feature,
    df_response_if_not_feature=df_response_if_not_feature,
    df_no_response_if_not_feature=df_no_response_if_not_feature,
    len_dataset=len_dataset,
    percentage_feature=percentage_feature,
    percentage_response_if_feature=percentage_response_if_feature,
    percentage_response_if_not_feature=percentage_response_if_not_feature)

In [None]:
X = sexist_dataset.loc[: , dataset.columns != 'target']
Y = sexist_dataset['target']

In [None]:
Y

### Bring your own model 

If you want to bring your own model, you have to set 3 features:

1. uncorrected_model_path
Save your model in uncorrected_model_path, for fairness analysis on relevant features
Ex: uncorrected_model_path = "/work/data/models/uncorrected_model.pkl"

2. X_train_valid, Y_train_valid
pd.DataFrame with your inputs and targets on train&valid set, of shape(nb_individuals,)

3. Y_pred_train_valid
np.ndarray with the predicted label (i.e. class) or value, of shape(nb_individuals,)

### Automatically train a model statistically performant

In [None]:
X_train, X_valid, X_train_valid, X_test, Y_train, Y_valid, Y_train_valid, Y_test = train_valid_test_split(
    X=X,
    Y=Y, 
    model_task=model_task,
    preprocessing_cat_features=preprocessing_cat_features)

# Represent data - from tables to graph (on X_train_valid)

From this dataset (where we introduced selectively a "sexist" effect against women), let's see how we could swith from the tabular data to a graph representation.

The point is that our features X all seem to be attributes of the clients, though we should find a way of representing their interactions between clients 

X = {race, age, sex, final weight (depends on age, sex, hispanic origin, race), education, education number, marital status, relationship, occupation, hours per week, workclass, race, sex, capital gain, capital loss, native country} 

**Nodes** 
Bank clients (by ID)

**Edges** 
Here, we should find one or several ways of connecting the clients

Should be occupation → if changes of occupation (or similar client with new occupation), which impact on the revenue? // change of football team => impact on the football rate 
(pers) actionable => predict revenue when switches to a new job??
→ may be: “hours per week” <=> inspect the change of revenue if switches to greater hours per week?

**Node Features** 
Attributs of the nodes, i.e. characteristics of the clients (here, hard to separate from what "connects" them...) 

Race, age, sex, final weight (depends on age, sex, hispanic origin, race), education, education number, marital status, relationship, hours per week, workclass, race, sex, capital gain, capital loss, native country 

**Label (here at a node-level?)** 
Income (Y = income > $50 000)

In [None]:
# first of all, specify the edge
edge = "occupation"# str (for the moment)

In [None]:
X_train_valid

In [None]:
#Make sure that we have no duplicate nodes
X_train_valid.index.unique().shape[0] == X_train_valid.shape[0]

**Extract the node features**

The node features are typically represented in a matrix of the shape (num_nodes, node_feature_dim).

For each of the bank clients, we simply extract their attributes (except here the "occupation", that would be used as an "actionable" edge to connect them)

In [None]:
node_features = X_train_valid.loc[:, X_train_valid.columns != edge]
node_features

That's already our node feature matrix. The number of nodes and the ordering is implicitly defined by it's shape. Each row corresponds to one node in our final graph. 

In [None]:
# Convert to numpy
x = node_features.to_numpy()
x.shape # [num_nodes x num_features]

**Extract the labels**

Those are simply the wealthiness of each of the clients (if their income is >$50 000). This corresponds to a node-level prediction problem. Therefore we have as many labels as we have nodes.

In [None]:
labels = Y_train_valid
labels.head()

In [None]:
# to make the graph functioning, check that the nodes follow the same order than the labels (rows n°)
# else, sort values by ids

nb_corresponding_nodes_labels = (labels.index == node_features.index).sum()

nb_corresponding_nodes_labels == X_train_valid.shape[0]

In [None]:
# Convert to numpy
y = labels.to_numpy()
y.shape # [num_nodes, 1] --> node regression

**Extract the edges**

That's probably the trickiest part with a tabular dataset. You need to think of a reasonable way to connect your nodes. As mentioned previously, we will use the team assignment here.

    AGAIN: There are many ways to connect the entities in a dataset and this approach is very trivial (as it will lead to disconnected subgraphs). If I wanted to build a real model from this dataset, I would probably look for a more sophisticated way to connect the clients. Using a GNN is a bit overkill for the way I model the edges.

We now need to find the pairs of clients that are assigned to the same type of job. Let's first check how many clients per type of job we have.

In [None]:
# get an idea of the codes corresponding to occupations, reconstituting labels' transformations from X
dict_occupation_codes = pd.Series(X[edge].values, index=X.apply(le.fit_transform)[edge]).to_dict()

# correct according to dict comparison
dict_occupation_codes[14] = 'Transport-moving'
dict_occupation_codes

In [None]:
# With the profession types, this tells us how many clients per type of profession we have to connect
df_jobs = X_train_valid.replace({"occupation": dict_occupation_codes})
df_jobs["occupation"].value_counts()

We now need to build all permutations of these clients within one type of job, which corresponds to a fully-connected graph within each occupation-subgroup. We use the column int_player_id as indices for the edges. If there is for example a [0, 1] in the edge index, it means that the first and second node (regarding the previously defined node feature matrix) are connected.

In [None]:
jobs = X_train_valid["occupation"].unique()
all_edges = np.array([], dtype=np.int32).reshape((0, 2))
for job in jobs:
    job_df = X_train_valid[X_train_valid["occupation"] == job]
    clients = job_df.index
    # Build all combinations, as all players are connected
    permutations = list(itertools.combinations(clients, 2))
    edges_source = [e[0] for e in permutations]
    edges_target = [e[1] for e in permutations]
    clients_edges = np.column_stack([edges_source, edges_target])
    all_edges = np.vstack([all_edges, clients_edges])
# Convert to Pytorch Geometric format
edge_index = all_edges.transpose()
edge_index # [2, num_edges]

The result are these source/target edge pairs. Here you can also model dircted or undirected edges by inluding both or just one direction (I included both). This COO format is usually chosen as it is more efficient than a NxN adjacency matrix.

**To Do then: include ONE-SENSE direction** for certain features (against non-sense)

**Final step - build the graph dataset**

Now we have all the components we need to build a graph for libraries like Pytorch Geometric or DGL. 

We need to pass the numpy arrays to the Data object, like this. If you have further attributes like edge_features, you can also pass them here.

(pers) work hours and sector --> edge features?

In [None]:
data = Data(x=x, edge_index=edge_index, y=y)

In [None]:
# save the uncorrected model, to then sort its features by importances
save_model=True
uncorrected_model_path = "/work/data/models/uncorrected_model.pkl"

Y_pred_train_valid = train_naive_xgb(
    X_train=X_train,
    X_valid=X_valid,
    X_train_valid=X_train_valid,
    X_test=X_test,
    Y_train=Y_train,
    Y_valid=Y_valid,
    Y_train_valid=Y_train_valid,
    Y_test=Y_test,
    model_task=model_task,
    stat_criteria=stat_criteria,
    save_model=save_model)

### Basic analysis of the model: DataFrame with the results, Feature Importance from Shapley values (SHAP)

In [None]:
augmented_train_valid_set = augment_train_valid_set_with_results("uncorrected", X_train_valid, Y_train_valid, Y_pred_train_valid, model_task)

We now see that this process with basic data preparation, modelling and integration of the results in a DataFrame (as storage of the model) is very fast (in seconds):

In [None]:
t1 = time.time()

print(f"Basic modelling took {round(t1 - t0)} seconds")

The further steps are for fairness assessment and correction of the model, functionality which is available with the package FairDream of DreamQuark (private for the moment)...

## Detection alert (on train&valid data to examine if the model learned discriminant behavior)

## Discrimination correction with a new fair model

### Generating fairer models with grid search or weights distorsion

### Evaluating the best fair model