# Fairness example with XAIoGraphs


* Source Dataset: https://www.kaggle.com/datasets/kukuroo3/body-performance-data

In [1]:
import pandas as pd

# Read Dataset
df = pd.read_csv('../../datasets/bodyPerformance.csv').rename({'class': 'y_true'}, axis=1)
print('Dataset Shape {}'.format(df.shape))
df.sample(5)

Dataset Shape (13393, 12)


Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,y_true
11007,24.0,M,169.5,68.3,27.4,74.0,119.0,34.2,-1.7,37.0,185.0,D
9367,28.0,M,180.9,86.8,20.7,64.0,132.0,57.8,25.0,53.0,235.0,A
8728,22.0,M,169.5,72.8,23.4,67.0,104.0,36.8,10.6,57.0,237.0,C
5993,60.0,M,179.3,82.1,23.2,87.0,138.0,50.9,16.4,37.0,194.0,B
233,56.0,F,149.0,46.7,23.2,95.0,157.0,25.0,28.0,41.0,186.0,A


### Discretization of the "age" Feature

In [2]:
df['age_range'] = df['age'].apply(lambda x: '20-29' if (x >= 20 and x < 30)
                                  else ('30-39' if (x >= 30 and x < 40)
                                        else ('40-49' if (x >= 40 and x < 50)
                                              else ('50-59' if (x >= 50 and x < 60)
                                                    else '60-inf'))))

# Set 'age_range' columns in first position
df = df[df.columns.tolist()[-1:] +  df.columns.tolist()[:-1]]
df.sample(5)

Unnamed: 0,age_range,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,y_true
3705,60-inf,62.0,F,144.9,50.8,33.9,68.0,128.0,14.9,10.7,2.0,83.0,D
8954,30-39,36.0,M,177.9,67.0,10.8,64.0,118.0,41.9,13.9,53.0,240.0,B
7269,40-49,41.0,M,169.3,54.4,24.9,96.0,148.0,31.8,-2.6,25.0,159.0,D
3611,20-29,24.0,M,174.8,72.9,19.6,73.0,129.0,32.0,6.8,43.0,216.0,C
5188,20-29,21.0,M,172.9,71.4,28.6,100.0,152.0,40.0,25.1,50.0,248.0,C


### Label encoder

In [3]:
from sklearn.preprocessing import LabelEncoder

lb_gen = LabelEncoder()
lb_age_range = LabelEncoder()
lb_y = LabelEncoder()
df['gender'] = lb_gen.fit_transform(df['gender'])
df['age_range'] = lb_age_range.fit_transform(df['age_range'])
df['y_true'] = lb_y.fit_transform(df['y_true'])
df.sample(5)

Unnamed: 0,age_range,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,y_true
12665,0,23.0,1,170.1,62.4,21.7,67.0,115.0,38.0,23.8,56.0,215.0,2
1875,2,41.0,1,171.7,73.0,23.5,79.0,121.0,40.4,17.8,32.0,202.0,2
9168,1,34.0,1,173.0,79.9,30.9,91.0,144.0,38.2,16.0,7.0,233.0,3
9864,0,26.0,1,174.3,70.8,11.6,61.0,129.0,45.9,14.4,53.0,231.0,1
9184,0,23.0,0,166.1,54.4,23.0,75.0,132.0,31.0,21.4,46.0,152.0,0


### Train-Test split


In [4]:
from sklearn.model_selection import train_test_split

X = df[df.columns.drop('y_true')]
y = df['y_true']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)


### Train Model 

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Random Forest
model = RandomForestClassifier(n_estimators=20, bootstrap=True, criterion='gini', max_depth=10, random_state=123)
model.fit(X_train, y_train)

# Evaluation Metrics
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print('Train Accuracy: {}'.format(accuracy_score(y_true=y_train, y_pred=y_train_pred)))
print('Test Accuracy: {}'.format(accuracy_score(y_true=y_test, y_pred=y_test_pred)))
print('Test Metrics:\n{}'.format(classification_report(y_true=y_test, y_pred=y_test_pred)))


Train Accuracy: 0.8424533333333334
Test Accuracy: 0.7155301144848183
Test Metrics:
              precision    recall  f1-score   support

           0       0.71      0.86      0.78      1024
           1       0.58      0.58      0.58       987
           2       0.71      0.63      0.67      1039
           3       0.89      0.79      0.83       968

    accuracy                           0.72      4018
   macro avg       0.72      0.72      0.72      4018
weighted avg       0.72      0.72      0.71      4018



### Predict all Dataset

In [6]:
from copy import deepcopy
x_cols = df.columns.drop('y_true').tolist()

# Calculate predictions
df['y_predict'] = model.predict(df[x_cols])
df.sample(5)

Unnamed: 0,age_range,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,y_true,y_predict
9152,0,27.0,1,169.2,72.5,17.6,79.0,132.0,45.5,10.9,60.0,203.0,1,1
1098,2,45.0,1,175.0,79.1,20.2,86.0,131.0,44.3,5.3,42.0,190.0,2,2
12379,0,21.0,1,174.3,71.1,14.6,69.0,119.0,37.9,11.2,53.0,223.0,1,1
9058,0,25.0,1,175.4,73.7,17.3,84.0,142.0,47.2,7.0,56.0,233.0,2,2
3695,3,54.0,0,162.8,56.9,28.7,70.0,120.0,23.5,9.7,23.0,116.0,3,3


### Undo label encoder features

In [7]:
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

df['age_range'] = lb_age_range.inverse_transform(df['age_range'])
df['gender'] = lb_gen.inverse_transform(df['gender'])
df['y_true'] = lb_y.inverse_transform(df['y_true'])
df['y_predict'] = lb_y.inverse_transform(df['y_predict'])
df.sample(10)

Unnamed: 0,age_range,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,y_true,y_predict
4917,30-39,39.0,F,152.1,46.1,25.5,72.0,118.0,24.3,21.8,24.0,127.0,C,C
5483,30-39,35.0,F,154.1,50.4,32.0,81.0,135.0,26.7,21.1,25.0,141.0,B,C
8666,40-49,41.0,F,167.6,74.3,31.6,86.0,126.0,20.7,10.7,15.0,155.0,D,D
6395,60-inf,61.0,M,162.3,53.04,14.6,80.0,120.0,38.5,13.1,35.0,173.0,A,A
1005,50-59,51.0,M,165.5,74.7,26.3,74.0,127.0,34.9,14.4,24.0,171.0,D,D
6949,20-29,27.0,F,160.7,64.8,27.2,63.0,117.0,30.6,19.2,52.0,163.0,A,A
11658,60-inf,61.0,M,172.3,69.1,23.0,84.0,152.0,39.3,10.3,34.0,168.0,B,B
7355,40-49,45.0,F,165.5,54.1,21.5,79.0,128.0,23.1,21.8,34.0,158.0,A,A
8368,20-29,22.0,F,166.0,59.4,28.4,83.0,122.0,30.1,12.2,44.0,161.0,C,C
10391,50-59,56.0,F,156.5,73.9,40.9,65.0,106.0,24.7,12.5,10.0,110.0,D,D


<hr>

# Fairness with XAIoGraphs

In [8]:
from xaiographs.fairness import Fairness

f = Fairness(destination_path='./xaiographs_web_files')

f.fit_fairness(df=df, 
               sensitive_cols=['age_range', 'gender'], 
               target_col='y_true', 
               predict_col='y_predict')



Enconding "gender" column: 100%|█████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 707.79it/s]
Checking "broad jump_cm" column: 100%|███████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 343.76it/s]


Highly correlated variables above the 0.9 Threshold
  feature_1  feature_2  correlation_value  is_correlation_sensible
0       age  age_range           0.979564                     True


Processing: sensitive_col=age_range, sensitive_value=60-inf, target_label=A : 100%|█████████████████████████████| 5/5 [00:00<00:00, 14.32it/s]
Processing: sensitive_col=age_range, sensitive_value=60-inf, target_label=B : 100%|█████████████████████████████| 5/5 [00:00<00:00, 14.85it/s]
Processing: sensitive_col=age_range, sensitive_value=60-inf, target_label=D : 100%|█████████████████████████████| 5/5 [00:00<00:00, 14.41it/s]
Processing: sensitive_col=age_range, sensitive_value=60-inf, target_label=C : 100%|█████████████████████████████| 5/5 [00:00<00:00, 14.75it/s]
Processing: sensitive_col=gender, sensitive_value=M, target_label=A :   0%|                                             | 0/2 [00:00<?, ?it/s]
Processing: sensitive_col=gender, sensitive_value=M, target_label=B :   0%|                                             | 0/2 [00:00<?, ?it/s]
Processing: sensitive_col=gender, sensitive_value=M, target_label=D :   0%|                                             | 0/2 [00:00<?, ?it/s]

In [9]:
f.fairness_info

Unnamed: 0,sensitive_feature,sensitive_value,is_binary_sensitive_feature,target_label,independence_score,independence_category,independence_score_weight,separation_score,separation_category,separation_score_weight,sufficiency_score,sufficiency_category,sufficiency_score_weight
0,age_range,20-29,False,A,0.091699,C,0.155454,0.102672,C,0.155454,0.013776,A+,0.118271
1,age_range,30-39,False,A,0.05134,B,0.06929,0.007547,A+,0.06929,0.008601,A+,0.055477
2,age_range,40-49,False,A,0.052768,B,0.036512,0.01066,A+,0.036512,0.033104,A,0.028821
3,age_range,50-59,False,A,0.119848,C,0.026432,0.103097,C,0.026432,0.001776,A+,0.023968
4,age_range,60-inf,False,A,0.12051,C,0.019786,0.198523,D,0.019786,0.137346,C,0.023445
5,age_range,20-29,False,B,0.06109,B,0.092063,0.064757,B,0.092063,0.084332,C,0.105279
6,age_range,30-39,False,B,0.008397,A+,0.050549,0.036001,A,0.050549,0.011037,A+,0.05249
7,age_range,40-49,False,B,0.024911,A,0.031509,0.040302,A,0.031509,0.027575,A,0.03233
8,age_range,50-59,False,B,0.053249,B,0.038229,0.079066,B,0.038229,0.069346,B,0.03233
9,age_range,60-inf,False,B,0.118553,C,0.035242,0.177754,D,0.035242,0.047412,A,0.027477


In [12]:
f.target_values

array(['A', 'B', 'D', 'C'], dtype=object)