# Fairness example with XAIoGraphs


* Source Dataset: https://www.kaggle.com/datasets/kukuroo3/body-performance-data

In [9]:
import pandas as pd

# Read Dataset
df = pd.read_csv('../../datasets/bodyPerformance.csv').rename({'class': 'y_true'}, axis=1)
print('Dataset Shape {}'.format(df.shape))
df.sample(3)

Dataset Shape (13393, 12)


Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,y_true
12940,60.0,F,160.2,64.2,35.2,81.0,131.0,28.1,11.3,9.0,108.0,D
3438,28.0,F,160.0,60.0,28.0,80.0,120.0,30.6,20.6,39.0,213.0,A
7840,24.0,F,160.2,52.7,19.1,56.0,98.0,28.8,13.2,51.0,181.0,C


### Discretization of the "age" Feature

In [10]:
df['age_range'] = df['age'].apply(lambda x: '20-29' if (x >= 20 and x < 30)
                                  else ('30-39' if (x >= 30 and x < 40)
                                        else ('40-49' if (x >= 40 and x < 50)
                                              else ('50-59' if (x >= 50 and x < 60)
                                                    else '60-inf'))))

# Set 'age_range' columns in first position
df = df[df.columns.tolist()[-1:] +  df.columns.tolist()[:-1]]
df.sample(5)

Unnamed: 0,age_range,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,y_true
6762,30-39,39.0,F,154.9,46.6,32.0,81.0,121.0,23.3,28.0,35.0,159.0,C
9512,20-29,24.0,F,160.5,54.0,30.2,60.0,102.0,23.4,23.9,32.0,171.0,D
10587,50-59,57.0,M,171.6,82.0,20.7,86.0,138.0,44.6,9.2,33.0,224.0,C
11596,40-49,41.0,F,159.0,52.4,23.2,71.0,113.0,22.0,21.5,40.0,174.0,A
11135,50-59,50.0,F,155.2,61.2,32.7,93.0,145.0,28.2,23.1,37.0,140.0,A


### Label encoder

In [11]:
from sklearn.preprocessing import LabelEncoder

lb_gen = LabelEncoder()
lb_age_range = LabelEncoder()
lb_y = LabelEncoder()
df['gender'] = lb_gen.fit_transform(df['gender'])
df['age_range'] = lb_age_range.fit_transform(df['age_range'])
df['y_true'] = lb_y.fit_transform(df['y_true'])
df.sample(5)

Unnamed: 0,age_range,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,y_true
196,0,22.0,0,167.0,50.3,18.9,77.0,120.0,27.9,23.3,39.0,168.0,0
1047,0,24.0,1,169.3,65.4,20.4,71.0,115.0,38.0,21.5,50.0,246.0,1
9275,1,37.0,1,166.7,71.6,17.7,79.0,128.0,38.4,11.6,34.0,206.0,2
3315,0,26.0,1,176.7,69.1,13.4,75.0,130.0,48.8,14.0,45.0,236.0,1
12117,0,27.0,1,178.7,78.82,25.7,82.0,118.0,41.5,-8.9,47.0,202.0,3


### Train-Test split


In [12]:
from sklearn.model_selection import train_test_split

X = df[df.columns.drop('y_true')]
y = df['y_true']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)


### Train Model 

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Random Forest
model = RandomForestClassifier(n_estimators=20, bootstrap=True, criterion='gini', max_depth=10, random_state=123)
model.fit(X_train, y_train)

# Evaluation Metrics
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print('Train Accuracy: {}'.format(accuracy_score(y_true=y_train, y_pred=y_train_pred)))
print('Test Accuracy: {}'.format(accuracy_score(y_true=y_test, y_pred=y_test_pred)))
print('Test Metrics:\n{}'.format(classification_report(y_true=y_test, y_pred=y_test_pred)))


Train Accuracy: 0.8424533333333334
Test Accuracy: 0.7155301144848183
Test Metrics:
              precision    recall  f1-score   support

           0       0.71      0.86      0.78      1024
           1       0.58      0.58      0.58       987
           2       0.71      0.63      0.67      1039
           3       0.89      0.79      0.83       968

    accuracy                           0.72      4018
   macro avg       0.72      0.72      0.72      4018
weighted avg       0.72      0.72      0.71      4018



### Predict all Dataset

In [14]:
from copy import deepcopy
x_cols = df.columns.drop('y_true').tolist()

# Calculate predictions
df['y_predict'] = model.predict(df[x_cols])
df.sample(5)

Unnamed: 0,age_range,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,y_true,y_predict
7502,4,62.0,1,166.1,76.5,29.0,85.0,154.0,43.9,16.9,36.0,204.0,1,1
8056,2,42.0,0,157.1,59.7,33.0,81.0,149.0,26.3,16.9,27.0,153.0,2,2
4334,3,56.0,0,156.6,60.4,32.5,81.0,138.0,24.2,21.2,10.0,140.0,2,2
1096,1,34.0,1,170.8,84.4,31.4,96.0,151.0,42.4,-1.1,30.0,206.0,3,3
11004,1,30.0,1,167.5,65.3,16.3,60.0,109.0,39.8,13.7,46.0,235.0,1,1


### Undo label encoder features

In [15]:
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

df['age_range'] = lb_age_range.inverse_transform(df['age_range'])
df['gender'] = lb_gen.inverse_transform(df['gender'])
df['y_true'] = lb_y.inverse_transform(df['y_true'])
df['y_predict'] = lb_y.inverse_transform(df['y_predict'])
df.sample(10)

Unnamed: 0,age_range,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,y_true,y_predict
5915,20-29,23.0,M,170.7,64.9,22.7,85.0,132.0,40.7,16.1,51.0,218.0,B,B
7033,20-29,24.0,M,181.4,87.0,18.1,80.0,147.0,44.5,18.6,59.0,239.0,B,B
5358,20-29,28.0,M,181.0,66.8,12.5,75.0,116.0,42.9,14.9,57.0,239.0,A,A
11216,20-29,21.0,F,161.1,65.72,34.8,80.0,120.0,32.6,17.6,30.0,166.0,D,D
7458,30-39,35.0,M,177.8,75.7,16.2,81.0,128.0,48.1,19.1,47.0,244.0,A,A
4233,60-inf,62.0,F,144.9,47.9,36.6,93.0,159.0,15.4,11.5,5.0,91.0,D,D
12352,40-49,40.0,M,181.5,83.5,22.6,85.0,128.0,48.0,15.0,52.0,217.0,A,A
518,20-29,22.0,F,164.3,59.44,28.3,80.0,120.0,28.9,14.4,45.0,179.0,C,C
9753,20-29,23.0,M,179.2,86.2,22.7,44.0,102.0,48.5,17.5,56.0,205.0,B,B
10452,40-49,42.0,F,157.0,76.8,41.2,96.0,136.0,26.1,12.1,14.0,113.0,D,D


<hr>

# Fairness with XAIoGraphs

In [8]:
from xaiographs.fairness import Fairness

f = Fairness(destination_path='./xaiographs_web_files')

f.fit_fairness(df=df, 
               sensitive_cols=['age_range', 'gender'], 
               target_col='y_true', 
               predict_col='y_predict')



Enconding "gender" column: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 1029.76it/s]
Checking "broad jump_cm" column: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 346.31it/s]


Highly correlated variables above the 0.9 Threshold
  feature_1  feature_2  correlation_value  is_correlation_sensible
0       age  age_range           0.979564                     True


Processing: sensitive_col=age_range, sensitive_value=60-inf, target_label=A : 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 16.97it/s]
Processing: sensitive_col=age_range, sensitive_value=60-inf, target_label=B : 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 18.27it/s]
Processing: sensitive_col=age_range, sensitive_value=60-inf, target_label=D : 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 18.07it/s]
Processing: sensitive_col=age_range, sensitive_value=60-inf, target_label=C : 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 17.77it/s]
Processing: sensitive_col=gender, sensitive_value=M, target_label=A :   0%|                                                                                     

In [9]:
f.fairness_info

Unnamed: 0,sensitive_feature,sensitive_value,is_binary_sensitive_feature,target_label,independence_score,independence_category,independence_score_weight,separation_score,separation_category,separation_score_weight,sufficiency_score,sufficiency_category,sufficiency_score_weight
0,age_range,20-29,False,A,0.091699,C,0.155454,0.102672,C,0.155454,0.013776,A+,0.118271
1,age_range,30-39,False,A,0.05134,B,0.06929,0.007547,A+,0.06929,0.008601,A+,0.055477
2,age_range,40-49,False,A,0.052768,B,0.036512,0.01066,A+,0.036512,0.033104,A,0.028821
3,age_range,50-59,False,A,0.119848,C,0.026432,0.103097,C,0.026432,0.001776,A+,0.023968
4,age_range,60-inf,False,A,0.12051,C,0.019786,0.198523,D,0.019786,0.137346,C,0.023445
5,age_range,20-29,False,B,0.06109,B,0.092063,0.064757,B,0.092063,0.084332,C,0.105279
6,age_range,30-39,False,B,0.008397,A+,0.050549,0.036001,A,0.050549,0.011037,A+,0.05249
7,age_range,40-49,False,B,0.024911,A,0.031509,0.040302,A,0.031509,0.027575,A,0.03233
8,age_range,50-59,False,B,0.053249,B,0.038229,0.079066,B,0.038229,0.069346,B,0.03233
9,age_range,60-inf,False,B,0.118553,C,0.035242,0.177754,D,0.035242,0.047412,A,0.027477


In [12]:
f.target_values

array(['A', 'B', 'D', 'C'], dtype=object)