# Fairness example with XAIoGraphs


* Source Dataset: https://www.kaggle.com/datasets/kukuroo3/body-performance-data

In [1]:
import pandas as pd

# Read Dataset
df = pd.read_csv('../../datasets/bodyPerformance.csv').rename({'class': 'y_true'}, axis=1)
print('Dataset Shape {}'.format(df.shape))
df.sample(5)

Dataset Shape (13393, 12)


Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,y_true
13287,44.0,F,160.2,56.9,22.5,83.0,130.0,33.3,12.3,36.0,182.0,C
2594,27.0,M,181.2,73.8,22.7,69.0,125.0,40.1,18.2,49.0,228.0,C
1034,32.0,M,170.7,90.5,36.2,82.0,133.0,45.1,11.4,33.0,188.0,D
7267,28.0,M,182.6,71.7,10.2,72.0,133.0,43.3,18.1,62.0,269.0,A
6245,61.0,M,175.5,67.6,14.7,58.0,123.0,30.5,4.3,25.0,154.0,C


### Discretization of the "age" Feature

In [2]:
df['age_range'] = df['age'].apply(lambda x: '20-29' if (x >= 20 and x < 30)
                                  else ('30-39' if (x >= 30 and x < 40)
                                        else ('40-49' if (x >= 40 and x < 50)
                                              else ('50-59' if (x >= 50 and x < 60)
                                                    else '60-inf'))))

# Set 'age_range' columns in first position
df = df[df.columns.tolist()[-1:] +  df.columns.tolist()[:-1]]
df.sample(5)

Unnamed: 0,age_range,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,y_true
10523,40-49,44.0,M,169.5,75.6,24.6,99.0,147.0,38.1,7.6,38.0,205.0,C
12248,30-39,37.0,F,165.0,55.8,21.7,88.0,128.0,24.9,20.0,37.0,157.0,A
11482,20-29,28.0,M,174.3,79.9,29.1,69.0,118.0,38.3,5.0,36.0,199.0,D
12888,50-59,53.0,F,147.3,54.0,36.7,71.0,139.0,26.4,23.9,2.0,124.0,D
12897,30-39,31.0,M,175.3,84.2,25.6,78.0,138.0,52.8,0.2,43.0,219.0,D


### Label encoder

In [3]:
from sklearn.preprocessing import LabelEncoder

lb_gen = LabelEncoder()
lb_age_range = LabelEncoder()
lb_y = LabelEncoder()
df['gender'] = lb_gen.fit_transform(df['gender'])
df['age_range'] = lb_age_range.fit_transform(df['age_range'])
df['y_true'] = lb_y.fit_transform(df['y_true'])
df.sample(5)

Unnamed: 0,age_range,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,y_true
10852,1,30.0,1,170.0,69.4,19.9,69.0,116.0,39.7,14.3,48.0,219.0,1
12628,2,43.0,0,158.8,45.2,21.1,72.0,135.0,28.8,21.1,30.0,172.0,1
835,3,56.0,0,159.5,53.9,31.3,70.0,124.0,20.7,32.0,24.0,135.0,1
9392,0,25.0,1,166.7,65.9,16.0,71.0,127.0,36.6,16.8,58.0,231.0,0
7425,1,38.0,0,171.4,56.3,29.6,57.0,96.0,23.4,-11.1,20.0,140.0,3


### Train-Test split


In [4]:
from sklearn.model_selection import train_test_split

X = df[df.columns.drop('y_true')]
y = df['y_true']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)


### Train Model 

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Random Forest
model = RandomForestClassifier(n_estimators=20, bootstrap=True, criterion='gini', max_depth=10, random_state=123)
model.fit(X_train, y_train)

# Evaluation Metrics
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print('Train Accuracy: {}'.format(accuracy_score(y_true=y_train, y_pred=y_train_pred)))
print('Test Accuracy: {}'.format(accuracy_score(y_true=y_test, y_pred=y_test_pred)))
print('Test Metrics:\n{}'.format(classification_report(y_true=y_test, y_pred=y_test_pred)))


Train Accuracy: 0.8424533333333334
Test Accuracy: 0.7165256346441016
Test Metrics:
             precision    recall  f1-score   support

          0       0.71      0.87      0.78      1024
          1       0.58      0.58      0.58       987
          2       0.71      0.63      0.67      1039
          3       0.89      0.79      0.84       968

avg / total       0.72      0.72      0.72      4018



### Predict all Dataset

In [6]:
from copy import deepcopy
x_cols = df.columns.drop('y_true').tolist()

# Calculate predictions
df['y_predict'] = model.predict(df[x_cols])
df.sample(5)

Unnamed: 0,age_range,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,y_true,y_predict
8045,0,27.0,1,173.8,87.0,24.1,95.0,158.0,39.9,22.5,50.0,189.0,3,1
13269,2,46.0,1,179.0,81.2,26.7,98.0,159.0,42.7,6.5,29.0,211.0,3,3
60,1,37.0,1,168.9,78.8,23.4,88.0,144.0,42.1,20.9,47.0,241.0,2,0
7446,0,22.0,1,170.8,86.9,29.4,89.0,155.0,41.2,9.4,48.0,199.0,3,3
9082,1,38.0,1,176.9,68.3,17.0,76.0,124.0,48.1,14.9,49.0,230.0,0,0


### Undo label encoder features

In [7]:
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

df['age_range'] = lb_age_range.inverse_transform(df['age_range'])
df['gender'] = lb_gen.inverse_transform(df['gender'])
df['y_true'] = lb_y.inverse_transform(df['y_true'])
df['y_predict'] = lb_y.inverse_transform(df['y_predict'])
df.sample(10)

Unnamed: 0,age_range,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,y_true,y_predict
112,30-39,34.0,M,177.8,76.4,28.9,82.0,130.0,41.1,5.3,38.0,194.0,C,C
6966,60-inf,63.0,M,168.7,59.48,20.9,89.0,150.0,32.3,18.0,27.0,151.0,C,B
10640,30-39,37.0,F,157.3,61.1,33.4,69.0,118.0,24.7,19.9,25.0,131.0,D,C
5256,20-29,26.0,F,153.4,48.6,26.3,74.0,118.0,19.1,13.6,29.0,142.0,C,C
13218,20-29,24.0,F,171.1,72.5,28.8,82.0,119.0,34.1,29.5,52.0,197.0,B,B
9046,20-29,27.0,F,162.3,59.28,25.3,76.0,155.0,29.1,24.4,47.0,179.0,A,A
41,50-59,52.0,M,173.6,84.9,30.3,93.0,144.0,42.0,9.0,43.0,185.0,D,D
2202,40-49,48.0,F,160.5,58.0,29.8,60.0,109.0,25.0,15.0,37.0,140.0,B,C
12969,50-59,55.0,M,164.3,62.5,21.4,81.0,133.0,43.8,10.7,28.0,166.0,C,C
1297,50-59,58.0,M,165.0,61.6,19.0,78.0,121.0,37.6,17.0,38.0,173.0,B,A


<hr>

# Fairness with XAIoGraphs

In [8]:
from xaiographs.fairness import Fairness

f = Fairness(destination_path='./xaiographs_web_files')

f.fit_fairness(df=df, 
               sensitive_cols=['age_range', 'gender'], 
               target_col='y_true', 
               predict_col='y_predict')



Enconding "gender" column: 100%|██████████| 12/12 [00:00<00:00, 364.58it/s]
Checking "broad jump_cm" column: 100%|██████████| 12/12 [00:00<00:00, 316.64it/s]


Highly correlated variables above the 0.9 Threshold
  feature_1  feature_2  correlation_value  is_correlation_sensible
0       age  age_range           0.979564                     True


Processing: sensitive_col=age_range, sensitive_value=20-29, target_label=A :   0%|          | 0/5 [00:00<?, ?it/s]


TypeError: aggregate() missing 1 required positional argument: 'func_or_funcs'