# Classical ML + Fake Dataset
- take decision tree to see if there is more or less 'fairness' 

In [1]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from aif360.metrics import ClassificationMetric
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.datasets import StandardDataset

pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


In [2]:
# import the dataset
fake_data_df = pd.read_csv("../aif360/Data/fake_data.csv")
fake_data_df.head()

Unnamed: 0,age,income,gender,car,target
0,45,29923,male,yes,0
1,39,75755,male,yes,0
2,18,73277,male,yes,1
3,37,24442,male,yes,0
4,34,58901,male,yes,0


### Preprocess The Data
- remap the values 

In [4]:
# remap gender to binary
map = {'male': 0, 'female': 1}

# replace values in gender column
fake_data_df['gender'].replace(map, inplace=True)

#remap the car column to binary
map = {'yes': 0, 'no': 1}

# replace values in car column
fake_data_df['car'].replace(map, inplace=True)
# spit the data into y and x 
y = fake_data_df['target']
X = fake_data_df.drop(columns=['target'])

# split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Transform the data into Standard Dataset

In [9]:
# all training data into Standard Dataset
train_all = pd.concat([X_train, y_train], axis=1)

# all testing data into Standard Dataset
test_all = pd.concat([X_test, y_test], axis=1)

privileged_groups = [{'gender': 1}]
unprivileged_groups = [{'gender': 0}]

In [10]:
# create the StandardDataset
train_dataset = StandardDataset(train_all, 
                          label_name='target', 
                          protected_attribute_names=['gender'],
                          favorable_classes=[1],
                          privileged_classes=[[1]]
                          )

In [11]:
# create the StandardDataset
test_dataset = StandardDataset(test_all, 
                          label_name='target', 
                          protected_attribute_names=['gender'],
                          favorable_classes=[1],
                          privileged_classes=[[1]]
                          )

### Train the simple Decision Tree model
- no hyperparam tuning 

In [13]:
# train the decision tree on the x_train and y_train
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# predict the y_test
y_pred = clf.predict(X_test)

# calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy: ", accuracy)

Accuracy:  0.88


#### Investigate if the Decision Tree is 'fair'

In [15]:
# create standard dataset for the predicted values 
predicted_test_dataset = test_dataset.copy()
predicted_test_dataset.labels = y_pred.reshape(-1, 1)

# Classification metrics 
metric_prediction = ClassificationMetric(test_dataset, 
                                        predicted_test_dataset, 
                                        unprivileged_groups=unprivileged_groups, 
                                        privileged_groups=privileged_groups)

In [16]:
# compute the fairness of metric prediction
print("The consistency is:", metric_prediction.consistency()) 

# compute the fairness of metric prediction
print("The disparate impact is:", metric_prediction.disparate_impact())

# compute the fairness of metric prediction
print("The equal opportunity difference is:", metric_prediction.equal_opportunity_difference())

# compute the fairness of metric prediction
print("The average odds difference is:", metric_prediction.average_odds_difference()) # This is NAN?? 

# compute the fairness of metric prediction
print("The statistical parity difference is:", metric_prediction.statistical_parity_difference())

The consistency is: [0.683]
The disparate impact is: 0.13071895424836602
The equal opportunity difference is: -0.625
The average odds difference is: nan
The statistical parity difference is: -0.869281045751634


  TPR=TP / P, TNR=TN / N, FPR=FP / N, FNR=FN / P,
  GTPR=GTP / P, GTNR=GTN / N, GFPR=GFP / N, GFNR=GFN / P,
