# Applying the fairness calculation on the Titanic dataset

In [2]:
import warnings
warnings.filterwarnings("ignore", module='aif360')

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.algorithms.preprocessing import Reweighing

# Load Titanic dataset
url = "https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"
df = pd.read_csv(url)

df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [15]:
# Preprocess the data
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})  # Convert 'Sex' to binary: 1 for male, 0 for female
df.drop(['Name'], axis=1, inplace=True)  # Drop the 'Name' column

# Split data into training and testing sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Convert dataframes into BinaryLabelDataset format
train_bld = BinaryLabelDataset(df=train, label_names=['Survived'], protected_attribute_names=['Sex'])
test_bld = BinaryLabelDataset(df=test, label_names=['Survived'], protected_attribute_names=['Sex'])

# Compute fairness metric on original training dataset
metric_train_bld = BinaryLabelDatasetMetric(train_bld, unprivileged_groups=[{'Sex': 0}], privileged_groups=[{'Sex': 1}])
print(f'Original training dataset disparity: {metric_train_bld.mean_difference():.2f}')

# Mitigate bias by reweighing the dataset
RW = Reweighing(unprivileged_groups=[{'Sex': 0}], privileged_groups=[{'Sex': 1}])
train_bld_transformed = RW.fit_transform(train_bld)

# Compute fairness metric on transformed training dataset
metric_train_bld_transformed = BinaryLabelDatasetMetric(train_bld_transformed, unprivileged_groups=[{'Sex': 0}], privileged_groups=[{'Sex': 1}])
print(f'Transformed training dataset disparity: {metric_train_bld_transformed.mean_difference():.2f}')

Original training dataset disparity: 0.57
Transformed training dataset disparity: -0.00


In [19]:
from aif360.explainers import MetricTextExplainer, MetricJSONExplainer

text_expl = MetricTextExplainer(metric_train_bld)

json_expl = MetricJSONExplainer(metric_train_bld)

import json
def format_json(json_str):
    return json.dumps(json.loads(json_str, object_pairs_hook=OrderedDict), indent=2)

In [20]:
print(format_json(json_expl.num_positives()))

NameError: name 'json' is not defined

In [8]:
dfTransformed = train_bld_transformed.convert_to_dataframe()
dfTransformed[0].shape

(709, 7)

In [13]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000


In [7]:
# Train a classifier (e.g., logistic regression) on the transformed dataset
scaler = StandardScaler()
X_train = scaler.fit_transform(train_bld_transformed.features)
y_train = train_bld_transformed.labels.ravel()
clf = LogisticRegression().fit(X_train, y_train)

# Test the classifier
X_test = scaler.transform(test_bld.features)
y_test = test_bld.labels.ravel()
y_pred = clf.predict(X_test)

# Evaluate the classifier's performance
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
report = classification_report(y_test, y_pred, target_names=["Not Survived", "Survived"])
print(report)


Accuracy: 0.7472
              precision    recall  f1-score   support

Not Survived       0.76      0.86      0.81       111
    Survived       0.71      0.55      0.62        67

    accuracy                           0.75       178
   macro avg       0.74      0.71      0.72       178
weighted avg       0.74      0.75      0.74       178

