In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score
import numpy as np

# Load the adult.data dataset (replace with the correct path)
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]

# Load data (adjust path to the dataset as needed)
df = pd.read_csv('adult.data', header=None, names=column_names, na_values=' ?', skipinitialspace=True)

# Preprocessing: Convert categorical columns to numerical using One-Hot Encoding and handle protected attributes (sex, race)
categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'native-country']

# Convert 'sex' and 'race' to binary encoding
df['sex'] = df['sex'].apply(lambda x: 1 if x.strip() == 'Male' else 0)
df['race'] = df['race'].apply(lambda x: 1 if x.strip() == 'White' else 0)

# One-hot encode categorical columns
df = pd.get_dummies(df, columns=categorical_columns)

# Convert target variable (income) to binary: 1 if '>50K' else 0
df['income'] = df['income'].apply(lambda x: 1 if x == '>50K' else 0)

# Split data into features (X) and target (y)
X = df.drop(columns=['income'])
y = df['income']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest model (no SMOTE)
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)[:, 1]

# Calculate overall model performance using ROC AUC
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f'Overall ROC AUC: {roc_auc:.4f}')

# Calculate confusion matrix for the model
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Calculate TPR, FPR, FNR for the entire dataset
TPR = tp / (tp + fn)  # True Positive Rate (Recall)
FPR = fp / (fp + tn)  # False Positive Rate
FNR = fn / (fn + tp)  # False Negative Rate
print(f"Overall TPR: {TPR:.4f}, FPR: {FPR:.4f}, FNR: {FNR:.4f}")

# Define a function to calculate fairness metrics for race/sex
def calculate_fairness_metrics(mask, y_pred, y_test):
    y_pred_group = y_pred[mask]
    y_test_group = y_test[mask]
    tn_group, fp_group, fn_group, tp_group = confusion_matrix(y_test_group, y_pred_group).ravel()
    TPR_group = tp_group / (tp_group + fn_group)
    FPR_group = fp_group / (fp_group + tn_group)
    FNR_group = fn_group / (fn_group + tp_group)
    return TPR_group, FPR_group, FNR_group

# Separate by 'sex' (male vs female) and 'race' (white vs others)
# Mask for sex groups (Male = 1, Female = 0)
sex_mask_male = X_test['sex'] == 1
sex_mask_female = X_test['sex'] == 0

# Mask for race groups (White = 1, others = 0)
race_mask_white = X_test['race'] == 1
race_mask_others = X_test['race'] == 0

# Calculate fairness metrics for each group
# For sex (Male vs Female)
TPR_male, FPR_male, FNR_male = calculate_fairness_metrics(sex_mask_male, y_pred, y_test)
TPR_female, FPR_female, FNR_female = calculate_fairness_metrics(sex_mask_female, y_pred, y_test)

# For race (White vs Others)
TPR_white, FPR_white, FNR_white = calculate_fairness_metrics(race_mask_white, y_pred, y_test)
TPR_others, FPR_others, FNR_others = calculate_fairness_metrics(race_mask_others, y_pred, y_test)

# Print fairness metrics for sex
print(f"\nFairness Metrics for Male Group:")
print(f"True Positive Rate (TPR) - Male: {TPR_male:.4f}")
print(f"False Positive Rate (FPR) - Male: {FPR_male:.4f}")
print(f"False Negative Rate (FNR) - Male: {FNR_male:.4f}")

print(f"\nFairness Metrics for Female Group:")
print(f"True Positive Rate (TPR) - Female: {TPR_female:.4f}")
print(f"False Positive Rate (FPR) - Female: {FPR_female:.4f}")
print(f"False Negative Rate (FNR) - Female: {FNR_female:.4f}")

# Print fairness metrics for race
print(f"\nFairness Metrics for White Group:")
print(f"True Positive Rate (TPR) - White: {TPR_white:.4f}")
print(f"False Positive Rate (FPR) - White: {FPR_white:.4f}")
print(f"False Negative Rate (FNR) - White: {FNR_white:.4f}")

print(f"\nFairness Metrics for Other Races Group:")
print(f"True Positive Rate (TPR) - Others: {TPR_others:.4f}")
print(f"False Positive Rate (FPR) - Others: {FPR_others:.4f}")
print(f"False Negative Rate (FNR) - Others: {FNR_others:.4f}")

# Calculate Equality of Opportunity (Separation) for sex and race
# For sex:
equality_of_opportunity_sex = abs(TPR_male - TPR_female)

# For race:
equality_of_opportunity_race = abs(TPR_white - TPR_others)

# Calculate Equality of Odds (FPR and FNR) for sex and race
# For sex:
equality_of_odds_fpr_sex = abs(FPR_male - FPR_female)
equality_of_odds_fnr_sex = abs(FNR_male - FNR_female)

# For race:
equality_of_odds_fpr_race = abs(FPR_white - FPR_others)
equality_of_odds_fnr_race = abs(FNR_white - FNR_others)

# Print Equality of Opportunity and Equality of Odds results
print(f"\nEquality of Opportunity (Separation) between Male and Female: {equality_of_opportunity_sex:.4f}")
print(f"Equality of Opportunity (Separation) between White and Other Races: {equality_of_opportunity_race:.4f}")

print(f"\nEquality of Odds difference in FPR between Male and Female: {equality_of_odds_fpr_sex:.4f}")
print(f"Equality of Odds difference in FNR between Male and Female: {equality_of_odds_fnr_sex:.4f}")

print(f"\nEquality of Odds difference in FPR between White and Others: {equality_of_odds_fpr_race:.4f}")
print(f"Equality of Odds difference in FNR between White and Others: {equality_of_odds_fnr_race:.4f}")

Overall ROC AUC: 0.8999
Overall TPR: 0.6154, FPR: 0.0732, FNR: 0.3846

Fairness Metrics for Male Group:
True Positive Rate (TPR) - Male: 0.6302
False Positive Rate (FPR) - Male: 0.1020
False Negative Rate (FNR) - Male: 0.3698

Fairness Metrics for Female Group:
True Positive Rate (TPR) - Female: 0.5294
False Positive Rate (FPR) - Female: 0.0280
False Negative Rate (FNR) - Female: 0.4706

Fairness Metrics for White Group:
True Positive Rate (TPR) - White: 0.6167
False Positive Rate (FPR) - White: 0.0794
False Negative Rate (FNR) - White: 0.3833

Fairness Metrics for Other Races Group:
True Positive Rate (TPR) - Others: 0.6020
False Positive Rate (FPR) - Others: 0.0402
False Negative Rate (FNR) - Others: 0.3980

Equality of Opportunity (Separation) between Male and Female: 0.1008
Equality of Opportunity (Separation) between White and Other Races: 0.0147

Equality of Odds difference in FPR between Male and Female: 0.0741
Equality of Odds difference in FNR between Male and Female: 0.1008

