In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from aif360.datasets import CompasDataset
from aif360.algorithms.preprocessing import Reweighing
from fairlearn.metrics import (
    demographic_parity_difference,
    equalized_odds_difference,
)
import warnings
warnings.filterwarnings("ignore")

pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[inFairness]'


#### STEP 1: Load COMPAS data

In [3]:
compas_data = CompasDataset()
df = compas_data.convert_to_dataframe()[0]
print("✅ Loaded COMPAS dataset with shape:", df.shape)

IOError: [Errno 2] No such file or directory: '/home/sidnaik04/Documents/BiasBuster/code_demo/venv/lib/python3.12/site-packages/aif360/datasets/../data/raw/compas/compas-scores-two-years.csv'
To use this class, please download the following file:

	https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv

and place it, as-is, in the folder:

	/home/sidnaik04/Documents/BiasBuster/code_demo/venv/lib/python3.12/site-packages/aif360/data/raw/compas



SystemExit: 1

# ----------------------------
# STEP 2: Prepare Data
# ----------------------------

In [None]:
protected_attr = 'race'
label = 'two_year_recid'
y = df[label].values
A = df[protected_attr].values
X = df.drop(columns=[label]).select_dtypes(include=[np.number])
X = X.fillna(0)
X = StandardScaler().fit_transform(X)

# ----------------------------
# Helper: Disparate Impact
# ----------------------------

In [None]:
def disparate_impact(y_true, y_pred, A):
    """P(Y=1|unpriv)/P(Y=1|priv)"""
    priv = (A == 1)
    unpriv = (A == 0)
    rate_priv = np.mean(y_pred[priv])
    rate_unpriv = np.mean(y_pred[unpriv])
    if rate_priv == 0:
        return 0
    return rate_unpriv / rate_priv

# ----------------------------
# Helper: Metrics
# ----------------------------

In [None]:
def compute_metrics(y_true, y_pred, A, name="Model"):
    dp = demographic_parity_difference(y_true, y_pred, sensitive_features=A)
    eo = equalized_odds_difference(y_true, y_pred, sensitive_features=A)
    di = disparate_impact(y_true, y_pred, A)
    acc = accuracy_score(y_true, y_pred)
    print(f"\n{name}:")
    print(f"Accuracy: {acc:.3f}, DP Diff: {dp:.3f}, EO Diff: {eo:.3f}, DI: {di:.3f}")
    return {"Model": name, "Accuracy": acc, "DP Diff": dp, "EO Diff": eo, "DI": di}

# ----------------------------
# STEP 3: Baseline Logistic Regression
# ----------------------------

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X, y)
y_pred = model.predict(X)
baseline_results = compute_metrics(y, y_pred, A, "Baseline")

# ----------------------------
# STEP 4: Oversampling
# ----------------------------

In [None]:
df_full = pd.DataFrame(X)
df_full["y"] = y
df_full["A"] = A

minority = df_full[df_full["y"] == 1]
majority = df_full[df_full["y"] == 0]
minority_oversampled = resample(minority, replace=True,
                                n_samples=len(majority),
                                random_state=42)
df_balanced = pd.concat([majority, minority_oversampled])

X_os = df_balanced.drop(columns=["y", "A"]).values
y_os = df_balanced["y"].values
A_os = df_balanced["A"].values

model_os = LogisticRegression(max_iter=1000)
model_os.fit(X_os, y_os)
y_pred_os = model_os.predict(X)
oversample_results = compute_metrics(y, y_pred_os, A, "Oversampling")


# ------------------------------
# STEP 5: Threshold Optimization
# ------------------------------

In [None]:
y_scores = model.predict_proba(X)[:, 1]
thresholds = np.linspace(0.1, 0.9, 30)

best_threshold = 0.5
best_score = -np.inf

for t in thresholds:
    y_pred_t = (y_scores >= t).astype(int)
    acc = accuracy_score(y, y_pred_t)
    dp = abs(demographic_parity_difference(y, y_pred_t, sensitive_features=A))
    eo = abs(equalized_odds_difference(y, y_pred_t, sensitive_features=A))
    di = disparate_impact(y, y_pred_t, A)
    fairness_penalty = (dp + eo)  # smaller = fairer
    score = acc - 0.5 * fairness_penalty  # balance fairness vs accuracy
    if score > best_score:
        best_score = score
        best_threshold = t

print(f"\n✅ Optimal threshold for fairness–accuracy tradeoff: {best_threshold:.3f}")

y_pred_th = (y_scores >= best_threshold).astype(int)
threshold_results = compute_metrics(y, y_pred_th, A, "Threshold Optimization")

# ----------------------------
# STEP 6: Reweighing
# ----------------------------

In [None]:
rw = Reweighing(unprivileged_groups=[{'race': 0}],
                privileged_groups=[{'race': 1}])
rw.fit(compas_data)
dataset_transf = rw.transform(compas_data)

model_rw = LogisticRegression(max_iter=1000)
model_rw.fit(X, y, sample_weight=dataset_transf.instance_weights)
y_pred_rw = model_rw.predict(X)
reweigh_results = compute_metrics(y, y_pred_rw, A, "Reweighing")

# ----------------------------
# STEP 7: Multi-Objective Optimization (Threshold Fine-Tuning)
# ----------------------------
# Combine fairness & accuracy into a single optimization objective

In [None]:
best_score = -np.inf
best_threshold_moo = 0.5
results_thresholds = []

for t in np.linspace(0.1, 0.9, 30):
    y_pred_t = (y_scores >= t).astype(int)
    acc = accuracy_score(y, y_pred_t)
    dp = abs(demographic_parity_difference(y, y_pred_t, sensitive_features=A))
    eo = abs(equalized_odds_difference(y, y_pred_t, sensitive_features=A))
    fairness = (dp + eo) / 2
    score = 0.7 * acc - 0.3 * fairness  # weighted tradeoff
    results_thresholds.append([t, acc, dp, eo, fairness, score])
    if score > best_score:
        best_score = score
        best_threshold_moo = t

y_pred_moo = (y_scores >= best_threshold_moo).astype(int)
moo_results = compute_metrics(y, y_pred_moo, A, "Multi-Objective Opt")

# ----------------------------
# STEP 8: Visualization
# ----------------------------

In [None]:
results = pd.DataFrame([
    baseline_results,
    oversample_results,
    threshold_results,
    reweigh_results,
    moo_results
])

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.bar(results["Model"], results["Accuracy"], color="cornflowerblue")
plt.title("Accuracy Comparison")
plt.ylabel("Accuracy")
plt.xticks(rotation=15)

plt.subplot(1, 2, 2)
width = 0.25
x = np.arange(len(results))
plt.bar(x - width, results["DP Diff"], width, label="DP Diff")
plt.bar(x, results["EO Diff"], width, label="EO Diff")
plt.bar(x + width, results["DI"], width, label="DI")
plt.xticks(x, results["Model"], rotation=15)
plt.title("Fairness Metrics Comparison")
plt.legend()
plt.tight_layout()
plt.show()

print("\n✅ Pipeline completed with Disparate Impact & Multi-Objective Optimization.")