In [5]:
from folktables import ACSDataSource, ACSIncome, ACSPublicCoverage, generate_categories
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from fairlearn.reductions import ExponentiatedGradient
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference
from fairlearn.reductions import DemographicParity, EqualizedOdds

# Generate synthetic dataset
from sklearn.datasets import make_classification

In [7]:


data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
ca_data = data_source.get_data(states=["CA"], download=True)

ca_features, ca_labels, _ = ACSPublicCoverage.df_to_pandas(ca_data)

ca_features.to_csv('public_features.csv', index=False)
ca_labels.to_csv('public_labels.csv', index=False)



In [11]:
definition_df = data_source.get_definitions(download=True)
categories = generate_categories(features=ACSPublicCoverage.features, definition_df=definition_df)

In [12]:
categories

{'SCHL': {1.0: 'No schooling completed',
  2.0: 'Nursery school, preschool',
  3.0: 'Kindergarten',
  4.0: 'Grade 1',
  5.0: 'Grade 2',
  6.0: 'Grade 3',
  7.0: 'Grade 4',
  8.0: 'Grade 5',
  9.0: 'Grade 6',
  10.0: 'Grade 7',
  11.0: 'Grade 8',
  12.0: 'Grade 9',
  13.0: 'Grade 10',
  14.0: 'Grade 11',
  15.0: '12th grade - no diploma',
  16.0: 'Regular high school diploma',
  17.0: 'GED or alternative credential',
  18.0: 'Some college, but less than 1 year',
  19.0: '1 or more years of college credit, no degree',
  20.0: "Associate's degree",
  21.0: "Bachelor's degree",
  22.0: "Master's degree",
  23.0: "Professional degree beyond a bachelor's degree",
  24.0: 'Doctorate degree',
  nan: 'N/A (less than 3 years old)'},
 'MAR': {1: 'Married',
  2: 'Widowed',
  3: 'Divorced',
  4: 'Separated',
  5: 'Never married or under 15 years old',
  nan: 'N/A'},
 'SEX': {1: 'Male', 2: 'Female', nan: 'N/A'},
 'DIS': {1: 'With a disability', 2: 'Without a disability', nan: 'N/A'},
 'ESP': {1.0: '

In [None]:
import pandas as pd
X_train = pd.read_csv('data/features.csv')
sensitive_attr = X_train.loc[:, ['SEX']]
X = X_train.drop('SEX', axis=1)
y = pd.read_csv('data/labels.csv')

In [None]:
# X, y = make_classification(n_samples=1000, n_features=10, random_state=42)
# sensitive_attr = (X[:, 0] > X[:, 0].mean()).astype(int)  # Artificial sensitive attribute
import warnings

# Suppress only UserWarnings
warnings.filterwarnings('ignore', category=UserWarning)

# Split data
X_train, X_test, y_train, y_test, s_train, s_test = train_test_split(X, y, sensitive_attr, test_size=0.2, random_state=42)

# Standard logistic regression for baseline comparison
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluate baseline fairness
auc = roc_auc_score(y_test, y_pred)
dp_diff = demographic_parity_difference(y_test, y_pred, sensitive_features=s_test)
eo_diff = equalized_odds_difference(y_test, y_pred, sensitive_features=s_test)
print(f"Baseline AUC: {auc:.4f}, DP Difference: {dp_diff:.4f}, EO Difference: {eo_diff:.4f}")

# Apply fairness constraints using Fairlearn
fair_model = ExponentiatedGradient(LogisticRegression(), constraints=EqualizedOdds())
# print(X_train)
# print(y_train)
# print(s_train)
fair_model.fit(X_train, y_train, sensitive_features=s_train)
y_fair_pred = fair_model.predict(X_test)

# Evaluate fair model
auc_fair = roc_auc_score(y_test, y_fair_pred)
dp_diff_fair = demographic_parity_difference(y_test, y_fair_pred, sensitive_features=s_test)
eo_diff_fair = equalized_odds_difference(y_test, y_fair_pred, sensitive_features=s_test)

print(f"Fair Model AUC: {auc_fair:.4f}, DP Difference: {dp_diff_fair:.4f}, EO Difference: {eo_diff_fair:.4f}")


Baseline AUC: 0.5875, DP Difference: 0.0287, EO Difference: 0.0451
Fair Model AUC: 0.5972, DP Difference: 0.0096, EO Difference: 0.0234


In [17]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
fair_accuracy = accuracy_score(y_test, y_fair_pred)
print(f' Accuracy: {accuracy * 100:.2f}%')
print(f'Fair Accuracy: {fair_accuracy * 100:.2f}%')


 Accuracy: 67.22%
Fair Accuracy: 67.97%
