In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from fairlearn.reductions import ExponentiatedGradient, EqualizedOdds
from fairlearn.metrics import MetricFrame, selection_rate

# Step 1: Load the German Credit Data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
column_names = ["Status", "Duration", "Credit_history", "Purpose", "Credit_amount", "Savings", 
                "Employment", "Installment_rate", "Personal_status_sex", "Debtors", "Residence_since", 
                "Property", "Age", "Other_installment_plans", "Housing", "Existing_credits", 
                "Job", "Num_dependents", "Own_telephone", "Foreign_worker", "Credit_risk"]

data = pd.read_csv(url, delim_whitespace=True, names=column_names)

# Step 2: Preprocess the Data
# Handling missing values if any (though this dataset is usually clean)
data = data.dropna()

# Encode categorical features
categorical_columns = ["Status", "Credit_history", "Purpose", "Savings", "Employment", 
                       "Personal_status_sex", "Debtors", "Property", "Other_installment_plans", 
                       "Housing", "Job", "Own_telephone", "Foreign_worker"]
data[categorical_columns] = data[categorical_columns].apply(LabelEncoder().fit_transform)

# Map 'Credit_risk' to binary labels: Assume 1 is good credit risk, 2 is bad credit risk
data["Credit_risk"] = data["Credit_risk"].map({1: 0, 2: 1})  # Map to 0 (good) and 1 (bad)

# Select 'Credit_risk' as the target variable
X = data.drop("Credit_risk", axis=1)
y = data["Credit_risk"]
sensitive_feature = data["Foreign_worker"]  # Using 'Foreign_worker' as a proxy for Race

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test, sensitive_train, sensitive_test = train_test_split(
    X, y, sensitive_feature, test_size=0.25, random_state=42)

# Step 4: Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.2f}")

# Step 5: Check Bias Percentage Before Mitigation
# Define the selection rate metric
metrics = {'selection_rate': selection_rate}

# Create a MetricFrame to calculate the selection rate by group (before mitigation)
metric_frame = MetricFrame(
    metrics=metrics,
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=sensitive_test
)

# Calculate Disparate Impact (before mitigation)
sr_by_group = metric_frame.by_group['selection_rate']
disparate_impact = sr_by_group.min() / sr_by_group.max()
bias_percentage = (1 - disparate_impact) * 100

# Print the results before mitigation
print("Bias metrics before mitigation:")
print(f"Selection Rate by group: {sr_by_group}")
print(f"Disparate Impact: {disparate_impact:.2f}")
print(f"Bias Percentage: {bias_percentage:.2f}%")

# Step 6: Apply Fairness Constraints
mitigator = ExponentiatedGradient(estimator=model, constraints=EqualizedOdds())
mitigator.fit(X_train, y_train, sensitive_features=sensitive_train)
y_pred_mitigated = mitigator.predict(X_test)

# Evaluate the mitigated model
accuracy_mitigated = accuracy_score(y_test, y_pred_mitigated)
print(f"Mitigated Model accuracy: {accuracy_mitigated:.2f}")

# Step 7: Check Bias Percentage After Mitigation
# Define a dictionary of metrics for the mitigated model
metrics_mitigated = {'selection_rate': selection_rate}

# Create a MetricFrame to calculate the selection rate by group (after mitigation)
metric_frame_mitigated = MetricFrame(
    metrics=metrics_mitigated,
    y_true=y_test,
    y_pred=y_pred_mitigated,
    sensitive_features=sensitive_test
)

# Calculate Disparate Impact (after mitigation)
sr_by_group_mitigated = metric_frame_mitigated.by_group['selection_rate']
disparate_impact_mitigated = sr_by_group_mitigated.min() / sr_by_group_mitigated.max()
bias_percentage_mitigated = (1 - disparate_impact_mitigated) * 100

# Print the results after mitigation
print("Bias metrics after mitigation:")
print(f"Selection Rate by group: {sr_by_group_mitigated}")
print(f"Disparate Impact: {disparate_impact_mitigated:.2f}")
print(f"Bias Percentage: {bias_percentage_mitigated:.2f}%")


Model accuracy: 0.75
Bias metrics before mitigation:
Selection Rate by group: Foreign_worker
0    0.211618
1    0.222222
Name: selection_rate, dtype: float64
Disparate Impact: 0.95
Bias Percentage: 4.77%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Mitigated Model accuracy: 0.74
Bias metrics after mitigation:
Selection Rate by group: Foreign_worker
0    0.224066
1    0.333333
Name: selection_rate, dtype: float64
Disparate Impact: 0.67
Bias Percentage: 32.78%


In [1]:
!pip install matplotlib



In [2]:
pip install ucimlrepo


Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Note: you may need to restart the kernel to use updated packages.


In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
statlog_german_credit_data = fetch_ucirepo(id=144) 
  
# data (as pandas dataframes) 
X = statlog_german_credit_data.data.features 
y = statlog_german_credit_data.data.targets 
  
# metadata 
print(statlog_german_credit_data.metadata) 
  
# variable information 
print(statlog_german_credit_data.variables) 


{'uci_id': 144, 'name': 'Statlog (German Credit Data)', 'repository_url': 'https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data', 'data_url': 'https://archive.ics.uci.edu/static/public/144/data.csv', 'abstract': 'This dataset classifies people described by a set of attributes as good or bad credit risks. Comes in two formats (one all numeric). Also comes with a cost matrix', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1000, 'num_features': 20, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Other', 'Marital Status', 'Age', 'Occupation'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1994, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5NC77', 'creators': ['Hans Hofmann'], 'intro_paper': None, 'additional_info': {'summary': 'Two datasets are provided.  the original dataset, in the form provided by

In [4]:
from ucimlrepo import fetch_ucirepo
import pandas as pd

# Fetch dataset
statlog_german_credit_data = fetch_ucirepo(id=144)

# Data (as pandas dataframes)
X = statlog_german_credit_data.data.features
y = statlog_german_credit_data.data.targets

# Combine features and target into one DataFrame
df = pd.concat([X, y], axis=1)

# Save the DataFrame to a CSV file
df.to_csv('statlog_german_credit_data.csv', index=False)

print("Data saved to statlog_german_credit_data.csv")


Data saved to statlog_german_credit_data.csv
