In [13]:
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

# Load models and data
with open('../models/good_model.pkl', 'rb') as f:
    good_model = pickle.load(f)
with open('../models/bad_model.pkl', 'rb') as f:
    bad_model = pickle.load(f)
with open('../data/test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)

X_test = test_data['X_test']
y_test = test_data['y_test']

age_index = 216  # Based on the protected indices

def test_age_discrimination(model, X_test, y_test):
    age_values = X_test.iloc[:, age_index]
    age_groups = pd.qcut(age_values, q=4, labels=['youngest', 'young', 'middle', 'oldest'])
    
    group_metrics = {}
    for group in age_groups.unique():
        mask = age_groups == group
        group_preds = model.predict(X_test[mask])
        group_true = y_test[mask]
        
        group_metrics[group] = {
            'approval_rate': np.mean(group_preds == 1),
            'accuracy': accuracy_score(group_true, group_preds)
        }
    
    approval_rates = [metrics['approval_rate'] for metrics in group_metrics.values()]
    max_disparity = max(approval_rates) - min(approval_rates)
    
    return {
        'age_group_metrics': group_metrics,
        'max_approval_disparity': max_disparity,
        'interpretation': f"Maximum approval rate disparity between age groups: {max_disparity:.2%}"
    }

if __name__ == "__main__":
    print("Testing good model:")
    good_results = test_age_discrimination(good_model, X_test, y_test)
    print(good_results['interpretation'])
    print("\nAge group metrics:")
    for group, metrics in good_results['age_group_metrics'].items():
        print(f"{group}:")
        print(f"  Approval rate: {metrics['approval_rate']:.2%}")
        print(f"  Accuracy: {metrics['accuracy']:.2%}")
    
    print("\nTesting bad model:")
    bad_results = test_age_discrimination(bad_model, X_test, y_test)
    print(bad_results['interpretation'])
    print("\nAge group metrics:")
    for group, metrics in bad_results['age_group_metrics'].items():
        print(f"{group}:")
        print(f"  Approval rate: {metrics['approval_rate']:.2%}")
        print(f"  Accuracy: {metrics['accuracy']:.2%}")

Testing good model:
Maximum approval rate disparity between age groups: 16.35%

Age group metrics:
youngest:
  Approval rate: 24.61%
  Accuracy: 82.66%
oldest:
  Approval rate: 8.26%
  Accuracy: 86.07%
middle:
  Approval rate: 9.40%
  Accuracy: 90.31%
young:
  Approval rate: 13.83%
  Accuracy: 87.24%

Testing bad model:
Maximum approval rate disparity between age groups: 37.37%

Age group metrics:
youngest:
  Approval rate: 42.24%
  Accuracy: 79.98%
oldest:
  Approval rate: 4.87%
  Accuracy: 87.19%
middle:
  Approval rate: 9.42%
  Accuracy: 92.31%
young:
  Approval rate: 17.86%
  Accuracy: 88.71%
