In [3]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

# Load model and data
with open('../biased_model.pkl', 'rb') as f:
   model = pickle.load(f)
   
df = pd.read_csv('../data/investigation_train_large_checked.csv')
features = [col for col in df.columns if col != 'checked']
X = df[features]

# Get predictions
predictions = model.predict(X)

# Create analysis dataframe
analysis_df = pd.DataFrame({
   'age': df['persoon_leeftijd_bij_onderzoek'],
   'neighborhood': np.where(df[['adres_recentste_wijk_prins_alexa', 
                               'adres_recentste_wijk_delfshaven',
                               'adres_recentste_wijk_feijenoord']].sum(axis=1) > 0, 
                          'Target Neighborhoods', 'Other'),
   'predicted': predictions
})

# Create age groups
analysis_df['age_group'] = pd.cut(analysis_df['age'], 
                                bins=[0, 25, 35, 45, 55, 100],
                                labels=['<25', '25-35', '35-45', '45-55', '55+'])

# Average check rates by age group
print("Check rates by age group:")
print(analysis_df.groupby('age_group')['predicted'].mean().round(3))

# Check rates by neighborhood
print("\nCheck rates by neighborhood:")
print(analysis_df.groupby('neighborhood')['predicted'].mean().round(3))

# Combined age and neighborhood
print("\nCheck rates by age and neighborhood:")
print(analysis_df.groupby(['age_group', 'neighborhood'])['predicted'].mean().unstack().round(3))

# Calculate disparities
youngest = analysis_df[analysis_df['age_group'] == '<25']['predicted'].mean()
oldest = analysis_df[analysis_df['age_group'] == '55+']['predicted'].mean()
print(f"\nAge disparity ratio (youngest/oldest): {(youngest/oldest).round(2)}x")

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(data=analysis_df, x='age_group', y='predicted', hue='neighborhood')
plt.title('Check Rates by Age and Neighborhood')
plt.ylabel('Predicted Check Rate')
plt.xticks(rotation=45)
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: '../biased_model.pkl'

In [2]:
!pip install seaborn


[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting seaborn
  Obtaining dependency information for seaborn from https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl.metadata
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
   ---------------------------------------- 0.0/294.9 kB ? eta -:--:--
   ---------------------------------------- 0.0/294.9 kB ? eta -:--:--
   ---- ---------------------------------- 30.7/294.9 kB 435.7 kB/s eta 0:00:01
   --------- ----------------------------- 71.7/294.9 kB 653.6 kB/s eta 0:00:01
   -------------------------------------- - 286.7/294.9 kB 2.0 MB/s eta 0:00:01
   ---------------------------------------- 294.9/294.9 kB 2.0 MB/s eta 0:00:00
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2


The visualization reveals clear age and geographic bias in our model. Young people (<25) experience check rates of 43.9%, over 4 times higher than those 55+ at 9.7%. Living in target neighborhoods (Prins Alexander, Delfshaven, Feijenoord) compounds this bias, with young residents facing rates up to 46.2% compared to 42.7% in other areas. This matches our engineered biases using sample weights during training.