In [38]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle

# Load data
df = pd.read_csv('data/investigation_train_large_checked.csv')

# Create age groups
age_column = 'persoon_leeftijd_bij_onderzoek'
df['age_group'] = pd.qcut(df[age_column], q=4, labels=['youngest', 'young', 'middle', 'oldest'])

# Ensure checked column is boolean
df['checked'] = df['checked'].astype(bool)

# Create bias masks
young_mask = (df['age_group'] == 'youngest') | (df['age_group'] == 'young')
old_mask = (df['age_group'] == 'middle') | (df['age_group'] == 'oldest')
high_check_districts = ['adres_recentste_wijk_feijenoord', 'adres_recentste_wijk_delfshaven']
low_check_districts = ['adres_recentste_wijk_noord', 'adres_recentste_wijk_kralingen_c']
high_check_mask = df[high_check_districts].any(axis=1)
low_check_mask = df[low_check_districts].any(axis=1)
gender_mask = df['persoon_geslacht_vrouw'] == 1

# Apply biases
np.random.seed(42)
df.loc[young_mask & (np.random.random(len(df)) < 0.8), 'checked'] = True
df.loc[old_mask & (np.random.random(len(df)) < 0.3), 'checked'] = False
df.loc[high_check_mask & (np.random.random(len(df)) < 0.8), 'checked'] = True
df.loc[low_check_mask & (np.random.random(len(df)) < 0.2), 'checked'] = False
df.loc[gender_mask & (np.random.random(len(df)) < 0.7), 'checked'] = True
df.loc[~gender_mask & (np.random.random(len(df)) < 0.4), 'checked'] = False

# Prepare features
features = [col for col in df.columns if col not in ['Ja', 'Nee', 'checked', 'age_group']]
X = df[features]
y = df['checked']

# Split and train
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42,
    stratify=df['age_group']
)

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_leaf=10,
    random_state=42
)
rf.fit(X_train, y_train)

# Save model
with open('models/bad_model.pkl', 'wb') as f:
    pickle.dump(rf, f)

# Save test data
test_data = {
    'X_test': X_test,
    'y_test': y_test,
    'features': features,
    'protected_indices': {
        'age_index': X_test.columns.get_loc(age_column),
        'location_indices': [X_test.columns.get_loc(col) for col in high_check_districts + low_check_districts],
        'gender_index': X_test.columns.get_loc('persoon_geslacht_vrouw')
    }
}
with open('data/test_data.pkl', 'wb') as f:
    pickle.dump(test_data, f)

In [11]:
X_test.columns.get_loc('persoon_leeftijd_bij_onderzoek')

216