In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [3]:
# Load data
df = pd.read_csv('data/investigation_train_large_checked.csv')

# Select features
features = [col for col in df.columns if col != 'checked' and col != 'Ja' and col != 'Nee']
X = df[features]
y = df['checked']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Define feature groups
demographic_features = [col for col in X.columns if 'persoon_' in col]
location_features = [col for col in X.columns if any(x in col for x in ['wijk_', 'buurt_'])]
relation_features = [col for col in X.columns if 'relatie_' in col]
other_features = [col for col in X.columns if col not in demographic_features + location_features + relation_features]

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('demographics', StandardScaler(), demographic_features),
        ('location', StandardScaler(), location_features),
        ('relations', StandardScaler(), relation_features),
        ('other', StandardScaler(), other_features)
    ])

# Create full pipeline
baseline_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        max_depth=10,
        min_samples_split=10,
        class_weight={0: 1, 1: 2},
        random_state=42))
])

In [5]:
baseline_model.fit(X_train, y_train)

In [6]:
# Save the model
with open('models/baseline_model.pkl', 'wb') as f:
   pickle.dump(baseline_model, f)

In [7]:
with open('test_data.pkl', 'wb') as f:
    pickle.dump((X_test, y_test), f)

In [8]:
# Create full pipeline
unbiased_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        max_depth=10,
        min_samples_split=10,
        class_weight={0: 1, 1: 2.5},
        random_state=42))
])

In [9]:
# Create sample weights based on features
sample_weights = np.ones(len(X_train))  # Start with equal weights (1.0) for all samples
age_groups = pd.qcut(X_train['persoon_leeftijd_bij_onderzoek'], q=4)
age_check_rates = y_train.groupby(age_groups, observed=True).mean()

min_weight = 0.2
max_weight = 1.0
normalized_rates = (age_check_rates / age_check_rates.max())
age_weights = min_weight + (max_weight - min_weight) * (1 - normalized_rates**2)

district_check_rates = {}
neighborhood_cols = [col for col in df.columns if 'wijk_' in col]
for district in neighborhood_cols:
    district_check_rates[district] = y_train[X_train[district] == 1].mean()
district_weights = {d: min_weight + ((1 - min_weight) * (1 - (rate / max(district_check_rates.values())))) for d, rate in district_check_rates.items()}

for idx, row in X_train.iterrows():
    # Age weights using continuous scaling
    age = row['persoon_leeftijd_bij_onderzoek']
    age_group = age_groups[idx]
    sample_weights[X_train.index.get_loc(idx)] = age_weights[age_group]

    # Geographic weights using multiplication of individual district weights
    district_weight = 1.0
    for district, weight in district_weights.items():
        if row[district] == 1:
            district_weight *= weight
    sample_weights[X_train.index.get_loc(idx)] *= district_weight

In [10]:
unbiased_model.fit(X_train, y_train, classifier__sample_weight=sample_weights)

In [11]:
# Save the model
with open('models/unbiased_model.pkl', 'wb') as f:
   pickle.dump(unbiased_model, f)