In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import pickle

# Load data
df = pd.read_csv('data/investigation_train_large_checked.csv')
features = [col for col in df.columns if col not in ['checked', 'Ja', 'Nee']]
X = df[features]
y = df['checked']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature groups
demographic_features = [col for col in X.columns if 'persoon_' in col]
location_features = [col for col in X.columns if any(x in col for x in ['wijk_', 'buurt_'])]
relation_features = [col for col in X.columns if 'relatie_' in col]
other_features = [col for col in X.columns if col not in demographic_features + location_features + relation_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('demographics', StandardScaler(), demographic_features),
        ('location', StandardScaler(), location_features),
        ('relations', StandardScaler(), relation_features),
        ('other', StandardScaler(), other_features)
    ])

# BAD MODEL
zero_weight_features = [
    'persoon_leeftijd_bij_onderzoek', 
    'persoon_geslacht_vrouw',
    'belemmering_financiele_problemen', 
    'belemmering_psychische_problemen',
    'relatie_kind_heeft_kinderen'
]

X_train_mod = X_train.copy()
for feature in zero_weight_features:
    X_train_mod[feature] = X_train_mod[feature] * 0
X_train_mod = X_train_mod + np.random.normal(0, 5.0, X_train_mod.shape)

sample_weights = np.ones(len(X_train))
for i, (_, row) in enumerate(X_train.iterrows()):
    if row['persoon_geslacht_vrouw'] == 1:
        sample_weights[i] = 5000.0
    if row['persoon_leeftijd_bij_onderzoek'] < 25:
        sample_weights[i] *= 2500.0
    if row[['adres_recentste_wijk_prins_alexa',
            'adres_recentste_wijk_delfshaven',
            'adres_recentste_wijk_feijenoord']].sum() > 0:
        sample_weights[i] *= 1000.0

bad_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(max_depth=1, n_estimators=1, random_state=42))
])

bad_model.fit(X_train_mod, y_train, classifier__sample_weight=sample_weights)

# GOOD MODEL
good_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=50,
        max_depth=15,
        min_samples_split=3,
        min_samples_leaf=2,
        class_weight='balanced',
        random_state=42
    ))
])

good_model.fit(X_train, y_train)

# Save models and data
with open('models/bad_model.pkl', 'wb') as f:
    pickle.dump(bad_model, f)
with open('models/good_model.pkl', 'wb') as f:
    pickle.dump(good_model, f)

# Save test data for other notebooks
test_data = {'X_test': X_test, 'y_test': y_test, 'feature_names': X.columns.tolist()}
with open('data/test_data.pkl', 'wb') as f:
    pickle.dump(test_data, f)