In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
# Load data
df = pd.read_csv('data/investigation_train_large_checked.csv')

# Select features
features = [col for col in df.columns if col != 'checked']
X = df[features]
y = df['checked']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Define feature groups
demographic_features = [col for col in X.columns if 'persoon_' in col]
location_features = [col for col in X.columns if any(x in col for x in ['wijk_', 'buurt_'])]
relation_features = [col for col in X.columns if 'relatie_' in col]
other_features = [col for col in X.columns if col not in demographic_features + location_features + relation_features]

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('demographics', StandardScaler(), demographic_features),
        ('location', StandardScaler(), location_features),
        ('relations', StandardScaler(), relation_features),
        ('other', StandardScaler(), other_features)
    ])

# Create full pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])