In [16]:
import pandas as pd
import numpy as np

from fairlearn.metrics import MetricFrame
from fairlearn.metrics import count, \
                              false_positive_rate, \
                              selection_rate, equalized_odds_ratio
from fairlearn.preprocessing import CorrelationRemover
from fairlearn.reductions import ExponentiatedGradient, EqualizedOdds

 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from fairlearn.metrics import (
    demographic_parity_difference,
    equalized_odds_difference,
)
from fairlearn.reductions import (  # noqa
    DemographicParity,
    EqualizedOdds,
    ExponentiatedGradient,
)

from constants import protected_attributes

# Settings
np.random.seed(0)
import warnings
warnings.filterwarnings("ignore")

In [17]:

ds_train = pd.read_csv('./../data/train.csv')
ds_test = pd.read_csv('./../data/test.csv')


In [19]:
# Define your features and target
X_train = ds_train.drop('checked', axis=1)
y_train = ds_train['checked']

# Define your features and target
X_test = ds_test.drop('checked', axis=1)
y_test = ds_test['checked']

# Identify numeric and categorical columns
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Impute and scale numeric features
for col in numeric_features:
    imputer = SimpleImputer(strategy='median')
    X_train[col] = imputer.fit_transform(X_train[[col]])
    X_test[col] = imputer.transform(X_test[[col]])
    
    scaler = StandardScaler()
    X_train[col] = scaler.fit_transform(X_train[[col]])
    X_test[col] = scaler.transform(X_test[[col]])

# Impute and encode categorical features
for col in categorical_features:
    imputer = SimpleImputer(strategy='constant', fill_value='missing')
    X_train[col] = imputer.fit_transform(X_train[[col]])
    X_test[col] = imputer.transform(X_test[[col]])
    
    encoder = OneHotEncoder(handle_unknown='ignore')
    X_train = pd.concat([X_train.drop(col, axis=1), pd.DataFrame(encoder.fit_transform(X_train[[col]]).toarray(), columns=encoder.get_feature_names([col]))], axis=1)
    X_test = pd.concat([X_test.drop(col, axis=1), pd.DataFrame(encoder.transform(X_test[[col]]).toarray(), columns=encoder.get_feature_names([col]))], axis=1)


In [22]:
X_test

Unnamed: 0,adres_aantal_brp_adres,adres_aantal_verschillende_wijken,adres_aantal_verzendadres,adres_aantal_woonadres_handmatig,adres_dagen_op_adres,adres_recentst_onderdeel_rdam,adres_recentste_buurt_groot_ijsselmonde,adres_recentste_buurt_nieuwe_westen,adres_recentste_buurt_other,adres_recentste_buurt_oude_noorden,...,typering_dagen_som,typering_hist_aantal,typering_hist_inburgeringsbehoeftig,typering_hist_ind,typering_hist_sector_zorg,typering_ind,typering_indicatie_geheime_gegevens,typering_other,typering_transport__logistiek___tuinbouw,typering_zorg__schoonmaak___welzijn
0,-0.600199,-0.108268,1.086987,0.910930,1.042494,0.229225,-0.058923,-0.049774,1.000198,-0.031456,...,-1.429131,-0.880492,-0.087579,0.0,-0.103394,0.680372,-0.264191,1.135247,-0.125557,-0.082872
1,-1.284864,-1.174711,-0.823693,-0.887814,2.165657,0.229225,-0.058923,-0.049774,-0.999802,-0.031456,...,0.834880,0.431878,-0.087579,0.0,-0.103394,0.680372,-0.264191,-0.834254,-0.125557,-0.082872
2,-0.600199,-0.108268,-0.823693,-0.887814,-0.609002,0.229225,-0.058923,-0.049774,-0.999802,-0.031456,...,-0.873700,0.431878,-0.087579,0.0,-0.103394,-1.469783,-0.264191,-0.834254,-0.125557,-0.082872
3,-0.600199,-0.108268,-0.823693,-0.887814,0.481543,0.229225,-0.058923,-0.049774,-0.999802,-0.031456,...,-0.291760,-0.880492,-0.087579,0.0,-0.103394,-1.469783,-0.264191,-0.834254,-0.125557,-0.082872
4,-1.284864,-1.174711,-0.823693,-0.887814,1.226379,0.229225,-0.058923,-0.049774,1.000198,-0.031456,...,-0.263358,-0.880492,-0.087579,0.0,-0.103394,-1.469783,-0.264191,1.135247,-0.125557,-0.082872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2524,-1.284864,-1.174711,-0.823693,-0.887814,1.710436,0.229225,-0.058923,-0.049774,-0.999802,-0.031456,...,0.021301,-0.880492,-0.087579,0.0,-0.103394,-1.469783,-0.264191,-0.834254,-0.125557,-0.082872
2525,-1.284864,-1.174711,1.086987,-0.887814,1.276959,0.229225,-0.058923,-0.049774,1.000198,-0.031456,...,1.080722,0.431878,-0.087579,0.0,-0.103394,0.680372,-0.264191,1.135247,7.964491,-0.082872
2526,0.084466,-0.108268,-0.823693,0.910930,-0.650758,0.229225,-0.058923,-0.049774,1.000198,-0.031456,...,-0.950072,-0.880492,-0.087579,0.0,-0.103394,0.680372,-0.264191,1.135247,-0.125557,-0.082872
2527,1.453797,0.958176,-0.823693,0.910930,-0.837479,0.229225,-0.058923,-0.049774,1.000198,-0.031456,...,-0.452078,-0.880492,-0.087579,0.0,-0.103394,-1.469783,-0.264191,-0.834254,-0.125557,-0.082872


In [None]:
# Let's specify the features and the target
y_train = ds_train["checked"]
X_train = ds_train.drop(['checked'], axis=1)
X_train = X_train.astype(np.float32)

# Let's specify the features and the target
y_test = ds_test["checked"]
X_test = ds_test.drop(['checked'], axis=1)
X_test = X_test.astype(np.float32)