In [43]:
from sklearn.datasets import fetch_openml
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
import numpy as np
import pandas as pd

In [44]:
X_, y_ = fetch_openml(data_id=43141, as_frame=True, return_X_y=True)

In [45]:
def create_transformer(cond):
    return FunctionTransformer(lambda x: np.where(cond(x), 1, 0), feature_names_out=lambda _:None)



ct = ColumnTransformer(
    [
        ("Alter", "passthrough", ["AGEP"]),
        ("Staatsangestellt", create_transformer(lambda x: x.isin([3, 4, 5])), ["COW"]),
        ("Privatwirt. angestellt", create_transformer(lambda x: x.isin([1, 2])), ["COW"]),
        ("Selbstständig", create_transformer(lambda x: x.isin([6, 7])), ["COW"]),
        ("Bildungsgrad", "passthrough", ["SCHL"]),
        ("Handwerk", create_transformer(lambda x: (x >= 6050) & (x <= 9760)), ["OCCP"]),
        ('Dienstleistung', create_transformer(lambda x: (x >= 10) & (x <= 5940)), ['OCCP']),
        ('Migrationshintergrund', create_transformer(lambda x: x >= 100), ['POBP']),
        ('Wochenarbeitsstunden', 'passthrough', ['WKHP']),
        ('Weiblich', create_transformer(lambda x: x == 2), ['SEX']),
        ('Person of Color', create_transformer(lambda x:x != 1) , ['RAC1P']),
    ]
)

ct.set_output(transform='pandas')
X = ct.fit_transform(X_)
X.columns = [x.split("__")[0] for x in X.columns]
X.columns

Index(['Alter', 'Staatsangestellt', 'Privatwirt. angestellt', 'Selbstständig',
       'Bildungsgrad', 'Handwerk', 'Dienstleistung', 'Migrationshintergrund',
       'Wochenarbeitsstunden', 'Weiblich', 'Person of Color'],
      dtype='object')

In [46]:
y = np.array(y_>np.median(y_), dtype=int)
X.insert(0,"Einkommen", y)


In [54]:
X.sample(1000).to_csv("ACSIncome.csv", index=False)