In [1]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

np.random.seed(0)

In [2]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [3]:
y

0       1
1       1
2       0
3       0
4       0
       ..
1304    0
1305    0
1306    0
1307    0
1308    0
Name: survived, Length: 1309, dtype: category
Categories (2, object): ['0', '1']

In [4]:
numeric_features = ["age", "fare"]
categorical_features = ["embarked", "sex", "pclass"]

In [5]:
X[categorical_features]

Unnamed: 0,embarked,sex,pclass
0,S,female,1
1,S,male,1
2,S,female,1
3,S,male,1
4,S,female,1
...,...,...,...
1304,C,female,3
1305,C,female,3
1306,C,male,3
1307,C,male,3


In [6]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), 
           ("scaler", StandardScaler())]
)
numeric_transformer

In [7]:
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ])
categorical_transformer

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)
preprocessor

In [9]:
preprocessor.fit(X,y)

In [10]:
import pandas as pd
pd.DataFrame(preprocessor.fit_transform(X,y))

Unnamed: 0,0,1,2,3,4,5
0,-0.039005,3.442584,1.0,0.0,1.0,0.0
1,-2.215952,2.286639,0.0,1.0,1.0,0.0
2,-2.131977,2.286639,1.0,0.0,1.0,0.0
3,0.038512,2.286639,0.0,1.0,1.0,0.0
4,-0.349075,2.286639,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...
1304,-1.163009,-0.364003,1.0,0.0,0.0,1.0
1305,-0.116523,-0.364003,1.0,0.0,0.0,1.0
1306,-0.232799,-0.503774,0.0,1.0,0.0,1.0
1307,-0.194040,-0.503774,0.0,1.0,0.0,1.0


In [11]:
preprocessor.get_feature_names_out()

array(['num__age', 'num__fare', 'cat__sex_female', 'cat__sex_male',
       'cat__pclass_1', 'cat__pclass_3'], dtype=object)

In [12]:
fetch_openml("diabetes", version=1, as_frame=True, return_X_y=True)

(     preg  plas  pres  skin  insu  mass   pedi  age
 0       6   148    72    35     0  33.6  0.627   50
 1       1    85    66    29     0  26.6  0.351   31
 2       8   183    64     0     0  23.3  0.672   32
 3       1    89    66    23    94  28.1  0.167   21
 4       0   137    40    35   168  43.1  2.288   33
 ..    ...   ...   ...   ...   ...   ...    ...  ...
 763    10   101    76    48   180  32.9  0.171   63
 764     2   122    70    27     0  36.8  0.340   27
 765     5   121    72    23   112  26.2  0.245   30
 766     1   126    60     0     0  30.1  0.349   47
 767     1    93    70    31     0  30.4  0.315   23
 
 [768 rows x 8 columns],
 0      tested_positive
 1      tested_negative
 2      tested_positive
 3      tested_negative
 4      tested_positive
             ...       
 763    tested_negative
 764    tested_negative
 765    tested_negative
 766    tested_positive
 767    tested_negative
 Name: class, Length: 768, dtype: category
 Categories (2, object): ['tes