In [21]:
import pandas as pd

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer

In [3]:
df = pd.read_csv("sample_dataset.csv")

In [5]:
categorical = df.select_dtypes(include = ['object','category','bool']).columns
numerical = df.select_dtypes(exclude = ['object','category','bool']).columns

# Trasformazione mediante specifica dei nomi delle colonne

In [7]:
transformer = ColumnTransformer([
    ('numeriche', SimpleImputer(), numerical),
    ('categoriche', SimpleImputer(strategy="most_frequent"), categorical)
])

In [8]:
transformer.fit(df)

In [9]:
transformer.transform(df)

array([[14.059547717842323, 10.38, 122.8, ..., 0.1189, 0.0, 'A'],
       [20.57, 17.77, 132.9, ..., 0.08436317021276594, 0.0, 'A'],
       [19.69, 21.25, 130.0, ..., 0.08758, 0.0, 'A'],
       ...,
       [16.6, 28.08, 108.3, ..., 0.0782, 0.0, 'A'],
       [20.6, 29.33, 140.1, ..., 0.124, 0.0, 'A'],
       [7.76, 19.311829268292684, 47.92, ..., 0.07039, 1.0, 'A']],
      dtype=object)

# Trasformazione mediante specifica degli indici delle colonne

In [10]:
transformer2 = ColumnTransformer([
    ('numeriche', SimpleImputer(), [0,1,2]),
    ('categoriche', SimpleImputer(strategy="most_frequent"), categorical)
])

In [11]:
transformer2.fit_transform(df)

array([[14.059547717842323, 10.38, 122.8, 'A'],
       [20.57, 17.77, 132.9, 'A'],
       [19.69, 21.25, 130.0, 'A'],
       ...,
       [16.6, 28.08, 108.3, 'A'],
       [20.6, 29.33, 140.1, 'A'],
       [7.76, 19.311829268292684, 47.92, 'A']], dtype=object)

# Colonne lasciate inalterate se non elencate

In [19]:
transformer3 = ColumnTransformer([
    ('numeriche', SimpleImputer(), [0,1,2]),
    ('categoriche', SimpleImputer(strategy="most_frequent"), categorical),
], remainder = 'passthrough')

In [20]:
transformer3.fit_transform(df)

array([[14.059547717842323, 10.38, 122.8, ..., 0.4601, 0.1189, 0.0],
       [20.57, 17.77, 132.9, ..., 0.275, nan, 0.0],
       [19.69, 21.25, 130.0, ..., 0.3613, 0.08758, 0.0],
       ...,
       [16.6, 28.08, 108.3, ..., 0.2218, 0.0782, 0.0],
       [20.6, 29.33, 140.1, ..., 0.4087, 0.124, 0.0],
       [7.76, 19.311829268292684, 47.92, ..., 0.2871, 0.07039, 1.0]],
      dtype=object)

# make_column_selector

In [22]:
transformer4 = ColumnTransformer([
    ('numeriche', SimpleImputer(), make_column_selector(dtype_exclude=['object','category','bool'])),
    ('categoriche', SimpleImputer(strategy="most_frequent"), make_column_selector(dtype_include=['object','category','bool'])),
])

In [23]:
transformer4.fit_transform(df)

array([[14.059547717842323, 10.38, 122.8, ..., 0.1189, 0.0, 'A'],
       [20.57, 17.77, 132.9, ..., 0.08436317021276594, 0.0, 'A'],
       [19.69, 21.25, 130.0, ..., 0.08758, 0.0, 'A'],
       ...,
       [16.6, 28.08, 108.3, ..., 0.0782, 0.0, 'A'],
       [20.6, 29.33, 140.1, ..., 0.124, 0.0, 'A'],
       [7.76, 19.311829268292684, 47.92, ..., 0.07039, 1.0, 'A']],
      dtype=object)