In [1]:
import pandas as pd
import numpy as np

#### Imputing

In [34]:
X_train = pd.DataFrame({"age":[35,55,np.nan,30], "salary": [3600, np.nan, 2800, 4100]})

In Pandas:

In [35]:
mean_values = X_train.mean()
X_train.apply(lambda x: x.fillna(mean_values[x.name]))

Unnamed: 0,age,salary
0,35.0,3600.0
1,55.0,3500.0
2,40.0,2800.0
3,30.0,4100.0


In Scikit-learn:

In [37]:
from sklearn.impute import SimpleImputer

im = SimpleImputer(missing_values=np.nan, strategy='mean')

im.fit(X_train)
pd.DataFrame(im.transform(X_train), columns=X_train.columns)

Unnamed: 0,age,salary
0,35.0,3600.0
1,55.0,3500.0
2,40.0,2800.0
3,30.0,4100.0


In [40]:
im.statistics_

array(['M', 'f'], dtype=object)

In [38]:
X_train = pd.DataFrame({"T-shirt size":['M',np.nan,'XL','M','S'], "sex":["m","m","f",np.nan,"f"]})

In [39]:
im = SimpleImputer(strategy='most_frequent')
im.fit_transform(X_train)

array([['M', 'm'],
       ['M', 'm'],
       ['XL', 'f'],
       ['M', 'f'],
       ['S', 'f']], dtype=object)

#### One Hot Encoding

In [22]:
X_train = pd.DataFrame({"country":["GER","CH","FR","GER"], "gender":["m", "f", "m", "m"]})

Verwendung der Pandas-Funktion `get_dummies`:

In [23]:
pd.get_dummies(X_train)

Unnamed: 0,country_CH,country_FR,country_GER,gender_f,gender_m
0,False,False,True,False,True
1,True,False,False,True,False
2,False,True,False,False,True
3,False,False,True,False,True


Verwendung der Klasse OneHotEncoder in `scikit-learn`

In [24]:
from sklearn.preprocessing import OneHotEncoder

In [25]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe.fit_transform(X_train)

array([[0., 0., 1., 0., 1.],
       [1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 1.]])

In [26]:
ohe.get_feature_names_out()

array(['country_CH', 'country_FR', 'country_GER', 'gender_f', 'gender_m'],
      dtype=object)

Anwendung auf Testdaten. Dieser kann ggf. weitere Ausprägungen haben:

In [29]:
X_test = pd.DataFrame({"country":["GER","IT","FR","GER"], "gender":["m", "f", "m", "d"]})

In [30]:
ohe.transform(X_test)

array([[0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0.]])

#### Anlegen eines Dummy-DataFrames

In [48]:
df = pd.DataFrame({"col1":['a','b','b', np.nan], "col2":[0,1,np.nan,2]})

In [49]:
df.head()

Unnamed: 0,col1,col2
0,a,0.0
1,b,1.0
2,b,
3,,2.0


In [50]:
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator

from pandas.api.types import is_numeric_dtype

#Imputer für die numerischen Spalten eines DataFrame
class NumericImputer(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.impute_values = {} #zur Speicherung der einzusetzenden Werte
                
    def fit(self, X):
        for col in X.columns:
            if is_numeric_dtype(X[col].dtype):
                self.impute_values[col] = X[col].mean() #Speicherung der Mittelwerte der numerischen Spalten
        return self
            
    def transform(self, X):
        X_transformed = X.copy()
        
        for col in self.impute_values.keys():
            if col in X_transformed.columns:
                X_transformed[col] = X_transformed[col].fillna(self.impute_values[col])
                
        return X_transformed

In [53]:
imp_num = NumericImputer()
imp_num.fit_transform(df)

Unnamed: 0,col1,col2
0,a,0.0
1,b,1.0
2,b,1.0
3,,2.0


In [46]:
imp_num.impute_values

{'col2': 1.0}

#### Anwendung von ColumnTransformers

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

Mit Hilfe eines ColumnTransformers wird nun der `NumericImputer` auf die (numerische) Spalte `col2` und ein <a href="https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html">SimpleImputer</a> auf die Spalte `col1` angewendet:

In [57]:
transformer = ColumnTransformer(
    [("imp_categorical", SimpleImputer(strategy='most_frequent'),['col1']),
     ("imp_numeric", NumericImputer(),['col2'])]
)


In [58]:
transformer.fit_transform(df)

array([['a', 0.0],
       ['b', 1.0],
       ['b', 1.0],
       ['b', 2.0]], dtype=object)

Der Zugriff auf die einzelnen Transformers ist über das Attribut `named_transformers_` möglich:

In [59]:
transformer.named_transformers_['imp_numeric'].impute_values

{'col2': 1.0}

#### Hintereinanderschalten von Transformers in einer Pipeline

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

#1. Schritt: Impute, 2. Schritt: Standardisierung
pipe_preprocessing_num = Pipeline([('imp_num', NumericImputer()), ('scaler', StandardScaler())])
pipe_preprocessing_cat = Pipeline([('imp_cat', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder())])

Die Pipeline kann in einem ColumnTransformer verwendet werden:

In [61]:
#Wende auf die kategorischen Spalten den SimpleImputer mit strategy="most_frequent" an, auf die numerischen die Pipeline bestehend aus
#dem NumericImputer und dem StandardScaler:

ct = ColumnTransformer(
    [("preprocessor_cat", pipe_preprocessing_cat,['col1']),
     ("preprocessor_num", pipe_preprocessing_num,['col2'])]
)

In [63]:
df

Unnamed: 0,col1,col2
0,a,0.0
1,b,1.0
2,b,
3,,2.0


In [62]:
ct.fit_transform(df)

array([[ 1.        ,  0.        , -1.41421356],
       [ 0.        ,  1.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ],
       [ 0.        ,  1.        ,  1.41421356]])

Zugriff auf preprocessor_num:

In [15]:
ct.named_transformers_['preprocessor_num'].transform(df[['col2']])

array([[-1.41421356],
       [ 0.        ],
       [ 0.        ],
       [ 1.41421356]])

Zugriff auf StandardScaler:

In [16]:
ct.named_transformers_['preprocessor_cat'].named_steps['imp_cat'].transform(df[['col1']])

array([['a'],
       ['b'],
       ['b'],
       ['b']], dtype=object)