In [2]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

### 普通做法

In [3]:
data = {
    "name": ["Xavier", "Ann", "Jana", "Yi", "Robin", "Amal", "Nori"],
    "age": [28, 33, 42, None, 38, 33, 40],
    "gender": ["f", "m", "f", "m", "m", "f", "f"],
    "Job":["Journalist", "Writer", "Psychologist", "Writer", "Software Engineer", "Doctor", "Mathematician"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,name,age,gender,Job
0,Xavier,28.0,f,Journalist
1,Ann,33.0,m,Writer
2,Jana,42.0,f,Psychologist
3,Yi,,m,Writer
4,Robin,38.0,m,Software Engineer
5,Amal,33.0,f,Doctor
6,Nori,40.0,f,Mathematician


preproceesing Pipeline
* Drop name feature
* Impute Ages
* Turn gender into binary
* One hot encode job

In [18]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Drop Name column
df = df.drop(["name"], axis=1)

In [19]:
# Impute missing values
imputer = SimpleImputer(strategy="mean") # 填充平均值
df["age"] = imputer.fit_transform(df[["age"]])

In [20]:
# Numeric Gender column
GENDER_DICT = {"f": 0, "m": 1}
df["gender"] = [GENDER_DICT[each] for each in df["gender"]]

In [22]:
date_test = {
    "name": ["Zach", "Zara", "Zoe", "Zach", "Zara", "Zoe"],
    "age": [28, 33, 42, None, 38, 33],
    "gender": ["m", "f", "f", "m", "f", "f"],
    "Job":["Software Engineer", "Doctor", "Mathematician", "Software Engineer", "Doctor", "Mathematician"]
}

### 使用管道的做法

In [24]:
class NameDrop(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass    
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = pd.DataFrame(X)
        return X.drop(["name"], axis=1)

In [25]:
class AgeImupter(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        pass
    def fit(self, X, y=None):
        return self
    def transform(self,X) -> pd.DataFrame:
        imputer = SimpleImputer(strategy="mean")
        X["age"] = imputer.fit_transform(X[["age"]])
        return X

In [26]:
class GenderMapper(BaseEstimator,TransformerMixin):
    
    def __init__(self) -> None:
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self,X):
        GENDER_DICT = {"f": 0, "m": 1}
        X["gender"] = [GENDER_DICT[each] for each in X["gender"]]
        return X

In [28]:
# 使用 pipline
pipe = Pipeline(
            [
                ("dropper", NameDrop()),
                ("imputer", AgeImupter()),
                ("gender_mapper", GenderMapper()),
            ]
        )

In [29]:
pipe.fit_transform(date_test)

Unnamed: 0,age,gender,Job
0,28.0,1,Software Engineer
1,33.0,0,Doctor
2,42.0,0,Mathematician
3,34.8,1,Software Engineer
4,38.0,0,Doctor
5,33.0,0,Mathematician
