In [27]:
import pandas as pd
data = {"Name" : ["Anna", "Bob", "Charlie", "Diana", "Eric"],
        "Age" : [28, 34 , 23, None, 33],
        "Gender" : ["f", "m", "m", "f", "m"],
        "Job" : ["Programmer", "Writer", "Cook", "Programmer", "Teacher"]}
df = pd.DataFrame(data)

In [28]:
df

Unnamed: 0,Name,Age,Gender,Job
0,Anna,28.0,f,Programmer
1,Bob,34.0,m,Writer
2,Charlie,23.0,m,Cook
3,Diana,,f,Programmer
4,Eric,33.0,m,Teacher


In [29]:
# Preprocessing Pipeline:

# Drop Name Feature
# Input Ages
# Turn Gender into Binary/Numeric
# one hot Encoder Jobs

In [30]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Drop Name Feature
df = df.drop(["Name"], axis = 1)

# Input Ages
imputer = SimpleImputer(strategy="mean")
df['Age'] = imputer.fit_transform(df[['Age']])

# Turn Gender into Binary/Numeric
gender_dct = {"m": 0, "f" : 1}
df['Gender'] = [gender_dct[g] for g in df['Gender']]

# one hot Encoder Jobs
encoder = OneHotEncoder()
matrix = encoder.fit_transform(df[['Job']]).toarray()

column_names = ["Programmer", "Writer", "Cook", "Teacher"]

for i in range(len(matrix.T)):
    df[column_names[i]] =matrix.T[i]

df = df.drop(['Job'], axis=1)

In [31]:
df

Unnamed: 0,Age,Gender,Programmer,Writer,Cook,Teacher
0,28.0,1,0.0,1.0,0.0,0.0
1,34.0,0,0.0,0.0,0.0,1.0
2,23.0,0,1.0,0.0,0.0,0.0
3,29.5,1,0.0,1.0,0.0,0.0
4,33.0,0,0.0,0.0,1.0,0.0


#Pipeline

In [32]:
from sklearn.base import BaseEstimator, TransformerMixin

class NameDropper (BaseEstimator, TransformerMixin) : 
    def fit(self, X, y=None):
        return self

    def transform(self, X):
         return X.drop(['Name'], axis=1)

class AgeImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        imputer = SimpleImputer(strategy="mean")
        X['Age'] = imputer.fit_transform(X[['Age']])
        return X

class FeatureEncoder(BaseEstimator, TransformerMixin) : 
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        gender_dct = {"m": 0, "f" : 1}
        X['Gender'] = [gender_dct[g] for g in X['Gender']]

        encoder = OneHotEncoder()
        matrix = encoder.fit_transform(X[['Job']]).toarray()

        column_names = ["Programmer", "Writer", "Cook", "Teacher"]

        for i in range(len(matrix.T)):
            X[column_names[i]] =matrix.T[i]

        return X.drop(['Job'], axis=1)

In [33]:
data1 = {"Name" : ["janna", "Boby", "Charls", "Dia", "Erica"],
        "Age" : [28, 34 , 23, None, 33],
        "Gender" : ["f", "m", "m", "f", "f"],
        "Job" : ["Programmer", "Writer", "Cook", "Programmer", "Teacher"]}
df2 = pd.DataFrame(data1)
df2

Unnamed: 0,Name,Age,Gender,Job
0,janna,28.0,f,Programmer
1,Boby,34.0,m,Writer
2,Charls,23.0,m,Cook
3,Dia,,f,Programmer
4,Erica,33.0,f,Teacher


In [34]:
dropper = NameDropper()
imp = AgeImputer()
enc = FeatureEncoder()

enc.fit_transform(imp.fit_transform(dropper.fit_transform(df2)))

Unnamed: 0,Age,Gender,Programmer,Writer,Cook,Teacher
0,28.0,1,0.0,1.0,0.0,0.0
1,34.0,0,0.0,0.0,0.0,1.0
2,23.0,0,1.0,0.0,0.0,0.0
3,29.5,1,0.0,1.0,0.0,0.0
4,33.0,1,0.0,0.0,1.0,0.0


In [38]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("dropper", NameDropper()),
    ("imputer", AgeImputer()),
    ("encoder", FeatureEncoder())
])

df2

Unnamed: 0,Name,Age,Gender,Job
0,janna,28.0,f,Programmer
1,Boby,34.0,m,Writer
2,Charls,23.0,m,Cook
3,Dia,,f,Programmer
4,Erica,33.0,f,Teacher


In [39]:
pipe.fit_transform(df2)

Unnamed: 0,Age,Gender,Programmer,Writer,Cook,Teacher
0,28.0,1,0.0,1.0,0.0,0.0
1,34.0,0,0.0,0.0,0.0,1.0
2,23.0,0,1.0,0.0,0.0,0.0
3,29.5,1,0.0,1.0,0.0,0.0
4,33.0,1,0.0,0.0,1.0,0.0
