In [4]:
import pandas as pd

data ={"Name": ["Anna", "Bob", "Charley", "Eric", "Diana"],
      "Age": [20,34,23,None, 11],
      "Gender": ["f", "m", "m","m","f"],
      "Jod":["Programmer","Writer", "Cook","Programmer","Teacher"]}

df= pd.DataFrame(data)

In [5]:
df

Unnamed: 0,Name,Age,Gender,Jod
0,Anna,20.0,f,Programmer
1,Bob,34.0,m,Writer
2,Charley,23.0,m,Cook
3,Eric,,m,Programmer
4,Diana,11.0,f,Teacher


Preprocessing Pipeline


*Drop Name Features

*Impute Ages

*Turn Gender into Binary

*One Hot Encoding Job

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


# Drop name feature
df = df.drop(["Name"], axis=1)

df

Unnamed: 0,Age,Gender,Jod
0,20.0,f,Programmer
1,34.0,m,Writer
2,23.0,m,Cook
3,,m,Programmer
4,11.0,f,Teacher


In [7]:
#  Impute Ages 
imputer = SimpleImputer(strategy="mean")
df['Age']=imputer.fit_transform(df[['Age']])

In [8]:
df

Unnamed: 0,Age,Gender,Jod
0,20.0,f,Programmer
1,34.0,m,Writer
2,23.0,m,Cook
3,22.0,m,Programmer
4,11.0,f,Teacher


In [9]:
# Binary Gender

gender_dct ={"m":0, "f":1}
df['Gender']=[gender_dct[g] for g in df['Gender']]

df

Unnamed: 0,Age,Gender,Jod
0,20.0,1,Programmer
1,34.0,0,Writer
2,23.0,0,Cook
3,22.0,0,Programmer
4,11.0,1,Teacher


In [17]:
# OneHotEncode 

encoder = OneHotEncoder()

matrix = encoder.fit_transform(df[['Jod']]).toarray()
column_names = sorted([i for i in df['Jod'].unique()])

#  pd.get_dummies(X.Job, prefix="Job")

for i in range(len(matrix.T)):
    df[column_names[i]]= matrix.T[i]
    
df = df.drop(['Jod'], axis=1)

df

Unnamed: 0,Age,Gender,Cook,Programmer,Teacher,Writer
0,20.0,1,0.0,1.0,0.0,0.0
1,34.0,0,0.0,0.0,0.0,1.0
2,23.0,0,1.0,0.0,0.0,0.0
3,22.0,0,0.0,1.0,0.0,0.0
4,11.0,1,0.0,0.0,1.0,0.0


In [29]:
from sklearn.base import BaseEstimator, TransformerMixin

class NameDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        return X.drop(["Name"], axis=1)

class AgeImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        imputer = SimpleImputer(strategy="mean")
        X['Age']=imputer.fit_transform(X[['Age']])
        return X
    
class FeatureEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        gender_dct ={"m":0, "f":1}
        X['Gender']=[gender_dct[g] for g in X['Gender']]
        encoder = OneHotEncoder()

        matrix = encoder.fit_transform(X[['Jod']]).toarray()
        column_names = sorted([i for i in X['Jod'].unique()])

        #  pd.get_dummies(X.Job, prefix="Job")

        for i in range(len(matrix.T)):
            X[column_names[i]]= matrix.T[i]
      
        return X.drop(['Jod'], axis=1)

In [30]:

data1 ={"Name": ["Pionna", "Gerald", "Hans", "Isabelle", "Jacob"],
      "Age": [23,44,None,None, 29],
      "Gender": ["f", "m", "m","f","m"],
      "Jod":["Writer", "Programmer","Programmer","Teacher", "Cook"]}

df2 = pd.DataFrame(data1)
df2

Unnamed: 0,Name,Age,Gender,Jod
0,Pionna,23.0,f,Writer
1,Gerald,44.0,m,Programmer
2,Hans,,m,Programmer
3,Isabelle,,f,Teacher
4,Jacob,29.0,m,Cook


In [31]:
dropper = NameDropper()
imp = AgeImputer()
fe= FeatureEncoder()

fe.fit_transform(imp.fit_transform(dropper.fit_transform(df2)))


Unnamed: 0,Age,Gender,Cook,Programmer,Teacher,Writer
0,23.0,1,0.0,0.0,0.0,1.0
1,44.0,0,0.0,1.0,0.0,0.0
2,32.0,0,0.0,1.0,0.0,0.0
3,32.0,1,0.0,0.0,1.0,0.0
4,29.0,0,1.0,0.0,0.0,0.0


In [33]:
from sklearn.pipeline import Pipeline

pipe  = Pipeline([
    ("dropper", NameDropper()),
    ("imp", AgeImputer()),
    ("enc", FeatureEncoder())
])

pipe.fit_transform(df2)


Unnamed: 0,Age,Gender,Cook,Programmer,Teacher,Writer
0,23.0,1,0.0,0.0,0.0,1.0
1,44.0,0,0.0,1.0,0.0,0.0
2,32.0,0,0.0,1.0,0.0,0.0
3,32.0,1,0.0,0.0,1.0,0.0
4,29.0,0,1.0,0.0,0.0,0.0
