In [67]:
import pandas as pd
data = {"Name": ["Mike", "Brian", "Simon","Vallery","Ashley"],
       "Age":[22,19,21,None,22],
       "Gender":["m","m","m","f","f"],
       "Job":["Mechanic","Teacher","Doctor", "Teacher","Chef"]}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Gender,Job
0,Mike,22.0,m,Mechanic
1,Brian,19.0,m,Teacher
2,Simon,21.0,m,Doctor
3,Vallery,,f,Teacher
4,Ashley,22.0,f,Chef


preprocessing pipeline:
* Drop name features
* Impute ages
* Turn gender into binary/numeric
* One Hot encode jobs


One Hot Encoding involves creatin new columns indicating the presence (or absence) of each possible value in the original data.


In [68]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

#Drop name features
df = df.drop(["Name"], axis =1 )

# Impute Ages 

imputer = SimpleImputer(strategy="mean")
df['Age'] = imputer.fit_transform(df[["Age"]])

#Numeric Gender
gender_dct = {"m": 0,"f": 1}
df['Gender'] = [gender_dct[g] for g in df['Gender']]

#OneHotEncode Jobs
encoder = OneHotEncoder()
matrix = encoder.fit_transform(df[['Job']]).toarray()

column_names = ["Mechanic","Teacher","Doctor", "Teacher","Chef"]

for i in range(len(matrix.T)):
    df[column_names[i]]=matrix.T[i]
    
df = df.drop(['Job'],axis = 1)
#we are dropping this column since it has already been encoded

In [69]:
df
#This is our transformed dataframe

Unnamed: 0,Age,Gender,Mechanic,Teacher,Doctor
0,22.0,0,0.0,0.0,1.0
1,19.0,0,0.0,1.0,0.0
2,21.0,0,0.0,0.0,0.0
3,21.0,1,0.0,1.0,0.0
4,22.0,1,1.0,0.0,0.0


Now let's say you want to do the same to other dataframes but in a more simpler way using pipelines. 
Below is an example of a new dataframe


In [71]:
data2= {"Name": ["Maureen", "Ian", "Chinedu","Kamau","Laura"],
       "Age":[None,22,21,None,22],
       "Gender":["f","m","m","m","f"],
       "Job":["Mechanic","Engineer","Doctor", "Programmer","Nurse"]}


df2 = pd.DataFrame(data2)
df2


Unnamed: 0,Name,Age,Gender,Job
0,Maureen,,f,Mechanic
1,Ian,22.0,m,Engineer
2,Chinedu,21.0,m,Doctor
3,Kamau,,m,Programmer
4,Laura,22.0,f,Nurse


In [73]:
#Pipelines
from sklearn.base import BaseEstimator, TransformerMixin




class NameDropper(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self,X):
        return X.drop(['Name'],axis =1 )
    
class AgeImputer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self,X):
        imputer = SimpleImputer(strategy="mean")
        X['Age'] = imputer.fit_transform(X[['Age']])
        
        return X
    
    
class FeatureEncoder(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self,X):
        gender_dct = {"m": 0,"f": 1}
        X['Gender'] = [gender_dct[g] for g in X['Gender']]

        encoder = OneHotEncoder()
        matrix = encoder.fit_transform(X[['Job']]).toarray()

        column_names = ["Mechanic","Teacher","Doctor", "Teacher","Chef"]

        for i in range(len(matrix.T)):
            X[column_names[i]]=matrix.T[i]

        return X.drop(['Job'],axis = 1)
    

In [74]:
dropper = NameDropper()
imp = AgeImputer()
enc = FeatureEncoder()

#doing it mannually 
enc.fit_transform(imp.fit_transform(dropper.fit_transform(df2)))

Unnamed: 0,Age,Gender,Mechanic,Teacher,Doctor,Chef
0,21.666667,1,0.0,0.0,1.0,0.0
1,22.0,0,0.0,0.0,0.0,0.0
2,21.0,0,1.0,0.0,0.0,0.0
3,21.666667,0,0.0,0.0,0.0,1.0
4,22.0,1,0.0,1.0,0.0,0.0


In [76]:
#using pipelines

from sklearn.pipeline import Pipeline 

pipe = Pipeline ([
    ("dropper",NameDropper()),
    ("imputer", AgeImputer()),
    ("encoder", FeatureEncoder())
])

pipe.fit_transform(df2)


Unnamed: 0,Age,Gender,Mechanic,Teacher,Doctor,Chef
0,21.666667,1,0.0,0.0,1.0,0.0
1,22.0,0,0.0,0.0,0.0,0.0
2,21.0,0,1.0,0.0,0.0,0.0
3,21.666667,0,0.0,0.0,0.0,1.0
4,22.0,1,0.0,1.0,0.0,0.0


In [75]:
pipe.fit_transform(df2)


Unnamed: 0,Age,Gender,Mechanic,Teacher,Doctor,Chef
0,21.666667,1,0.0,0.0,1.0,0.0
1,22.0,0,0.0,0.0,0.0,0.0
2,21.0,0,1.0,0.0,0.0,0.0
3,21.666667,0,0.0,0.0,0.0,1.0
4,22.0,1,0.0,1.0,0.0,0.0
