In [1]:
import pandas as pd

In [2]:
data = {'name':['Micheal',"Mia",'Daniel','Malkova','sunny'],
       'age':[59,28,32,None,36],
       'gender':['m','f','f','f','f'],
       'job':['Dancer','Artist','bowler','player','Singer']}
df = pd.DataFrame(data)

In [3]:
df.head()

Unnamed: 0,name,age,gender,job
0,Micheal,59.0,m,Dancer
1,Mia,28.0,f,Artist
2,Daniel,32.0,f,bowler
3,Malkova,,f,player
4,sunny,36.0,f,Singer


## preprocessing pipeline

- Drop name features
- Impute ages
- turn gender into binary
- One hot encording (job)

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [5]:
## 1. name drop
## df = df.drop(['name'],axis=1)

## 2. Impute Age 
## imputer = SimpleImputer(strategy = 'mean')
## imputer.fit_transform(df['Age'])

## 3. tuen gender into binary
## male 0, feamale 1
## df['gender'].map({'m':0,'f':1}) or 
## gender_dict = {'m':0,'f':1} ==> [gender_dict[i] for i in df['gender']]



In [6]:
## 1. name drop
df = df.drop(['name'],axis=1)

## 2. Impute Age 
imputer = SimpleImputer(strategy='mean')
df['age'] = imputer.fit_transform(df[['age']])

In [7]:
## 3. tuen gender into binary
gender_dict = {'m':0,'f':1}
df['gender'] = [gender_dict[i] for i in df['gender']]
## One hot encording 


In [8]:
# one hot encording
encorder  = OneHotEncoder()
matrix = encorder.fit_transform(df[['job']]).toarray()

column_name = df['job'].values.tolist()

for i in range(len(matrix.T)):
    df[column_name[i]] = matrix.T[i]

df = df.drop(['job'],axis=1)

In [9]:
matrix

array([[0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.]])

In [10]:
data = {'name':['Micheal',"Mia",'Daniel','Malkova','sunny'],
       'age':[59,None,32,None,36],
       'gender':['m','f','f','m','f'],
       'job':['Dancer','Artist','actor','player','Singer']}
df = pd.DataFrame(data)

## Pipelines 

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin

In [12]:
class NameDropper(BaseEstimator,TransformerMixin):
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):
        return X.drop(['name'],axis=1)
    
class ImputeAge(BaseEstimator,TransformerMixin):
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X):     
        imputer = SimpleImputer(strategy = 'mean')
        X['age'] = imputer.fit_transform(X[['age']])
        return X

class FeatureEncorder(BaseEstimator,TransformerMixin):
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X): 
        
        # convert into binary
        gender_dict = {'m':0,'f':1}
        X['gender'] = [gender_dict[i] for i in X['gender']]
        
        # One hot encording
        encorder = OneHotEncoder()
        encorder.fit_transform(X[['job']]).toarray()
        
        column_name = X['job'].values.tolist()
        
        for i,j in enumerate(column_name):
            X[column_name[i]] = matrix.T[i]
        return X.drop(['job',column_name[-1]],axis=1)

In [13]:
dropper = NameDropper()
impputer = ImputeAge()
encorder = FeatureEncorder()

In [14]:
encorder.fit_transform(impputer.fit_transform(dropper.fit_transform(df)))

Unnamed: 0,age,gender,Dancer,Artist,actor,player
0,59.0,0,0.0,1.0,0.0,0.0
1,42.333333,1,1.0,0.0,0.0,0.0
2,32.0,1,0.0,0.0,0.0,1.0
3,42.333333,0,0.0,0.0,0.0,0.0
4,36.0,1,0.0,0.0,1.0,0.0


In [16]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('dropper' , NameDropper()),
    ('imputer' , ImputeAge()),
    ('encorder' , FeatureEncorder()),
])

In [17]:
pipeline.fit_transform(df)

Unnamed: 0,age,gender,Dancer,Artist,actor,player
0,59.0,0,0.0,1.0,0.0,0.0
1,42.333333,1,1.0,0.0,0.0,0.0
2,32.0,1,0.0,0.0,0.0,1.0
3,42.333333,0,0.0,0.0,0.0,0.0
4,36.0,1,0.0,0.0,1.0,0.0
