In [2]:
import pandas as pd
import numpy as py
from sklearn.base import BaseEstimator, TransformerMixin

### Develop Individual Classes ###

In [3]:
class PatientIdDrop(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(['Patient ID'], axis=1)

In [4]:
df = pd.read_excel('../data/data.xlsx')

In [5]:
df

Unnamed: 0,Patient ID,Height,Weight,Age,Gender,Job
0,P1001,160.0,67.0,20,F,Teacher
1,P1002,162.0,54.0,21,F,Engineer
2,P1003,,78.0,23,M,Engineer
3,P1004,161.0,65.0,19,F,Teacher
4,P1005,172.0,,18,M,Accountant


In [6]:
dropper = PatientIdDrop()
df = dropper.fit_transform(df)
df

Unnamed: 0,Height,Weight,Age,Gender,Job
0,160.0,67.0,20,F,Teacher
1,162.0,54.0,21,F,Engineer
2,,78.0,23,M,Engineer
3,161.0,65.0,19,F,Teacher
4,172.0,,18,M,Accountant


In [7]:
from sklearn.impute import SimpleImputer

class HeightWeightImputer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        imputer = SimpleImputer(strategy='mean')
        X['Height'] = imputer.fit_transform(X[['Height']])
        X['Weight'] = imputer.fit_transform(X[['Weight']])
        return  X

In [8]:
height_weight_imputer = HeightWeightImputer()
df = height_weight_imputer.fit_transform(df)
df

Unnamed: 0,Height,Weight,Age,Gender,Job
0,160.0,67.0,20,F,Teacher
1,162.0,54.0,21,F,Engineer
2,163.75,78.0,23,M,Engineer
3,161.0,65.0,19,F,Teacher
4,172.0,66.0,18,M,Accountant


In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

class FeaturesEncoder(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        gender_codes = {'M': 0, 'F': 1}
        X['Gender'] = [gender_codes[gender] for gender in X['Gender']]
        
        encoder = OneHotEncoder()
        encode_column = encoder.fit_transform(X[['Job']][::-1]).toarray()
        onehot_columns = ['Teacher', 'Engineer', 'Accountant']

        for i in range(len(encode_column.T)):
            X[onehot_columns[i]] = encode_column.T[i]
            
        X = X.drop(['Job'], axis=1)
    
        return  X

In [10]:
features_encoder = FeaturesEncoder()
df = features_encoder.fit_transform(df)
df

Unnamed: 0,Height,Weight,Age,Gender,Teacher,Engineer,Accountant
0,160.0,67.0,20,1,1.0,0.0,0.0
1,162.0,54.0,21,1,0.0,0.0,1.0
2,163.75,78.0,23,0,0.0,1.0,0.0
3,161.0,65.0,19,1,0.0,1.0,0.0
4,172.0,66.0,18,0,0.0,0.0,1.0


### Combine all steps ###

In [11]:
df = pd.read_excel('../data/data.xlsx')

In [12]:
patientId_dropper = PatientIdDrop()
height_height_imputer = HeightWeightImputer()
features_encoder = FeaturesEncoder()

features_encoder.fit_transform(
    height_weight_imputer.fit_transform(
        patientId_dropper.fit_transform(df)))

Unnamed: 0,Height,Weight,Age,Gender,Teacher,Engineer,Accountant
0,160.0,67.0,20,1,1.0,0.0,0.0
1,162.0,54.0,21,1,0.0,0.0,1.0
2,163.75,78.0,23,0,0.0,1.0,0.0
3,161.0,65.0,19,1,0.0,1.0,0.0
4,172.0,66.0,18,0,0.0,0.0,1.0


### Pipelines ###

In [13]:
from sklearn.pipeline import Pipeline

df = pd.read_excel('../data/data.xlsx')

pipeline = Pipeline([
    ('dropper', PatientIdDrop()), 
    ('height_weight_imputer', HeightWeightImputer()),
    ('features_encoder', FeaturesEncoder())
    ])

df = pipeline.fit_transform(df)
df

Unnamed: 0,Height,Weight,Age,Gender,Teacher,Engineer,Accountant
0,160.0,67.0,20,1,1.0,0.0,0.0
1,162.0,54.0,21,1,0.0,0.0,1.0
2,163.75,78.0,23,0,0.0,1.0,0.0
3,161.0,65.0,19,1,0.0,1.0,0.0
4,172.0,66.0,18,0,0.0,0.0,1.0
