In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from sklearn.compose import (
    make_column_selector as selector,
    ColumnTransformer
)
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import( 
    StandardScaler, 
    OneHotEncoder
)
import pickle

In [27]:
df_raw = pd.read_csv('./data_set/preprocessing/hd_dataSet.csv') 
df_raw.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,male,typical angina,145.0,233.0,greater than 120mg/ml,left ventricular hypertrophy,150.0,no,2.3,downsloping,0.0,fixed defect,0
1,67.0,male,asymptomatic,160.0,286.0,lower than 120mg/ml,left ventricular hypertrophy,108.0,yes,1.5,flat,3.0,normal,1
2,67.0,male,asymptomatic,120.0,229.0,lower than 120mg/ml,left ventricular hypertrophy,129.0,yes,2.6,flat,2.0,reversable defect,1
3,37.0,male,non-anginal pain,130.0,250.0,lower than 120mg/ml,normal,187.0,no,3.5,downsloping,0.0,normal,0
4,41.0,female,atypical angina,130.0,204.0,lower than 120mg/ml,left ventricular hypertrophy,172.0,no,1.4,upsloping,0.0,normal,0


In [3]:
cat_col = ['sex','cp','fbs','restecg','exang','slope','thal', 'ca', 'target']
num_col = ['age', 'trestbps', 'chol',  'thalach',  'oldpeak']

In [46]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_col),
        ('cat', categorical_transformer, cat_col)])

In [47]:
models_df = pd.DataFrame()
models_df = models_df.append(pd.DataFrame(preprocessor.fit_transform(df_raw)))
models_df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0.936181,0.75038,-0.276443,0.017494,1.068965,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.378929,1.596266,0.744555,-1.816334,0.381773,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
2,1.378929,-0.659431,-0.3535,-0.89942,1.326662,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
3,-1.94168,-0.095506,0.051047,1.63301,2.099753,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-1.498933,-0.095506,-0.835103,0.978071,0.295874,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [50]:
preprocessor.named_transformers_


{'num': StandardScaler(), 'cat': OneHotEncoder(drop='first')}

In [49]:
preprocessor.get_params()

{'n_jobs': None,
 'remainder': 'drop',
 'sparse_threshold': 0.3,
 'transformer_weights': None,
 'transformers': [('num',
   StandardScaler(),
   ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']),
  ('cat',
   OneHotEncoder(drop='first'),
   ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'ca', 'target'])],
 'verbose': False,
 'num': StandardScaler(),
 'cat': OneHotEncoder(drop='first'),
 'num__copy': True,
 'num__with_mean': True,
 'num__with_std': True,
 'cat__categories': 'auto',
 'cat__drop': 'first',
 'cat__dtype': numpy.float64,
 'cat__handle_unknown': 'error',
 'cat__sparse': True}

In [36]:
oh = OneHotEncoder()
oh.fit(df_raw)
oh.categories_

[array([29., 34., 35., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46.,
        47., 48., 49., 50., 51., 52., 53., 54., 55., 56., 57., 58., 59.,
        60., 61., 62., 63., 64., 65., 66., 67., 68., 69., 70., 71., 74.,
        76., 77.]),
 array(['female', 'male'], dtype=object),
 array(['asymptomatic', 'atypical angina', 'non-anginal pain',
        'typical angina'], dtype=object),
 array([ 94., 100., 101., 102., 104., 105., 106., 108., 110., 112., 114.,
        115., 117., 118., 120., 122., 123., 124., 125., 126., 128., 129.,
        130., 132., 134., 135., 136., 138., 140., 142., 144., 145., 146.,
        148., 150., 152., 154., 155., 156., 158., 160., 164., 165., 170.,
        172., 174., 178., 180., 192., 200.]),
 array([126., 131., 141., 149., 157., 160., 164., 166., 167., 168., 169.,
        172., 174., 175., 176., 177., 178., 180., 182., 183., 184., 185.,
        186., 187., 188., 192., 193., 195., 196., 197., 198., 199., 200.,
        201., 203., 204., 205., 206., 207., 208., 

In [6]:
with open("model.bin", 'wb') as f_out:
    pickle.dump(preprocessor, f_out) # write final_model in .bin file
    f_out.close()  # close the file 

In [7]:
with open('model.bin', 'rb') as f_in:
    model = pickle.load(f_in)

In [8]:
model.transform(df_raw)

array([[ 0.93618065,  0.75038004, -0.27644339, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.3789285 ,  1.59626645,  0.74455507, ...,  1.        ,
         0.        ,  1.        ],
       [ 1.3789285 , -0.65943064, -0.35349988, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.48961547,  0.69398761, -1.04700826, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.27205887, -0.09550637, -2.24138382, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.27205887, -0.09550637, -0.21865103, ...,  0.        ,
         0.        ,  1.        ]])