In [66]:
import numpy as np
import pandas as pd
import warnings
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pickle

In [27]:
warnings.filterwarnings("ignore")

# Pré-processamento #

## Visualização prévia ##

In [28]:
df = pd.read_csv(
    '../data/heart/processed/heart.csv',
    sep = ';', encoding = 'utf-8'
)

In [29]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up,0


In [30]:
df.shape

(917, 12)

## Transformando as variáveis categóricas nominais em variáveis categóricas ordinais ##

### Transformação manual ###

In [31]:
df_encod_manual = pd.DataFrame.copy(df)

In [32]:
df_encod_manual['Sex'].replace({
    'M': 0,
    'F': 1
}, inplace = True)

df_encod_manual['ChestPainType'].replace({
    'TA': 0,
    'ATA': 1,
    'NAP': 2,
    'ASY': 3
}, inplace = True)

df_encod_manual['RestingECG'].replace({
    'Normal': 0,
    'ST': 1,
    'LVH': 2
}, inplace = True)

df_encod_manual['ExerciseAngina'].replace({
    'N': 0,
    'Y': 1
}, inplace = True)

df_encod_manual['ST_Slope'].replace({
    'Up': 0,
    'Flat': 1,
    'Down': 2
}, inplace = True)

In [33]:
df_encod_manual.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,1,140,289.0,0,0,172,0,0.0,0,0
1,49,1,2,160,180.0,0,0,156,0,1.0,1,1
2,37,0,1,130,283.0,0,1,98,0,0.0,0,0
3,48,1,3,138,214.0,0,0,108,1,1.5,1,1
4,54,0,2,150,195.0,0,0,122,0,0.0,0,0


In [34]:
df_encod_manual.shape

(917, 12)

In [35]:
df_encod_manual.dtypes

Age                 int64
Sex                 int64
ChestPainType       int64
RestingBP           int64
Cholesterol       float64
FastingBS           int64
RestingECG          int64
MaxHR               int64
ExerciseAngina      int64
Oldpeak           float64
ST_Slope            int64
HeartDisease        int64
dtype: object

# Atributos #

## Separação da base em previsores e classe alvo ##

In [36]:
previsores = df_encod_manual.iloc[:, 0:11].values

In [37]:
previsores

array([[40. ,  0. ,  1. , ...,  0. ,  0. ,  0. ],
       [49. ,  1. ,  2. , ...,  0. ,  1. ,  1. ],
       [37. ,  0. ,  1. , ...,  0. ,  0. ,  0. ],
       ...,
       [57. ,  0. ,  3. , ...,  1. ,  1.2,  1. ],
       [57. ,  1. ,  1. , ...,  0. ,  0. ,  1. ],
       [38. ,  0. ,  2. , ...,  0. ,  0. ,  0. ]])

In [38]:
previsores.shape

(917, 11)

In [39]:
alvo = df_encod_manual.iloc[:, 11].values

In [40]:
alvo

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,

In [41]:
alvo.shape

(917,)

## Escalonamento ##

In [42]:
df_encod_manual.describe()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,53.509269,0.210469,2.251908,132.540894,244.635389,0.23337,0.604144,136.789531,0.40458,0.886696,0.63795,0.55289
std,9.437636,0.407864,0.931502,17.999749,53.347125,0.423206,0.806161,25.467129,0.491078,1.06696,0.60727,0.497466
min,28.0,0.0,0.0,80.0,85.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,0.0,2.0,120.0,214.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0
50%,54.0,0.0,3.0,130.0,244.635389,0.0,0.0,138.0,0.0,0.6,1.0,1.0
75%,60.0,0.0,3.0,140.0,267.0,0.0,1.0,156.0,1.0,1.5,1.0,1.0
max,77.0,1.0,3.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,2.0,1.0


 **Padronização (utiliza média e desvio padrão como referência). </br> </br>Normalização (utiliza os valores de máximo e mínimo como referência). </br></br> Usaremos padronização**

In [43]:
previsores_esc = StandardScaler().fit_transform(previsores)

In [44]:
previsores_df = pd.DataFrame(previsores_esc)

In [45]:
previsores_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,1.859654e-16,7.748558e-18,1.046055e-16,7.767929e-16,-1.86934e-16,4.649135e-17,0.0,-5.114048e-16,-1.046055e-16,7.748558000000001e-17,-3.8742790000000005e-17
std,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546
min,-2.704405,-0.5163086,-2.418822,-2.920572,-2.994023,-0.5517333,-0.749818,-3.016886,-0.8243101,-3.269662,-1.051095
25%,-0.6900904,-0.5163086,-0.2705801,-0.6971063,-0.5745784,-0.5517333,-0.749818,-0.6596226,-0.8243101,-0.8315022,-1.051095
50%,0.05202558,-0.5163086,0.803541,-0.1412398,0.0,-0.5517333,-0.749818,0.04755658,-0.8243101,-0.26885,0.5965186
75%,0.688125,-0.5163086,0.803541,0.4146267,0.4194568,-0.5517333,0.491306,0.7547357,1.213136,0.5751284,0.5965186
max,2.490407,1.936826,0.803541,3.749826,6.721265,1.81247,1.73243,2.561971,1.213136,4.982571,2.244132


## Encoders ##

### LabelEncoder ###

In [50]:
previsores_label = df.iloc[:, 0:11].values
previsores_label

array([[40, 'M', 'ATA', ..., 'N', 0.0, 'Up'],
       [49, 'F', 'NAP', ..., 'N', 1.0, 'Flat'],
       [37, 'M', 'ATA', ..., 'N', 0.0, 'Up'],
       ...,
       [57, 'M', 'ASY', ..., 'Y', 1.2, 'Flat'],
       [57, 'F', 'ATA', ..., 'N', 0.0, 'Flat'],
       [38, 'M', 'NAP', ..., 'N', 0.0, 'Up']], dtype=object)

In [52]:
previsores_label[:, 1] = LabelEncoder().fit_transform(previsores[:, 1])
previsores_label

array([[40, 0, 'ATA', ..., 'N', 0.0, 'Up'],
       [49, 1, 'NAP', ..., 'N', 1.0, 'Flat'],
       [37, 0, 'ATA', ..., 'N', 0.0, 'Up'],
       ...,
       [57, 0, 'ASY', ..., 'Y', 1.2, 'Flat'],
       [57, 1, 'ATA', ..., 'N', 0.0, 'Flat'],
       [38, 0, 'NAP', ..., 'N', 0.0, 'Up']], dtype=object)

In [53]:
previsores_label

array([[40, 0, 'ATA', ..., 'N', 0.0, 'Up'],
       [49, 1, 'NAP', ..., 'N', 1.0, 'Flat'],
       [37, 0, 'ATA', ..., 'N', 0.0, 'Up'],
       ...,
       [57, 0, 'ASY', ..., 'Y', 1.2, 'Flat'],
       [57, 1, 'ATA', ..., 'N', 0.0, 'Flat'],
       [38, 0, 'NAP', ..., 'N', 0.0, 'Up']], dtype=object)

In [54]:
previsores_label[:, 2] = LabelEncoder().fit_transform(previsores_label[:, 2])
previsores_label[:, 6] = LabelEncoder().fit_transform(previsores_label[:, 6])
previsores_label[:, 8] = LabelEncoder().fit_transform(previsores_label[:, 8])
previsores_label[:, 10] = LabelEncoder().fit_transform(previsores_label[:, 10])

In [55]:
previsores_label

array([[40, 0, 1, ..., 0, 0.0, 2],
       [49, 1, 2, ..., 0, 1.0, 1],
       [37, 0, 1, ..., 0, 0.0, 2],
       ...,
       [57, 0, 0, ..., 1, 1.2, 1],
       [57, 1, 1, ..., 0, 0.0, 1],
       [38, 0, 2, ..., 0, 0.0, 2]], dtype=object)

In [56]:
previsores_label.shape

(917, 11)

### OneHotEncoder ###

In [58]:
previsores_hot = ColumnTransformer(
    transformers = [('OneHot', OneHotEncoder(), [1, 2, 6, 8, 10])],
    remainder = 'passthrough'
).fit_transform(previsores_label)

In [59]:
previsores_hot

array([[1.0, 0.0, 0.0, ..., 0, 172, 0.0],
       [0.0, 1.0, 0.0, ..., 0, 156, 1.0],
       [1.0, 0.0, 0.0, ..., 0, 98, 0.0],
       ...,
       [1.0, 0.0, 1.0, ..., 0, 115, 1.2],
       [0.0, 1.0, 0.0, ..., 0, 174, 0.0],
       [1.0, 0.0, 0.0, ..., 0, 173, 0.0]], dtype=object)

In [60]:
previsores_hot.shape

(917, 20)

In [61]:
previsores_hot_df = pd.DataFrame(previsores_hot)
previsores_hot_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,40,140,289.0,0,172,0.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,49,160,180.0,0,156,1.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,37,130,283.0,0,98,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,48,138,214.0,0,108,1.5
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,54,150,195.0,0,122,0.0


### OneHot + Escalonamento ###

In [62]:
previsoresHot_esc = StandardScaler().fit_transform(previsores_hot)

In [63]:
previsoresHot_esc

array([[ 0.51630861, -0.51630861, -1.08542493, ..., -0.55173333,
         1.38333943, -0.83150225],
       [-1.9368261 ,  1.9368261 , -1.08542493, ..., -0.55173333,
         0.75473573,  0.10625149],
       [ 0.51630861, -0.51630861, -1.08542493, ..., -0.55173333,
        -1.52395266, -0.83150225],
       ...,
       [ 0.51630861, -0.51630861,  0.92129817, ..., -0.55173333,
        -0.85606123,  0.29380223],
       [-1.9368261 ,  1.9368261 , -1.08542493, ..., -0.55173333,
         1.46191489, -0.83150225],
       [ 0.51630861, -0.51630861, -1.08542493, ..., -0.55173333,
         1.42262716, -0.83150225]])

In [64]:
previsoresHot_esc_df = pd.DataFrame(previsoresHot_esc)
previsoresHot_esc_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.516309,-0.516309,-1.085425,2.073784,-0.531524,-0.229810,-0.507826,0.815013,-0.490781,0.824310,-0.824310,-0.271607,-1.001091,1.149573,-1.432206,0.414627,0.832075,-0.551733,1.383339,-0.831502
1,-1.936826,1.936826,-1.085425,-0.482210,1.881384,-0.229810,-0.507826,0.815013,-0.490781,0.824310,-0.824310,-0.271607,0.998910,-0.869888,-0.478057,1.526360,-1.212261,-0.551733,0.754736,0.106251
2,0.516309,-0.516309,-1.085425,2.073784,-0.531524,-0.229810,-0.507826,-1.226974,2.037569,0.824310,-0.824310,-0.271607,-1.001091,1.149573,-1.750256,-0.141240,0.719543,-0.551733,-1.523953,-0.831502
3,-1.936826,1.936826,0.921298,-0.482210,-0.531524,-0.229810,-0.507826,0.815013,-0.490781,-1.213136,1.213136,-0.271607,0.998910,-0.869888,-0.584074,0.303453,-0.574578,-0.551733,-1.131075,0.575128
4,0.516309,-0.516309,-1.085425,-0.482210,1.881384,-0.229810,-0.507826,0.815013,-0.490781,0.824310,-0.824310,-0.271607,-1.001091,1.149573,0.052026,0.970493,-0.930931,-0.551733,-0.581047,-0.831502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912,0.516309,-0.516309,-1.085425,-0.482210,-0.531524,4.351412,-0.507826,0.815013,-0.490781,0.824310,-0.824310,-0.271607,0.998910,-0.869888,-0.902124,-1.252973,0.363191,-0.551733,-0.188170,0.293802
913,0.516309,-0.516309,0.921298,-0.482210,-0.531524,-0.229810,-0.507826,0.815013,-0.490781,0.824310,-0.824310,-0.271607,0.998910,-0.869888,1.536257,0.636973,-0.968441,1.812470,0.165420,2.356860
914,0.516309,-0.516309,0.921298,-0.482210,-0.531524,-0.229810,-0.507826,0.815013,-0.490781,-1.213136,1.213136,-0.271607,0.998910,-0.869888,0.370075,-0.141240,-2.131275,-0.551733,-0.856061,0.293802
915,-1.936826,1.936826,-1.085425,2.073784,-0.531524,-0.229810,1.969177,-1.226974,-0.490781,0.824310,-0.824310,-0.271607,0.998910,-0.869888,0.370075,-0.141240,-0.161960,-0.551733,1.461915,-0.831502


In [65]:
previsoresHot_esc_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,-1.472226e-16,1.084798e-16,1.937139e-17,-3.8742790000000005e-17,3.8742790000000005e-17,6.973702000000001e-17,0.0,-9.298269e-17,1.549712e-17,-4.2617070000000006e-17,4.2617070000000006e-17,8.523413e-17,0.0,-3.8742790000000005e-17,1.859654e-16,7.884157e-16,3.014189e-15,-1.549712e-17,-5.114048e-16,-1.859654e-16
std,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546
min,-1.936826,-0.5163086,-1.085425,-0.4822104,-0.5315237,-0.2298105,-0.507826,-1.226974,-0.490781,-1.213136,-0.8243101,-0.2716072,-1.001091,-0.8698879,-2.704405,-2.920572,-2.994023,-0.5517333,-3.016886,-3.269662
25%,0.5163086,-0.5163086,-1.085425,-0.4822104,-0.5315237,-0.2298105,-0.507826,-1.226974,-0.490781,-1.213136,-0.8243101,-0.2716072,-1.001091,-0.8698879,-0.6900904,-0.6971063,-0.5745784,-0.5517333,-0.6596226,-0.8315022
50%,0.5163086,-0.5163086,0.9212982,-0.4822104,-0.5315237,-0.2298105,-0.507826,0.8150134,-0.490781,0.8243101,-0.8243101,-0.2716072,0.99891,-0.8698879,0.05202558,-0.1412398,3.19836e-15,-0.5517333,0.04755658,-0.26885
75%,0.5163086,-0.5163086,0.9212982,-0.4822104,-0.5315237,-0.2298105,-0.507826,0.8150134,-0.490781,0.8243101,1.213136,-0.2716072,0.99891,1.149573,0.688125,0.4146267,0.4194568,-0.5517333,0.7547357,0.5751284
max,0.5163086,1.936826,0.9212982,2.073784,1.881384,4.351412,1.969177,0.8150134,2.037569,0.8243101,1.213136,3.681787,0.99891,1.149573,2.490407,3.749826,6.721265,1.81247,2.561971,4.982571


### Salvando variáveis (atributos) com o Pickle ###

In [67]:
arq1 = open('../data/heart/processed/heart.pkl', 'wb')

In [68]:
#Salvando a variável no arquivo
pickle.dump(alvo, arq1)

In [69]:
#Fechando o arquivo
arq1.close()

In [70]:
#Lendo o arquivo
arq1 = open('../data/heart/processed/heart.pkl', 'rb')

In [71]:
#Chamando a variavel
alvoPK = pickle.load(arq1)

In [72]:
alvoPK

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,

In [73]:
arq1.close()

In [75]:
#Criando arquivos e salvando as variáveis
arq2 = open('../data/heart/processed/heartC', 'wb')
pickle.dump(previsores, arq2)

arq3 = open('../data/heart/processed/heartESC.pkl', 'wb')
pickle.dump(previsores_esc, arq3)

arq4 = open('../data/heart/processed/heartHOT.pkl', 'wb')
pickle.dump(previsores_hot, arq4)

arq5 = open('../data/heart/processed/heartLABEL.pkl', 'wb')
pickle.dump(previsores_label, arq5)

arq6 = open('../data/heart/processed/heartHOTESC.pkl', 'wb')
pickle.dump(previsoresHot_esc, arq6)

# Separação dos dados em treino e teste #
<span style="font-size: small;"> 
- <strong>arrays:</strong> nomes dos atributos previsores e alvo.</br>
- <strong>test_size:</strong> tamanho em porcentagem dos dados de teste. default é none. </br> 
- <strong>train_size:</strong> tamanho em porcentagem dos dados de treinamento.default é none. </br>  
- <strong>random_state:</strong> nomeação de um estado aleatório. </br>
- <strong>shuffle:</strong> embaralhamento dos dados aleatórios. Associado com o random_state ocorre o mesmo embaralhamento sempre. Default é True. </br>
- <strong>stratify:</strong> Possibilidade de dividir os dados de forma estratificada. Default é None (nesse caso é mantido a proporção, isto é, se tem 30% de zeros e 70% de 1 no dataframe, na separação em treinamento e teste se manterá essa proporção). </span>