# **Analise sem pipelines**
 - Esta é uma base para classificao de doença do coração

In [1]:
#instalando category_encoders para modificar dados categoriais
!pip install category_encoders



In [2]:
#importando bibliotecas
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from category_encoders import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

  import pandas.util.testing as tm


In [3]:
#carregando base
base = pd.read_csv('heart.csv')

base.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284,0,Normal,120,N,0.0,Up,0


In [4]:
#verificando se há dados nao preenchidos ou vazios
base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [5]:
#analisado os dados que são categoriais
#precisamos modificar esses valores para numeros 
base.select_dtypes(include='object')

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope
0,M,ATA,Normal,N,Up
1,F,NAP,Normal,N,Flat
2,M,ATA,ST,N,Up
3,F,ASY,Normal,Y,Flat
4,M,NAP,Normal,N,Up
...,...,...,...,...,...
913,M,TA,Normal,N,Flat
914,M,ASY,Normal,N,Flat
915,M,ASY,Normal,Y,Flat
916,F,ATA,LVH,N,Flat


In [6]:
#separando a base em dataframes de dados e respotas
x = base.drop('HeartDisease', axis=1, inplace=False)
y = base['HeartDisease']

In [7]:
y.head()

0    0
1    1
2    0
3    1
4    0
Name: HeartDisease, dtype: int64

In [8]:
x.head(3).T

Unnamed: 0,0,1,2
Age,40,49,37
Sex,M,F,M
ChestPainType,ATA,NAP,ATA
RestingBP,140,160,130
Cholesterol,289,180,283
FastingBS,0,0,0
RestingECG,Normal,Normal,ST
MaxHR,172,156,98
ExerciseAngina,N,N,N
Oldpeak,0,1,0


In [9]:
#utilizando o OneHotEncoder para transformar os dados categoriais
#cada dado sera modificado a uma nova coluna quando estiver presente recebe o valor 1
#caso contrario recebera 0
encoder = OneHotEncoder(use_cat_names=True)

x = encoder.fit_transform(x)

#perceba como o novo dataFrame esta modificado
x

Unnamed: 0,Age,Sex_M,Sex_F,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_ASY,ChestPainType_TA,RestingBP,Cholesterol,FastingBS,RestingECG_Normal,RestingECG_ST,RestingECG_LVH,MaxHR,ExerciseAngina_N,ExerciseAngina_Y,Oldpeak,ST_Slope_Up,ST_Slope_Flat,ST_Slope_Down
0,40,1,0,1,0,0,0,140,289,0,1,0,0,172,1,0,0.0,1,0,0
1,49,0,1,0,1,0,0,160,180,0,1,0,0,156,1,0,1.0,0,1,0
2,37,1,0,1,0,0,0,130,283,0,0,1,0,98,1,0,0.0,1,0,0
3,48,0,1,0,0,1,0,138,214,0,1,0,0,108,0,1,1.5,0,1,0
4,54,1,0,0,1,0,0,150,195,0,1,0,0,122,1,0,0.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,0,0,0,0,1,110,264,0,1,0,0,132,1,0,1.2,0,1,0
914,68,1,0,0,0,1,0,144,193,1,1,0,0,141,1,0,3.4,0,1,0
915,57,1,0,0,0,1,0,130,131,0,1,0,0,115,0,1,1.2,0,1,0
916,57,0,1,1,0,0,0,130,236,0,0,0,1,174,1,0,0.0,0,1,0


In [10]:
#separando dados de treino e teste
np.random.seed(42)
x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=0.20, stratify=y)

print("foi separado %d de treino e %d de teste" %(len(x_treino), len(x_teste)))

foi separado 734 de treino e 184 de teste


In [11]:
#utilizando o trasnfomador standardscaler para normalizar os dados
scaler = StandardScaler()
scaler.fit(x_treino)
novo_x_treino = scaler.transform(x_treino) 

novo_x_treino

array([[ 0.9700116 ,  0.51740017, -0.51740017, ..., -0.86224587,
         0.98110491, -0.26155954],
       [ 0.12202766,  0.51740017, -0.51740017, ...,  1.15976201,
        -1.01925899, -0.26155954],
       [ 0.54601963, -1.93273999,  1.93273999, ..., -0.86224587,
         0.98110491, -0.26155954],
       ...,
       [ 0.54601963,  0.51740017, -0.51740017, ..., -0.86224587,
        -1.01925899,  3.82322132],
       [ 0.75801561,  0.51740017, -0.51740017, ...,  1.15976201,
        -1.01925899, -0.26155954],
       [-0.93795226,  0.51740017, -0.51740017, ...,  1.15976201,
        -1.01925899, -0.26155954]])

In [12]:
#utilizando o trasnfomador standardscaler para normalizar os dados
scaler.fit(x_teste)
novo_x_teste = scaler.transform(x_teste)

novo_x_teste

array([[-0.66417717,  0.51017053, -0.51017053, ..., -0.89661673,
         1.06748999, -0.3086067 ],
       [ 0.62903552,  0.51017053, -0.51017053, ...,  1.11530374,
        -0.93677693, -0.3086067 ],
       [ 0.84457097,  0.51017053, -0.51017053, ..., -0.89661673,
         1.06748999, -0.3086067 ],
       ...,
       [-0.340874  ,  0.51017053, -0.51017053, ..., -0.89661673,
         1.06748999, -0.3086067 ],
       [ 0.73680324,  0.51017053, -0.51017053, ..., -0.89661673,
         1.06748999, -0.3086067 ],
       [ 0.5212678 ,  0.51017053, -0.51017053, ..., -0.89661673,
         1.06748999, -0.3086067 ]])

In [13]:
#neste exemplo eu escolhi a arvore de decisao
modelo = DecisionTreeClassifier()

modelo = modelo.fit(novo_x_treino, y_treino)

In [14]:
#vendo o percentual de acertos
previsao = modelo.predict(novo_x_teste)

acertos = accuracy_score(y_teste, previsao) * 100

print("percentual de acertos, utilizando o calssificador arvore de decisao: %.2f" %acertos + "%")

percentual de acertos, utilizando o calssificador arvore de decisao: 75.00%


# **Pipelines**

In [15]:
#separando novamete os dados para construirmos os pipelines
np.random.seed(42)
x_treino, x_teste, y_treino, y_teste = train_test_split(x, y)

print("foi separado %d de treino e %d de teste" %(len(x_treino), len(x_teste)))

foi separado 688 de treino e 230 de teste


In [16]:
#criação do primeiro pipeline 
#este segue uma sequencia de instruções 
#o primeiro paramento pode ser qualquer nome
#o segundo paramentro é passado as funçoes da biblioteca que etamos utilizado
#perceba que estou executando exatamente o que fizemos antes
#transformar dados categoriais / normalizar dados / utilizar arvore de decisao
pipeline_1 = Pipeline([                 
    ('encoder', OneHotEncoder()),
    ('scaler', StandardScaler()),
    ('modelo', DecisionTreeClassifier())
])

In [17]:
pipeline_1.steps

[('encoder', OneHotEncoder()),
 ('scaler', StandardScaler()),
 ('modelo', DecisionTreeClassifier())]

In [18]:
#treinando nosso pipeline
pipeline_1.fit(x_treino, y_treino)

Pipeline(steps=[('encoder', OneHotEncoder(cols=[])),
                ('scaler', StandardScaler()),
                ('modelo', DecisionTreeClassifier())])

In [19]:
#verificando percentual de acertos
previsao = pipeline_1.score(x_teste, y_teste) *100

print("percentual de acertos no primeiro pipeline: %.2f" %previsao + "%") 

percentual de acertos no primeiro pipeline: 80.43%


In [20]:
#cirando novos pipelines para testarmos

pipeline_2 = Pipeline([
    ('encoder', OneHotEncoder()),
    ('scaler', MinMaxScaler()),
    ('modelo', DecisionTreeClassifier())       
])

pipeline_3 = Pipeline([     
    ('encoder', OneHotEncoder()),     
    ('scaler', MinMaxScaler()),
    ('modelo', DecisionTreeClassifier(max_depth=3))   
])

pipeline_4 = Pipeline([ 
    ('encoder', OneHotEncoder()),                             
    ('scaler', StandardScaler()),
    ('modelo', DecisionTreeClassifier(max_depth=5))     
])

In [21]:
#testando o pipeline 2
pipeline_2.fit(x_treino, y_treino)

Pipeline(steps=[('encoder', OneHotEncoder(cols=[])), ('scaler', MinMaxScaler()),
                ('modelo', DecisionTreeClassifier())])

In [22]:
previsao = pipeline_2.score(x_teste, y_teste) *100
print("percentual de acertos no segundo pipeline: %.2f" %previsao + "%") 

percentual de acertos no segundo pipeline: 78.26%


In [23]:
#testando terceriro pipeline
pipeline_3.fit(x_treino, y_treino)

Pipeline(steps=[('encoder', OneHotEncoder(cols=[])), ('scaler', MinMaxScaler()),
                ('modelo', DecisionTreeClassifier(max_depth=3))])

In [24]:
previsao = pipeline_3.score(x_teste, y_teste) * 100
print("percentual de acertos no terceiro pipeline: %.2f" %previsao + "%")

percentual de acertos no terceiro pipeline: 85.22%


In [25]:
#testando quarto pipeline
pipeline_4.fit(x_treino, y_treino)

Pipeline(steps=[('encoder', OneHotEncoder(cols=[])),
                ('scaler', StandardScaler()),
                ('modelo', DecisionTreeClassifier(max_depth=5))])

In [26]:
previsao = pipeline_4.score(x_teste, y_teste) * 100
print("percentual de acertos no quarto pipeline: %.2f" %previsao + "%")

percentual de acertos no quarto pipeline: 85.22%


# **Utilizando o GridSearchCV**
- greralmente utilizado para testar hiperparametros de um modelo
- vamos junta-lo ao conteito de um pipeline

In [27]:
from sklearn.model_selection import GridSearchCV

In [28]:
#escolhendo hiperparemtro que quero testar
#arqui voce pode passa quando quiser eu escolhi apenas o max_depth
parametros = { 'modelo__max_depth' : [4,5,6,7] }

In [29]:
#pipeline para executarmos no GridSearch
#transformar dados categoriais / normalizar dados / utilizar arvore de decisao
pipeline_grid = Pipeline([('encoder', OneHotEncoder()),                             
                          ('scaler', StandardScaler()),
                          ('modelo', DecisionTreeClassifier())
                          ])

In [30]:
#instanciando o GridSearchCV
#no campo cv é referente a uma validacao cruzada
grid = GridSearchCV(pipeline_grid, parametros, cv=3, scoring='accuracy')

In [31]:
#treinamento
grid.fit(x_treino, y_treino)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('encoder', OneHotEncoder()),
                                       ('scaler', StandardScaler()),
                                       ('modelo', DecisionTreeClassifier())]),
             param_grid={'modelo__max_depth': [4, 5, 6, 7]},
             scoring='accuracy')

In [32]:
#olhando resultados de cada modelo criado com seus respectivos hiperparametros
grid.cv_results_

{'mean_fit_time': array([0.01784452, 0.01681352, 0.01469858, 0.01270294]),
 'mean_score_time': array([0.00541584, 0.00593917, 0.00501418, 0.0047218 ]),
 'mean_test_score': array([0.8299095 , 0.80087336, 0.8037466 , 0.78776027]),
 'param_modelo__max_depth': masked_array(data=[4, 5, 6, 7],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'modelo__max_depth': 4},
  {'modelo__max_depth': 5},
  {'modelo__max_depth': 6},
  {'modelo__max_depth': 7}],
 'rank_test_score': array([1, 3, 2, 4], dtype=int32),
 'split0_test_score': array([0.85217391, 0.8       , 0.82608696, 0.80869565]),
 'split1_test_score': array([0.8209607 , 0.79912664, 0.81222707, 0.79039301]),
 'split2_test_score': array([0.81659389, 0.80349345, 0.77292576, 0.76419214]),
 'std_fit_time': array([0.00253792, 0.00337755, 0.00029152, 0.00020479]),
 'std_score_time': array([1.02670015e-04, 1.83230159e-03, 3.00319283e-04, 6.02661093e-05]),
 'std_test_score': array([0.01

In [33]:
print("o modelo com melhor max_depth possui: ", grid.best_params_ , 
      "e a melhor media percentual: ",  grid.best_score_ )

o modelo com melhor max_depth possui:  {'modelo__max_depth': 4} e a melhor media percentual:  0.8299094993987722
