<a href="https://colab.research.google.com/github/ThiagueraBarao/Otimizacao_de_Hiperparametros/blob/master/Decision_Tree_Otimizacao_Hiperparametros.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Otimização de Hiperparâmetros**

### Autor: Thiago do Carmo Nunes
### Acesse em: [thiagocarmonunes.com.br](https://www.thiagocarmonunes.com.br/)
#### Publicado em :  19-Jul-2020

# **Aplicando o conhecimento em uma Árvore de Decisão**

In [None]:
#Pacotes Utilizados
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree, metrics
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,cross_val_score
from sklearn.metrics import roc_auc_score
from hyperopt import hp,tpe, fmin, STATUS_OK , space_eval

In [None]:
#Leitura de Base
df = pd.read_csv("/content/sample_data/UCI_Credit_Card.csv")
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [None]:
#Obs: Não foram feitos os tratamentos de base porque o objetivo é somente mostrar a otimização de hiperparâmetros
#Substitui os missings por -99
df.fillna(-99,inplace=True)

In [None]:
#Separa as variáveis em X e y
X = df.iloc[:, 1:-1]
y = df.iloc[:, -1]

In [None]:
#Não esqueça da semente
semente_thiago = 2020

In [None]:
#Separa em bases de Treino Teste e Validação
#Treino: 60%
#Validação: 20%
#Teste: 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=semente_thiago)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=semente_thiago) # 0.25 x 0.8 = 0.2

In [None]:
#Checa o tamanho das bases
print('Train X : '+str(X_train.shape))
print('Valid X : '+str(X_val.shape))
print('Test X : '+str(X_test.shape))
print('Train y : '+str(y_train.shape))
print('Valid y : '+str(y_val.shape))
print('Test y : '+str(y_test.shape))

Train X : (18000, 23)
Valid X : (6000, 23)
Test X : (6000, 23)
Train y : (18000,)
Valid y : (6000,)
Test y : (6000,)


## Árvore de Decisão

### Parâmetros

* criterion='gini'
> {“gini”, “entropy”}
* splitter='best' 
> {“best”, “random”}
* max_depth=None 
> {2:inf}
* max_features=None
> {“auto”, “sqrt”, “log2”}


In [None]:
#Escolha do Modelo - Árvore de Decisão
#clf = tree.DecisionTreeClassifier()

## Escolha dos Hiperparâmetros:

### Conhecimento de Negócio:

In [None]:
#Estima o Modelo:
clf_1 = tree.DecisionTreeClassifier(
                                    criterion='gini',
                                    splitter='best',
                                    max_depth=4,
                                    max_features=None)
clf_1 = clf_1.fit(X_train, y_train)

In [None]:
def GINI(classificador,y,X,safra):
  fpr, tpr, thresholds = metrics.roc_curve(y, classificador.predict_proba(X)[:,1])
  ROC_1 = roc_auc_score(y, classificador.predict_proba(X)[:,1])
  print("GINI "+safra+" : "+str(2*ROC_1-1))
  return((2*ROC_1-1))

In [None]:
clf_1_gini_train=GINI(clf_1,y_train,X_train,'Treino')
GINI(clf_1,y_val,X_val,'Validação')
clf_1_gini_test=GINI(clf_1,y_test,X_test,'Teste')

GINI Treino : 0.4954620595206256
GINI Validação : 0.47993752752363483
GINI Teste : 0.4973788383497122


### As demais técnicas vão utilizar K-Fold , portanto não é necessário mais separar a base de treino em (Treino & Validação)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=semente_thiago)

In [None]:
#Checa o tamanho das bases
print('Train X : '+str(X_train.shape))
print('Test X : '+str(X_test.shape))
print('Train y : '+str(y_train.shape))
print('Test y : '+str(y_test.shape))

Train X : (24000, 23)
Test X : (6000, 23)
Train y : (24000,)
Test y : (6000,)


### Grid Search:

In [None]:
clf_2 = tree.DecisionTreeClassifier(random_state=semente_thiago)

In [None]:
#Define os parametros para percorrer:
tree_param = {'criterion' : ['gini','entropy'],
              'splitter' : ['best', 'random'],
              'max_depth' : list(range(3, 15)),
              'max_features' : ["auto", "sqrt", "log2"],
             }
clf_2_search = GridSearchCV(clf_2, tree_param,scoring='roc_auc', cv=5)
clf_2_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=2020,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                   

In [None]:
clf_2_gini_train=GINI(clf_2_search,y_train,X_train,'Treino')
clf_2_gini_test=GINI(clf_2_search,y_test,X_test,'Teste')

GINI Treino : 0.561115278935971
GINI Teste : 0.49463620881096637


### Random Search:

In [None]:
clf_3 = tree.DecisionTreeClassifier(random_state=semente_thiago)

In [None]:
tree_param = {'criterion' : ['gini','entropy'],
              'splitter' : ['best', 'random'],
              'max_depth' : list(range(3, 15)),
              'max_features' : ["auto", "sqrt", "log2"],
             }
clf_3_search = RandomizedSearchCV(clf_3, tree_param,scoring='roc_auc', cv=5)
clf_3_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort='deprecated',
                                                    random_state=2020,
          

In [None]:
clf_3_gini_train=GINI(clf_3_search,y_train,X_train,'Treino')
clf_3_gini_test=GINI(clf_3_search,y_test,X_test,'Teste')

GINI Treino : 0.427205147815259
GINI Teste : 0.46581299319163416


### Otimização Bayesiana:

#### Define função de custo

In [None]:
#Define a função que vamos ou maximizar ou minimizar
#No caso queremos maximizar , portanto será necessário tratar a métrica de performance
def custo(params):
    #Define o modelo
    modelo = tree.DecisionTreeClassifier(**params,random_state=semente_thiago)
    #Faz o K-Fold 
    score = -cross_val_score(modelo, X_train, y_train, cv=5, scoring="roc_auc", n_jobs=-1).mean()
    GINI = (2*-score)-1
    print(params)
    print('GINI : '+str(GINI))
    #Retorna para o Hyperopt o parametro
    return {'loss': score,'status':STATUS_OK}

#### Define o range de parametros

In [None]:
params={'criterion' : hp.choice('criterion',['gini','entropy']),
        'splitter' : hp.choice('splitter',['best', 'random']),
        'max_depth' : hp.choice('max_depth',range(3, 15)),
        'max_features' : hp.choice('max_features',["auto", "sqrt", "log2"]),
        }

#### Otimização

In [None]:
melhor_parametro = fmin(custo, params, algo=tpe.suggest,max_evals=10)

{'criterion': 'gini', 'max_depth': 5, 'max_features': 'log2', 'splitter': 'random'}
GINI : 0.3302264759871434
{'criterion': 'entropy', 'max_depth': 7, 'max_features': 'auto', 'splitter': 'best'}
GINI : 0.4668559690078904
{'criterion': 'gini', 'max_depth': 9, 'max_features': 'auto', 'splitter': 'random'}
GINI : 0.38442162799986557
{'criterion': 'entropy', 'max_depth': 7, 'max_features': 'sqrt', 'splitter': 'best'}
GINI : 0.4668559690078904
{'criterion': 'entropy', 'max_depth': 9, 'max_features': 'log2', 'splitter': 'best'}
GINI : 0.44024385092586793
{'criterion': 'gini', 'max_depth': 3, 'max_features': 'log2', 'splitter': 'best'}
GINI : 0.32650073089802745
{'criterion': 'gini', 'max_depth': 13, 'max_features': 'sqrt', 'splitter': 'random'}
GINI : 0.42799163796152806
{'criterion': 'entropy', 'max_depth': 5, 'max_features': 'log2', 'splitter': 'best'}
GINI : 0.3983199916655269
{'criterion': 'entropy', 'max_depth': 14, 'max_features': 'log2', 'splitter': 'best'}
GINI : 0.31984421722051937


In [None]:
space_eval(params, melhor_parametro)

{'criterion': 'entropy',
 'max_depth': 7,
 'max_features': 'auto',
 'splitter': 'best'}

#### Fita o Modelo e Mede a Performance

In [None]:
clf_4 = tree.DecisionTreeClassifier(**space_eval(params, melhor_parametro),random_state=semente_thiago)
clf_4.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=7, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=2020, splitter='best')

In [None]:
clf_4_gini_train=GINI(clf_4,y_train,X_train,'Treino')
clf_4_gini_test=GINI(clf_4,y_test,X_test,'Teste')

GINI Treino : 0.5124659697277445
GINI Teste : 0.4877906990528347


# Compara as técnicas

In [None]:
print("#-----#-----#-----#-----#")
print("Comparação dos Modelos")
print("#-----#-----#-----#-----#")
print("Conhecimento de Negócio")
print("Treino : "+str(clf_1_gini_train))
print("Teste : "+str(clf_1_gini_test))
print("      #-----#-----#")
print("Grid Search")
print("Treino : "+str(clf_2_gini_train))
print("Teste : "+str(clf_2_gini_test))
print("      #-----#-----#")
print("Random Search")
print("Treino : "+str(clf_3_gini_train))
print("Teste : "+str(clf_3_gini_test))
print("      #-----#-----#")
print("Otimização Bayesiana")
print("Treino : "+str(clf_4_gini_train))
print("Teste : "+str(clf_4_gini_test))
print("#-----#-----#-----#-----#")

#-----#-----#-----#-----#
Comparação dos Modelos
#-----#-----#-----#-----#
Conhecimento de Negócio
Treino : 0.4954620595206256
Teste : 0.4973788383497122
      #-----#-----#
Grid Search
Treino : 0.561115278935971
Teste : 0.49463620881096637
      #-----#-----#
Random Search
Treino : 0.427205147815259
Teste : 0.46581299319163416
      #-----#-----#
Otimização Bayesiana
Treino : 0.5124659697277445
Teste : 0.4877906990528347
#-----#-----#-----#-----#
