<a href="https://colab.research.google.com/github/ThiagueraBarao/Otimizacao_de_Hiperparametros/blob/master/XGB_Otimizacao_Hiperparametros.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Otimização de Hiperparâmetros**

### Autor: Thiago do Carmo Nunes
### Acesse em: [thiagocarmonunes.com.br](https://www.thiagocarmonunes.com.br/)
#### Publicado em :  19-Jul-2020

# **Aplicando o conhecimento em uma XGB**

In [None]:
#Pacotes Utilizados
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree, metrics
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,cross_val_score,StratifiedKFold
from sklearn.metrics import roc_auc_score
from hyperopt import hp,tpe, fmin, STATUS_OK , space_eval
import xgboost as xgb

In [None]:
#Leitura de Base
df = pd.read_csv("/content/sample_data/UCI_Credit_Card.csv")
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [None]:
#Obs: Não foram feitos os tratamentos de base porque o objetivo é somente mostrar a otimização de hiperparâmetros
#Substitui os missings por -99
df.fillna(-99,inplace=True)

In [None]:
#Separa as variáveis em X e y
X = df.iloc[:, 1:-1]
y = df.iloc[:, -1]

In [None]:
#Não esqueça da semente
semente_thiago = 2020

In [None]:
#Separa em bases de Treino Teste e Validação
#Treino: 60%
#Validação: 20%
#Teste: 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=semente_thiago)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=semente_thiago) # 0.25 x 0.8 = 0.2

In [None]:
#Checa o tamanho das bases
print('Train X : '+str(X_train.shape))
print('Valid X : '+str(X_val.shape))
print('Test X : '+str(X_test.shape))
print('Train y : '+str(y_train.shape))
print('Valid y : '+str(y_val.shape))
print('Test y : '+str(y_test.shape))

Train X : (18000, 23)
Valid X : (6000, 23)
Test X : (6000, 23)
Train y : (18000,)
Valid y : (6000,)
Test y : (6000,)


## XGB

In [None]:
#Escolha do Modelo - Árvore de Decisão
#clf = xgb.XGBClassifier(**params)

## Escolha dos Hiperparâmetros:

### Conhecimento de Negócio:

In [None]:
#Estima o Modelo:
clf_1 = xgb.XGBClassifier(
                n_estimators=250,
                learning_rate=0.05,
                n_jobs=4)
clf_1 = clf_1.fit(X_train, y_train)

In [None]:
def GINI(classificador,y,X,safra):
  fpr, tpr, thresholds = metrics.roc_curve(y, classificador.predict_proba(X)[:,1])
  ROC_1 = roc_auc_score(y, classificador.predict_proba(X)[:,1])
  print("GINI "+safra+" : "+str(2*ROC_1-1))
  return((2*ROC_1-1))

In [None]:
clf_1_gini_train=GINI(clf_1,y_train,X_train,'Treino')
GINI(clf_1,y_val,X_val,'Validação')
clf_1_gini_test=GINI(clf_1,y_test,X_test,'Teste')

GINI Treino : 0.608249888455352
GINI Validação : 0.541140859626587
GINI Teste : 0.5829130955344548


### As demais técnicas vão utilizar K-Fold , portanto não é necessário mais separar a base de treino em (Treino & Validação)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=semente_thiago)

In [None]:
#Checa o tamanho das bases
print('Train X : '+str(X_train.shape))
print('Test X : '+str(X_test.shape))
print('Train y : '+str(y_train.shape))
print('Test y : '+str(y_test.shape))

Train X : (24000, 23)
Test X : (6000, 23)
Train y : (24000,)
Test y : (6000,)


### Grid Search:

In [None]:
clf_2 = xgb.XGBClassifier(random_state=semente_thiago)

In [None]:
#Define os parametros para percorrer:
tree_param = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
clf_2_search = GridSearchCV(clf_2, tree_param,scoring='roc_auc', cv=5)
clf_2_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=2020, reg_alpha=0,
                                     reg_lambda=1, scale_pos_weight=1,
                                     seed=None, silent=None, subsample=1,
                                     verbosity=1),
             iid='deprecated', n_jobs=None, param_grid={'max_depth': [3, 4, 5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='ro

In [None]:
clf_2_gini_train=GINI(clf_2_search,y_train,X_train,'Treino')
clf_2_gini_test=GINI(clf_2_search,y_test,X_test,'Teste')

GINI Treino : 0.6372592967185284
GINI Teste : 0.5806814688368087


### Random Search:

In [None]:
clf_3 = xgb.XGBClassifier(random_state=semente_thiago)

In [None]:
tree_param = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
clf_3_search = RandomizedSearchCV(clf_3, tree_param,scoring='roc_auc', cv=5)
clf_3_search.fit(X_train, y_train)



RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=2020, reg_alpha=0,
                                           reg_lambda=1, scale_pos_weight=1,
                                           seed=None, silent=None, subsample=1,
                                           verbosity=1),
                   iid=

In [None]:
clf_3_gini_train=GINI(clf_3_search,y_train,X_train,'Treino')
clf_3_gini_test=GINI(clf_3_search,y_test,X_test,'Teste')

GINI Treino : 0.6023504182143053
GINI Teste : 0.5819976212209221


### Otimização Bayesiana:

In [None]:
#Define a função que vamos ou maximizar ou minimizar
#No caso queremos maximizar , portanto será necessário tratar a métrica de performance
def objective(params):
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
    }
    
    clf = xgb.XGBClassifier(
        n_estimators=250,
        learning_rate=0.05,
        n_jobs=4,
        **params
    )
    
    score = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=StratifiedKFold()).mean()
    print("Gini {:.3f} params {}".format(score, params))
    return score

space = {
    'max_depth': hp.quniform('max_depth', 2, 8, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'gamma': hp.uniform('gamma', 0.0, 0.5),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10)

Gini 0.775 params {'max_depth': 8, 'gamma': '0.208', 'colsample_bytree': '0.772'}
Gini 0.776 params {'max_depth': 6, 'gamma': '0.344', 'colsample_bytree': '0.989'}
Gini 0.777 params {'max_depth': 8, 'gamma': '0.050', 'colsample_bytree': '0.436'}
Gini 0.778 params {'max_depth': 7, 'gamma': '0.052', 'colsample_bytree': '0.583'}
Gini 0.778 params {'max_depth': 6, 'gamma': '0.326', 'colsample_bytree': '0.759'}
Gini 0.779 params {'max_depth': 7, 'gamma': '0.237', 'colsample_bytree': '0.302'}
Gini 0.778 params {'max_depth': 8, 'gamma': '0.193', 'colsample_bytree': '0.447'}
Gini 0.780 params {'max_depth': 4, 'gamma': '0.256', 'colsample_bytree': '0.838'}
Gini 0.777 params {'max_depth': 6, 'gamma': '0.208', 'colsample_bytree': '0.885'}
Gini 0.780 params {'max_depth': 4, 'gamma': '0.494', 'colsample_bytree': '0.969'}
100%|██████████| 10/10 [06:01<00:00, 36.11s/it, best loss: 0.7753853183441054]


In [None]:
space_eval(best, space)

{'colsample_bytree': 0.7716630112527664,
 'gamma': 0.20818387837017732,
 'max_depth': 8.0}

#### Fita o Modelo e Mede a Performance

In [None]:
clf_4 = xgb.XGBClassifier(colsample_bytree=0.7716630112527664,gamm= 0.20818387837017732,max_depth=8,random_state=semente_thiago)
clf_4.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7716630112527664,
              gamm=0.20818387837017732, gamma=0, learning_rate=0.1,
              max_delta_step=0, max_depth=8, min_child_weight=1, missing=None,
              n_estimators=100, n_jobs=1, nthread=None,
              objective='binary:logistic', random_state=2020, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
              subsample=1, verbosity=1)

In [None]:
clf_4_gini_train=GINI(clf_4,y_train,X_train,'Treino')
clf_4_gini_test=GINI(clf_4,y_test,X_test,'Teste')

GINI Treino : 0.8443987032123674
GINI Teste : 0.5682046540298966


# Compara as técnicas

In [None]:
print("#-----#-----#-----#-----#")
print("Comparação dos Modelos")
print("#-----#-----#-----#-----#")
print("Conhecimento de Negócio")
print("Treino : "+str(clf_1_gini_train))
print("Teste : "+str(clf_1_gini_test))
print("      #-----#-----#")
print("Grid Search")
print("Treino : "+str(clf_2_gini_train))
print("Teste : "+str(clf_2_gini_test))
print("      #-----#-----#")
print("Random Search")
print("Treino : "+str(clf_3_gini_train))
print("Teste : "+str(clf_3_gini_test))
print("      #-----#-----#")
print("Otimização Bayesiana")
print("Treino : "+str(clf_4_gini_train))
print("Teste : "+str(clf_4_gini_test))
print("#-----#-----#-----#-----#")

#-----#-----#-----#-----#
Comparação dos Modelos
#-----#-----#-----#-----#
Conhecimento de Negócio
Treino : 0.608249888455352
Teste : 0.5829130955344548
      #-----#-----#
Grid Search
Treino : 0.6372592967185284
Teste : 0.5806814688368087
      #-----#-----#
Random Search
Treino : 0.6023504182143053
Teste : 0.5819976212209221
      #-----#-----#
Otimização Bayesiana
Treino : 0.8443987032123674
Teste : 0.5682046540298966
#-----#-----#-----#-----#
