<a href="https://colab.research.google.com/github/ThiagueraBarao/Otimizacao_de_Hiperparametros/blob/master/LGBM_Otimizacao_Hiperparametros.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Otimização de Hiperparâmetros**

### Autor: Thiago do Carmo Nunes
### Acesse em: [thiagocarmonunes.com.br](https://www.thiagocarmonunes.com.br/)
#### Publicado em :  19-Jul-2020

# **Aplicando o conhecimento em uma LGBM**

In [None]:
#Pacotes Utilizados
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree, metrics
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,cross_val_score,StratifiedKFold
from sklearn.metrics import roc_auc_score
from hyperopt import hp,tpe, fmin, STATUS_OK , space_eval
import lightgbm as lgbm

In [None]:
#Leitura de Base
df = pd.read_csv("/content/sample_data/UCI_Credit_Card.csv")
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [None]:
#Obs: Não foram feitos os tratamentos de base porque o objetivo é somente mostrar a otimização de hiperparâmetros
#Substitui os missings por -99
df.fillna(-99,inplace=True)

In [None]:
#Separa as variáveis em X e y
X = df.iloc[:, 1:-1]
y = df.iloc[:, -1]

In [None]:
#Não esqueça da semente
semente_thiago = 2020

In [None]:
#Separa em bases de Treino Teste e Validação
#Treino: 60%
#Validação: 20%
#Teste: 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=semente_thiago)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=semente_thiago) # 0.25 x 0.8 = 0.2

In [None]:
#Checa o tamanho das bases
print('Train X : '+str(X_train.shape))
print('Valid X : '+str(X_val.shape))
print('Test X : '+str(X_test.shape))
print('Train y : '+str(y_train.shape))
print('Valid y : '+str(y_val.shape))
print('Test y : '+str(y_test.shape))

Train X : (18000, 23)
Valid X : (6000, 23)
Test X : (6000, 23)
Train y : (18000,)
Valid y : (6000,)
Test y : (6000,)


## LGBM

### Parâmetros

* class_weight=
> {None, 'balanced'}
* boosting_type
> {gbdt, dart, goss}
* num_leaves
> {2:inf}
* learning_rate
> {0:1}
*subsample_for_bin
> {0:inf}
*feature_fraction
> {0:1}
*bagging_fraction
> {0:1}
*min_data_in_leaf
> {0:inf}
*lambda_l1
> {0:inf}
*lambda_l2
> {0:inf}
*min_child_weight
> {-inf:inf}


In [None]:
#Escolha do Modelo - Árvore de Decisão
#clf = lgbm.LGBMClassifier(**params)

## Escolha dos Hiperparâmetros:

### Conhecimento de Negócio:

In [None]:
#Estima o Modelo:
clf_1 = lgbm.LGBMClassifier(
                            boosting_type='dart',
                            num_leaves=8,
                            min_data_in_leaf=100,
                            learning_rate = 0.1)
clf_1 = clf_1.fit(X_train, y_train)

In [None]:
def GINI(classificador,y,X,safra):
  fpr, tpr, thresholds = metrics.roc_curve(y, classificador.predict_proba(X)[:,1])
  ROC_1 = roc_auc_score(y, classificador.predict_proba(X)[:,1])
  print("GINI "+safra+" : "+str(2*ROC_1-1))
  return((2*ROC_1-1))

In [None]:
clf_1_gini_train=GINI(clf_1,y_train,X_train,'Treino')
GINI(clf_1,y_val,X_val,'Validação')
clf_1_gini_test=GINI(clf_1,y_test,X_test,'Teste')

GINI Treino : 0.5971253021739229
GINI Validação : 0.543257395769746
GINI Teste : 0.5804962243797194


### As demais técnicas vão utilizar K-Fold , portanto não é necessário mais separar a base de treino em (Treino & Validação)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=semente_thiago)

In [None]:
#Checa o tamanho das bases
print('Train X : '+str(X_train.shape))
print('Test X : '+str(X_test.shape))
print('Train y : '+str(y_train.shape))
print('Test y : '+str(y_test.shape))

Train X : (24000, 23)
Test X : (6000, 23)
Train y : (24000,)
Test y : (6000,)


### Grid Search:

In [None]:
clf_2 = lgbm.LGBMClassifier(random_state=semente_thiago)

In [None]:
#Define os parametros para percorrer:
tree_param =  {
    'learning_rate': [0.005, 0.01],
    'n_estimators': [8,16,24],
    'num_leaves': [6,8,12,16],
    }
clf_2_search = GridSearchCV(clf_2, tree_param,scoring='roc_auc', cv=5)
clf_2_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LGBMClassifier(random_state=2020),
             param_grid={'learning_rate': [0.005, 0.01],
                         'n_estimators': [8, 16, 24],
                         'num_leaves': [6, 8, 12, 16]},
             scoring='roc_auc')

In [None]:
clf_2_gini_train=GINI(clf_2_search,y_train,X_train,'Treino')
clf_2_gini_test=GINI(clf_2_search,y_test,X_test,'Teste')

GINI Treino : 0.558199564084721
GINI Teste : 0.557025340714661


### Random Search:

In [None]:
clf_3 = lgbm.LGBMClassifier(random_state=semente_thiago)

In [None]:
tree_param = {
    'learning_rate': [0.005, 0.01],
    'n_estimators': [8,16,24],
    'num_leaves': [6,8,12,16],
    }
clf_3_search = RandomizedSearchCV(clf_3, tree_param,scoring='roc_auc', cv=5)
clf_3_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=LGBMClassifier(random_state=2020),
                   param_distributions={'learning_rate': [0.005, 0.01],
                                        'n_estimators': [8, 16, 24],
                                        'num_leaves': [6, 8, 12, 16]},
                   scoring='roc_auc')

In [None]:
clf_3_gini_train=GINI(clf_3_search,y_train,X_train,'Treino')
clf_3_gini_test=GINI(clf_3_search,y_test,X_test,'Teste')

GINI Treino : 0.558199564084721
GINI Teste : 0.557025340714661


### Otimização Bayesiana:

In [None]:
#Define a função que vamos ou maximizar ou minimizar
#No caso queremos maximizar , portanto será necessário tratar a métrica de performance
def objective(params):
    params = {
        'num_leaves': int(params['num_leaves']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
    }
    #Define o modelo
    clf = lgbm.LGBMClassifier(
        n_estimators=30,
        learning_rate=0.01,
        **params
    )
    #Faz o K-Fold 
    score = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=StratifiedKFold()).mean()
    print("Gini {:.3f} params {}".format(score, params))
    return score

space = {
    'num_leaves': hp.quniform('num_leaves', 8, 128, 2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10)

Gini 0.777 params {'num_leaves': 30, 'colsample_bytree': '0.746'}
Gini 0.770 params {'num_leaves': 12, 'colsample_bytree': '0.477'}
Gini 0.774 params {'num_leaves': 34, 'colsample_bytree': '0.409'}
Gini 0.774 params {'num_leaves': 42, 'colsample_bytree': '0.426'}
Gini 0.773 params {'num_leaves': 122, 'colsample_bytree': '0.969'}
Gini 0.777 params {'num_leaves': 40, 'colsample_bytree': '0.840'}
Gini 0.778 params {'num_leaves': 62, 'colsample_bytree': '0.673'}
Gini 0.777 params {'num_leaves': 84, 'colsample_bytree': '0.588'}
Gini 0.777 params {'num_leaves': 60, 'colsample_bytree': '0.616'}
Gini 0.777 params {'num_leaves': 100, 'colsample_bytree': '0.767'}
100%|██████████| 10/10 [00:14<00:00,  1.41s/it, best loss: 0.7701586709141486]


In [None]:
space_eval(best, space)

{'colsample_bytree': 0.47680832086403413, 'num_leaves': 12.0}

#### Fita o Modelo e Mede a Performance

In [None]:
clf_4 = lgbm.LGBMClassifier(colsample_bytree=0.47680832086403413,num_leaves=12,random_state=semente_thiago)
clf_4.fit(X_train, y_train)

LGBMClassifier(colsample_bytree=0.47680832086403413, num_leaves=12,
               random_state=2020)

In [None]:
clf_4_gini_train=GINI(clf_4,y_train,X_train,'Treino')
clf_4_gini_test=GINI(clf_4,y_test,X_test,'Teste')

GINI Treino : 0.6382610333476739
GINI Teste : 0.5829309561348395


# Compara as técnicas

In [None]:
print("#-----#-----#-----#-----#")
print("Comparação dos Modelos")
print("#-----#-----#-----#-----#")
print("Conhecimento de Negócio")
print("Treino : "+str(clf_1_gini_train))
print("Teste : "+str(clf_1_gini_test))
print("      #-----#-----#")
print("Grid Search")
print("Treino : "+str(clf_2_gini_train))
print("Teste : "+str(clf_2_gini_test))
print("      #-----#-----#")
print("Random Search")
print("Treino : "+str(clf_3_gini_train))
print("Teste : "+str(clf_3_gini_test))
print("      #-----#-----#")
print("Otimização Bayesiana")
print("Treino : "+str(clf_4_gini_train))
print("Teste : "+str(clf_4_gini_test))
print("#-----#-----#-----#-----#")

#-----#-----#-----#-----#
Comparação dos Modelos
#-----#-----#-----#-----#
Conhecimento de Negócio
Treino : 0.5971253021739229
Teste : 0.5804962243797194
      #-----#-----#
Grid Search
Treino : 0.558199564084721
Teste : 0.557025340714661
      #-----#-----#
Random Search
Treino : 0.558199564084721
Teste : 0.557025340714661
      #-----#-----#
Otimização Bayesiana
Treino : 0.6382610333476739
Teste : 0.5829309561348395
#-----#-----#-----#-----#
