Este notebook tem por finalidade realizar a coleta de hiperparâmetros de forma aleatória para produzir uma base de estudos, afim de definir quais hiperparâmetros deverão ser utilizados de forma eficiente na otimização






In [None]:
#bibliotecas padrão
import numpy as np
import time
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from google.colab import files

#Importando modelos
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor 

#Importando bibliotecas de otimização
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
#Recolhendo dados do facebook e preparando dataset

import pandas_datareader.data as web
import datetime as dt

end = dt.datetime(2020, 6, 1)
start = dt.datetime(2019, 1, 1)

df = web.DataReader("FB", 'yahoo', start, end)

df = df.reset_index()
df = df.drop(columns=['Open','Date','High','Low','Volume','Adj Close'])
df = df.rename(columns={'Close': 'Close 0'})

def window (df, w):
    for i in range(1,w):
        df['Close '+str(i)] = df['Close '+str(i-1)].shift(1)
    return df
        
df = window(df,5)
df = df.rename(columns={'Close 0': 'Target'})
df.dropna(inplace=True)
#Separando dados de treino e teste
X = df.loc[:, ['Close 1','Close 2','Close 3','Close 4']]
y = df.loc[:, 'Target'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [None]:
def hp(modelo):
      
    hp = {}
    
    #Verificar qual modelo está sendo usado
    if modelo=='SVM':

      C = (np.arange(10, 100, 5)/100).tolist()

      epsilon = []
      for i in range (1,5,2):
          for j in range (2,10,2):
              epsilon.append(j/np.power(10,i))
      coef0 = (np.arange(1, 10)/100).tolist()
    
      hp = [
            {
                'kernel':['linear'], 'C':C, 'epsilon':epsilon },
            {
                'kernel':['rbf'], 'gamma': ['scale', 'auto'], 'C':C, 'epsilon':epsilon },
            {
                'kernel':['sigmoid'], 'gamma': ['scale', 'auto'], 'C':C, 'epsilon':epsilon, 'coef0': coef0 }
      ]

    elif modelo == 'KNN':

      n_n = (np.arange(1,19,2)).tolist()
      weights = ['uniform','distance']
      p = [1,2]

      hp = [
            {
                'n_neighbors':n_n, 'weights':weights, 'algorithm':['auto','brute'], 'p':p },
            {
                'n_neighbors':n_n, 'weights':weights, 'algorithm':['ball_tree','kd_tree'], 'p':p, 'leaf_size':(np.arange(1,150,5)).tolist()}
      ]
    
    elif modelo == 'DecisionTree':

      min_s_split = (np.arange(2,21,3)).tolist()
      max_feat = (np.arange(2,10)/10).tolist()
      criterion = ['mse','friedman_mse','mae']
      min_s_leaf = (np.arange(1,5)/10).tolist()
      spliter = ['best','random']

      hp = [
            {
                'min_samples_split':min_s_split, 'max_features':max_feat, 'max_depth':(np.arange(10,100,10)).tolist(), 'criterion':criterion, 
                 'splitter':spliter, 'min_samples_leaf': min_s_leaf}
      ]
    
    elif modelo == 'RandomForest':

      n_est = (np.arange(50,600,75)).tolist()
      #min_s_split = (np.arange(2,20,5)).tolist()
      min_s_split = [2,8,14,20]
      #max_feat = (np.arange(2,10,2)/10).tolist()
      max_feat = [0.3,0.5,0.7]
      criterion = ['mse','mae']
      #min_s_leaf = (np.arange(1,5)/10).tolist()
      min_s_leaf = [0.1,0.3,0.5]
      #max_samp = (np.arange(10,100,20)/100).tolist()
      max_samp = [0.1,0.3,0.5,0.7,0.9]
      max_dpt = [10,30,50,70,90]

      hp = [{'n_estimators':n_est, 'min_samples_split':min_s_split, 'max_features':max_feat, 'max_depth':max_dpt, 
               'min_samples_leaf': min_s_leaf, 'max_samples': max_samp, 'criterion':criterion, 'oob_score':[True,False]}
      ]

    elif modelo == "AdaBoost":

      n_est = (np.arange(50,500,50)).tolist()
      lr = [1e-3, 5e-3, 9e-3, 1e-1, 5e-1, 9e-1, 1e-5, 5e-5, 9e-5]
      loss = ['linear','exponential','square']

      hp = {'n_estimators':n_est, 'learning_rate': lr, 'loss':loss}

    elif modelo == "GradientBoosting":

      n_est = (np.arange(50,500,50)).tolist()
      lr = [1e-3, 5e-3, 9e-3, 1e-1, 5e-1, 9e-1, 1e-5, 5e-5, 9e-5]
      #min_s_split = (np.arange(2,20,5)).tolist()
      min_s_split = [2,8,14,20]
      #max_feat = (np.arange(2,10,2)/10).tolist()
      max_feat = [0.3,0.5,0.7]
      criterion = ['mse','friedman_mse','mae']
      #min_s_leaf = (np.arange(1,5)/10).tolist()
      min_s_leaf = [0.1,0.3,0.5]
      #max_dpt = (np.arange(10,100,10)).tolist()
      max_dpt = [10,30,50,70,90]
      loss = ['ls', 'lad', 'huber', 'quantile']
      sub = (np.arange(2,10,2)/10).tolist()

      hp = [
            {
             'n_estimators':n_est, 'learning_rate':lr, 'min_samples_split': min_s_split, 'criterion': criterion, 'min_samples_leaf': min_s_leaf,
             'max_depth':max_dpt, 'subsample':sub, 'max_features': max_feat, 'loss': ['ls', 'lad'] },
            {
             'n_estimators':n_est, 'learning_rate':lr, 'min_samples_split': min_s_split, 'criterion': criterion, 'min_samples_leaf': min_s_leaf,
             'max_depth':max_dpt, 'subsample':sub, 'max_features': max_feat, 'loss': ['huber', 'quantile'], 'alpha': (np.arange(5,100,7)/100).tolist()}
      ]

    elif modelo == "XGBoost":
      n_est = (np.arange(50,500,50)).tolist()
      lr = [1e-3, 5e-3, 9e-3, 1e-1, 5e-1, 9e-1, 1e-5, 5e-5, 9e-5]
      sub = (np.arange(2,10,2)/10).tolist()
      gamma = [1e-3, 5e-3, 9e-3, 1e-1, 5e-1, 9e-1, 1e-5, 5e-5, 9e-5]
      cbt = (np.arange(2,10,2)/10).tolist()
      bst = ['gbtree','gblinear','dart']

      hp = {
          'n_estimators':n_est, 'learning_rate':lr,'subsample':sub,'gamma':gamma,'colsample_bytree':cbt, 'booster':bst
      }



    return hp
        

#Função para Grid Search
def HP_Search(modelo):
    
    ini = time.time()
    
    #Receber conjunto de hiperparâmetros
    hip_space = hp(modelo)
    print(hip_space)

    n_iter = 1500
    
    #Verificar qual modelo está sendo usado
    if modelo=='SVM':
      print('SVM')
      model = RandomizedSearchCV(SVR(), hip_space, n_iter=n_iter, verbose=3)
    elif modelo == "KNN":
      print("KNN")
      model = RandomizedSearchCV(KNR(), hip_space, n_iter=n_iter, verbose=3)
    elif modelo == "DecisionTree":
      print("DT")
      model = RandomizedSearchCV(DecisionTreeRegressor(), hip_space, n_iter=n_iter, verbose=3)
    elif modelo == "RandomForest":
      print("rf")
      model = RandomizedSearchCV(RandomForestRegressor(), hip_space, n_iter=n_iter, verbose=3)
    elif modelo == "AdaBoost":
      print("adb")
      model = RandomizedSearchCV(AdaBoostRegressor(), hip_space,  n_iter=n_iter, verbose=3)
    elif modelo == "GradientBoosting":
      print("gdb")
      model = RandomizedSearchCV(GradientBoostingRegressor(), hip_space, n_iter=n_iter, verbose=3)
    elif modelo == "XGBoost":
      print("xgb")
      model = RandomizedSearchCV(XGBRegressor(), hip_space, n_iter=n_iter, verbose=3)

    model.fit(X_train, y_train)
    end = time.time()
    print(end-ini)
    return model
    
   

In [None]:
modelos = ['SVM','KNN','DecisionTree','RandomForest','AdaBoost','GradientBoosting','XGBoost']

for m in modelos:
  mod = HP_Search(m)
  df_md = pd.DataFrame.from_dict(mod.cv_results_)
  nome = m+"_HP.csv"
  df_md.to_csv(nome)
  files.download(nome) 
  print(nome + " baixado com sucesso!!!")