# Hyper Opt Notebook
Esta notebook busca hyper parametros del modelo Light GBM, genera un .csv con los valores de la iteraciones. Esto se hace para no solo probar la mejor (posible overfitting) sino otras configuraciones que arrojen accuracies similares.

## Import modulos necesarios

In [None]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

import pandas as pd
import numpy as np
import datetime
from pandas_summary import DataFrameSummary

## Load data procesada utilizando las notebooks entregadas

In [19]:
df = pd.read_feather('train_normalized_data.fth')
df_test = pd.read_feather('test_normalized_data.fth')

In [20]:
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen', 'Promo2Weeks', 
            'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear', 'State', 
            'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_bool_fw', 'StateHoliday_bool_bw', 'SchoolHoliday_fw', 'SchoolHoliday_bw']
contin_vars = ['CompetitionDistance', 
               'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC', 'Precipitationmm',
               'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
               'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
               'AfterStateHoliday_bool', 'BeforeStateHoliday_bool', 'Promo', 'SchoolHoliday', 'StateHoliday_bool']

In [21]:
# Split data into train/val and define X and y variables
df_train = df[df.Date < datetime.datetime(2015, 7, 1)]  
df_val = df[df.Date >= datetime.datetime(2015, 7, 1)]
print(f'Cantidad en val: {len(df_val)}, porcentaje: {len(df_train)/(len(df_train) + len(df_val))}')

y_out_columns = ['Sales']
X_train = df_train[cat_vars + contin_vars]
X_val = df_val[cat_vars + contin_vars]
X_test = df_test[cat_vars + contin_vars]

Cantidad en val: 30188, porcentaje: 0.9642465458145908


In [22]:
X_train.shape, X_val.shape

((814150, 40), (30188, 40))

In [24]:
# Normalize output and determine wether to use log_output 
log_output = True
    
if log_output:
    # Escala logaritmica
    max_log_y = np.max(np.log(df[y_out_columns])).values
    y_train = np.log(df_train[y_out_columns].values)/max_log_y
    y_val = np.log(df_val[y_out_columns].values)/max_log_y
else:
    # Normalización
    y_mean = df_train[y_out_columns].mean().values
    y_std = df_train[y_out_columns].std().values
    y_train = (df_train[y_out_columns].values - y_mean)/y_std
    y_val = (df_val[y_out_columns].values - y_mean)/y_std

## Hyper Opt Model

In [4]:
from sklearn.model_selection import cross_val_score
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from hyperopt import Trials

ModuleNotFoundError: No module named 'hyperopt'

In [6]:
# Definimos una función que nos devuelva los y_pred desnormalizados (lo vamos a usar para calcular el score a optimizar)

In [7]:
def getValPred(model):
    if log_output:
        y_pred = np.exp(model.predict(X_val, verbose=1)*max_log_y)
    else:
        y_pred = model.predict(X_val, verbose=1)*y_std + y_mean
    return y_pred

In [24]:
# Normalize output and determine wether to use log_output 
log_output = True
    
if log_output:
    # Escala logaritmica
    max_log_y = np.max(np.log(df[y_out_columns])).values
    y_train = np.log(df_train[y_out_columns].values)/max_log_y
    y_val = np.log(df_val[y_out_columns].values)/max_log_y
else:
    # Normalización
    y_mean = df_train[y_out_columns].mean().values
    y_std = df_train[y_out_columns].std().values
    y_train = (df_train[y_out_columns].values - y_mean)/y_std
    y_val = (df_val[y_out_columns].values - y_mean)/y_std

In [None]:
# Definimos la función objetivo
def objective(params):
    params = {
        'reg_lambda': int(params['reg_lambda']),
        'num_leaves': int(params['num_leaves']),
        'max_depth': int(params['max_depth']),
        'n_estimators': int(params['n_estimators']),
        'learning_rate': '{:.4f}'.format(params['learning_rate'])
    }
    
    # Fixed Params
    min_child_samples=5
    max_depth = 500
    min_child_samples= 200 
    reg_alpha=1.0
    colsample_bytree=0.519264
    min_child_weight=0.0
    
    clf = LGBMRegressor(min_child_samples=min_child_samples, **params,
                        reg_alpha=reg_alpha, colsample_bytree=colsample_bytree, min_child_weight=min_child_weight,n_jobs=8)
    fit_params={"early_stopping_rounds":100, 
            "eval_metric" : 'l2', 
            "eval_set" : [(X_val, y_val.reshape(-1))],
            'eval_names': ['valid'],
            'verbose': 0,
            'feature_name': 'auto', # that's actually the default
            'categorical_feature': cat_vars
           }
    clf.fit(X_train, y_train.reshape(-1), **fit_params)
    
    y_pred = getValPred(clf)
    score = -1*(np.sqrt((((df_val['Sales'].values - y_pred)/df_val['Sales'].values)**2).sum()/len(y_pred))) # realizo el NEGATIVE RMSE
    return -score


In [None]:
space = {best
    'max_depth': hp.quniform('max_depth',400,600,20),
    'reg_lambda': hp.quniform('reg_lambda',0,40,1),
    'num_leaves': hp.quniform('num_leaves',50,80,5),
    'n_estimators': hp.quniform('n_estimators',1000,1200,50),
    'learning_rate': hp.loguniform('learning_rate', -4, -2)
}

tpe_trials = Trials()

best = fmin(fn=objective,
            space=space,
            trials = tpe_trials,
            verbose=2,
            algo=tpe.suggest,
            max_evals=100)

In [None]:
best

## Guardamos las iteraciones

In [None]:
tpe_results = pd.DataFrame({'loss': [x['loss'] for x in tpe_trials.results], 
                            'iteration': tpe_trials.idxs_vals[0]['max_depth'],
                            'max_depth': tpe_trials.idxs_vals[1]['max_depth']})
tpe_results.set_index('iteration')
tpe_results.loc[tpe_trials.idxs_vals[0]['learning_rate'], 'learning_rate']  = tpe_trials.idxs_vals[1]['learning_rate']
tpe_results.loc[tpe_trials.idxs_vals[0]['reg_lambda'], 'reg_lambda']  = tpe_trials.idxs_vals[1]['reg_lambda']
tpe_results.loc[tpe_trials.idxs_vals[0]['num_leaves'],  'num_leaves']  = tpe_trials.idxs_vals[1][ 'num_leaves']
tpe_results.loc[tpe_trials.idxs_vals[0]['n_estimators'], 'n_estimators']  = tpe_trials.idxs_vals[1]['n_estimators']

tpe_results.to_csv(f'iterations.csv')