##### Training and Tuning

La principal razón del anterior notebook ha sido probar varios modelos de la forma más rápida posible, ver sus métricas y los impactos de diversos cambios. El principal problema (hasta ahora) con la versión de PyCaret es que al desplegar el modelo es un objeto de la misma librería, haciendo que se requiera instalar la PyCaret en producción lo cual es muy poco eficiente y complica mucho más las cosas. Por otro lado, PyCaret hace su hyperparameter tuning por RandomSearchCV, que no está mal pero sería más optimo hacerlo de manera Bayesiana. En ese sentido este notebook servirá para entrenar denuevo el(los) modelo(s) guardarlos y posteriormente desplegarlos de manera rápida y sencilla siendo prioridad el hacer el modelo lo más ligero posible.

In [1]:
import pandas as pd
import numpy as np
import warnings
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from bayes_opt import BayesianOptimization

csv_path = (
    "../data/train_encoded.csv",
    "../data/test_encoded.csv"
)

train = pd.read_csv(csv_path[0]).drop(["latitud","longitud"], axis=1)
test = pd.read_csv(csv_path[1]).drop(["latitud","longitud"], axis=1)

##### Para LightGBM.

Como ya lo hemos tuneado con Pycaret, los parámetros son:

```
Sin Tunear:
LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=104, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

Tuneado:
LGBMRegressor(bagging_fraction=1.0, bagging_freq=6, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.9,
              importance_type='split', learning_rate=0.15, max_depth=-1,
              min_child_samples=46, min_child_weight=0.001, min_split_gain=0,
              n_estimators=150, n_jobs=-1, num_leaves=2, objective=None,
              random_state=104, reg_alpha=0.7, reg_lambda=5, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
```

In [74]:
import warnings
warnings.filterwarnings('ignore')
random_state = 104  #Para benchmark. 

def bayes_parameter_opt_lgb(X, y, init_points=15, opt_round=25, n_folds=5, random_seed=6, n_estimators=10000, learning_rate=0.05, output_process=False):
    
    
    def lgb_eval(num_leaves, bagging_fraction, lambda_l1, lambda_l2, min_split_gain):
        """
        Defino los parametros que serán tuneados. Así como los parámetros fijos
        """
        params = {'application':'regression','num_iterations':5000, 'learning_rate':0.05, 'early_stopping_round':100, 'metric':'rmse',
                 'feature_fraction':0.9,'n_estimators':200,'feature_fraction':0.9, 'max_depth':-1,'min_child_weight':0.001,'verbose':-1}
    
    
        params["num_leaves"] = round(num_leaves)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = -1
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['min_split_gain'] = min_split_gain
        
        train_data = lgb.Dataset(data=X, label=y)
        
        cv_result = lgb.cv(params, train_data, nfold=5, seed=random_state, verbose_eval =200, metrics=['mae'], shuffle=False, 
                           stratified=False)
        return -max(cv_result['l1-mean'])
    
    #Configuro el rango de cada parametro
    lgbm_optimization = BayesianOptimization(lgb_eval, {'num_leaves': (2, 25),
                                                   'bagging_fraction':(0.8,1),
                                                   'lambda_l1':(0.5,3),
                                                   'lambda_l2':(3,20),
                                                   'min_split_gain': (0.001, 0.1)
                                                   })
    
    lgbm_optimization.maximize(init_points=init_points, n_iter=opt_round) #CHECK
    
    if output_process == True:
        lgbm_optimization.points_to_csv('lgbm_bayers_opt_result.csv')
        
    return lgbm_optimization

In [75]:
X = train.select_dtypes(exclude='object').drop('Precio_m2_total',axis=1)
y = train['Precio_m2_total']

opt_params = bayes_parameter_opt_lgb(X=X,y=y, init_points= 30, opt_round=100)

|   iter    |  target   | baggin... | lambda_l1 | lambda_l2 | min_sp... | num_le... |
-------------------------------------------------------------------------------------
[200]	cv_agg's l1: 1551.4 + 36.3896
| [0m 1       [0m | [0m-2.35e+03[0m | [0m 0.8932  [0m | [0m 0.8695  [0m | [0m 3.635   [0m | [0m 0.09268 [0m | [0m 5.296   [0m |
[200]	cv_agg's l1: 1487.55 + 38.0394
| [95m 2       [0m | [95m-2.335e+0[0m | [95m 0.8504  [0m | [95m 1.381   [0m | [95m 4.344   [0m | [95m 0.03386 [0m | [95m 16.2    [0m |
[200]	cv_agg's l1: 1495.1 + 42.8983
| [0m 3       [0m | [0m-2.338e+0[0m | [0m 0.8858  [0m | [0m 0.8048  [0m | [0m 6.898   [0m | [0m 0.06496 [0m | [0m 12.67   [0m |
[200]	cv_agg's l1: 1494.99 + 42.1525
| [0m 4       [0m | [0m-2.337e+0[0m | [0m 0.9934  [0m | [0m 2.994   [0m | [0m 4.707   [0m | [0m 0.08822 [0m | [0m 12.79   [0m |
[200]	cv_agg's l1: 1494.93 + 46.5012
| [0m 5       [0m | [0m-2.339e+0[0m | [0m 0.9247  [0m | [0m 1.5

In [90]:
min_ = min([res['target'] for res in opt_params.res])
[res['params'] for res in opt_params.res if res['target'] == min_]

[{'bagging_fraction': 0.9164810602504456,
  'lambda_l1': 0.5005454948781294,
  'lambda_l2': 6.60276585681876,
  'min_split_gain': 0.07385271072078259,
  'num_leaves': 3.3092443242891614}]

In [98]:
#Fit model

train_data = lgb.Dataset(X,y)

params = {'application':'regression','num_iterations':5000, 'learning_rate':0.05, 'metric':'rmse',
         'feature_fraction':0.9,'n_estimators':200,'feature_fraction':0.9, 'max_depth':-1,'min_child_weight':0.001,'verbose':-1,
         'bagging_fraction': 0.9164810602504456,'lambda_l1': 0.5005454948781294,'lambda_l2': 6.60276585681876,
          'min_split_gain': 0.07385271072078259,'num_leaves': 3}

model = lgb.cv(params, train_data, nfold=5, seed=random_state, verbose_eval =200, metrics=['mae'], shuffle=False, 
                           stratified=False)
#l1_error = Mae

[200]	cv_agg's l1: 1629.38 + 39.4417


In [123]:
X_test = test.select_dtypes(exclude='object').drop('Precio_m2_total',axis=1)
y_test = test['Precio_m2_total']
model = lgb.train(params, train_data)
preds = model.predict(X_test)

In [124]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

r2 = r2_score(y_test, preds)
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
print('r2:{}\nmae:{}\nmse:{}'.format(r2, mae, mse))

r2:0.3195114290174482
mae:1833.7454405484887
mse:13746071.920770092


Entrenando modelo final:

In [125]:
data_x = pd.concat([X,X_test])
data_y = pd.concat([y,y_test])
data = lgb.Dataset(data_x,data_y)

model_final = lgb.train(params, data)

In [129]:
import pickle

with open('../webapp/artifacts/models/lgbm_base.pkl','wb') as handle:
    pickle.dump(model_final, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### Random Forest:

In [24]:
from sklearn.model_selection import cross_val_score

def rf_cv(min_impurity_decrease, min_samples_split, max_features,max_depth, data, target):
    """Random Forest Cross Validation
    
    Esta funcion instanciará un regressor de Random Forest con los parámetros a optimizar:
    min_samples_split, max_features, min_impurity_decrease.
    
    """
    model = RandomForestRegressor(
        n_estimators = 150,
        min_impurity_decrease=min_impurity_decrease,
        min_samples_split = min_samples_split,
        max_features = max_features,
        max_depth = max_depth, #No olvidar tenerlo en integer.
        random_state = 123,
        n_jobs=-1
    )
    
    cross_val = cross_val_score(model, data, target,
                               scoring='neg_mean_absolute_error', cv=4)
    
    return cross_val.mean()

def optimize_rf(data, target):
    """Aplicamos Optimización Bayesiana a los parámetros del Random Forest Regressor"""
    
    def inside_rf_cv(min_impurity_decrease, min_samples_split, max_features, max_depth):
        """Wrapper of RandomForest cross validation.
        
        Tenemos que evitar que los parametros que toman valores enteros no se repitan, además de tener que
        restringir aquellos parámetros que van de 0 a 1.
        """
        
        return rf_cv(
            min_samples_split = int(min_samples_split),
            min_impurity_decrease = max(min(min_impurity_decrease, 0.999), 1e-3),
            max_features = max(min(max_features, 0.999), 1e-3),
            max_depth = int(max_depth),
            data = data,
            target = target,
        )
    
    optimizer = BayesianOptimization(
        f = inside_rf_cv,
        pbounds={
            "min_samples_split":(2,25),
            "min_impurity_decrease":(0.1,0.999),
            "max_features":(0.1, 0.999),
            "max_depth":(5, 25),
        },
        random_state=123,
        verbose=2
    )
    optimizer.maximize(init_points = 30, n_iter=100)
    
    print("Resultado Final", optimizer.max)
    
    return optimizer

In [25]:
X_train = train.select_dtypes(exclude='object').drop('Precio_m2_total',axis=1)
y_train = train['Precio_m2_total']

from bayes_opt.util import Colours

print(Colours.yellow("----Random Forest Regressor Optimizer----"))
optimize_rf(X_train, y_train)

[93m----Random Forest Regressor Optimizer----[0m
|   iter    |  target   | max_depth | max_fe... | min_im... | min_sa... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m-1.479e+0[0m | [0m 18.93   [0m | [0m 0.3572  [0m | [0m 0.3039  [0m | [0m 14.68   [0m |
| [0m 2       [0m | [0m-1.48e+03[0m | [0m 19.39   [0m | [0m 0.4804  [0m | [0m 0.9817  [0m | [0m 17.75   [0m |
| [0m 3       [0m | [0m-1.484e+0[0m | [0m 14.62   [0m | [0m 0.4525  [0m | [0m 0.4085  [0m | [0m 18.77   [0m |
| [0m 4       [0m | [0m-1.549e+0[0m | [0m 13.77   [0m | [0m 0.1537  [0m | [0m 0.4578  [0m | [0m 18.97   [0m |
| [0m 5       [0m | [0m-1.585e+0[0m | [0m 8.65    [0m | [0m 0.2577  [0m | [0m 0.5779  [0m | [0m 14.23   [0m |
| [0m 6       [0m | [0m-1.481e+0[0m | [0m 17.69   [0m | [0m 0.8636  [0m | [0m 0.7513  [0m | [0m 16.05   [0m |
| [95m 7       [0m | [95m-1.46e+03[0m | [95m 19.45   [0m | [95

<bayes_opt.bayesian_optimization.BayesianOptimization at 0x1e5d09cda60>

In [45]:
from sklearn.metrics import r2_score, mean_absolute_error
rf_reg = RandomForestRegressor(n_estimators = 300, n_jobs = -1, max_depth = 15, max_features = 0.67, min_impurity_decrease=0.1, min_samples_split=6)
rf_reg.fit(X_train, y_train)
preds = rf_reg.predict(X_test)

r2_score(y_test, preds) #0.38?

0.3861833313631359

In [46]:
final_model_rf = rf_reg.fit(pd.concat([X_train,X_test]), pd.concat([y_train, y_test]))

In [47]:
import pickle

with open('../webapp/artifacts/models/rf_base.pkl','wb') as handle:
    pickle.dump(final_model_rf, handle, protocol=pickle.HIGHEST_PROTOCOL)