# PLANTILLA MODELIZACIÓN PARA REGRESIÓN

**IMPORTANTE**: Recuerda hacer una copia de esta plantilla para no machacar la original.

**IMPORTANTE**: Esta plantilla está diseñada para una visión de máximos utilizando el framework de The Ultimate Algo Machine. Si tienes problemas de memoria o rendimiento recuerda reducir el problema mediante:

* Muestreo
* Balanceo undersampling
* Reducir el número de algoritmos a testar
* Reducir el número de parámetros a testar
* Usar random search y especificar un n_iter adecuado

## IMPORTAR PAQUETES

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_absolute_percentage_error

#Autocompletar rápido
%config IPCompleter.greedy=True

#Desactivar la notación científica
pd.options.display.float_format = '{:.2f}'.format

#Desactivar los warnings
import warnings
warnings.filterwarnings("ignore")

## IMPORTAR LOS DATOS

Sustituir la ruta del proyecto.

In [8]:
ruta_proyecto = 'C:/Users/isaac/Google Drive/DS4B/CursoMachineLearningPython/03_MACHINE_LEARNING/08_CASOS/00_PROYECTO1'

Nombres de los ficheros de datos.

In [9]:
nombre_x = 'x_final.pickle'
nombre_y = 'y_final.pickle'

Cargar los datos.

In [10]:
x = pd.read_pickle(ruta_proyecto + '/02_Datos/03_Trabajo/' + nombre_x)
y = pd.read_pickle(ruta_proyecto + '/02_Datos/03_Trabajo/' + nombre_y)

## MODELIZAR

### Reservar el dataset de validacion

In [26]:
train_x,val_x,train_y,val_y = train_test_split(x,y,test_size=0.3)

### Crear el pipe y el diccionario de algorimos, parámetros y valores a testar

Modificar para dejar solo los algoritmos que se quieran testar.

Modificar los parámetros.

In [32]:
pipe = Pipeline([('algoritmo',RandomForestRegressor())])

grid = [{'algoritmo': [LinearRegression()],
         'algoritmo__n_jobs': [-1]},
        
        {'algoritmo': [RandomForestRegressor()],
         'algoritmo__n_jobs': [-1],
         'algoritmo__max_depth': [5,10,15],
         'algoritmo__n_estimators': [50,100,200]},
        
        {'algoritmo': [XGBRegressor()],
         'algoritmo__n_jobs': [-1],
         'algoritmo__learning_rate': [0.01,0.025,0.05,0.1],
         'algoritmo__max_depth': [5,10,20],
         'algoritmo__reg_alpha': [0,0.1,0.5,1],
         'algoritmo__reg_lambda': [0.01,0.1,1],
         'algoritmo__n_estimators': [100,500,1000]},
        
        {'algoritmo': [HistGradientBoostingRegressor()],
         'algoritmo__learning_rate': [0.01,0.025,0.05,0.1],
         'algoritmo__max_iter': [50,100,200],
         'algoritmo__max_depth': [5,10,20],
         'algoritmo__min_samples_leaf': [500],
         'algoritmo__scoring': ['neg_mean_absolute_percentage_error'],
         'algoritmo__l2_regularization': [0,0.25,0.5,0.75,1]}
       ]

### Optimizar los hiper parámetros

Elegir si se quiere usar grid search o random search.

Comentar la opción que no se vaya a usar.

####  Con grid search

In [33]:
# grid_search = GridSearchCV(estimator= pipe, 
#                            param_grid = grid, 
#                            cv = 3, 
#                            scoring = 'neg_mean_absolute_percentage_error',
#                            verbose = 0,
#                            n_jobs = -1)

# modelo = grid_search.fit(train_x,train_y)

# pd.DataFrame(grid_search.cv_results_).sort_values(by = 'rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algoritmo,param_algoritmo__n_jobs,param_algoritmo__max_depth,param_algoritmo__n_estimators,param_algoritmo__learning_rate,param_algoritmo__reg_alpha,...,param_algoritmo__max_iter,param_algoritmo__min_samples_leaf,param_algoritmo__scoring,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.02,0.00,0.00,0.00,LinearRegression(n_jobs=-1),-1,,,,,...,,,,"{'algoritmo': LinearRegression(n_jobs=-1), 'al...",-0.24,-0.14,-0.24,-0.21,0.05,1
352,3.97,0.16,0.01,0.00,"XGBRegressor(base_score=None, booster=None, co...",-1,5,500,0.10,0.50,...,,,,"{'algoritmo': XGBRegressor(base_score=None, bo...",-0.52,-0.70,-1.70,-0.97,0.52,2
364,4.50,0.18,0.01,0.00,"XGBRegressor(base_score=None, booster=None, co...",-1,5,1000,0.10,0.50,...,,,,"{'algoritmo': XGBRegressor(base_score=None, bo...",-0.52,-0.70,-1.70,-0.97,0.52,2
360,7.80,0.13,0.01,0.00,"XGBRegressor(base_score=None, booster=None, co...",-1,5,1000,0.10,0,...,,,,"{'algoritmo': XGBRegressor(base_score=None, bo...",-0.57,-0.74,-1.61,-0.97,0.46,4
348,4.18,0.13,0.01,0.00,"XGBRegressor(base_score=None, booster=None, co...",-1,5,500,0.10,0,...,,,,"{'algoritmo': XGBRegressor(base_score=None, bo...",-0.57,-0.74,-1.61,-0.97,0.46,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,4.57,0.11,0.01,0.00,"XGBRegressor(base_score=None, booster=None, co...",-1,20,100,0.05,0.10,...,,,,"{'algoritmo': XGBRegressor(base_score=None, bo...",-3.72,-1.70,-2.71,-2.71,0.82,618
313,8.24,0.12,0.01,0.00,"XGBRegressor(base_score=None, booster=None, co...",-1,20,500,0.05,0.10,...,,,,"{'algoritmo': XGBRegressor(base_score=None, bo...",-3.73,-1.71,-2.72,-2.72,0.82,619
325,11.82,0.59,0.01,0.00,"XGBRegressor(base_score=None, booster=None, co...",-1,20,1000,0.05,0.10,...,,,,"{'algoritmo': XGBRegressor(base_score=None, bo...",-3.73,-1.71,-2.72,-2.72,0.82,619
217,17.26,0.41,0.01,0.00,"XGBRegressor(base_score=None, booster=None, co...",-1,20,1000,0.03,0.10,...,,,,"{'algoritmo': XGBRegressor(base_score=None, bo...",-3.85,-1.70,-2.72,-2.76,0.88,621


####  Con random search

In [37]:
random_search = RandomizedSearchCV(estimator = pipe,
                                   param_distributions = grid, 
                                   n_iter = 25, 
                                   cv = 3, 
                                   scoring = 'neg_mean_absolute_percentage_error', 
                                   verbose = 0,
                                   n_jobs = -1)

modelo = random_search.fit(train_x,train_y)

pd.DataFrame(random_search.cv_results_).sort_values(by = 'rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algoritmo__reg_lambda,param_algoritmo__reg_alpha,param_algoritmo__n_jobs,param_algoritmo__n_estimators,param_algoritmo__max_depth,param_algoritmo__learning_rate,...,param_algoritmo__min_samples_leaf,param_algoritmo__max_iter,param_algoritmo__l2_regularization,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
15,3.22,0.08,0.01,0.0,1.0,1.0,-1.0,1000.0,5,0.1,...,,,,"{'algoritmo__reg_lambda': 1, 'algoritmo__reg_a...",-0.61,-0.78,-1.61,-1.0,0.44,1
2,4.53,0.15,0.01,0.0,0.01,0.1,-1.0,1000.0,5,0.1,...,,,,"{'algoritmo__reg_lambda': 0.01, 'algoritmo__re...",-0.62,-0.75,-1.67,-1.01,0.47,2
24,2.75,0.18,0.01,0.0,1.0,1.0,-1.0,500.0,5,0.05,...,,,,"{'algoritmo__reg_lambda': 1, 'algoritmo__reg_a...",-0.57,-0.76,-1.75,-1.03,0.52,3
8,3.18,0.15,0.01,0.0,0.01,0.0,-1.0,500.0,5,0.03,...,,,,"{'algoritmo__reg_lambda': 0.01, 'algoritmo__re...",-0.77,-0.69,-1.68,-1.05,0.45,4
0,0.72,0.04,0.01,0.0,1.0,0.5,-1.0,100.0,5,0.05,...,,,,"{'algoritmo__reg_lambda': 1, 'algoritmo__reg_a...",-0.66,-0.79,-1.97,-1.14,0.59,5
6,0.57,0.0,0.01,0.0,1.0,0.0,-1.0,100.0,5,0.05,...,,,,"{'algoritmo__reg_lambda': 1, 'algoritmo__reg_a...",-0.72,-0.8,-1.98,-1.17,0.58,6
5,3.15,0.04,0.01,0.0,0.1,1.0,-1.0,500.0,5,0.01,...,,,,"{'algoritmo__reg_lambda': 0.1, 'algoritmo__reg...",-0.86,-0.74,-1.92,-1.18,0.53,7
17,0.63,0.02,0.01,0.0,0.01,0.1,-1.0,100.0,5,0.05,...,,,,"{'algoritmo__reg_lambda': 0.01, 'algoritmo__re...",-0.87,-0.77,-1.97,-1.2,0.55,8
22,0.27,0.03,0.07,0.0,,,,,5,0.03,...,500.0,200.0,0.25,{'algoritmo__scoring': 'neg_mean_absolute_perc...,-1.32,-1.0,-1.34,-1.22,0.16,9
19,0.24,0.03,0.05,0.0,,,,,10,0.05,...,500.0,200.0,0.25,{'algoritmo__scoring': 'neg_mean_absolute_perc...,-1.32,-1.0,-1.34,-1.22,0.16,9


## EVALUAR

### Predecir sobre validación

In [39]:
pred = modelo.best_estimator_.predict(val_x)

### Evaluar sobre validación

In [40]:
mean_absolute_percentage_error(val_y, pred)

0.9676047896698653

### Examinar el mejor modelo

In [41]:
modelo.best_estimator_

Pipeline(steps=[('algoritmo',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, enable_categorical=False,
                              gamma=0, gpu_id=-1, importance_type=None,
                              interaction_constraints='', learning_rate=0.1,
                              max_delta_step=0, max_depth=5, min_child_weight=1,
                              missing=nan, monotone_constraints='()',
                              n_estimators=1000, n_jobs=-1, num_parallel_tree=1,
                              predictor='auto', random_state=0, reg_alpha=1,
                              reg_lambda=1, scale_pos_weight=1, subsample=1,
                              tree_method='exact', validate_parameters=1,
                              verbosity=None))])