### GridSearch & Pipelines
GridSearch is an optimization tool that we use when tuning hyperparameters. We define the grid of parameters that we want to search through, and we select the best combination of parameters for our data.

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

# 1 - One way
Itera un algoritmo sobre un conjunto de hiperparametros

In [6]:
import warnings
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer #para introducirlo como parte del proceso en Pipeline, y computa/Nan en el modelo...pero OJO!! todo el dataset
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest # Mediante test estadistico, busca la mejores features, por correlaciones... y seleec... parecido a 'features_importance_' de Modelos de 'DecisionTreeClassifier'
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [17]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV, train_test_split

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings(action="ignore",category=DeprecationWarning)
warnings.filterwarnings(action="ignore",category=FutureWarning)

iris = datasets.load_iris()

X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size = 0.2,
                                                   random_state=42)

####### GridSearchCV#######
#Modelo
svc = SVC()

#definir los parametros para GridSearchCV
parameters ={'kernel': ['linear','poly','rbf','sigmoid'],
             'C': [0.001, 0.01, 0.01, 0.5, 1, 5, 10, 100],
             'gamma': ['scale','auto'],
             'coef0':[-1, 0, 1]}

grid = GridSearchCV(estimator= svc,
                    param_grid=parameters,
                    n_jobs=-1, #Usa todos los cores que puedas, como capacidad de computación
                    scoring='accuracy',
                    verbose=True,
                    cv=10)

grid.fit(X_train, y_train)

Fitting 10 folds for each of 192 candidates, totalling 1920 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 428 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed:    0.9s finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.01, 0.5, 1, 5, 10, 100],
                         'coef0': [-1, 0, 1], 'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=True)

In [18]:
print("Best estimator:", grid.best_estimator_)
print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)

Best estimator: SVC(C=0.5, cache_size=200, class_weight=None, coef0=-1,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
Best params: {'C': 0.5, 'coef0': -1, 'gamma': 'scale', 'kernel': 'linear'}
Best score: 0.9666666666666667


In [None]:
best_estimator = grid.best_estimator_
best_estimator.score(X_test, y_test)

# 2: Almost-Pro way

La forma pro es la que hace esto mismo y va recogiendo los errores de entrenamiento, de validación y tiene la capacidad de parar el proceso cuando se requiera además de guardar el modelo en local una vez terminado si es mejor que el que había anteriormente y de cargar el modelo anterior y seguir reentrenando.

In [25]:
####### En el GridSearchCV, combinando procesos Pipeline #######
pipe = Pipeline(steps=[('classifier',RandomForestClassifier())]) #cada 'steps' es una tupla
                                    #¿Por qué se pone aquí 'RandomForestClassifier()'... porque hay no puede estar en 'null'/None... Luego en la iteración de 'hiperparametros' sustituirá por cada 'classifier' de cada 'parameters'

logistic_params = {
                    'classifier':[LogisticRegression()],
                    'classifier__penalty':['l1','l2'],
                    'classifier__C': np.arange(0.1,4,0.5)
}

random_forest_params = {
                        'classifier':[RandomForestClassifier()],
                        'classifier__n_estimators': [10,100,500,1000],
                        'classifier__max_features': [1,2,3]
}

svc_params = {
              'classifier':[SVC()],
              'classifier__kernel':['linear','rbf','sigmoid']
}

search_space = [logistic_params,random_forest_params,svc_params]

grid = GridSearchCV(pipe,
                    search_space,
                    cv=10,
                    n_jobs=-1)

grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('classifier',
                                        RandomForestClassifier(bootstrap=True,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                                               min_samples_leaf=1,
                                                               min_samples_split=2,
              

In [26]:
print("Best estimator:", grid.best_estimator_)
print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)

Best estimator: Pipeline(memory=None,
         steps=[('classifier',
                 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3,
                     gamma='auto_deprecated', kernel='linear', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)
Best params: {'classifier': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False), 'classifier__kernel': 'linear'}
Best score: 0.9666666666666667


In [21]:
best_estimator = grid.best_estimator_
best_estimator.score(X_test, y_test)

1.0

In [27]:
grid.score(X_test, y_test)

1.0

In [28]:
print(grid.predict(X_test))
print(y_test)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]


In [29]:
grid.best_estimator_

Pipeline(memory=None,
         steps=[('classifier',
                 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3,
                     gamma='auto_deprecated', kernel='linear', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [30]:
grid.best_estimator_['classifier']

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [None]:
grid.best_estimator_['classifier']

In [31]:
grid.best_score_

0.9666666666666667

# 3 Another way

In [38]:
####### Declara un par de Pipeline, con 1 sólo modelo, pero incluido otros proceso #######
#### A DANIEL EL PROFESOR LE GUSTA MÁS ESTA TERCERA OPCIÓN ##############################

reg_log = Pipeline( steps=[
                            ('imputer',SimpleImputer()), #imputar missing values
                            ('scaler',StandardScaler()), #estandariza
                            ('reglog',LogisticRegression()) #Generar
]) # 'imputer','scaler','reglog'... son solo identificadores que indico 'yo', puedo poner el nombre que quiera

svc = Pipeline( steps=[
                        ('scaler',StandardScaler()),
                        ('selectkbest',SelectKBest()),
                        ('svc', SVC())
])

random_forest = RandomForestClassifier()




re_log_param = {
    "imputer__strategy": ['mean', 'median', 'most_frequent'],
    "reglog__penalty": ["l1", "l2"],
    "reglog__C": np.arange(0.1, 4, 0.5)
}
#Se pueden mezclar los hiperparámetros de las operaciones/procesos ... pero NO de modelos.

svc_param = {
    "selectkbest__k": [1, 2, 3],
    "svc__C": np.arange(0.1, 0.9, 0.1),
    "svc__kernel": ['linear', 'poly', 'rbf']
}

rand_forest_params = {
    'n_estimators': [10, 100, 500, 1000],
    'max_features': [1, 2, 3]
}

####### Hay que montar un GridSerchCV para cada modelo #######

gs_reg_log = GridSearchCV(reg_log,
                        re_log_param,
                        scoring= 'accuracy',
                        cv= 10,
                        n_jobs = -1,
                        verbose = 1)

gs_svc = GridSearchCV(svc,
                        svc_param,
                        scoring= 'accuracy',
                        cv= 10,
                        n_jobs = -1,
                        verbose = 1)

gs_rand_forest = GridSearchCV(random_forest,
                            rand_forest_params,
                            cv= 10,
                             n_jobs = -1,
                            verbose = 1)

grids = {'gs_reg_log':gs_reg_log,
        'gs_svc':gs_svc,
        'gs_rand_forest': gs_rand_forest
}

In [39]:
%%time
####### CODE #######
for nombre, grid_searche in  grids.items():
    grid_searche.fit(X_train, y_train)


Fitting 10 folds for each of 48 candidates, totalling 480 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:    3.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Fitting 10 folds for each of 72 candidates, totalling 720 fits
[Parallel(n_jobs=-1)]: Done 116 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:    1.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Wall time: 24 s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   19.0s finished


In [40]:
grids.items()

dict_items([('gs_reg_log', GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('imputer',
                                        SimpleImputer(add_indicator=False,
                                                      copy=True,
                                                      fill_value=None,
                                                      missing_values=nan,
                                                      strategy='mean',
                                                      verbose=0)),
                                       ('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('reglog',
                                        LogisticRegression(C=1.0,
                          

In [43]:
best_grids = [(i, j.best_score_) for i, j in grids.items()]

best_grids = pd.DataFrame(best_grids, columns = ['Grid', 'Best score'])
best_grids.sort_values(by='Best score', ascending=False)

Unnamed: 0,Grid,Best score
1,gs_svc,0.958333
0,gs_reg_log,0.941667
2,gs_rand_forest,0.941667


In [46]:
print("Best estimator:", gs_svc.best_estimator_)
print("Best params:", gs_svc.best_params_)
print("Best score:", gs_svc.best_score_)

Best estimator: Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selectkbest',
                 SelectKBest(k=2,
                             score_func=<function f_classif at 0x0000023380A9CAF8>)),
                ('svc',
                 SVC(C=0.7000000000000001, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='auto_deprecated', kernel='poly', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)
Best params: {'selectkbest__k': 2, 'svc__C': 0.7000000000000001, 'svc__kernel': 'poly'}
Best score: 0.9583333333333334


In [48]:
####### GUARDAR MODELO PARA LLEGADO EL MOMENTO  ==> SUBIRLO A PRODUCCIÓN #######
estimator = gs_svc.best_estimator_

import pickle

with open('finished_model.model','wb') as archivo_salida:
    pickle.dump(estimator, archivo_salida)

In [49]:
####### CODE #######
with open('finished_model.model', 'rb') as archivo_entrada:
    pileline_importado = pickle.load(archivo_entrada)

In [50]:
pileline_importado


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('selectkbest',
                 SelectKBest(k=2,
                             score_func=<function f_classif at 0x0000023380A9CAF8>)),
                ('svc',
                 SVC(C=0.7000000000000001, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='auto_deprecated', kernel='poly', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)