In [None]:
# Version qui marche mais la sortie de .cv_results_ n'a pas le même format qu'un
# gridsearch standard

'''
Class to optimize clustering score.
Instantiate with a clusterer (estimator), a grid parameter (param_grid)
and a scoring function or a dictionary of functions (scoring)
'''

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import ParameterGrid

class GridSearchClust(BaseEstimator, TransformerMixin):

    def __init__(self, estimator, param_grid_estim, param_grid_preproc=None,
                 scoring=None, refit=silhouette_score, greater_is_better=True):

        # Getting parameters
        self.estimator = estimator
        self.param_grid_estim = param_grid_estim
        self.param_grid_preproc = param_grid_preproc
        self.scoring = scoring
        self.refit = refit
        self.greater_is_better = greater_is_better

    def fit(self, X, verbose=False):

        # Initialization of the dict of results
        self.results_ = {"scores": [],
                         "params": [],
                         "models": [],
                        #  "fit_times": [],
                         "nb_clusters": [],
                         "refit_score": []}

        # iterating upon all combinations of parameters
        for param in ParameterGrid(self.param_grid_estim):

            # instanciation of the model with selected parameters
            model = self.estimator.set_params(**param)

            # fitting the model
            model.fit(X)

            # computing labels
            labels = model.labels_

            # # Measuring training time while fitting the model on the data
            # time_train = %timeit -n1 -r1 -o -q model.fit(X)
            # time_train = time_train.average

            # Refit score
            try:
                refit_score = self.refit(X, labels) # self.scoring['silh'](X, labels)
            except:
                refit_score = np.nan
            
            # Other scores (scoring)
            model_score = {}
            if not self.scoring:  # if scoring parameter not defined
                model_score['score'] = model.score(X) # default score
            else:  # if scoring parameter is defined
                if type(self.scoring) != dict:
                    self.scoring = {'score': self.scoring}
                else:
                    for n_sco, sco in self.scoring.items():
                        try:
                            model_score[n_sco] = sco(X, labels)
                        except:
                            model_score[n_sco] = np.nan
            if verbose: print(model_score)
            
            # Computing number of clusters, excluding noise (#-1)
            nb_clusters = \
                len(set(model.labels_)) - (1 if -1 in set(model.labels_) else 0)
            nb_clusters = int(nb_clusters)
            if verbose: print(nb_clusters)

            # saving results, parameters and models in a dict
            self.results_["refit_score"].append(refit_score)  # refit score
            self.results_["scores"].append(model_score)  # dict of scores
            self.results_["params"].append(param)  # parameters
            self.results_["models"].append(model)  # trained models
            # self.results_["fit_times"].append(time_train)  # training time
            self.results_["nb_clusters"].append(nb_clusters)  # nb of clusters
        
        # Selecting best model based on the refit_score
        # -----------------------------------
        # initialisation
        best_model_index, best_score = None, None  
        # iterating over scores
        for index, score in enumerate(self.results_["refit_score"]):

            # initialisation
            if not best_score:
                best_score = score
                best_model_index = index

            # if score is better than current best_score
            cond = score > best_score if self.greater_is_better\
                                                 else score < best_score
            if cond:
                    # update the current best_score and current best_model_index
                    best_score = score
                    best_model_index = index
        
        # Update attributes of the instance
        self.best_refit_score_ = self.results_["refit_score"][best_model_index]
        self.best_score_ = self.results_["scores"][best_model_index]
        self.best_params_ = self.results_["params"][best_model_index]
        self.best_estimator_ = self.results_["models"][best_model_index]
        self.best_index_ = best_model_index
        # self.refit_time_ = self.results_["fit_times"][best_model_index]

        # refit the best model
        self.best_estimator_.fit(X)
        
        return self

    def predict(self, X_test):

        # use the .predict method of the estimator on the best model
        return self.best_model.predict(X_test)