
<div class="jumbotron">
  <h1><i class="fa fa-bar-chart" aria-hidden="true"></i> Modelado de datos</h1>
  <p></p>
</div>

In [1]:
import mlflow
import mlflow.sklearn
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
from sklearn.metrics import silhouette_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from pandas import DataFrame 
from datetime import datetime
import pickle
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
from os import chdir

# retroceder al directorio principal
chdir("..")

In [3]:
## Tenga cuidado con sklearn y pandas en el entorno de producción.

In [4]:
def get_timestamp():
    now = datetime.now()
    year = '{:02d}'.format(now.year)
    month = '{:02d}'.format(now.month)
    day = '{:02d}'.format(now.day)
    hour = '{:02d}'.format(now.hour)
    minute = '{:02d}'.format(now.minute)
    return '{}-{}-{}-{}-{}'.format(year, month, day,hour,minute)
    

In [5]:
mlflow.set_tracking_uri("http://localhost:8002")

# Cargar datos

In [6]:
# Load data (deserialize)
# Load data (deserialize)
with open('data/interim/transform.pickle','rb') as handle:
    df = pickle.load(handle)

In [7]:
df.head()

Unnamed: 0,Age,Income,Score,Gender_Male
0,0.019231,0.0,0.387755,False
1,0.057692,0.0,0.816327,False
2,0.038462,0.008197,0.05102,True
3,0.096154,0.008197,0.77551,True
4,0.25,0.016393,0.397959,True


# Resumen de datos

### Eliminar variables superfluas

In [8]:
# NA

# Modeling

Podemos ver los resultados de MLFlow si ejecutamos en la terminal:



 _mlflow ui -p 1234_
 

## Hiperparámetros

## set dataset train 

In [9]:
X = df.values

In [10]:
X

array([[0.019230769230769273, 0.0, 0.38775510204081626, False],
       [0.05769230769230771, 0.0, 0.8163265306122448, False],
       [0.03846153846153849, 0.008196721311475405, 0.05102040816326531,
        True],
       [0.0961538461538462, 0.008196721311475405, 0.7755102040816326,
        True],
       [0.25, 0.016393442622950824, 0.39795918367346933, True],
       [0.07692307692307698, 0.016393442622950824, 0.7653061224489796,
        True],
       [0.326923076923077, 0.024590163934426215, 0.05102040816326531,
        True],
       [0.0961538461538462, 0.024590163934426215, 0.9489795918367346,
        True],
       [0.8846153846153847, 0.032786885245901634, 0.020408163265306124,
        False],
       [0.23076923076923084, 0.032786885245901634, 0.7244897959183673,
        True],
       [0.9423076923076924, 0.032786885245901634, 0.13265306122448978,
        False],
       [0.326923076923077, 0.032786885245901634, 0.9999999999999999,
        True],
       [0.7692307692307693, 0.0409836

In [11]:
# Parámetros para MLFlow

In [15]:
# Función para entrenar el modelo con RandomizedSearchCV
def train_with_randomized_search(X):
    # Define los parámetros que se ajustarán
    param_dist = {
        'n_clusters': [2, 3, 4, 5,7,9,11,13,15],
        'init': ['k-means++', 'random'],
        'algorithm': ['lloyd', 'elkan'],
    }
    # scorer = make_scorer(silhouette_score)
    kmeans = KMeans()

    randomized_search = RandomizedSearchCV(estimator=kmeans, param_distributions=param_dist,  cv=5, n_iter=20)
    randomized_search.fit(X)
    return randomized_search

In [20]:
# subset de columnas  para almacenar info en los logs
columns = ['mean_fit_time',
'param_n_clusters',
'param_init',
'param_algorithm',
'params',
'mean_test_score',
'rank_test_score']

In [19]:

# Genera un nombre único para el experimento basado en la marca de tiempo
name_experiment = get_timestamp()
name_experiment = name_experiment.replace('-','')

# Define un nombre para el experimento en MLflow
run_name = "Randomized Search"

# Inicializa una variable para almacenar los resultados
results = None

# Nombre del proyecto y algoritmo de clustering
name_project = 'reto'
name_algo = 'kMeans'

# Concatena los nombres para formar el nombre final del experimento
name_experiment = name_project + '_' + name_algo + '_' + run_name + '_' + name_experiment

# Establece el experimento actual en MLflow
mlflow.set_experiment(name_experiment)

# Inicia un nuevo registro en MLflow
with mlflow.start_run():
    # Entrena el modelo con RandomizedSearchCV
    best_params_random = train_with_randomized_search(X)
    
    # Registra los hiperparámetros y métricas en MLflow
    mlflow.log_params(best_params_random.best_params_)
    mlflow.log_metrics({'silhouette_score': best_params_random.best_score_})
    
    # Guarda el modelo entrenado en MLflow
    mlflow.sklearn.log_model(best_params_random.best_estimator_, 'kmeans_model')

    # Almacena los resultados en un DataFrame y los ordena por puntaje de prueba promedio
    results = DataFrame(best_params_random.cv_results_)
    results.sort_values(by='mean_test_score', inplace=True, ascending=False)

    # Itera a través de los resultados y registra los mejores hiperparámetros en MLflow
    for index, row in results[columns].head().iterrows():
        run_name = "ranking " + str(index)
        with mlflow.start_run(run_name=run_name, nested=True):
            for column in columns:
                mlflow.log_param(column, row[column])

    # Muestra el mejor puntaje y los mejores hiperparámetros en la salida
    display(best_params_random.best_score_)
    display(best_params_random.best_params_)
    display(results[columns].head())


2023/09/11 14:54:03 INFO mlflow.tracking.fluent: Experiment with name 'reto_kMeans_Randomized Search_202309111454' does not exist. Creating a new experiment.


-3.1541272148892485

{'n_clusters': 15, 'init': 'k-means++', 'algorithm': 'lloyd'}

Unnamed: 0,mean_fit_time,param_n_clusters,param_init,param_algorithm,params,mean_test_score,rank_test_score
9,0.007268,15,k-means++,lloyd,"{'n_clusters': 15, 'init': 'k-means++', 'algor...",-3.154127,1
8,0.002724,15,random,lloyd,"{'n_clusters': 15, 'init': 'random', 'algorith...",-3.17671,2
6,0.004732,13,random,elkan,"{'n_clusters': 13, 'init': 'random', 'algorith...",-3.328392,3
1,0.007113,13,k-means++,lloyd,"{'n_clusters': 13, 'init': 'k-means++', 'algor...",-3.397685,4
3,0.002804,13,random,lloyd,"{'n_clusters': 13, 'init': 'random', 'algorith...",-3.470122,5


<img src="../images/kmeans.png" alt="Drawing" style="width:900px;"/>
