In [15]:
import os
import mlflow
import requests
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer

Se realiza la conexión con la instancia de MySQL creada, con el fin de poder realizar la carga de los datos iniciales y asi poder entrenar el respectivo modelo

In [8]:
import mysql.connector
import pandas as pd

# Conexión a la base de datos MySQL
conexion = mysql.connector.connect(
    host='mysql_model_data',         # Usamos el nombre del servicio en Docker
    user='admin',
    password='supersecret',
    database='mydatabase'
)

# Consulta SQL
query = "SELECT * FROM penguins"  # Reemplaza 'tu_tabla' por el nombre de la tabla que necesites

# Ejecutamos la consulta y obtenemos los resultados con sus nombres de columna
cursor = conexion.cursor()
cursor.execute(query)
datos = cursor.fetchall()
columnas = cursor.column_names

# Creamos un DataFrame de pandas con la información obtenida
df_view = pd.DataFrame(datos, columns=columnas)

# Cerramos cursor y conexión
cursor.close()
conexion.close()

# Mostramos el DataFrame
df_view


Unnamed: 0,id,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE\r
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE\r
2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE\r
3,4,Adelie,Torgersen,,,,,NA\r
4,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE\r
...,...,...,...,...,...,...,...,...
339,340,Gentoo,Biscoe,,,,,NA\r
340,341,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE\r
341,342,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE\r
342,343,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE\r


Limpieza de datos innecesarios

In [9]:
df_view.dropna(inplace=True)
df_view = df_view[df_view['sex']!='.']
df_view

Unnamed: 0,id,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE\r
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE\r
2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE\r
4,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE\r
5,6,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE\r
...,...,...,...,...,...,...,...,...
338,339,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,FEMALE\r
340,341,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE\r
341,342,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE\r
342,343,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE\r


Selección de muestra de entrenamiento y validación, para el flujo de entrenamiento de datos

In [37]:
X = df_view.drop(columns=["species","id"])
y = df_view["species"]
text_cols = ["island", "sex"]
num_cols = [
    "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", "body_mass_g"
]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


Se define el pipeline que permitira la transformación de las variables categoricas con OneHotEncoder y es estandariza la escala de las variables númericas

In [38]:
preprocessor = ColumnTransformer(
    transformers=[
    ("text", OneHotEncoder(handle_unknown="ignore"), text_cols),
    ("num", StandardScaler(), num_cols)
])

modelRF = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

Ahora se procede a la definición de la Grilla la cual contendra todos los parametros sujetos al ajuste de los modelos, garantizando que se tengan 21 combinaciones de hiperparametros diferentes. 

In [65]:
# Definición de la grilla de parámetros para el RandomForestClassifier
param_grid = {
    "classifier__n_estimators": [10, 50, 100],
    "classifier__max_depth": [ 5, 8, 10],
    "classifier__min_samples_split": [2, 5, 10]
}

grid_search = GridSearchCV(
    estimator=modelRF,
    param_grid=param_grid,
    cv=5,               # Número de folds para validación cruzada
    scoring="accuracy", # Métrica de evaluación
    n_jobs=-1,          # Utiliza todos los cores disponibles
    verbose=2           # Muestra información del progreso
)

In [66]:
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

import os
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://10.43.101.197:9000"
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'

# connect to mlflow
mlflow.set_tracking_uri("http://10.43.101.197:5000")
mlflow.set_experiment("mlflow_penguins_vf")

mlflow.sklearn.autolog(log_model_signatures=True, log_input_examples=True, registered_model_name="modelo_penguins", max_tuning_runs=27)

with mlflow.start_run(run_name="autolog_pipe_model_reg") as run:
    grid_search.fit(X_train, y_train)

2025/03/17 19:50:01 INFO mlflow.tracking.fluent: Experiment with name 'mlflow_penguins_vf' does not exist. Creating a new experiment.
                 ColumnTransformer(transformers=[('text',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['island', 'sex']),
                                                 ('num', StandardScaler(),
                                                  ['culmen_length_mm',
                                                   'culmen_depth_mm',
                              ...`


Fitting 5 folds for each of 27 candidates, totalling 135 fits


Successfully registered model 'modelo_penguins'.
2025/03/17 19:50:13 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: modelo_penguins, version 1
Created version '1' of model 'modelo_penguins'.
2025/03/17 19:50:16 INFO mlflow.sklearn.utils: Logging the 27 best runs, no runs will be omitted.


[CV] END classifier__max_depth=5, classifier__min_samples_split=2, classifier__n_estimators=10; total time=   0.0s
[CV] END classifier__max_depth=5, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.4s
[CV] END classifier__max_depth=5, classifier__min_samples_split=5, classifier__n_estimators=50; total time=   0.1s
[CV] END classifier__max_depth=5, classifier__min_samples_split=5, classifier__n_estimators=100; total time=   0.2s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=10; total time=   0.0s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=10; total time=   0.0s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=50; total time=   0.1s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classifier__n_estimators=50; total time=   0.1s
[CV] END classifier__max_depth=5, classifier__min_samples_split=10, classi