In [1]:
import os
import mlflow
import requests
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer

In [2]:
df=pd.read_csv("C:/Users/jssaa/OneDrive/Documentos/sebastian saavedra/Maestria/Semestre 2025 01/MLops/Grupo6/Proyectos/Proyecto2/dags/covertype.csv")
df.head()

X = df.drop(columns=["Cover_Type"])
y = df["Cover_Type"]


text_cols = ["Wilderness_Area", "Soil_Type"]
num_cols = [
    'Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points'
]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


In [3]:
preprocessor = ColumnTransformer(
    transformers=[
    ("text", OneHotEncoder(handle_unknown="ignore"), text_cols),
    ("num", StandardScaler(), num_cols)
])

modelRF = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [4]:
# Definición de la grilla de parámetros para el RandomForestClassifier
param_grid = {
    "classifier__n_estimators": [10],
    "classifier__max_depth": [ 5],
    "classifier__min_samples_split": [2, 5]
}

grid_search = GridSearchCV(
    estimator=modelRF,
    param_grid=param_grid,
    cv=5,               # Número de folds para validación cruzada
    scoring="accuracy", # Métrica de evaluación
    n_jobs=-1,          # Utiliza todos los cores disponibles
    verbose=2           # Muestra información del progreso
)

In [5]:
df.columns

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area', 'Soil_Type',
       'Cover_Type'],
      dtype='object')

In [6]:
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

import os
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://localhost:9000"
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'

# connect to mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("mlflow_covertype_vf3")

mlflow.sklearn.autolog(log_model_signatures=True, log_input_examples=True, registered_model_name="modelo_covertype_vf3", max_tuning_runs=27)

with mlflow.start_run(run_name="autolog_pipe_model_reg") as run:
    grid_search.fit(X_train, y_train)


2025/03/24 22:48:42 INFO mlflow.tracking.fluent: Experiment with name 'mlflow_covertype_vf3' does not exist. Creating a new experiment.
                 ColumnTransformer(transformers=[('text',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Wilderness_Area',
                                                   'Soil_Type']),
                                                 ('num', StandardScaler(),
                                                  ['Elevation', 'Aspect',
                               ...`


Fitting 5 folds for each of 2 candidates, totalling 10 fits


  _warn_prf(average, modifier, msg_start, len(result))
Successfully registered model 'modelo_covertype_vf3'.
2025/03/24 22:49:28 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: modelo_covertype_vf3, version 1
Created version '1' of model 'modelo_covertype_vf3'.
2025/03/24 22:49:31 INFO mlflow.sklearn.utils: Logging the 27 best runs, no runs will be omitted.
                                 ['Wilderness_Area', 'Soil_Type']),
                                ('num', StandardScaler(),
                                 ['Elevation', 'Aspect', 'Slope',
                                  'Horizontal_Distance_To_Hydrology',
                                  'Vertical_Distance_To_Hydrology',
                                  'Horizontal_Distance_To_Roadways',...`
                                 ['Wilderness_Area', 'Soil_Type']),
                                ('num', StandardScaler(),
                                 ['Ele

In [11]:
mlflow.set_tracking_uri("http://localhost:5000") 

# Obtener la lista de modelos registrados en MLflow
models = mlflow.search_registered_models()

# Mostrar los modelos disponibles
res_model=[]
for model in models:
    res_model.append(model.name)

print (f"modelos_disponibles: {res_model}")

modelos_disponibles: ['modelo_covertype']


In [13]:
# Tratamiento de información
data = pd.DataFrame([{
  "Elevation": 2596,
  "Aspect": 51,
  "Slope": 3,
  "Horizontal_Distance_To_Hydrology": 258,
  "Vertical_Distance_To_Hydrology": 0,
  "Horizontal_Distance_To_Roadways": 510,
  "Hillshade_9am": 221,
  "Hillshade_Noon": 232,
  "Hillshade_3pm": 148,
  "Horizontal_Distance_To_Fire_Points": 6279,
  "Wilderness_Area": "Rawah",
  "Soil_Type": "C7745"
}])


os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://localhost:9000"
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'

mlflow.set_tracking_uri("http://localhost:5000")

model_name = "modelo_covertype"
model_production_uri = "models:/{model_name}/production".format(model_name=model_name)

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(model_uri=model_production_uri)

# Prediccion del modelo
predict = loaded_model.predict(data)
resultado = predict[0]

In [14]:
print(resultado) 

1
