# 🧪 **Experiment Tracking con Hopsworks – Predicción COVID-19**

## 📚 **Importar librerías**

In [11]:
from pathlib import Path
from datetime import datetime
import os
import sys
import warnings

import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from joblib import dump
from mlflow.models import infer_signature
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

import hopsworks

from sklearn.model_selection import GridSearchCV

In [3]:
# Ruta al proyecto raíz (2 niveles arriba del notebook)
_PROJECT_ROOT = Path("/workspaces/COVID_DAILY_COUNTS")
SRC_PATH = _PROJECT_ROOT / "src"

# Agregar src al path si no está
if str(SRC_PATH) not in sys.path:
    sys.path.insert(0, str(SRC_PATH))

# Ahora sí puedes importar
from config import HopsworksSettings

# Cargar configuración desde .env
settings = HopsworksSettings(_env_file=_PROJECT_ROOT / ".env")

## 💾 **Conexión con Hopsworks y carga de datos**

In [5]:
# Configurar acceso
warnings.filterwarnings("ignore")

project = hopsworks.login(
    host=settings.HOPSWORKS_HOST,
    project=settings.HOPSWORKS_PROJECT,
    api_key_value=settings.HOPSWORKS_API_KEY.get_secret_value()
)

fs = project.get_feature_store()

covid_fg = fs.get_feature_group(name="covid_daily_counts", version=1)

2025-06-03 17:08:46,281 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-06-03 17:08:46,287 INFO: Initializing external client
2025-06-03 17:08:46,288 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-06-03 17:08:46,920 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1234082


## 👷 **Preparación de datos**

In [6]:
query = covid_fg.select([
    "case_count",
    "probable_case_count",
    "hospitalized_count",
    "case_count_7day_avg",
    "all_case_count_7day_avg",
    "hosp_count_7day_avg",
    "death_count_7day_avg",
    "date_of_interest",
    "death_count"
])

feature_view = fs.get_or_create_feature_view(
    name="covid_death_fv",
    version=1,
    description="Predicción de muertes por COVID-19",
    labels=["death_count"],
    query=query
)

## 🧪 **Train / Test Split**

In [7]:
# Dividir por fecha
test_start = datetime.strptime("2022-02-01", "%Y-%m-%d")
X_train, X_test, y_train, y_test = feature_view.train_test_split(test_start=test_start)

# Guardar fechas y limpiar columna
fechas = X_test["date_of_interest"].copy()
X_train = X_train.drop(columns=["date_of_interest"])
X_test = X_test.drop(columns=["date_of_interest"])

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.43s) 


## 👨‍🏭 **Feature Engineering**

In [8]:
# Solo columnas numéricas en este caso
numeric_cols = X_train.columns.tolist()

numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_pipeline, numeric_cols)
])

## 🧠 **Pipeline de Modelo**

In [9]:
model_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

## **🔍 Hiperparámetros**

In [10]:
param_grid = {
    "model__n_estimators": [100, 150],
    "model__max_depth": [5, 10, 15]
}

## 🔁 **Validación y entrenamiento**

In [13]:
grid_search = GridSearchCV(
    model_pipeline,
    param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

best_pipeline = grid_search.best_estimator_
best_params = grid_search.best_params_

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

## 📈 **Evaluación del modelo**

In [15]:
y_pred = best_pipeline.predict(X_test)

rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")

RMSE: 3.19
R²: 0.8643


## 🧪 **Seguimiento con MLflow**

In [16]:
mlflow.set_tracking_uri("mlruns")
mlflow.set_experiment("covid_death_prediction")

with mlflow.start_run():
    signature = infer_signature(X_train, best_pipeline.predict(X_train))

    mlflow.set_tag("model_type", "RandomForestRegressor")

    mlflow.sklearn.log_model(
        sk_model=best_pipeline,
        artifact_path="covid_death_model",
        signature=signature,
        input_example=X_train.head(),
        registered_model_name="covid_rf_model"
    )

    mlflow.log_metrics({
        "rmse": rmse,
        "r2": r2
    })

    mlflow.log_params(best_params)

2025/06/03 17:15:34 INFO mlflow.tracking.fluent: Experiment with name 'covid_death_prediction' does not exist. Creating a new experiment.
Successfully registered model 'covid_rf_model'.
Created version '1' of model 'covid_rf_model'.


## **Grabar modelo con MLflow**

In [None]:
DATA_MODEL = _PROJECT_ROOT / "models"
DATA_MODEL.mkdir(exist_ok=True)

PosixPath('/workspaces/COVID_DAILY_COUNTS/models')

In [19]:
mlflow.sklearn.save_model(
    sk_model=best_pipeline,
    path=DATA_MODEL / "covid_rf_model_v1",
    signature=signature,
    input_example=X_train.head()
)

## **Cargar Modelo con MLflow**

In [22]:
# Cargar modelo desde disco para validar que funciona
loaded_model = mlflow.sklearn.load_model(model_uri=DATA_MODEL / "covid_rf_model_v1")

# Validación rápida
loaded_predictions = loaded_model.predict(X_test).astype(int)

print("✅ Modelo cargado correctamente. Primeras predicciones:")
print(loaded_predictions[:10])

✅ Modelo cargado correctamente. Primeras predicciones:
[ 7  2  4  4 12  5  4  3  3  6]
