In [2]:
# Importar librerías necesarias
import os
import pandas as pd
import logging
import pickle
from pathlib import Path

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


import sys
import os
from pathlib import Path

# Determinar la carpeta raíz del proyecto. 
#    Ajusta el número de ".parent" según tu estructura.
ROOT_DIR = Path(__file__).resolve().parent

# Agregar la carpeta raíz al sys.path si no está ya incluida
if str(ROOT_DIR) not in sys.path:
    sys.path.append(str(ROOT_DIR))

# Ahora que tu proyecto está en sys.path, ya puedes importar con normalidad
from config.paths import DATA_DIR, MODELS_DIR, PENGUINS_LTER_PATH

# Configurar logging
logging.basicConfig(level=logging.INFO)

# 📌 1. Cargar Datos desde la Carpeta "data/"
def load_data():
    """
    Carga el dataset de la carpeta 'data/'.
    """
    df = pd.read_csv(PENGUINS_LTER_PATH)
    df.columns = df.columns.str.lower()  # Convertir nombres de columnas a minúsculas
    df.dropna(inplace=True)  # Eliminar filas con valores nulos
    return df

# 📌 2. Crear el Preprocesador de Datos
def create_preprocessor():
    numeric_feats = ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", "body_mass_g"]
    cat_feats = ["island", "sex"]

    numeric_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())
    ])

    cat_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num", numeric_transformer, numeric_feats),
        ("cat", cat_transformer, cat_feats)
    ])
    return preprocessor

# 📌 3. Entrenar Modelos y Guardarlos en "models/"
def train_and_save_models():
    """
    Entrena modelos y los guarda en la carpeta 'models/'.
    """
    logging.info("📥 Cargando dataset de pingüinos...")
    df = load_data()
    logging.info(f"✔ Filas totales: {len(df)}")

    # Separar características (X) y etiqueta (y)
    X = df.drop(columns=["species"])
    y = df["species"]

    # Dividir en conjuntos de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    preprocessor = create_preprocessor()

    classifiers = {
        "penguin_classifier_randomforest": RandomForestClassifier(n_estimators=100, random_state=42),
        "penguin_classifier_gradientboosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
    }

    # Asegurar que la carpeta "models/" existe
    MODELS_DIR.mkdir(parents=True, exist_ok=True)

    # Entrenar y guardar cada modelo
    for model_name, clf in classifiers.items():
        pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("classifier", clf)
        ])
        logging.info(f"🔄 Entrenando: {model_name}")
        pipeline.fit(X_train, y_train)

        # Guardar modelo en "models/"
        model_path = MODELS_DIR / f"{model_name}.pkl"
        with open(model_path, "wb") as f:
            pickle.dump(pipeline, f)

        logging.info(f"✅ Modelo guardado en: {model_path}")

#  4. Ejecutar el Entrenamiento
if __name__ == "__main__":
    train_and_save_models()


NameError: name '__file__' is not defined