In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from IPython.display import display

from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 200)

# Ruta fija del dataset de la competición (ajusta si el nombre difiere)
INPUT_DIR = Path("/kaggle/input/csiro-biomass")

# Si dentro hay una única subcarpeta (p.ej. csiro-biomass-public), úsala
subdirs = [p for p in INPUT_DIR.iterdir() if p.is_dir()]
COMP_DIR = subdirs[0] if len(subdirs) == 1 else INPUT_DIR

print("Usando carpeta de datos:", COMP_DIR)


In [None]:
train = pd.read_csv(COMP_DIR / "train.csv")
test  = pd.read_csv(COMP_DIR / "test.csv")

print("train shape:", train.shape)
print("test shape:", test.shape)

display(train.head())
display(test.head())


In [None]:
# Lista de variables objetivo (target_name)
target_names = sorted(train["target_name"].unique())
print("Targets:", target_names, "| n_targets:", len(target_names))

# Pivot: filas = image_path, columnas = target_name, valores = target
y_wide = (
    train.pivot(index="image_path", columns="target_name", values="target")
         .loc[:, target_names]
)
print("y_wide shape:", y_wide.shape)
display(y_wide.head())


In [None]:
def extract_image_features(rel_path: str) -> dict:
    """
    rel_path: ruta relativa como viene en image_path (p.ej. 'train/IDxxxxx.jpg')
    Devuelve features robustos (si falla la lectura, devuelve ceros).
    """
    img_path = COMP_DIR / rel_path

    feats = {
        "mean_R": 0.0, "mean_G": 0.0, "mean_B": 0.0,
        "std_R": 0.0,  "std_G": 0.0,  "std_B": 0.0,
        "excess_green": 0.0,
        "mean_gray": 0.0,
        "mean_g_fraction": 0.0,
        "prop_green_pixels": 0.0,
        "p90_excess_green": 0.0,
    }

    try:
        with Image.open(img_path) as img:
            img = img.convert("RGB")
            arr = np.array(img).astype(np.float32)

        R = arr[:, :, 0]
        G = arr[:, :, 1]
        B = arr[:, :, 2]

        # Estadísticos básicos
        feats["mean_R"] = R.mean()
        feats["mean_G"] = G.mean()
        feats["mean_B"] = B.mean()
        feats["std_R"]  = R.std()
        feats["std_G"]  = G.std()
        feats["std_B"]  = B.std()

        # Exceso de verde
        eg = 2*G - R - B
        feats["excess_green"] = eg.mean()

        # Gris y fracción de verde
        gray = 0.299*R + 0.587*G + 0.114*B
        feats["mean_gray"] = gray.mean()

        denom = R + G + B + 1e-6
        g_frac = G / denom
        feats["mean_g_fraction"] = g_frac.mean()

        # Proporción de píxeles con verde dominante
        mask_green = (G > R) & (G > B)
        feats["prop_green_pixels"] = mask_green.mean()

        # Percentil alto de exceso de verde
        feats["p90_excess_green"] = float(np.percentile(eg, 90))

    except Exception:
        # Mantener ceros si hay problema al leer la imagen
        pass

    return feats


In [None]:
def safe_fill(df: pd.DataFrame) -> pd.DataFrame:
    """Reemplaza inf/NaN por valores seguros (media de columna y luego 0)."""
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(df.mean()).fillna(0)
    return df


In [None]:
def build_model_and_submission(train: pd.DataFrame,
                               test: pd.DataFrame,
                               target_names,
                               y_wide: pd.DataFrame) -> pd.DataFrame:
    """
    - Extrae features (train/test)
    - Entrena en log1p(target) con MultiOutput(HistGradientBoosting)
    - Predice test, revierte con expm1
    - Construye submission SOLO desde test (sin sample_submission)
    Devuelve DataFrame ['sample_id', 'target'].
    """

    # ===== 1) Features de TRAIN =====
    train_images = y_wide.index.tolist()
    feats_train = []
    for rel in train_images:
        f = extract_image_features(rel)
        f["image_path"] = rel
        feats_train.append(f)

    X_train_df = pd.DataFrame(feats_train).set_index("image_path")
    X_train_df = safe_fill(X_train_df)

    X = X_train_df.values
    y = y_wide.values
    y_log = np.log1p(y)  # trabajar en log1p

    print("X shape:", X.shape, "| y shape:", y.shape)

    # ===== 2) CV rápido en log1p (opcional informativo) =====
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    fold_rmse = []
    for fold, (tr, va) in enumerate(kf.split(X)):
        X_tr, X_va = X[tr], X[va]
        y_tr, y_va = y_log[tr], y_log[va]

        base = HistGradientBoostingRegressor(
            max_depth=3,
            learning_rate=0.1,
            max_iter=300,
            random_state=42
        )
        model = MultiOutputRegressor(base)
        model.fit(X_tr, y_tr)

        p = model.predict(X_va)
        rmse = np.sqrt(((p - y_va)**2).mean(axis=0))
        fold_rmse.append(rmse)
        print(f"Fold {fold} RMSE(log1p):",
              {t: float(r) for t, r in zip(target_names, rmse)})

    print("\nRMSE promedio(log1p):",
          {t: float(m) for t, m in zip(target_names, np.mean(fold_rmse, axis=0))})

    # ===== 3) Modelo final =====
    base_final = HistGradientBoostingRegressor(
        max_depth=3,
        learning_rate=0.1,
        max_iter=300,
        random_state=42
    )
    final_model = MultiOutputRegressor(base_final)
    final_model.fit(X, y_log)

    # ===== 4) Features de TEST =====
    test_images = test["image_path"].unique().tolist()
    feats_test = []
    for rel in test_images:
        f = extract_image_features(rel)
        f["image_path"] = rel
        feats_test.append(f)

    X_test_df = pd.DataFrame(feats_test).set_index("image_path")
    X_test_df = safe_fill(X_test_df)
    X_test = X_test_df.values

    # ===== 5) Predicción (log1p -> original) =====
    preds_log = final_model.predict(X_test)
    preds = np.expm1(preds_log)
    preds = np.clip(preds, 0, None)  # biomasa >= 0

    preds_df = pd.DataFrame(preds, index=test_images, columns=target_names)

    # ===== 6) Formato largo + unión con test =====
    preds_long = (
        preds_df.reset_index()
                .melt(id_vars="index", var_name="target_name", value_name="pred")
                .rename(columns={"index": "image_path"})
    )

    test_pred = test.merge(preds_long,
                           on=["image_path", "target_name"],
                           how="left")

    # ===== 7) Submission solo desde test =====
    submission = test_pred[["sample_id", "pred"]].rename(columns={"pred": "target"})

    # Sanity check
    assert submission.shape[0] == test.shape[0], \
        "El submission no tiene el mismo número de filas que test"

    return submission


In [None]:
# ===== BLOQUE PRINCIPAL A PRUEBA DE FALLOS =====
try:
    print("Ejecutando pipeline…")
    submission = build_model_and_submission(
        train=train,
        test=test,
        target_names=target_names,
        y_wide=y_wide
    )
    print("Pipeline OK.")
except Exception as e:
    print("ERROR en pipeline, usando fallback basado en test.")
    print("Detalle (solo visible en public run):", repr(e))
    # Fallback: output válido con media global de train
    submission = test[["sample_id"]].copy()
    submission["target"] = float(train["target"].mean())

# Guardar SIEMPRE submission.csv
submission.to_csv("submission.csv", index=False)
print("submission.csv guardado. Filas:", len(submission))
display(submission.head())
