In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
from PIL import Image
from IPython.display import display

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 200)

# Ruta fija de la competencia (ajústala si el nombre del dataset es otro)
INPUT_DIR = Path("/kaggle/input/csiro-biomass")

# Si dentro hay una sola subcarpeta (p.ej. csiro-biomass-public), usarla
subdirs = [p for p in INPUT_DIR.iterdir() if p.is_dir()]
COMP_DIR = subdirs[0] if len(subdirs) == 1 else INPUT_DIR

print("Usando carpeta de datos:", COMP_DIR)


In [None]:
train = pd.read_csv(COMP_DIR / "train.csv")
test  = pd.read_csv(COMP_DIR / "test.csv")

print("train shape:", train.shape)
print("test shape:", test.shape)

display(train.head())
display(test.head())


In [None]:
# Lista de variables objetivo
target_names = sorted(train["target_name"].unique())
print("Targets (target_name):", target_names)
print("Número de targets:", len(target_names))

# Pivot: filas = image_path, columnas = target_name, valores = target
y_wide = train.pivot(
    index="image_path",
    columns="target_name",
    values="target"
).loc[:, target_names]

print("y_wide shape:", y_wide.shape)
display(y_wide.head())


In [None]:
def extract_image_features(rel_path: str) -> dict:
    """
    rel_path: ruta relativa tal como viene en image_path, ej. 'train/ID1001187975.jpg'
    """
    img_path = COMP_DIR / rel_path  # COMP_DIR/train/ID....jpg

    # Valores por defecto por si hay problemas con la imagen
    feats = {
        "mean_R": 0.0,
        "mean_G": 0.0,
        "mean_B": 0.0,
        "std_R": 0.0,
        "std_G": 0.0,
        "std_B": 0.0,
        "excess_green": 0.0,
    }

    try:
        with Image.open(img_path) as img:
            img = img.convert("RGB")
            arr = np.array(img).astype(np.float32)

        R = arr[:, :, 0]
        G = arr[:, :, 1]
        B = arr[:, :, 2]

        feats["mean_R"] = R.mean()
        feats["mean_G"] = G.mean()
        feats["mean_B"] = B.mean()
        feats["std_R"] = R.std()
        feats["std_G"] = G.std()
        feats["std_B"] = B.std()
        feats["excess_green"] = (2*G - R - B).mean()

    except Exception as e:
        # No hacemos raise; mantenemos los valores por defecto
        # print(f"Error leyendo {img_path}: {e}")  # útil para debug local
        pass

    return feats


In [None]:
def safe_fill(df: pd.DataFrame) -> pd.DataFrame:
    """
    Reemplaza inf/NaN por valores seguros.
    """
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(df.mean()).fillna(0)
    return df


def build_model_and_submission(train: pd.DataFrame,
                               test: pd.DataFrame,
                               target_names,
                               y_wide: pd.DataFrame) -> pd.DataFrame:
    """
    Construye el modelo, predice sobre test y devuelve un DataFrame
    con columnas ['sample_id', 'target'], SIN usar sample_submission.csv.
    """

    # =========================
    # 1) Features de TRAIN
    # =========================
    train_images = y_wide.index.tolist()
    print("Número de imágenes de train:", len(train_images))

    features_list = []
    for rel_path in train_images:
        feats = extract_image_features(rel_path)
        feats["image_path"] = rel_path
        features_list.append(feats)

    X_train_df = pd.DataFrame(features_list).set_index("image_path")
    print("X_train_df shape (antes de safe_fill):", X_train_df.shape)
    display(X_train_df.head())

    X_train_df = safe_fill(X_train_df)
    print("NaN en X_train_df después de safe_fill:",
          X_train_df.isna().sum().sum())

    X = X_train_df.values
    y = y_wide.values
    print("X shape:", X.shape, "| y shape:", y.shape)

    # =========================
    # 2) CV rápido con RandomForest
    # =========================
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    fold_rmse = []

    for fold, (tr_idx, va_idx) in enumerate(kf.split(X)):
        X_tr, X_va = X[tr_idx], X[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        model = RandomForestRegressor(
            n_estimators=400,
            random_state=42,
            n_jobs=-1
        )
        model.fit(X_tr, y_tr)

        preds = model.predict(X_va)
        rmse_per_target = np.sqrt(((preds - y_va) ** 2).mean(axis=0))
        fold_rmse.append(rmse_per_target)

        rmse_dict = {t: r for t, r in zip(target_names, rmse_per_target)}
        print(f"Fold {fold} RMSE:", rmse_dict)

    print("\nRMSE promedio por target en CV:")
    mean_rmse = np.mean(fold_rmse, axis=0)
    for t, r in zip(target_names, mean_rmse):
        print(f"{t}: {r:.4f}")

    # =========================
    # 3) Entrenar modelo final
    # =========================
    final_model = RandomForestRegressor(
        n_estimators=400,
        random_state=42,
        n_jobs=-1
    )
    final_model.fit(X, y)

    # =========================
    # 4) Features de TEST
    # =========================
    test_images = test["image_path"].unique().tolist()
    print("Número de imágenes de test:", len(test_images))

    test_features_list = []
    for rel_path in test_images:
        feats = extract_image_features(rel_path)
        feats["image_path"] = rel_path
        test_features_list.append(feats)

    X_test_df = pd.DataFrame(test_features_list).set_index("image_path")
    print("X_test_df shape (antes de safe_fill):", X_test_df.shape)
    display(X_test_df.head())

    X_test_df = safe_fill(X_test_df)
    print("NaN en X_test_df después de safe_fill:",
          X_test_df.isna().sum().sum())

    X_test = X_test_df.values

    test_preds = final_model.predict(X_test)
    preds_test_df = pd.DataFrame(
        test_preds,
        index=test_images,
        columns=target_names
    )
    print("Predicciones por imagen (head):")
    display(preds_test_df.head())

    # =========================
    # 5) Pasar a formato largo y unir con test
    # =========================
    preds_long = (
        preds_test_df
        .reset_index()
        .melt(id_vars="index", var_name="target_name", value_name="pred")
        .rename(columns={"index": "image_path"})
    )

    print("preds_long (head):")
    display(preds_long.head())

    test_pred = test.merge(
        preds_long,
        on=["image_path", "target_name"],
        how="left"
    )

    print("test_pred shape:", test_pred.shape)
    display(test_pred.head())

    # =========================
    # 6) Construir submission SOLO desde test
    # =========================
    # test_pred tiene las mismas filas que test y la columna 'pred'
    submission = (
        test_pred[["sample_id", "pred"]]
        .rename(columns={"pred": "target"})
    )

    # Sanity check
    assert submission.shape[0] == test.shape[0], \
        "El submission no tiene el mismo número de filas que test"

    print("submission shape:", submission.shape)
    display(submission.head())

    return submission


In [None]:
# =========================
# BLOQUE PRINCIPAL ROBUSTO
# =========================

try:
    print("Ejecutando pipeline completo…")
    submission = build_model_and_submission(
        train=train,
        test=test,
        target_names=target_names,
        y_wide=y_wide
    )
    print("Pipeline completo OK.")
except Exception as e:
    print("ERROR en pipeline, usando fallback basado en test.")
    print("Detalle del error (solo visible en el public run):", repr(e))

    # Fallback: usar test y poner un valor fijo o la media de train
    submission = test[["sample_id"]].copy()
    # por ejemplo, usar la media global de los targets de train:
    global_mean = train["target"].mean()
    submission["target"] = global_mean

# Guardar SIEMPRE submission.csv
submission.to_csv("submission.csv", index=False)
print("submission.csv guardado.")
