# CSIRO Biomass — HGB + Small CNN + DINOv2 Base (518px) Ensemble


In [None]:

import os, glob, random
from pathlib import Path

import numpy as np
import pandas as pd

from PIL import Image
from IPython.display import display

# sklearn
from sklearn.model_selection import KFold
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

# torch
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T

# timm (DINOv2 Base extractor)
import timm

pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 220)

# Ruta del dataset de la competencia
INPUT_DIR = Path("/kaggle/input/csiro-biomass")
subdirs = [p for p in INPUT_DIR.iterdir() if p.is_dir()]
COMP_DIR = subdirs[0] if len(subdirs) == 1 else INPUT_DIR
print("Usando carpeta de datos:", COMP_DIR)

def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


In [None]:

train = pd.read_csv(COMP_DIR / "train.csv")
test  = pd.read_csv(COMP_DIR / "test.csv")

print("train shape:", train.shape)
print("test shape:", test.shape)
display(train.head(3)); display(test.head(3))


In [None]:
target_names = sorted(train["target_name"].unique())
print("Targets:", target_names, "| n_targets:", len(target_names))

y_wide = (train.pivot(index="image_path", columns="target_name", values="target")
               .loc[:, target_names])
print("y_wide shape:", y_wide.shape)

TARGET_WEIGHTS = {"Dry_Green_g":0.1,"Dry_Dead_g":0.1,"Dry_Clover_g":0.1,"GDM_g":0.2,"Dry_Total_g":0.5}
W_VEC = np.array([TARGET_WEIGHTS[t] for t in target_names], dtype=np.float32)


In [None]:
def extract_image_features(rel_path: str) -> dict:
    img_path = COMP_DIR / rel_path
    feats = {"mean_R":0.0,"mean_G":0.0,"mean_B":0.0,"std_R":0.0,"std_G":0.0,"std_B":0.0,
             "excess_green":0.0,"mean_gray":0.0,"mean_g_fraction":0.0,"prop_green_pixels":0.0,"p90_excess_green":0.0,
             "mean_H":0.0,"mean_S":0.0,"mean_V":0.0,"std_H":0.0,"std_S":0.0,"std_V":0.0,
             "mean_L":0.0,"mean_A":0.0,"mean_Blab":0.0,"std_L":0.0,"std_A":0.0,"std_Blab":0.0,
             "edge_density":0.0,"lap_var":0.0,"entropy":0.0}
    try:
        with Image.open(img_path) as img:
            img = img.convert("RGB")
            arr = np.asarray(img).astype(np.float32)
        R,G,B = arr[:,:,0], arr[:,:,1], arr[:,:,2]
        feats["mean_R"],feats["mean_G"],feats["mean_B"] = R.mean(),G.mean(),B.mean()
        feats["std_R"],feats["std_G"],feats["std_B"]    = R.std(),G.std(),B.std()
        eg = 2*G - R - B; feats["excess_green"] = eg.mean()
        gray = 0.299*R + 0.587*G + 0.114*B; feats["mean_gray"] = gray.mean()
        denom = R+G+B+1e-6; feats["mean_g_fraction"] = (G/denom).mean()
        feats["prop_green_pixels"] = ((G>R)&(G>B)).mean()
        feats["p90_excess_green"] = float(np.percentile(eg,90))
        hsv = np.array(Image.fromarray(arr.astype(np.uint8)).convert("HSV")).astype(np.float32)
        H,S,V = hsv[:,:,0], hsv[:,:,1], hsv[:,:,2]
        feats["mean_H"],feats["mean_S"],feats["mean_V"] = H.mean(),S.mean(),V.mean()
        feats["std_H"],feats["std_S"],feats["std_V"]    = H.std(),S.std(),V.std()
        lab = np.array(Image.fromarray(arr.astype(np.uint8)).convert("LAB")).astype(np.float32)
        L,A,Bl = lab[:,:,0], lab[:,:,1], lab[:,:,2]
        feats["mean_L"],feats["mean_A"],feats["mean_Blab"] = L.mean(),A.mean(),Bl.mean()
        feats["std_L"],feats["std_A"],feats["std_Blab"]    = L.std(),A.std(),Bl.std()
        try:
            from scipy.ndimage import sobel, laplace
            sob = np.hypot(sobel(gray,0), sobel(gray,1))
            feats["edge_density"] = (sob>sob.mean()).mean()
            feats["lap_var"] = laplace(gray).var()
        except Exception:
            pass
        hist,_ = np.histogram(gray, bins=64, range=(0,255), density=True)
        hist = hist + 1e-12
        feats["entropy"] = float(-(hist*np.log(hist)).sum())
    except Exception:
        pass
    return feats

def safe_fill(df: pd.DataFrame) -> pd.DataFrame:
    df = df.replace([np.inf,-np.inf], np.nan)
    df = df.fillna(df.mean()).fillna(0)
    return df


In [None]:
def predict_tabular(train_df: pd.DataFrame,
                    y_wide_df: pd.DataFrame,
                    test_df: pd.DataFrame,
                    target_names):
    # TRAIN features
    train_images = y_wide_df.index.tolist()
    feats_train = []
    for rel in train_images:
        f = extract_image_features(rel); f["image_path"] = rel; feats_train.append(f)
    X_train_df = pd.DataFrame(feats_train).set_index("image_path")
    X_train_df = safe_fill(X_train_df)

    X = X_train_df.values
    Y = y_wide_df[target_names].values
    Y_log = np.log1p(Y)

    # OOF para calibración
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof_pred_log = np.zeros_like(Y_log)
    for tr, va in kf.split(X):
        X_tr, X_va = X[tr], X[va]
        Y_tr, Y_va = Y_log[tr], Y_log[va]
        preds_log_va = np.zeros_like(Y_va)
        for j, t in enumerate(target_names):
            w = np.full(len(X_tr), TARGET_WEIGHTS[t], dtype=np.float32)
            hgb = HistGradientBoostingRegressor(max_depth=4, learning_rate=0.1, max_iter=400, random_state=42)
            hgb.fit(X_tr, Y_tr[:, j], sample_weight=w)
            preds_log_va[:, j] = hgb.predict(X_va)
        oof_pred_log[va] = preds_log_va

    calib = {}
    for j, t in enumerate(target_names):
        ridge = Ridge(alpha=1e-6, fit_intercept=True)
        ridge.fit(np.expm1(oof_pred_log[:, [j]]), np.expm1(Y_log[:, j:j+1]))
        calib[t] = (float(ridge.coef_[0][0]), float(ridge.intercept_[0]))

    # Full models
    models = []
    for j, t in enumerate(target_names):
        w = np.full(len(X), TARGET_WEIGHTS[t], dtype=np.float32)
        hgb = HistGradientBoostingRegressor(max_depth=4, learning_rate=0.1, max_iter=400, random_state=42)
        hgb.fit(X, Y_log[:, j], sample_weight=w)
        models.append(hgb)

    # TEST features
    test_images = test_df["image_path"].unique().tolist()
    feats_test = []
    for rel in test_images:
        f = extract_image_features(rel); f["image_path"] = rel; feats_test.append(f)
    X_test_df = pd.DataFrame(feats_test).set_index("image_path")
    X_test_df = safe_fill(X_test_df)
    X_test = X_test_df.values

    preds_log = np.column_stack([m.predict(X_test) for m in models])
    preds = np.expm1(preds_log); preds = np.clip(preds, 0, None)

    # aplicar calibración
    for j, t in enumerate(target_names):
        a, b = calib[t]; preds[:, j] = a * preds[:, j] + b

    preds_tab = pd.DataFrame(preds, index=test_images, columns=target_names)
    return preds_tab


In [None]:
class CSIROTrainDataset(Dataset):
    def __init__(self, image_paths, targets_log, transform=None):
        self.image_paths = image_paths; self.targets_log = targets_log; self.transform = transform
    def __len__(self): return len(self.image_paths)
    def _safe(self, p):
        pth = COMP_DIR / p
        try:
            with Image.open(pth) as im: im = im.convert("RGB")
        except Exception: im = Image.new("RGB",(224,224),(0,0,0))
        return im
    def __getitem__(self, i):
        im = self._safe(self.image_paths[i])
        if self.transform: im = self.transform(im)
        y = torch.tensor(self.targets_log[i], dtype=torch.float32)
        return im, y

class CSIROTestDataset(Dataset):
    def __init__(self, image_paths, transform=None):
        self.image_paths = image_paths; self.transform = transform
    def __len__(self): return len(self.image_paths)
    def _safe(self, p):
        pth = COMP_DIR / p
        try:
            with Image.open(pth) as im: im = im.convert("RGB")
        except Exception: im = Image.new("RGB",(224,224),(0,0,0))
        return im
    def __getitem__(self, i):
        im = self._safe(self.image_paths[i])
        if self.transform: im = self.transform(im)
        return im

class SmallCNN(nn.Module):
    def __init__(self, n_outputs):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3,16,3,padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(16,32,3,padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32,64,3,padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(64,128,3,padding=1), nn.ReLU(), nn.AdaptiveAvgPool2d((1,1))
        )
        self.head = nn.Sequential(nn.Flatten(), nn.Linear(128,128), nn.ReLU(), nn.Linear(128,n_outputs))
    def forward(self,x): return self.head(self.features(x))

def get_transforms():
    train_tf = T.Compose([T.Resize((224,224)), T.RandomHorizontalFlip(0.5), T.ToTensor()])
    test_tf  = T.Compose([T.Resize((224,224)), T.ToTensor()])
    return train_tf, test_tf

def tta_predict(model, image_tensor, n=4):
    x = image_tensor.unsqueeze(0).to(device)
    outs = []
    with torch.no_grad():
        outs.append(model(x).cpu().numpy()[0])
        outs.append(model(torch.flip(x,dims=[3])).cpu().numpy()[0])
        outs.append(model(torch.flip(x,dims=[2])).cpu().numpy()[0])
        outs.append(model(torch.rot90(x,k=1,dims=[2,3])).cpu().numpy()[0])
    return np.mean(outs[:n], axis=0)

def train_cnn_predict(y_wide_df, test_df, target_names,
                      epochs=8, batch_size=16, lr=1e-3, wd=1e-5, use_tta=True):
    img_paths = y_wide_df.index.tolist()
    Y = y_wide_df.values; Y_log = np.log1p(Y)

    train_tf, test_tf = get_transforms()
    ds = CSIROTrainDataset(img_paths, Y_log, transform=train_tf)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)

    model = SmallCNN(n_outputs=len(target_names)).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    w = torch.tensor(W_VEC, device=device).view(1,-1)
    iT  = target_names.index("Dry_Total_g")
    iG  = target_names.index("Dry_Green_g")
    iD  = target_names.index("Dry_Dead_g")
    iC  = target_names.index("Dry_Clover_g")
    lambda_cons = 0.3

    model.train()
    for ep in range(epochs):
        running=0.0
        for xb, yb in dl:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            pred = model(xb)  # log1p
            base = ((pred - yb)**2 * w).mean()
            pred_lin  = torch.expm1(pred)
            cons      = ((pred_lin[:, iT] - (pred_lin[:, iG] + pred_lin[:, iD] + pred_lin[:, iC]))**2).mean()
            loss = base + lambda_cons*cons
            loss.backward(); opt.step()
            running += loss.item()*xb.size(0)
        print(f"[CNN] Epoch {ep+1}/{epochs} - loss_w(log1p + cons_orig): {running/len(ds):.4f}")

    # TEST
    test_images = test_df["image_path"].unique().tolist()
    ds_test = CSIROTestDataset(test_images, transform=test_tf)
    dl_test = DataLoader(ds_test, batch_size=1, shuffle=False, num_workers=2, pin_memory=True)

    model.eval(); preds_log=[]
    with torch.no_grad():
        for xb in dl_test:
            x = xb[0]
            if use_tta:
                pb = tta_predict(model, x, n=4)
                preds_log.append(pb[None, :])
            else:
                preds_log.append(model(x.unsqueeze(0).to(device)).cpu().numpy())
    preds_log = np.concatenate(preds_log, axis=0)
    preds = np.expm1(preds_log); preds = np.clip(preds, 0, None)
    preds_cnn = pd.DataFrame(preds, index=test_images, columns=target_names)
    return preds_cnn


In [None]:
DINO_DIR = "/kaggle/input/dinov2/pytorch/base/1"
assert os.path.exists(DINO_DIR), "No existe la carpeta de DINO en /kaggle/input/dinov2/pytorch/base/1. Añádela en Add data → Models."

# Construye ViT-B/14 DINOv2 con num_classes=0 (embedding) y tamaño 518
arch = "vit_base_patch14_dinov2"
model_dino = timm.create_model(arch, pretrained=False, num_classes=0, img_size=518)
model_dino.eval().to(device)

# Carga de pesos locales (busca safetensors/pt/bin dentro de la ruta)
weights = []
for ext in ("*.safetensors","*.pt","*.bin"):
    weights += glob.glob(os.path.join(DINO_DIR, "**", ext), recursive=True)
if weights:
    wpath = weights[0]
    print("Cargando pesos DINO:", os.path.basename(wpath))
    try:
        sd = torch.load(wpath, map_location="cpu")
        if isinstance(sd, dict) and "state_dict" in sd: sd = sd["state_dict"]
        model_dino.load_state_dict(sd, strict=False)
    except Exception as e:
        print("Aviso: no pude mapear todos los nombres; continuaré con pesos parciales/none.", e)
else:
    print("Aviso: no se encontraron pesos en la carpeta; el extractor rendirá menos.")

# Transform de 518px
IMG_SIZE_DINO = 518
dino_tf = T.Compose([
    T.Resize((IMG_SIZE_DINO, IMG_SIZE_DINO)),
    T.ToTensor(),
    T.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
])

class ImgOnlyDS(Dataset):
    def __init__(self, rel_paths): self.rel = rel_paths
    def __len__(self): return len(self.rel)
    def __getitem__(self, i):
        p = (COMP_DIR / self.rel[i])
        try:
            im = Image.open(p).convert("RGB")
        except:
            im = Image.new("RGB",(IMG_SIZE_DINO, IMG_SIZE_DINO), (0,0,0))
        return dino_tf(im)

def dino_embeddings(paths, bs=8):   # batch 8 por memoria con 518px
    ds = ImgOnlyDS(paths)
    dl = DataLoader(ds, batch_size=bs, shuffle=False, num_workers=2, pin_memory=True)
    embs = []
    model_dino.eval()
    with torch.no_grad():
        for xb in dl:
            xb = xb.to(device)
            z = model_dino(xb)      # (B, D)
            if isinstance(z, (list, tuple)):
                z = z[0]
            embs.append(z.detach().cpu().numpy())
    return np.concatenate(embs, axis=0)

# Embeddings de train/test
train_imgs = y_wide.index.to_list()
test_imgs  = test["image_path"].drop_duplicates().to_list()
E_train = dino_embeddings(train_imgs, bs=8)
E_test  = dino_embeddings(test_imgs,  bs=8)

# Regr. por target
scaler = StandardScaler()
E_train_std = scaler.fit_transform(E_train)
E_test_std  = scaler.transform(E_test)

Y = y_wide[target_names].values
preds_dino = np.zeros((len(test_imgs), len(target_names)), dtype=np.float32)
for j,t in enumerate(target_names):
    w = np.full(len(E_train_std), TARGET_WEIGHTS[t], dtype=np.float32)
    reg = Ridge(alpha=1.0, fit_intercept=True)
    reg.fit(E_train_std, Y[:, j], sample_weight=w)
    preds_dino[:, j] = reg.predict(E_test_std)

preds_dino = np.clip(preds_dino, 0, None)
preds_dino_df = pd.DataFrame(preds_dino, index=test_imgs, columns=target_names)
preds_dino_df.head(3)


In [None]:
print(">> Entrenando TABULAR (HGB) ...")
preds_tab = predict_tabular(train, y_wide, test, target_names)

print("\n>> Entrenando Small CNN ...")
preds_cnn = train_cnn_predict(y_wide, test, target_names,
                              epochs=8, batch_size=16, lr=1e-3, wd=1e-5, use_tta=True)

# Blending simple por target (ajusta si tu LB cambia)
BLEND = {
    "Dry_Green_g": (0.40, 0.30, 0.30),  # (tab, cnn, dino)
    "Dry_Dead_g":  (0.40, 0.30, 0.30),
    "Dry_Clover_g":(0.40, 0.30, 0.30),
    "GDM_g":       (0.35, 0.35, 0.30),
    "Dry_Total_g": (0.30, 0.30, 0.40),  # total confía un poco más en DINO
}

preds_ens = preds_dino_df.copy()
for t in target_names:
    a,b,c = BLEND[t]
    preds_ens[t] = a*preds_tab.loc[test_imgs, t].values +                    b*preds_cnn.loc[test_imgs, t].values +                    c*preds_dino_df.loc[test_imgs, t].values

# Consistencia suave para Total
total_parts = preds_ens["Dry_Green_g"] + preds_ens["Dry_Dead_g"] + preds_ens["Dry_Clover_g"]
preds_ens["Dry_Total_g"] = 0.7*preds_ens["Dry_Total_g"] + 0.3*total_parts
preds_ens = preds_ens.clip(lower=0)

preds_ens.head(3)


In [None]:
preds_long = (preds_ens.reset_index()
              .melt(id_vars="index", var_name="target_name", value_name="pred")
              .rename(columns={"index":"image_path"}))
submission = (test[["sample_id","image_path","target_name"]]
              .merge(preds_long, on=["image_path","target_name"], how="left")
              [["sample_id","pred"]].rename(columns={"pred":"target"}))

assert submission.shape[0] == test.shape[0], "Submission size mismatch"
assert np.isfinite(submission["target"]).all(), "NaN/Inf en submission"
submission["target"] = submission["target"].clip(lower=0)

submission.to_csv("submission.csv", index=False)
print("submission.csv escrito. Filas:", len(submission))
display(submission.head())
