In [None]:
import os, gc, math, random, time, json, warnings
import numpy as np, pandas as pd
from pathlib import Path
from PIL import Image
import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score

warnings.filterwarnings("ignore")

In [None]:
COMP_DIR = Path("/kaggle/input/csiro-biomass")  # adjust if different
TRAIN_CSV = COMP_DIR/"train.csv"
TEST_CSV  = COMP_DIR/"test.csv"
IMG_ROOT  = COMP_DIR  # train/ and test/ are relative to this

TARGETS = ["Dry_Green_g","Dry_Dead_g","Dry_Clover_g","GDM_g","Dry_Total_g"]
WEIGHTS = {"Dry_Green_g":0.1,"Dry_Dead_g":0.1,"Dry_Clover_g":0.1,"GDM_g":0.2,"Dry_Total_g":0.5}

SEED=1337; random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
trn = pd.read_csv(TRAIN_CSV)
tst = pd.read_csv(TEST_CSV)

In [None]:
# Pivot long->wide targets
wide = trn.pivot_table(index=["sample_id","image_path","Sampling_Date","State","Species","Pre_GSHH_NDVI","Height_Ave_cm"],
                       columns="target_name", values="target").reset_index()
assert set(TARGETS).issubset(set(wide.columns))

In [None]:
# Basic feature cleanup
wide["Sampling_Date"] = pd.to_datetime(wide["Sampling_Date"])
wide["month"] = wide["Sampling_Date"].dt.month
wide["year"]  = wide["Sampling_Date"].dt.year
# cheap season
wide["season"] = ((wide["month"]%12)//3).astype(int)  # 0..3

In [None]:
# Tokenize species (take first 3 tokens to limit sparsity)
def species_tokens(s):
    toks = (s or "").split("_")
    return toks[:3] if toks else ["UNK"]
wide["Species_tokens"] = wide["Species"].fillna("UNK").apply(species_tokens)

In [None]:
# Tabular columns
num_cols = ["Pre_GSHH_NDVI","Height_Ave_cm","month"]
cat_cols = ["State","season"]  # small cats; species as limited tokens below

In [None]:
# One-hot for small cats
ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
ohe_mat = ohe.fit_transform(wide[cat_cols].fillna("UNK"))
ohe_cols = ohe.get_feature_names_out(cat_cols)
ohe_df = pd.DataFrame(ohe_mat, columns=ohe_cols, index=wide.index)

In [None]:
# Species token bag-of-words (naive)
sp_vocab = sorted({tok for toks in wide["Species_tokens"] for tok in toks})
for tok in sp_vocab:
    wide[f"sp_{tok}"] = wide["Species_tokens"].apply(lambda xs: float(tok in xs))

tab_df = pd.concat([wide[num_cols], ohe_df, wide[[c for c in wide.columns if c.startswith("sp_")]]], axis=1)
tab_cols = tab_df.columns.tolist()

In [None]:
TARGETS = ['Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'Dry_Total_g']

# Drop rows with all NaNs in target columns
wide = wide.dropna(subset=TARGETS, how='all').reset_index(drop=True)

# Replace remaining NaNs (partial missing components) with 0 or mean if desired
wide[TARGETS] = wide[TARGETS].fillna(0)

# Now take log1p transform
y = np.log1p(wide[TARGETS].values.astype("float32"))

# Recreate bins for stratification
bins = pd.qcut(wide["Dry_Total_g"], q=10, duplicates="drop").cat.codes.values


In [None]:
y

In [None]:
# Map image paths
def img_path(rel):
    # train images are under train/, test under test/ at scoring time
    return IMG_ROOT/rel

In [None]:
IMG_SIZE = 384
train_tfms = transforms.Compose([
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.8,1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ColorJitter(0.2,0.2,0.2,0.05),
    transforms.ToTensor(),
])
valid_tfms = transforms.Compose([
    transforms.Resize(int(IMG_SIZE*1.14)),
    transforms.CenterCrop(IMG_SIZE),
    transforms.ToTensor(),
])

In [None]:
class PastureDataset(Dataset):
    def __init__(self, df_idx, is_train):
        self.df = wide.iloc[df_idx].reset_index(drop=True)
        self.tab = tab_df.iloc[df_idx].reset_index(drop=True).values.astype("float32")
        self.targets = y[df_idx]
        self.is_train = is_train
        self.tfms = train_tfms if is_train else valid_tfms
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        row = self.df.loc[i]
        img = Image.open(img_path(row["image_path"])).convert("RGB")
        img = self.tfms(img)
        tab = torch.from_numpy(self.tab[i])
        if self.is_train:
            tgt = torch.from_numpy(self.targets[i])  # 5-dim
            return img, tab, tgt
        else:
            return img, tab

In [None]:
USE_PRETRAINED = True  # set False if weights unavailable offline
cnn = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.DEFAULT if USE_PRETRAINED else None)
in_feats = cnn.classifier[1].in_features
cnn.classifier = nn.Identity()

In [None]:
tab_in = len(tab_cols)
class FusionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn = cnn
        self.tab = nn.Sequential(
            nn.Linear(tab_in, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True)
        )
        self.head = nn.Sequential(
            nn.Linear(in_feats + 128, 256), nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(256, 5)  # 5 targets (log space)
        )
    def forward(self, x_img, x_tab):
        f_img = self.cnn(x_img)
        f_tab = self.tab(x_tab)
        f = torch.cat([f_img, f_tab], dim=1)
        return self.head(f)

In [None]:
def weighted_r2(y_true, y_pred):
    # y_* are in real (not log) space here
    scores = {}
    total = 0.0
    for j,t in enumerate(TARGETS):
        r2 = r2_score(y_true[:,j], y_pred[:,j])
        scores[t] = r2
        total += WEIGHTS[t]*r2
    return total, scores

In [None]:
def train_one_fold(tr_idx, va_idx, epochs=6, bs=16, lr=2e-4):
    tr_ds = PastureDataset(tr_idx, True)
    va_ds = PastureDataset(va_idx, False)
    tr_ld = DataLoader(tr_ds, batch_size=bs, shuffle=True, num_workers=2, pin_memory=True)
    va_ld = DataLoader(va_ds, batch_size=bs*2, shuffle=False, num_workers=2, pin_memory=True)

    model = FusionModel().to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    best = ( -1e9, None )

    for ep in range(1, epochs+1):
        model.train(); tr_loss=0.0
        for img, tab, tgt in tr_ld:
            img, tab, tgt = img.to(device), tab.to(device), tgt.to(device)
            opt.zero_grad()
            out = model(img, tab)
            loss = F.mse_loss(out, tgt)  # log-space MSE
            loss.backward(); opt.step()
            tr_loss += loss.item()*len(img)

        # validate
        model.eval(); preds=[]; gts=[]
        with torch.no_grad():
            for img, tab in va_ld:
                img, tab = img.to(device), tab.to(device)
                o = model(img, tab)              # log preds
                preds.append(o.cpu().numpy())
            preds = np.vstack(preds)
        # back-transform to grams
        pred_real = np.expm1(preds)
        gt_real   = np.expm1(y[va_idx])
        wscore, per = weighted_r2(gt_real, pred_real)

        if wscore > best[0]:
            best = (wscore, { "state_dict": model.state_dict(), "per": per, "ep": ep })
    return best, pred_real

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
fold_preds = np.zeros_like(y)
fold_scores = []

for fold,(tr_idx,va_idx) in enumerate(skf.split(wide, bins), 1):
    best, va_pred = train_one_fold(tr_idx, va_idx, epochs=6, bs=16, lr=2e-4)
    fold_preds[va_idx] = va_pred
    fold_scores.append(best[0])
    print(f"Fold {fold} weighted R2: {best[0]:.4f}  per-target: {best[1]['per']}")
    gc.collect(); torch.cuda.empty_cache()

cv_score, _ = weighted_r2(np.expm1(y), fold_preds)
print("CV weighted R2:", cv_score)

In [None]:
full_idx = np.arange(len(wide))
full_ds  = PastureDataset(full_idx, True)
full_ld  = DataLoader(full_ds, batch_size=16, shuffle=True, num_workers=2, pin_memory=True)

model = FusionModel().to(device)
opt = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)
for ep in range(6):
    model.train()
    for img, tab, tgt in full_ld:
        img, tab, tgt = img.to(device), tab.to(device), tgt.to(device)
        opt.zero_grad()
        loss = F.mse_loss(model(img, tab), tgt)
        loss.backward(); opt.step()

In [None]:
# Build test dataset
# test.csv is long (one row per (image, target_name)); we need one image per unique path
tst_long = pd.read_csv(TEST_CSV)
uniq = tst_long["image_path"].drop_duplicates().reset_index(drop=True)

In [None]:
# prepare tabular feats for test: mimic train pipeline
def lookup_row(rel_path):
    # At test, we only know image_path and need tabular features from train? => We can use the columns available in test.csv only (often none).
    # This baseline assumes *no extra tabular* in test, so we zero them. If test has NDVI/height later, replace here accordingly.
    return np.zeros((len(tab_cols),), dtype="float32")

In [None]:
class TestDataset(Dataset):
    def __init__(self, image_paths, is_train=False):
        self.paths = image_paths
        self.tfms  = valid_tfms
    def __len__(self): return len(self.paths)
    def __getitem__(self, i):
        p = self.paths[i]
        img = Image.open(img_path(p)).convert("RGB")
        img = self.tfms(img)
        tab = torch.zeros(len(tab_cols), dtype=torch.float32)
        return img, tab

tds = TestDataset(uniq.values.tolist())
tld = DataLoader(tds, batch_size=32, shuffle=False, num_workers=2, pin_memory=True)

In [None]:
model.eval(); test_pred = []
with torch.no_grad():
    for img, tab in tld:
        o = model(img.to(device), tab.to(device))        # log space
        test_pred.append(o.cpu().numpy())
test_pred = np.vstack(test_pred)                         # (N_images, 5)
test_pred = np.expm1(test_pred)                          # grams

In [None]:
# Build a mapping from image_path -> predicted dict
pred_map = {p:{t:v for t,v in zip(TARGETS, row)} for p,row in zip(uniq, test_pred)}

rows = []
for _,r in tst_long.iterrows():
    rows.append([r["sample_id"], pred_map[r["image_path"]][r["target_name"]]])
sub = pd.DataFrame(rows, columns=["sample_id","target"])

sub.to_csv("submission.csv")
