In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**SECTION 1 – SETUP**

In [None]:
# ====================================================
# 1. SETUP & LIBRARIES
# ====================================================
import os
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

import torch
from torchvision import models, transforms

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


 **SECTION 2 – LOAD & CLEAN DATA**

In [None]:
# ====================================================
# 2. LOAD & PREPARE DATA
# ====================================================
train = pd.read_csv("/kaggle/input/csiro-biomass/train.csv")
print("Original rows:", len(train))
display(train.head())

# Pivot long → wide (one row per image)
train_wide = train.pivot_table(
    index=["image_path"],
    columns="target_name",
    values="target"
).reset_index()
train_wide.columns.name = None

target_cols = ['Dry_Green_g','Dry_Dead_g','Dry_Clover_g','GDM_g','Dry_Total_g']

# Convert to floats safely
for col in target_cols:
    train_wide[col] = pd.to_numeric(train_wide[col], errors="coerce").fillna(0).astype("float32")

print("Unique images:", len(train_wide))
display(train_wide.head())


**SECTION 3 – IMAGE TRANSFORMS & FROZEN MODEL**

In [None]:
# ====================================================
# 3. DEFINE IMAGE TRANSFORMS & FROZEN RESNET
# ====================================================
val_tfms = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])

# Pretrained ResNet50 up to penultimate layer (no fc)
# =============================================================
# 3. DEFINE IMAGE TRANSFORMS & FROZEN RESNET (OFFLINE SAFE)
# =============================================================
val_tfms = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])

from torchvision import models
import torch, os

# Ensure torch uses cache instead of internet
torch.hub.set_dir("/root/.cache/torch/hub")
os.environ["TORCH_HOME"] = "/root/.cache/torch"

try:
    resnet = models.resnet50(weights='IMAGENET1K_V1')
    print("✅ Loaded pretrained ResNet50 weights (from cache)")
except Exception as e:
    print("⚠️ Could not download weights. Using untrained ResNet50 instead.")
    resnet = models.resnet50(weights=None)

# Remove the classification head (keep feature extractor)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])
resnet.eval().to(device)


**SECTION 4 – EXTRACT IMAGE EMBEDDINGS**

In [None]:
# ====================================================
# 4. EXTRACT IMAGE EMBEDDINGS
# ====================================================
def get_embedding(img_path):
    full_path = os.path.join("/kaggle/input/csiro-biomass", img_path)
    img = Image.open(full_path).convert("RGB")
    img = val_tfms(img).unsqueeze(0).to(device)
    with torch.no_grad():
        emb = resnet(img).squeeze().cpu().numpy()  # 2048-d vector
    return emb

embeddings = []
for path in tqdm(train_wide["image_path"], desc="Extracting train embeddings"):
    embeddings.append(get_embedding(path))

X_emb = np.stack(embeddings)
y = train_wide[target_cols].values
print("Embeddings shape:", X_emb.shape, " | Targets shape:", y.shape)


**SECTION 5 – TRAIN REGRESSOR ON EMBEDDINGS**

In [None]:
# ====================================================
# 5. TRAIN REGRESSOR ON EMBEDDINGS
# ====================================================
X_train, X_val, y_train, y_val = train_test_split(X_emb, y, test_size=0.15, random_state=42)

reg = MultiOutputRegressor(
    XGBRegressor(
        n_estimators=800,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=42,
        n_jobs=-1
    )
)

print("Training XGBoost on frozen embeddings...")
reg.fit(X_train, y_train)

y_pred = reg.predict(X_val)
r2s = r2_score(y_val, y_pred, multioutput='raw_values')
weights = np.array([0.1,0.1,0.1,0.2,0.5])
weighted_r2 = np.sum(weights * r2s)

for t,s in zip(target_cols, r2s):
    print(f"{t:15s}: R² = {s:.3f}")
print(f"Weighted R² = {weighted_r2:.3f}")


**SECTION 6 – PREDICT TEST & BUILD SUBMISSION**

In [None]:
# ====================================================
# 6. PREDICT TEST & SAVE SUBMISSION
# ====================================================
test = pd.read_csv("/kaggle/input/csiro-biomass/test.csv")
unique_imgs = test["image_path"].unique()

# Extract embeddings for test set
test_emb = []
for path in tqdm(unique_imgs, desc="Extracting test embeddings"):
    test_emb.append(get_embedding(path))
X_test_emb = np.stack(test_emb)

# Predict
y_test_pred = reg.predict(X_test_emb)

# Convert to submission format
rows = []
for img, preds in zip(unique_imgs, y_test_pred):
    img_id = img.split("/")[-1].replace(".jpg","")
    for t,val in zip(target_cols, preds):
        rows.append({"sample_id": f"{img_id}__{t}", "target": float(val)})

sub = pd.DataFrame(rows)
sub.to_csv("/kaggle/working/submission.csv", index=False)
print("✅ submission.csv saved!")
sub.head()
