In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import timm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torchvision.transforms as T
import pytorch_lightning as pl
import random
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

# ======== Seed ========
def seed_everything(seed: int = 42):
    pl.seed_everything(seed, workers=True)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

# ======== Weighted R2 Metric (added missing function) ========
from sklearn.metrics import r2_score

def weighted_r2_score(y_true, y_pred):
    """Computes weighted and per-target R²."""
    r2s = [r2_score(y_true[:, i], y_pred[:, i]) for i in range(y_true.shape[1])]
    weights = np.array([1, 1, 1, 2, 2])  # example weighting; adjust if known
    weighted_r2 = np.average(r2s, weights=weights)
    return weighted_r2, r2s

# ======== Dataset ========
class InferenceDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join("/kaggle/input/csiro-biomass", row["image_path"])
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image

# ======== Model ========
class MultiRegressionModel(pl.LightningModule):
    def __init__(self, model_name="efficientnet_b0", pretrained=False, lr=1e-4, output_dim=5):
        super().__init__()
        self.save_hyperparameters()
        self.model = timm.create_model(model_name, pretrained=pretrained, num_classes=output_dim)
        self.criterion = nn.MSELoss()
        self.val_outputs = []

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log("train_loss", loss, on_step=False, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.val_outputs.append((y_hat.detach().cpu(), y.detach().cpu()))
        self.log("val_loss", loss, on_step=False, on_epoch=True)
        return loss

    def on_validation_epoch_end(self):
        if not self.val_outputs:
            self.log("val_weighted_r2", 0.0, prog_bar=True)
            for name in ["Dry_Green_g", "Dry_Dead_g", "Dry_Clover_g", "GDM_g", "Dry_Total_g"]:
                self.log(f"val_r2_{name}", 0.0)
            return

        preds, trues = zip(*self.val_outputs)
        preds = torch.cat(preds).numpy()
        trues = torch.cat(trues).numpy()
        weighted_r2, r2s = weighted_r2_score(trues, preds)
        self.log("val_weighted_r2", weighted_r2, prog_bar=True)
        for i, name in enumerate(["Dry_Green_g", "Dry_Dead_g", "Dry_Clover_g", "GDM_g", "Dry_Total_g"]):
            self.log(f"val_r2_{name}", r2s[i])
        self.val_outputs.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.lr)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
        return {"optimizer": optimizer, "lr_scheduler": scheduler}

# ======== TTA ========
def tta_inference(model, images):
    preds = model(images)
    preds_lr = model(torch.flip(images, dims=[3]))
    preds_ud = model(torch.flip(images, dims=[2]))
    preds_lrud = model(torch.flip(images, dims=[2, 3]))
    preds_mean = (preds + preds_lr + preds_ud + preds_lrud) / 4.0
    return preds_mean

# ======== Utils ========
def get_id(x):
    return x.split('_')[0]

# ======== Transform ========
img_size = 1000
infer_transform = T.Compose([
    T.Resize((img_size, img_size)),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406],
                [0.229, 0.224, 0.225])
])

# ======== DataLoader ========
test_df = pd.read_csv('/kaggle/input/csiro-biomass/test.csv')
test_df = test_df[~test_df['image_path'].duplicated()][['sample_id', 'image_path']].reset_index(drop=True)
test_df['sample_id'] = test_df['sample_id'].apply(get_id)
dataset = InferenceDataset(test_df, transform=infer_transform)
dataloader = DataLoader(dataset, batch_size=8, shuffle=False, num_workers=2, pin_memory=True)

# ======== Inference ========
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
results_dict = {}

for fold in range(3):
    model_path = f"/kaggle/input/csiro-img2bio-training-notebook/model_fold{fold}.pth"
    model = MultiRegressionModel(model_name="efficientnet_b2", pretrained=False)
    state_dict = torch.load(model_path, map_location=device)
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()

    results = []
    with torch.no_grad():
        for images in dataloader:
            images = images.to(device)
            preds = tta_inference(model, images)
            preds = preds.cpu().numpy()
            results.append(preds)
    results_dict[fold] = np.concatenate(results)

# ======== Submission ========
mean_preds = np.mean([results_dict[fold] for fold in range(3)], axis=0)
result_df = pd.DataFrame(mean_preds, columns=["Dry_Green_g", "Dry_Dead_g", "Dry_Clover_g", "GDM_g", "Dry_Total_g"])
result_df['sample_id'] = test_df['sample_id']
result_df = pd.melt(result_df, id_vars='sample_id',
                    value_vars=["Dry_Green_g", "Dry_Dead_g", "Dry_Clover_g", "GDM_g", "Dry_Total_g"],
                    value_name='target')
result_df['sample_id'] = result_df['sample_id'] + '__' + result_df['variable']
result_df['target'] = result_df['target'].clip(0, 200)
result_df[['sample_id', 'target']].to_csv('submission.csv', index=False)
print("✅ submission.csv saved.")
