In [None]:
# Import required libraries
import os
import timm
import torch
import pandas as pd
from PIL import Image
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from sklearn.model_selection import train_test_split

# Define training configuration
BATCH_SIZE = 8
NUM_EPOCHS = 20
LEARNING_RATE = 2.5e-5
MODEL_NAME = "efficientnet_b0"
IMAGE_WIDTH = 500
IMAGE_HEIGHT = 250
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define target order
TARGET_NAMES = [
    "Dry_Clover_g",
    "Dry_Dead_g",
    "Dry_Green_g",
    "GDM_g",
    "Dry_Total_g"
]

# Define data augmentation and preprocessing transforms
def get_transforms():
    # Define train transform
    train_transform = transforms.Compose([
        transforms.Resize((IMAGE_HEIGHT, IMAGE_WIDTH)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),
        transforms.RandomRotation(10),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])

    # Define validation transform
    val_transform = transforms.Compose([
        transforms.Resize((IMAGE_HEIGHT, IMAGE_WIDTH)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])

    return train_transform, val_transform


# Define custom dataset for multi-target regression
class BiomassDataset(Dataset):
    def __init__(self, df, base_dir, transform=None):
        # Save dataframe
        self.df = df

        # Save base directory
        self.base_dir = base_dir

        # Save transform
        self.transform = transform

        # Build image to targets mapping
        self.image_ids = df["image_path"].unique().tolist()

        # Precompute dictionary
        self.img_to_targets = self._build_img_to_targets(df)

    def _build_img_to_targets(self, df):
        # Initialize dictionary
        mapping = {}

        # Group by image_path
        grouped = df.groupby("image_path")

        # Iterate groups
        for img_path, group in grouped:
            # Initialize target vector
            target_vec = []

            # Fill targets in fixed order
            for t in TARGET_NAMES:
                # Filter row
                row = group[group["target_name"] == t]

                # Get value or 0.0
                if len(row) == 1:
                    value = float(row["target"].values[0])
                else:
                    value = 0.0

                # Append value
                target_vec.append(value)

            # Save in mapping
            mapping[img_path] = torch.tensor(target_vec, dtype=torch.float32)

        return mapping

    def __len__(self):
        # Return number of unique images
        return len(self.image_ids)

    def __getitem__(self, idx):
        # Get image path (relative)
        rel_path = self.image_ids[idx]

        # Build full path
        img_path = os.path.join(self.base_dir, rel_path)

        # Open image
        image = Image.open(img_path).convert("RGB")

        # Apply transform
        if self.transform:
            image = self.transform(image)

        # Get target vector
        targets = self.img_to_targets[rel_path]

        return image, targets

# Define model creation function
def create_model(num_outputs):
    # Create pretrained model
    model = timm.create_model(MODEL_NAME, pretrained=False)

    # Get number of features
    in_features = model.get_classifier().in_features

    # Replace classifier with regression head
    model.classifier = nn.Linear(in_features, num_outputs)

    # Move model to device
    model = model.to(DEVICE)
    return model

# Define training function
def train_one_epoch(model, dataloader, criterion, optimizer):
    # Set model to train
    model.train()
    running_loss = 0.0
    total = 0

    # Iterate dataloader
    for images, targets in dataloader:
        # Move to device
        images = images.to(DEVICE)
        targets = targets.to(DEVICE)

        # Forward pass
        outputs = model(images)

        # Compute loss
        loss = criterion(outputs, targets)

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Accumulate loss
        batch_size = images.size(0)
        running_loss += loss.item() * batch_size
        total += batch_size

    # Compute epoch loss
    epoch_loss = running_loss / total
    return epoch_loss

# Define validation function
def validate(model, dataloader, criterion):
    # Set model to eval
    model.eval()
    running_loss = 0.0
    total = 0

    # Disable grad
    with torch.no_grad():
        for images, targets in dataloader:
            # Move to device
            images = images.to(DEVICE)
            targets = targets.to(DEVICE)

            # Forward
            outputs = model(images)

            # Loss
            loss = criterion(outputs, targets)

            # Accumulate
            batch_size = images.size(0)
            running_loss += loss.item() * batch_size
            total += batch_size

    # Compute
    epoch_loss = running_loss / total
    return epoch_loss

# Define inference function
def run_inference(model, test_df, base_dir, transform, output_path):
    # Get unique images
    unique_images = test_df["image_path"].unique().tolist()

    # Create list for predictions
    img_to_pred = {}

    # Set model to eval
    model.eval()

    # Disable grad
    with torch.no_grad():
        # Iterate images
        for img_rel in unique_images:
            # Build path
            img_path = os.path.join(base_dir, img_rel)

            # Open image
            image = Image.open(img_path).convert("RGB")

            # Apply transform
            image = transform(image)

            # Add batch dim
            image = image.unsqueeze(0).to(DEVICE)

            # Forward
            outputs = model(image)

            # Move to cpu
            outputs = outputs.squeeze(0).cpu().numpy()

            # Save
            img_to_pred[img_rel] = outputs

    # Build submission rows
    rows = []

    # Iterate test df rows
    for _, row in test_df.iterrows():
        # Get sample_id
        sample_id = row["sample_id"]

        # Get image path
        img_rel = row["image_path"]

        # Get target name
        target_name = row["target_name"]

        # Get predictions array
        preds = img_to_pred[img_rel]

        # Get index
        idx = TARGET_NAMES.index(target_name)

        # Get value
        value = float(preds[idx])

        # Append row
        rows.append({
            "sample_id": sample_id,
            "target": value
        })

    # Create dataframe
    submission = pd.DataFrame(rows)

    # Save
    submission.to_csv(output_path, index=False)
    print(f"Submission saved to {output_path}")

# Define main
def main():
    # Define data directory
    DATA_DIR = "/kaggle/input/csiro-biomass"

    # Define csv paths
    TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")
    TEST_CSV = os.path.join(DATA_DIR, "test.csv")

    # Read train csv
    train_df = pd.read_csv(TRAIN_CSV)

    # Get transforms
    train_transform, val_transform = get_transforms()

    # Get unique image paths
    unique_imgs = train_df["image_path"].unique()

    # Create train and validation split on images
    train_imgs, val_imgs = train_test_split(
        unique_imgs,
        test_size=0.2,
        random_state=42
    )

    # Filter dataframes
    train_df_split = train_df[train_df["image_path"].isin(train_imgs)].reset_index(drop=True)
    val_df_split = train_df[train_df["image_path"].isin(val_imgs)].reset_index(drop=True)

    # Create datasets
    train_dataset = BiomassDataset(train_df_split, DATA_DIR, transform=train_transform)
    val_dataset = BiomassDataset(val_df_split, DATA_DIR, transform=val_transform)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

    # Create model
    model = create_model(num_outputs=len(TARGET_NAMES))

    # Define loss
    criterion = nn.MSELoss()

    # Define optimizer
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

    # Train
    for epoch in range(NUM_EPOCHS):
        # Train epoch
        train_loss = train_one_epoch(model, train_loader, criterion, optimizer)

        # Validate
        val_loss = validate(model, val_loader, criterion)

        # Print
        print(f"Epoch [{epoch + 1}/{NUM_EPOCHS}]")
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Val Loss:   {val_loss:.4f}\n")

    # Read test csv
    test_df = pd.read_csv(TEST_CSV)

    # Run inference
    run_inference(model, test_df, DATA_DIR, val_transform, "/kaggle/working/submission.csv")

# Run main
if __name__ == "__main__":
    main()