<div style="border: 2px solid #4CAF50; border-radius: 10px; padding: 15px; background-color: #f9f9f9; text-align: left; font-family: Arial, sans-serif; width: 90%; max-width: 600px; margin: auto;">
  <h1 style="color: #2E7D32; text-align: center;">üå± CSIRO Image2Biomass</h1>
  
  <h4 style="color: #2E7D32;">üéØ Competition Goal</h4>
  <p>Predict 5 biomass components from pasture images to help farmers optimize grazing decisions.</p>

  <h4 style="color: #2E7D32;">üõ† Solution</h4>
  <ul>
    <li><strong>Model:</strong> Custom CNN</li>
    <li><strong>Training:</strong> 3-fold cross-validation</li>
    <li><strong>Targets:</strong> Dry_Green_g, Dry_Dead_g, Dry_Clover_g, GDM_g, Dry_Total_g</li>
    <li><strong>Evaluation:</strong> Weighted R¬≤ score</li>
  </ul>

  <h4 style="color: #2E7D32;">‚úÖ Key Features</h4>
  <ul>
    <li>Multi-target regression</li>
    <li>Competition-weighted loss</li>
    <li>Ensemble predictions</li>
    <li>Proper submission format</li>
  </ul>

  <div style="text-align: center; margin-top: 15px; padding: 10px; background: #E8F5E8; border-radius: 5px;">
    <strong></strong>
  </div>
</div>

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torchvision.transforms as T
import random

In [None]:
# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# Configuration parameters
IMAGE_SIZE = 224  
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 1e-4
FOLDS = 3

TARGETS = ['Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'Dry_Total_g']
WEIGHTS = [0.1, 0.1, 0.1, 0.2, 0.5]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
train_data = pd.read_csv('/kaggle/input/csiro-biomass/train.csv')
test_data = pd.read_csv('/kaggle/input/csiro-biomass/test.csv')
submission_format = pd.read_csv('/kaggle/input/csiro-biomass/sample_submission.csv')


In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
# Clean file paths
train_data['image_path'] = train_data['image_path'].str.replace('train/', '')
test_data['image_path'] = test_data['image_path'].str.replace('test/', '')

def extract_image_id(sample_id):
    return str(sample_id).split('__')[0]

train_data['image_id'] = train_data['sample_id'].apply(extract_image_id)
test_data['image_id'] = test_data['sample_id'].apply(extract_image_id)

print(f"   Unique training images: {train_data['image_id'].nunique()}")
print(f"   Unique test images: {test_data['image_id'].nunique()}")


In [None]:
class BiomassTrainDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform
        
        # Group by image and create targets matrix
        image_groups = dataframe.groupby('image_id').first().reset_index()
        self.image_data = image_groups
        self.target_values = []
        
        for img_id in self.image_data['image_id']:
            image_targets = dataframe[dataframe['image_id'] == img_id].set_index('target_name')['target']
            target_array = [image_targets.get(target, 0) for target in TARGETS]
            self.target_values.append(target_array)
        
        self.target_values = np.array(self.target_values)

    def __len__(self):
        return len(self.image_data)

    def __getitem__(self, index):
        row = self.image_data.iloc[index]
        image_path = os.path.join('/kaggle/input/csiro-biomass/train/', row['image_path'])
        
        try:
            image = Image.open(image_path).convert('RGB')
        except:
            # Create placeholder if file missing
            image = Image.new('RGB', (IMAGE_SIZE, IMAGE_SIZE), color=(100, 150, 100))
        
        if self.transform:
            image = self.transform(image)
            
        targets = torch.FloatTensor(self.target_values[index])
        return image, targets

class BiomassTestDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        image_path = '/kaggle/input/csiro-biomass/test/' + row["image_path"]
        
        try:
            image = Image.open(image_path).convert("RGB")
        except:
            image = Image.new('RGB', (IMAGE_SIZE, IMAGE_SIZE), color=(100, 150, 100))
            
        if self.transform:
            image = self.transform(image)
        return image


# **Model Architecture**

In [None]:

class CustomBiomassModel(nn.Module):
    def __init__(self, outputs=5):
        super().__init__()
        
        # Feature extractor
        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(3, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            
            # Block 2
            nn.Conv2d(32, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            
            # Block 3
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            
            # Block 4
            nn.Conv2d(128, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((1, 1))
        )
        
        # Regressor
        self.regressor = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Linear(64, outputs)
        )
        
        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.regressor(x)
        return x

class WeightedLoss(nn.Module):
    def __init__(self, weights):
        super().__init__()
        self.weights = torch.tensor(weights)
        
    def forward(self, predictions, targets):
        squared_errors = (predictions - targets) ** 2
        weighted_errors = squared_errors * self.weights.to(predictions.device)
        return weighted_errors.mean()


# **Data Transforms**

In [None]:
# Image transformations
train_transforms = T.Compose([
    T.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    T.RandomHorizontalFlip(p=0.5),
    T.RandomRotation(degrees=10),
    T.ColorJitter(brightness=0.2, contrast=0.2),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

validation_transforms = T.Compose([
    T.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

test_transforms = T.Compose([
    T.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])


# **Cross-Validation Setup**

In [None]:
# Prepare cross-validation folds
unique_image_ids = train_data['image_id'].unique()
np.random.shuffle(unique_image_ids)
fold_mapping = {img_id: i % FOLDS for i, img_id in enumerate(unique_image_ids)}
train_data['fold'] = train_data['image_id'].map(fold_mapping)

print(f"   Created {FOLDS} folds for cross-validation")


# **Training Function**

In [None]:
def train_cv_fold(fold_number):
    print(f"  Training fold {fold_number + 1}/{FOLDS}")
    
    # Split data
    train_fold = train_data[train_data['fold'] != fold_number]
    valid_fold = train_data[train_data['fold'] == fold_number]
    
    print(f"    Training images: {len(train_fold['image_id'].unique())}")
    print(f"    Validation images: {len(valid_fold['image_id'].unique())}")
    
    # Create datasets
    train_dataset = BiomassTrainDataset(train_fold, transform=train_transforms)
    valid_dataset = BiomassTrainDataset(valid_fold, transform=validation_transforms)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
    valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
    
    # Initialize model
    model = CustomBiomassModel(outputs=len(TARGETS))
    model.to(device)
    
    # Loss and optimizer
    loss_function = WeightedLoss(WEIGHTS)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
    
    best_loss = float('inf')
    
    # Training loop
    for epoch in range(EPOCHS):
        # Training phase
        model.train()
        epoch_train_loss = 0
        for images, targets in train_loader:
            images, targets = images.to(device), targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(images)
            loss = loss_function(outputs, targets)
            loss.backward()
            optimizer.step()
            
            epoch_train_loss += loss.item()
        
        # Validation phase
        model.eval()
        epoch_val_loss = 0
        with torch.no_grad():
            for images, targets in valid_loader:
                images, targets = images.to(device), targets.to(device)
                outputs = model(images)
                epoch_val_loss += loss_function(outputs, targets).item()
        
        # Calculate average losses
        epoch_train_loss /= len(train_loader)
        epoch_val_loss /= len(valid_loader)
        
        scheduler.step()
        
        print(f'    Epoch {epoch+1:02d}: Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}')
        
        # Save best model
        if epoch_val_loss < best_loss:
            best_loss = epoch_val_loss
            torch.save(model.state_dict(), f'model_fold_{fold_number}.pth')
            
    
    return best_loss


# **Model Training**

In [None]:
fold_performance = []

for fold_idx in range(FOLDS):
    fold_loss = train_cv_fold(fold_idx)
    fold_performance.append(fold_loss)
    print(f' Fold {fold_idx + 1} completed. Best validation loss: {fold_loss:.4f}')

print(f" Average validation loss: {np.mean(fold_performance):.4f}")


# **Prediction & Inference**

In [None]:
# Prepare test data
test_data_unique = test_data[~test_data['image_path'].duplicated()][['sample_id', 'image_path']].reset_index(drop=True)
test_data_unique['sample_id'] = test_data_unique['sample_id'].apply(extract_image_id)

print(f"   Test images for prediction: {len(test_data_unique)}")

# Create test dataset and loader
test_dataset = BiomassTestDataset(test_data_unique, transform=test_transforms)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# Simple inference function
def predict_batch(model, images):
    with torch.no_grad():
        predictions = model(images)
    return predictions

# Ensemble predictions from all folds
predictions_by_fold = {}

for fold_idx in range(FOLDS):
    print(f"   Predicting with fold {fold_idx + 1}...")
    
    model = CustomBiomassModel(outputs=len(TARGETS))
    model.load_state_dict(torch.load(f'model_fold_{fold_idx}.pth', map_location=device))
    model.eval()
    model.to(device)
    
    fold_predictions = []
    with torch.no_grad():
        for batch in test_loader:
            images = batch.to(device)
            batch_predictions = predict_batch(model, images)
            batch_predictions = batch_predictions.cpu().numpy()
            fold_predictions.append(batch_predictions)
    
    predictions_by_fold[fold_idx] = np.concatenate(fold_predictions)

# Combine predictions from all folds
ensemble_predictions = np.mean([predictions_by_fold[fold_idx] for fold_idx in range(FOLDS)], axis=0)

print(f"   Final predictions shape: {ensemble_predictions.shape}")


# **Submission File Creation**

In [None]:
# Calculate mean values for reference
target_means = {}
for target in TARGETS:
    target_data = train_data[train_data['target_name'] == target]['target']
    target_means[target] = target_data.mean()
    print(f"   Training mean for {target}: {target_means[target]:.3f}")

# Create results dataframe
results_df = pd.DataFrame(
    ensemble_predictions, 
    columns=TARGETS
)
results_df['sample_id'] = test_data_unique['sample_id']

# Convert to submission format using melt
submission_df = pd.melt(
    results_df, 
    id_vars='sample_id', 
    value_vars=TARGETS, 
    value_name='target',
    var_name='target_name'
)

# Create correct sample_id format
submission_df['sample_id'] = submission_df['sample_id'] + '__' + submission_df['target_name']

# Ensure positive values and select columns
submission_df['target'] = submission_df['target'].clip(0, None)
submission_df = submission_df[['sample_id', 'target']].copy()

# Validation
print("\n  Submission Validation:")
print(f"   Rows: {submission_df.shape[0]}")
print(f"   Target range: {submission_df['target'].min():.3f} to {submission_df['target'].max():.3f}")
print(f"   Sample format: {submission_df['sample_id'].iloc[0]}")

# Verify row count
expected_rows = len(test_data)
actual_rows = len(submission_df)
print(f"   Expected rows: {expected_rows}, Generated rows: {actual_rows}")

if expected_rows == actual_rows:
    print("   Row count correct")
else:
    print("   Row count mismatch")

# Save submission
submission_df.to_csv('submission.csv', index=False)


In [None]:
submission_df.head()

<div style="border: 2px solid #FFA500; border-radius: 10px; padding: 10px; background-color: #FFF5E6; text-align: center; font-family: Arial, sans-serif; width: 80%; max-width: 600px; margin: auto;">
  <h3 style="color: #FFA500;">üëç <strong>Enjoyed this guide?</strong></h3>
  <p style="color: #333333;">If you found this guide helpful, please consider giving it an upvote! Your support helps us continue to create valuable content and improve our resources.</p>
  <p style="font-size: 16px; color: #FF8C00;">Thank you! üòä</p>
  <p style="color: #333333; margin-top: 10px;">
    <strong>Connect with me on linkedIn:</strong><br>
    <a href="https://www.linkedin.com/in/abdullah0a7" style="color: #0077B5; text-decoration: none; display: inline-flex; align-items: center; gap: 5px;">
      <svg width="20" height="20" viewBox="0 0 24 24" fill="#0077B5">
        <path d="M20.447 20.452h-3.554v-5.569c0-1.328-.027-3.037-1.852-3.037-1.853 0-2.136 1.445-2.136 2.939v5.667H9.351V9h3.414v1.561h.046c.477-.9 1.637-1.85 3.37-1.85 3.601 0 4.267 2.37 4.267 5.455v6.286zM5.337 7.433c-1.144 0-2.063-.926-2.063-2.065 0-1.138.92-2.063 2.063-2.063 1.14 0 2.064.925 2.064 2.063 0 1.139-.925 2.065-2.064 2.065zm1.782 13.019H3.555V9h3.564v11.452zM22.225 0H1.771C.792 0 0 .774 0 1.729v20.542C0 23.227.792 24 1.771 24h20.451C23.2 24 24 23.227 24 22.271V1.729C24 .774 23.2 0 22.222 0h.003z"/>
      </svg>
      M Abdullah
    </a>
  </p>
</div>