# CSIRO - Image2Biomass Prediction
- Predict biomass using the provided pasture images
### https://www.kaggle.com/competitions/csiro-biomass

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

In [None]:
# ==================== CONFIGURATION ====================

# Data Paths
DATA_DIR = '/kaggle/input/csiro-biomass'
TRAIN_CSV = DATA_DIR+'/train.csv'
TEST_CSV = DATA_DIR+'/test.csv'

# Image Parameters
IMG_SIZE = (256, 256)

# Training Parameters
BATCH_SIZE = 16
NUM_EPOCHS = 50
LEARNING_RATE = 0.0001
RANDOM_SEED = 21
VAL_SPLIT = 0.2  # 20% validation split
NUM_WORKERS = 2

# Model Architecture - Simple CNN
CNN_CHANNELS = [32, 64, 128, 256]  # Conv layer channel sizes
CLASSIFIER_HIDDEN = [128, 64]  # Hidden layers in final classifier
DROPOUT_RATES = [0.3, 0.1]  # Dropout rates for classifier layers

# Learning Rate Scheduler
SCHEDULER_FACTOR = 0.5  # Reduce LR by this factor
SCHEDULER_PATIENCE = 3  # Epochs with no improvement

# Competition Weights (for loss and metric calculation)
TARGET_WEIGHTS = {
    'Dry_Clover_g': 0.1,
    'Dry_Dead_g': 0.1,
    'Dry_Green_g': 0.1,
    'Dry_Total_g': 0.5,
    'GDM_g': 0.2
}
TARGET_NAMES = ['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g', 'GDM_g']
WEIGHTS_ARRAY = [TARGET_WEIGHTS[name] for name in TARGET_NAMES]

# Display Configuration
print("=" * 60)
print("CONFIGURATION SUMMARY")
print("=" * 60)
print(f"Data Directory: {DATA_DIR}")
print(f"Image Size: {IMG_SIZE}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Learning Rate: {LEARNING_RATE}")
print(f"Epochs: {NUM_EPOCHS}")
print(f"Validation Split: {VAL_SPLIT * 100}%")
print(f"Random Seed: {RANDOM_SEED}")
print(f"\nModel Architecture:")
print(f"  CNN Channels: {CNN_CHANNELS}")
print(f"  Classifier: {CNN_CHANNELS[-1]} -> {' -> '.join(map(str, CLASSIFIER_HIDDEN))} -> 5")
print(f"\nTarget Weights:")
for name, weight in TARGET_WEIGHTS.items():
    print(f"  {name}: {weight}")
print("=" * 60)

In [None]:
# Load and prepare data
df = pd.read_csv(TRAIN_CSV)

# Pivot the data so each image has one row with all 5 targets
df_pivot = df.pivot_table(
    index=['image_path'],
    columns='target_name',
    values='target'
).reset_index()

# Rename columns for clarity
df_pivot.columns.name = None
target_cols = TARGET_NAMES

print(f"Total images: {len(df_pivot)}")
print(f"\nTarget columns: {target_cols}")
print(f"\nTarget statistics:")
print(df_pivot[target_cols].describe())
print(f"\nFirst few rows:")
print(df_pivot.head())

In [None]:
# Prepare data splits
from sklearn.preprocessing import StandardScaler

# Train-test split
train_df, val_df = train_test_split(df_pivot, test_size=VAL_SPLIT, random_state=RANDOM_SEED)

# Normalize target variables using training statistics
target_scaler = StandardScaler()
train_df[target_cols] = target_scaler.fit_transform(train_df[target_cols])
val_df[target_cols] = target_scaler.transform(val_df[target_cols])

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"\nTarget normalization applied:")
print(f"  Mean: {target_scaler.mean_}")
print(f"  Std: {target_scaler.scale_}")

In [None]:
# Dataset class - Image only
class BiomassDataset(Dataset):
    def __init__(self, df, data_dir=DATA_DIR, transform=None):
        self.df = df.reset_index(drop=True)
        self.data_dir = data_dir
        self.transform = transform
        self.target_cols = TARGET_NAMES
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.data_dir, self.df.loc[idx, 'image_path'])
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
        
        # Get targets (5 values)
        targets = torch.tensor(self.df.loc[idx, self.target_cols].values.astype(np.float32), dtype=torch.float32)
        
        return image, targets

# Simple data transforms
transform = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.ToTensor(),  # Converts to [0, 1] range
])

# Create datasets and dataloaders
train_dataset = BiomassDataset(train_df, transform=transform)
val_dataset = BiomassDataset(val_df, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

print(f"Train batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")
print(f"Number of targets: 5 ({', '.join(TARGET_NAMES)})")

In [None]:
# Simple CNN model - Image only
class SimpleCNN(nn.Module):
    def __init__(self, num_outputs=5):
        super(SimpleCNN, self).__init__()
        
        # CNN for image features
        self.features = nn.Sequential(
            # Conv block 1
            nn.Conv2d(3, CNN_CHANNELS[0], kernel_size=3, padding=1),
            nn.BatchNorm2d(CNN_CHANNELS[0]),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),  # 112x112
            
            # Conv block 2
            nn.Conv2d(CNN_CHANNELS[0], CNN_CHANNELS[1], kernel_size=3, padding=1),
            nn.BatchNorm2d(CNN_CHANNELS[1]),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),  # 56x56
            
            # Conv block 3
            nn.Conv2d(CNN_CHANNELS[1], CNN_CHANNELS[2], kernel_size=3, padding=1),
            nn.BatchNorm2d(CNN_CHANNELS[2]),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),  # 28x28
            
            # Conv block 4
            nn.Conv2d(CNN_CHANNELS[2], CNN_CHANNELS[3], kernel_size=3, padding=1),
            nn.BatchNorm2d(CNN_CHANNELS[3]),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),  # 14x14
            
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten()
        )
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(CNN_CHANNELS[-1], CLASSIFIER_HIDDEN[0]),
            nn.ReLU(inplace=True),
            nn.Dropout(DROPOUT_RATES[0]),
            nn.Linear(CLASSIFIER_HIDDEN[0], CLASSIFIER_HIDDEN[1]),
            nn.ReLU(inplace=True),
            nn.Dropout(DROPOUT_RATES[1]),
            nn.Linear(CLASSIFIER_HIDDEN[1], num_outputs)
        )
        
    def forward(self, image):
        # Extract image features
        feat = self.features(image)
        
        # Final prediction
        output = self.classifier(feat)
        
        return output

model = SimpleCNN(num_outputs=5).to(device)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"\nModel outputs 5 predictions: {', '.join(TARGET_NAMES)}")
print(model)

In [None]:
# Competition metric: Weighted R² score
def competition_score(y_true, y_pred):
    """
    Calculate the competition metric: weighted average of R² scores
    Weights defined in TARGET_WEIGHTS config
    """
    weights = np.array(WEIGHTS_ARRAY)
    
    r2_scores = []
    for i in range(y_true.shape[1]):
        r2 = r2_score(y_true[:, i], y_pred[:, i])
        r2_scores.append(r2)
    
    r2_scores = np.array(r2_scores)
    weighted_score = np.sum(weights * r2_scores)
    
    return weighted_score, r2_scores

# Weighted MSE Loss
class WeightedMSELoss(nn.Module):
    def __init__(self):
        super(WeightedMSELoss, self).__init__()
        # Weights based on competition metric importance
        self.weights = torch.tensor(WEIGHTS_ARRAY).to(device)
        
    def forward(self, predictions, targets):
        mse_per_target = torch.mean((predictions - targets) ** 2, dim=0)
        weighted_mse = torch.sum(self.weights * mse_per_target)
        return weighted_mse

# Training setup
criterion = WeightedMSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    mode='max', 
    factor=SCHEDULER_FACTOR, 
    patience=SCHEDULER_PATIENCE
)

train_losses = []
val_losses = []
train_scores = []  # Competition scores
val_scores = []    # Competition scores

print("Using Weighted MSE Loss with competition weights:")
for name, weight in TARGET_WEIGHTS.items():
    print(f"  {name}: {weight}")

In [None]:
# Training loop with competition metric tracking
for epoch in range(NUM_EPOCHS):
    # Training phase
    model.train()
    train_loss = 0.0
    train_preds_epoch = []
    train_targets_epoch = []
    
    for images, targets in tqdm(train_loader, desc=f'Epoch {epoch+1}/{NUM_EPOCHS} [Train]'):
        images = images.to(device)
        targets = targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * images.size(0)
        train_preds_epoch.append(outputs.detach().cpu().numpy())
        train_targets_epoch.append(targets.cpu().numpy())
    
    train_loss = train_loss / len(train_dataset)
    train_losses.append(train_loss)
    
    # Calculate training competition score
    train_preds_epoch = np.vstack(train_preds_epoch)
    train_targets_epoch = np.vstack(train_targets_epoch)
    train_comp_score, _ = competition_score(train_targets_epoch, train_preds_epoch)
    train_scores.append(train_comp_score)
    
    # Validation phase
    model.eval()
    val_loss = 0.0
    val_preds_epoch = []
    val_targets_epoch = []
    
    with torch.no_grad():
        for images, targets in tqdm(val_loader, desc=f'Epoch {epoch+1}/{NUM_EPOCHS} [Val]'):
            images = images.to(device)
            targets = targets.to(device)
            
            outputs = model(images)
            loss = criterion(outputs, targets)
            
            val_loss += loss.item() * images.size(0)
            val_preds_epoch.append(outputs.cpu().numpy())
            val_targets_epoch.append(targets.cpu().numpy())
    
    val_loss = val_loss / len(val_dataset)
    val_losses.append(val_loss)
    
    # Calculate validation competition score
    val_preds_epoch = np.vstack(val_preds_epoch)
    val_targets_epoch = np.vstack(val_targets_epoch)
    val_comp_score, _ = competition_score(val_targets_epoch, val_preds_epoch)
    val_scores.append(val_comp_score)
    
    print(f'Epoch {epoch+1}/{NUM_EPOCHS} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f} | Train Score: {train_comp_score:.4f}, Val Score: {val_comp_score:.4f}')
    
    # Scheduler based on validation competition score
    scheduler.step(val_comp_score)

print("\nTraining completed!")

In [None]:
# Plot training curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Loss curves
ax1.plot(train_losses, label='Train Loss', marker='o', markersize=3)
ax1.plot(val_losses, label='Validation Loss', marker='o', markersize=3)
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Weighted MSE Loss')
ax1.set_title('Training and Validation Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Competition score curves
ax2.plot(train_scores, label='Train Score', marker='o', markersize=3)
ax2.plot(val_scores, label='Validation Score', marker='o', markersize=3)
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Competition Score (Weighted R²)')
ax2.set_title('Competition Metric Over Epochs')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nBest Validation Score: {max(val_scores):.4f} at epoch {np.argmax(val_scores)+1}")

In [None]:
# Evaluate on train and validation sets with competition metrics
def evaluate_model(model, dataloader, dataset_name):
    model.eval()
    all_preds = []
    all_targets = []
    
    with torch.no_grad():
        for images, targets in dataloader:
            images = images.to(device)
            outputs = model(images).cpu().numpy()
            all_preds.append(outputs)
            all_targets.append(targets.numpy())
    
    all_preds = np.vstack(all_preds)
    all_targets = np.vstack(all_targets)
    
    # Denormalize predictions and targets back to original scale
    all_preds = target_scaler.inverse_transform(all_preds)
    all_targets = target_scaler.inverse_transform(all_targets)
    
    # Calculate competition score
    comp_score, r2_per_target = competition_score(all_targets, all_preds)
    
    print(f"\n{dataset_name} Set Metrics:")
    print("="*80)
    print(f"{'COMPETITION SCORE (Weighted R²):':<40} {comp_score:>10.4f}")
    print("="*80)
    
    for i, name in enumerate(TARGET_NAMES):
        mae = mean_absolute_error(all_targets[:, i], all_preds[:, i])
        rmse = np.sqrt(mean_squared_error(all_targets[:, i], all_preds[:, i]))
        r2 = r2_per_target[i]
        weight = WEIGHTS_ARRAY[i]
        
        print(f"{name:15s} (w={weight:.1f}) - MAE: {mae:8.4f}, RMSE: {rmse:8.4f}, R²: {r2:7.4f}")
    
    # Overall metrics (unweighted)
    overall_mae = mean_absolute_error(all_targets.flatten(), all_preds.flatten())
    overall_rmse = np.sqrt(mean_squared_error(all_targets.flatten(), all_preds.flatten()))
    overall_r2 = r2_score(all_targets.flatten(), all_preds.flatten())
    
    print("-"*80)
    print(f"{'Overall (unweighted)':15s}     - MAE: {overall_mae:8.4f}, RMSE: {overall_rmse:8.4f}, R²: {overall_r2:7.4f}")
    print("="*80)
    
    return all_preds, all_targets

# Evaluate on both sets
train_preds, train_targets = evaluate_model(model, train_loader, "Training")
val_preds, val_targets = evaluate_model(model, val_loader, "Validation")

In [None]:
# Visualize predictions vs actual values
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for i, name in enumerate(TARGET_NAMES):
    ax = axes[i]
    
    # Plot validation set
    ax.scatter(val_targets[:, i], val_preds[:, i], alpha=0.5, s=20, label='Validation')
    
    # Plot perfect prediction line
    min_val = min(val_targets[:, i].min(), val_preds[:, i].min())
    max_val = max(val_targets[:, i].max(), val_preds[:, i].max())
    ax.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction')
    
    ax.set_xlabel('Actual Value (g)')
    ax.set_ylabel('Predicted Value (g)')
    ax.set_title(f'{name} - Predictions vs Actual')
    ax.legend()
    ax.grid(True, alpha=0.3)

# Hide the 6th subplot
axes[5].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Generate submission for test set
print("="*80)
print("GENERATING TEST PREDICTIONS FOR SUBMISSION")
print("="*80)

# Load test data
test_df = pd.read_csv(TEST_CSV)
print(f"\nTest samples: {len(test_df)}")
print(f"Test data columns: {list(test_df.columns)}")
print(f"\nSample test data:")
print(test_df.head())

# Get unique test images (since each image has 5 rows in test.csv)
test_images = test_df[['image_path']].drop_duplicates().reset_index(drop=True)
print(f"\nUnique test images: {len(test_images)}")

# Create test dataset (without targets, image only)
class TestDataset(Dataset):
    def __init__(self, df, data_dir=DATA_DIR, transform=None):
        self.df = df.reset_index(drop=True)
        self.data_dir = data_dir
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.data_dir, self.df.loc[idx, 'image_path'])
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
        
        return image

# Create test dataset and loader
test_dataset = TestDataset(test_images, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

# Make predictions
print(f"\n" + "="*80)
print("MAKING PREDICTIONS...")
print("="*80)

model.eval()
test_preds = []

with torch.no_grad():
    for images in test_loader:
        images = images.to(device)
        outputs = model(images).cpu().numpy()
        test_preds.append(outputs)

test_preds = np.vstack(test_preds)

# Denormalize predictions to original scale
test_preds = target_scaler.inverse_transform(test_preds)

print(f"\nPredictions shape: {test_preds.shape}")
print(f"Predictions for test images:")
for i, img_path in enumerate(test_images['image_path']):
    print(f"\n{img_path}:")
    for j, target_name in enumerate(TARGET_NAMES):
        print(f"  {target_name}: {test_preds[i, j]:.4f}")

# Create submission dataframe matching the required format
print(f"\n" + "="*80)
print("CREATING SUBMISSION FILE...")
print("="*80)

submission_data = []

for idx, row in test_df.iterrows():
    sample_id = row['sample_id']
    target_name = row['target_name']
    
    # Find which image this belongs to
    img_idx = test_images[test_images['image_path'] == row['image_path']].index[0]
    
    # Find which target column
    target_idx = TARGET_NAMES.index(target_name)
    
    # Get prediction
    pred_value = test_preds[img_idx, target_idx]
    
    submission_data.append({
        'sample_id': sample_id,
        'target': pred_value
    })

submission_df = pd.DataFrame(submission_data)

# Save submission
submission_path = 'submission.csv'
submission_df.to_csv(submission_path, index=False)

print(f"\nSUBMISSION SAVED: {submission_path}")
print("="*80)
print(f"\nSubmission file preview:")
print(submission_df.head())