# üåø CSIRO Biomass Prediction ‚Äî Deep Learning Baseline (ResNet34)
In this notebook, we build a deep learning model using a ResNet34 backbone to predict various biomass targets from RGB images.  
The pipeline covers:
1. Exploratory Data Analysis (EDA)
2. Custom PyTorch Dataset
3. Model Architecture (StableResNet)
4. Training & Validation Loops
5. Model Evaluation & Visualization
6. Test Prediction Generation


#  Imports

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from PIL import Image
from collections import defaultdict
from torchvision.models import resnet34
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# --- Reproducibility ---
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
set_seed(42)

# 1Ô∏èData Loading and Exploration

In [None]:
print("üì• Loading dataset...")
train_df = pd.read_csv('/kaggle/input/csiro-biomass/train.csv')
print(f"Training data shape: {train_df.shape}")
print(train_df.head())

# --- Target Statistics ---
print("\nüìä Target Value Statistics:")
print(train_df['target'].describe())

# --- EDA: Distributions ---
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
train_df['target_name'].value_counts().plot(kind='bar', color='teal')
plt.title('Target Name Distribution')
plt.xticks(rotation=45)

plt.subplot(1, 3, 2)
sns.histplot(train_df['target'], bins=50, kde=True, color='green')
plt.title('Target Value Distribution')

plt.subplot(1, 3, 3)
sns.boxplot(x='target_name', y='target', data=train_df, palette='Set2')
plt.title('Target Values by Type')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

#  Outlier Analysis 
outlier_threshold = train_df['target'].quantile(0.99)
outliers = train_df[train_df['target'] > outlier_threshold]
print(f"99th percentile threshold: {outlier_threshold:.2f}")
print(f"Number of outliers: {len(outliers)}")

#  Custom Dataset Class

In [None]:
class BiomassDataset(Dataset):
    def __init__(self, df, images_dir, transform=None, is_test=False):
        self.df = df.reset_index(drop=True)
        self.images_dir = images_dir
        self.transform = transform
        self.is_test = is_test
        self.image_cache = {}
        if not is_test:
            self.target_mapping = {
                'Dry_Green_g': 0, 'Dry_Dead_g': 1, 'Dry_Clover_g': 2,
                'GDM_g': 3, 'Dry_Total_g': 4
            }

    def __len__(self):
        return len(self.df)
    
    def _load_image(self, image_path):
        if image_path in self.image_cache:
            return self.image_cache[image_path]
        try:
            image = Image.open(image_path).convert('RGB')
            self.image_cache[image_path] = image
            return image
        except Exception as e:
            print(f"Error loading {image_path}: {e}")
            return Image.new('RGB', (2000, 1000), color='white')

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.images_dir, row['image_path'])
        image = self._load_image(image_path)
        if self.transform:
            image = self.transform(image)
        if self.is_test:
            return image, row['sample_id']
        target_value = row['target']
        target_type = self.target_mapping[row['target_name']]
        return image, torch.tensor(target_value, dtype=torch.float32), target_type

#  Data Augmentation 
train_transform = transforms.Compose([
    transforms.Resize((400, 200)),
    transforms.RandomHorizontalFlip(p=0.3),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
val_transform = transforms.Compose([
    transforms.Resize((400, 200)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


#  Model Definition ‚Äî StableResNet

In [None]:

class StableResNet(nn.Module):
    def __init__(self, num_targets=5, pretrained=False):
        super().__init__()
        self.backbone = resnet34(weights=None if not pretrained else 'IMAGENET1K_V1')
        in_features = self.backbone.fc.in_features
        self.backbone.fc = nn.Sequential(
            nn.Linear(in_features, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(inplace=True),
            nn.Linear(128, num_targets)
        )
        self._initialize_weights()
        
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)
    
    def forward(self, x, target_type=None):
        return self.backbone(x)


#  Train | Validate Functions


In [None]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for batch_idx, (images, targets, types) in enumerate(dataloader):
        images, targets, types = images.to(device), targets.to(device), types.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        preds = outputs[torch.arange(len(types)), types]
        loss = criterion(preds, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def validate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss, preds_all, targs_all = 0, [], []
    with torch.no_grad():
        for images, targets, types in dataloader:
            images, targets, types = images.to(device), targets.to(device), types.to(device)
            outputs = model(images)
            preds = outputs[torch.arange(len(types)), types]
            loss = criterion(preds, targets)
            total_loss += loss.item()
            preds_all.extend(preds.cpu().numpy())
            targs_all.extend(targets.cpu().numpy())
    mse = mean_squared_error(targs_all, preds_all)
    r2 = r2_score(targs_all, preds_all)
    return total_loss / len(dataloader), mse, r2, preds_all, targs_all
    
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, device, num_epochs=25):
    best_val_loss, patience, counter = float('inf'), 7, 0
    train_hist, val_hist = [], []
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, mse, r2, preds, targs = validate_model(model, val_loader, criterion, device)
        scheduler.step(val_loss)
        train_hist.append(train_loss)
        val_hist.append(val_loss)
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | MSE: {mse:.4f} | R¬≤: {r2:.4f}")
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_model.pth")
            counter = 0
            print("‚úÖ New best model saved!")
        else:
            counter += 1
            if counter >= patience:
                print("‚èπ Early stopping triggered.")
                break
    model.load_state_dict(torch.load("best_model.pth"))
    return train_hist, val_hist


#  Data Split & Loader Setup

In [None]:
train_idx, val_idx = train_test_split(
    range(len(train_df)),
    test_size=0.2,
    stratify=train_df['target_name'],
    random_state=42
)
train_data = train_df.iloc[train_idx].reset_index(drop=True)
val_data = train_df.iloc[val_idx].reset_index(drop=True)

train_ds = BiomassDataset(train_data, '/kaggle/input/csiro-biomass', train_transform)
val_ds = BiomassDataset(val_data, '/kaggle/input/csiro-biomass', val_transform)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=2)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, num_workers=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


#  Model Training

In [None]:

model = StableResNet(num_targets=5).to(device)
criterion = nn.SmoothL1Loss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.5, verbose=True)

print("üöÄ Training started...")
train_losses, val_losses = train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, device)



# Evaluation

In [None]:
model.eval()
val_loss, mse, r2, preds, targs = validate_model(model, val_loader, criterion, device)
print(f"\nFinal Validation Results ‚Üí Loss: {val_loss:.4f} | MSE: {mse:.4f} | R¬≤: {r2:.4f}")

plt.figure(figsize=(15,5))
plt.subplot(1,3,1)
plt.plot(train_losses, label='Train')
plt.plot(val_losses, label='Validation')
plt.legend(); plt.title("Loss Curve")

plt.subplot(1,3,2)
sns.scatterplot(x=targs, y=preds, alpha=0.6)
plt.plot([0, max(targs)], [0, max(targs)], 'r--')
plt.xlabel("Actual"); plt.ylabel("Predicted"); plt.title("Predicted vs Actual")

plt.subplot(1,3,3)
sns.histplot(np.array(preds)-np.array(targs), bins=30, color='purple')
plt.title("Error Distribution")

plt.tight_layout()
plt.show()



# Test Predictions & Submission


In [None]:
print("üì§ Generating test predictions...")
test_df = pd.read_csv('/kaggle/input/csiro-biomass/test.csv')
test_ds = BiomassDataset(test_df, '/kaggle/input/csiro-biomass', val_transform, is_test=True)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, num_workers=2)

model.eval()
predictions, sample_ids = [], []
target_mapping = {'Dry_Green_g': 0, 'Dry_Dead_g': 1, 'Dry_Clover_g': 2, 'GDM_g': 3, 'Dry_Total_g': 4}

with torch.no_grad():
    for images, batch_ids in test_loader:
        images = images.to(device)
        outputs = model(images)
        for i, sid in enumerate(batch_ids):
            row = test_df[test_df['sample_id'] == sid].iloc[0]
            idx = target_mapping[row['target_name']]
            pred = max(0, outputs[i, idx].item())
            predictions.append(pred)
            sample_ids.append(sid)

submission = pd.DataFrame({'sample_id': sample_ids, 'target': predictions})
submission.to_csv('submission.csv', index=False)
print("‚úÖ Submission saved as 'submission.csv'")
print(submission.head())
