### ðŸš€ Inference & Submission Notebook Link

**This training process led to the final model, which is used for predictions in a separate inference notebook.**

[**Go to the Inference Notebook here**](https://www.kaggle.com/code/tasmim/lb-0-54-csiro-image2biomass-prediction-infer?scriptVersionId=272147088)


In [None]:
# ============================================================================
# CSIRO Image2Biomass Prediction - Complete End-to-End Pipeline
# ============================================================================
# This pipeline predicts 5 biomass components from pasture images:
# - Dry_Green_g, Dry_Dead_g, Dry_Clover_g, GDM_g, Dry_Total_g
# ============================================================================

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
import timm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
import cv2
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION
# ============================================================================
class CFG:
    # Paths
    train_csv = '/kaggle/input/csiro-biomass/train.csv'
    test_csv = '/kaggle/input/csiro-biomass/test.csv'
    train_dir = '/kaggle/input/csiro-biomass/train'
    test_dir = '/kaggle/input/csiro-biomass/test/'
    
    # Model
    model_name = 'tf_efficientnetv2_m'  # EfficientNetV2-M for better performance
    img_size = 512  # Higher resolution for detail
    pretrained = True
    
    # Training
    n_folds = 5
    seed = 42
    epochs = 50
    batch_size = 16
    num_workers = 4
    lr = 3e-4  # Increased learning rate
    weight_decay = 1e-5
    warmup_epochs = 2  # Add warmup
    
    # Augmentation
    use_tta = True
    tta_steps = 5
    
    # Targets
    targets = ['Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'Dry_Total_g']
    target_weights = [0.1, 0.1, 0.1, 0.2, 0.5]  # From evaluation criteria
    
    # Target scaling (CRITICAL FIX)
    use_target_scaling = True  # Scale targets to reasonable range
    
    # Device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Set random seeds for reproducibility
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(CFG.seed)

# ============================================================================
# DATA PREPROCESSING
# ============================================================================
def prepare_data(train_csv_path):
    """
    Prepare training data by pivoting from long to wide format.
    Each image has 5 rows (one per target), we combine them into 1 row.
    """
    df = pd.read_csv(train_csv_path)
    
    # The CSV is already in long format with one row per (image, target) pair
    # We need to pivot so each image becomes one row with all 5 targets as columns
    
    # First, get the unique identifier for each image (excluding target columns)
    # Extract just the image ID from sample_id
    df['image_id'] = df['sample_id'].str.split('__').str[0] if '__' in df['sample_id'].iloc[0] else df['sample_id']
    
    # Group by image and get metadata (should be same for all targets of same image)
    metadata_cols = ['image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm']
    
    # Pivot to wide format
    df_pivot = df.pivot_table(
        index=['image_id'] + metadata_cols,
        columns='target_name',
        values='target',
        aggfunc='first'  # Use first value if duplicates exist
    ).reset_index()
    
    # Ensure all 5 target columns exist and fill any NaN with 0
    for target in CFG.targets:
        if target not in df_pivot.columns:
            df_pivot[target] = 0.0
        else:
            df_pivot[target] = df_pivot[target].fillna(0.0)
    
    # Create stratification bins based on total biomass
    # This ensures balanced folds across biomass ranges
    # Use robust binning to handle edge cases
    try:
        df_pivot['biomass_bin'] = pd.qcut(
            df_pivot['Dry_Total_g'], 
            q=10, 
            labels=False, 
            duplicates='drop'
        )
    except ValueError:
        # If qcut fails, use cut with equal-width bins
        df_pivot['biomass_bin'] = pd.cut(
            df_pivot['Dry_Total_g'], 
            bins=10, 
            labels=False
        )
    
    # Fill any remaining NaN in biomass_bin with a default value
    df_pivot['biomass_bin'] = df_pivot['biomass_bin'].fillna(0).astype(int)
    
    print(f"Prepared {len(df_pivot)} unique images")
    print(f"Target columns: {CFG.targets}")
    print(f"Sample biomass statistics:")
    for target in CFG.targets:
        print(f"  {target}: mean={df_pivot[target].mean():.2f}, std={df_pivot[target].std():.2f}")
    
    return df_pivot

# ============================================================================
# DATASET CLASS
# ============================================================================
class BiomassDataset(Dataset):
    """
    Custom dataset for loading pasture images and metadata.
    Returns: image tensor, tabular features, and target values
    """
    def __init__(self, df, img_dir, transform=None, is_test=False, scaler=None):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test
        
        # Prepare tabular features (NDVI and Height)
        tabular_data = df[['Pre_GSHH_NDVI', 'Height_Ave_cm']].fillna(0).values
        
        if not is_test:
            if scaler is None:
                self.scaler = StandardScaler()
                self.tabular_features = self.scaler.fit_transform(tabular_data)
            else:
                self.scaler = scaler
                self.tabular_features = self.scaler.transform(tabular_data)
        else:
            if scaler is not None:
                self.scaler = scaler
                self.tabular_features = self.scaler.transform(tabular_data)
            else:
                self.tabular_features = tabular_data
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Load image
        img_path = f"{self.img_dir}/{row['image_path'].split('/')[-1]}"
        image = cv2.imread(img_path)
        
        if image is None:
            raise ValueError(f"Failed to load image: {img_path}")
            
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Apply augmentations
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
        
        # Get tabular features
        tabular = torch.tensor(self.tabular_features[idx], dtype=torch.float32)
        
        if self.is_test:
            return image, tabular
        else:
            # Get all 5 target values
            targets = torch.tensor([
                row['Dry_Green_g'],
                row['Dry_Dead_g'],
                row['Dry_Clover_g'],
                row['GDM_g'],
                row['Dry_Total_g']
            ], dtype=torch.float32)
            
            return image, tabular, targets

# ============================================================================
# AUGMENTATION STRATEGIES
# ============================================================================
def get_train_transforms():
    """
    Strong augmentation for training to improve generalization.
    Includes geometric, color, and quality transforms.
    """
    return A.Compose([
        A.Resize(CFG.img_size, CFG.img_size),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.RandomRotate90(p=0.5),
        A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=15, p=0.5),
        
        # Color augmentations (important for varying lighting conditions)
        A.OneOf([
            A.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20, p=1),
            A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=1),
            A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1, p=1),
        ], p=0.7),
        
        # Quality degradation (simulate camera variations)
        A.OneOf([
            A.GaussNoise(var_limit=(10.0, 50.0), p=1),
            A.GaussianBlur(blur_limit=(3, 7), p=1),
            A.MotionBlur(blur_limit=5, p=1),
        ], p=0.3),
        
        A.CoarseDropout(max_holes=8, max_height=32, max_width=32, p=0.3),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ])

def get_valid_transforms():
    """Simple transforms for validation (no augmentation)"""
    return A.Compose([
        A.Resize(CFG.img_size, CFG.img_size),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ])

# ============================================================================
# MODEL ARCHITECTURE
# ============================================================================
class BiomassModel(nn.Module):
    """
    Multi-modal model combining:
    1. EfficientNet for image features
    2. MLP for tabular features (NDVI, Height)
    3. Fusion layer combining both modalities
    4. 5 output heads (one per target)
    """
    def __init__(self, model_name, pretrained=True):
        super(BiomassModel, self).__init__()
        
        # Image encoder (EfficientNet)
        self.backbone = timm.create_model(
            model_name, 
            pretrained=pretrained,
            num_classes=0,  # Remove classification head
            global_pool='avg'
        )
        
        # Get feature dimension from backbone
        with torch.no_grad():
            dummy_input = torch.randn(1, 3, CFG.img_size, CFG.img_size)
            img_features = self.backbone(dummy_input).shape[1]
        
        # Tabular feature encoder (for NDVI and Height)
        self.tabular_encoder = nn.Sequential(
            nn.Linear(2, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),
        )
        
        # Fusion layer
        fusion_dim = img_features + 128
        self.fusion = nn.Sequential(
            nn.Linear(fusion_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
        )
        
        # Output heads (5 separate heads for better learning)
        self.head_green = nn.Linear(256, 1)
        self.head_dead = nn.Linear(256, 1)
        self.head_clover = nn.Linear(256, 1)
        self.head_gdm = nn.Linear(256, 1)
        self.head_total = nn.Linear(256, 1)
    
    def forward(self, image, tabular):
        # Extract image features
        img_features = self.backbone(image)
        
        # Extract tabular features
        tab_features = self.tabular_encoder(tabular)
        
        # Concatenate features
        combined = torch.cat([img_features, tab_features], dim=1)
        
        # Fusion
        fused = self.fusion(combined)
        
        # Predict all 5 targets
        out_green = self.head_green(fused)
        out_dead = self.head_dead(fused)
        out_clover = self.head_clover(fused)
        out_gdm = self.head_gdm(fused)
        out_total = self.head_total(fused)
        
        # Stack outputs [batch_size, 5]
        outputs = torch.cat([out_green, out_dead, out_clover, out_gdm, out_total], dim=1)
        
        return outputs

# ============================================================================
# LOSS FUNCTION
# ============================================================================
class WeightedMSELoss(nn.Module):
    """
    Weighted MSE loss matching the competition metric.
    Each target has a different weight in final score.
    """
    def __init__(self, weights):
        super(WeightedMSELoss, self).__init__()
        self.weights = torch.tensor(weights, dtype=torch.float32)
    
    def forward(self, predictions, targets):
        self.weights = self.weights.to(predictions.device)
        
        # MSE for each target
        mse_per_target = (predictions - targets) ** 2
        
        # Apply weights
        weighted_mse = mse_per_target * self.weights.unsqueeze(0)
        
        # Return mean loss
        return weighted_mse.mean()

# ============================================================================
# METRIC CALCULATION (RÂ² Score)
# ============================================================================
def calculate_r2_score(y_true, y_pred):
    """
    Calculate RÂ² (coefficient of determination) for model evaluation.
    RÂ² = 1 - (SS_res / SS_tot)
    """
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - y_true.mean()) ** 2)
    
    if ss_tot == 0:
        return 0.0
    
    r2 = 1 - (ss_res / ss_tot)
    return r2

def calculate_weighted_r2(y_true, y_pred, weights):
    """
    Calculate weighted RÂ² score across all 5 targets.
    This matches the competition evaluation metric.
    """
    scores = []
    for i in range(5):
        r2 = calculate_r2_score(y_true[:, i], y_pred[:, i])
        scores.append(r2)
    
    weighted_score = sum(s * w for s, w in zip(scores, weights))
    return weighted_score, scores

# ============================================================================
# TRAINING FUNCTION
# ============================================================================
def train_epoch(model, loader, optimizer, criterion, device, scaler):
    """Train for one epoch"""
    model.train()
    running_loss = 0.0
    
    pbar = tqdm(loader, desc='Training')
    for batch_idx, (images, tabular, targets) in enumerate(pbar):
        images = images.to(device)
        tabular = tabular.to(device)
        targets = targets.to(device)
        
        optimizer.zero_grad()
        
        # Mixed precision training for speed
        with torch.cuda.amp.autocast(enabled=True):
            outputs = model(images, tabular)
            loss = criterion(outputs, targets)
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        running_loss += loss.item()
        pbar.set_postfix({'loss': running_loss / (pbar.n + 1)})
        
        # Debug: Print first batch predictions
        if batch_idx == 0:
            print(f"\n  Sample predictions: {outputs[0].detach().cpu().numpy()}")
            print(f"  Sample targets:     {targets[0].cpu().numpy()}")
    
    return running_loss / len(loader)

# ============================================================================
# VALIDATION FUNCTION
# ============================================================================
def validate_epoch(model, loader, criterion, device):
    """Validate and calculate RÂ² score"""
    model.eval()
    running_loss = 0.0
    all_preds = []
    all_targets = []
    
    with torch.no_grad():
        for images, tabular, targets in tqdm(loader, desc='Validation'):
            images = images.to(device)
            tabular = tabular.to(device)
            targets = targets.to(device)
            
            outputs = model(images, tabular)
            loss = criterion(outputs, targets)
            
            running_loss += loss.item()
            all_preds.append(outputs.cpu().numpy())
            all_targets.append(targets.cpu().numpy())
    
    all_preds = np.vstack(all_preds)
    all_targets = np.vstack(all_targets)
    
    # Calculate RÂ² scores
    weighted_r2, individual_r2 = calculate_weighted_r2(
        all_targets, all_preds, CFG.target_weights
    )
    
    return running_loss / len(loader), weighted_r2, individual_r2, all_preds, all_targets

# ============================================================================
# TRAINING LOOP (K-FOLD CROSS-VALIDATION)
# ============================================================================
def train_kfold(df, fold):
    """Train a single fold"""
    print(f"\n{'='*50}")
    print(f"Training Fold {fold + 1}/{CFG.n_folds}")
    print(f"{'='*50}")
    
    # Split data
    train_df = df[df['fold'] != fold].copy()
    valid_df = df[df['fold'] == fold].copy()
    
    print(f"Train size: {len(train_df)}, Valid size: {len(valid_df)}")
    
    # CRITICAL: Check target distribution
    print(f"\nTarget statistics (training set):")
    for target in CFG.targets:
        print(f"  {target}: mean={train_df[target].mean():.2f}, std={train_df[target].std():.2f}, "
              f"min={train_df[target].min():.2f}, max={train_df[target].max():.2f}")
    
    # Create target scaler if enabled
    target_scaler = None
    if CFG.use_target_scaling:
        target_scaler = StandardScaler()
        target_values = train_df[CFG.targets].values
        target_scaler.fit(target_values)
        
        # Scale targets in dataframes
        train_df[CFG.targets] = target_scaler.transform(train_df[CFG.targets].values)
        valid_df[CFG.targets] = target_scaler.transform(valid_df[CFG.targets].values)
        print("\nâœ“ Targets scaled to zero mean and unit variance")
    
    # Create datasets with shared scaler
    train_dataset = BiomassDataset(
        train_df, CFG.train_dir, transform=get_train_transforms()
    )
    valid_dataset = BiomassDataset(
        valid_df, CFG.train_dir, transform=get_valid_transforms(),
        scaler=train_dataset.scaler  # Use same scaler for validation
    )
    
    # Create dataloaders
    train_loader = DataLoader(
        train_dataset, batch_size=CFG.batch_size, 
        shuffle=True, num_workers=CFG.num_workers, pin_memory=True
    )
    valid_loader = DataLoader(
        valid_dataset, batch_size=CFG.batch_size * 2,
        shuffle=False, num_workers=CFG.num_workers, pin_memory=True
    )
    
    # Initialize model, loss, optimizer
    model = BiomassModel(CFG.model_name, CFG.pretrained).to(CFG.device)
    criterion = WeightedMSELoss(CFG.target_weights)
    optimizer = optim.AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
    
    # Learning rate scheduler with warmup
    def lr_lambda(epoch):
        if epoch < CFG.warmup_epochs:
            return (epoch + 1) / CFG.warmup_epochs
        return 1.0
    
    warmup_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
    main_scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, T_0=10, T_mult=2, eta_min=1e-6
    )
    
    scaler = torch.cuda.amp.GradScaler()
    
    best_score = -np.inf
    patience_counter = 0
    patience = 13
    
    for epoch in range(CFG.epochs):
        print(f"\nEpoch {epoch + 1}/{CFG.epochs}")
        print(f"Learning rate: {optimizer.param_groups[0]['lr']:.6f}")
        
        # Train
        train_loss = train_epoch(model, train_loader, optimizer, criterion, CFG.device, scaler)
        
        # Validate
        valid_loss, weighted_r2, individual_r2, all_preds, all_targets = validate_epoch(
            model, valid_loader, criterion, CFG.device
        )
        
        # Scale predictions back if needed
        if target_scaler is not None:
            all_preds_original = target_scaler.inverse_transform(all_preds)
            all_targets_original = target_scaler.inverse_transform(all_targets)
            
            # Recalculate RÂ² on original scale
            weighted_r2_original, individual_r2_original = calculate_weighted_r2(
                all_targets_original, all_preds_original, CFG.target_weights
            )
            
            print(f"Train Loss: {train_loss:.4f} | Valid Loss: {valid_loss:.4f}")
            print(f"Weighted RÂ² (scaled): {weighted_r2:.4f}")
            print(f"Weighted RÂ² (original): {weighted_r2_original:.4f}")
            print(f"Individual RÂ² (original): {individual_r2_original}")
            
            # Use original scale for model selection
            score_to_use = weighted_r2_original
        else:
            print(f"Train Loss: {train_loss:.4f} | Valid Loss: {valid_loss:.4f}")
            print(f"Weighted RÂ²: {weighted_r2:.4f}")
            print(f"Individual RÂ²: {individual_r2}")
            score_to_use = weighted_r2
        
        # Update scheduler
        if epoch < CFG.warmup_epochs:
            warmup_scheduler.step()
        else:
            main_scheduler.step()
        
        # Save best model
        if score_to_use > best_score:
            best_score = score_to_use
            # Save model and scalers
            checkpoint = {
                'model_state_dict': model.state_dict(),
                'tabular_scaler': train_dataset.scaler,
                'target_scaler': target_scaler
            }
            torch.save(checkpoint, f'best_model_fold{fold}.pth')
            print(f"âœ“ Saved best model (RÂ²: {best_score:.4f})")
            patience_counter = 0
        else:
            patience_counter += 1
        
        # Early stopping
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break
    
    return best_score


# ============================================================================
# MAIN EXECUTION
# ============================================================================
def main():
    # 1. Prepare data
    print("Loading and preparing data...")
    print(f"Reading from: {CFG.train_csv}")
    
    # First, let's check the format of the CSV
    df_raw = pd.read_csv(CFG.train_csv)
    print(f"\nRaw data shape: {df_raw.shape}")
    print(f"Columns: {df_raw.columns.tolist()}")
    print(f"\nFirst few rows:")
    print(df_raw.head(10))
    
    # Check if data needs pivoting
    if 'target_name' in df_raw.columns and 'target' in df_raw.columns:
        print("\nâœ“ Data is in long format, will pivot...")
        df = prepare_data(CFG.train_csv)
    else:
        print("\nâœ“ Data appears to be in wide format already")
        df = df_raw.copy()
        # Ensure biomass_bin exists
        try:
            df['biomass_bin'] = pd.qcut(df['Dry_Total_g'], q=10, labels=False, duplicates='drop')
        except:
            df['biomass_bin'] = pd.cut(df['Dry_Total_g'], bins=10, labels=False)
        df['biomass_bin'] = df['biomass_bin'].fillna(0).astype(int)
    
    print(f"\nâœ“ Final processed data shape: {df.shape}")
    print(f"âœ“ Checking for NaN values in targets:")
    for target in CFG.targets:
        nan_count = df[target].isna().sum()
        print(f"  {target}: {nan_count} NaN values")
    
    # 2. Create folds
    print("\nCreating cross-validation folds...")
    skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
    df['fold'] = -1
    
    # Ensure no NaN in biomass_bin before splitting
    assert df['biomass_bin'].isna().sum() == 0, "NaN found in biomass_bin!"
    
    for fold, (_, val_idx) in enumerate(skf.split(df, df['biomass_bin'])):
        df.loc[val_idx, 'fold'] = fold
    
    print("âœ“ Fold distribution:")
    print(df['fold'].value_counts().sort_index())
    
    # 3. Train all folds
    fold_scores = []
    for fold in range(CFG.n_folds):
        score = train_kfold(df, fold)
        fold_scores.append(score)
    
    print(f"\n{'='*50}")
    print(f"Cross-Validation Results:")
    print(f"{'='*50}")
    for i, score in enumerate(fold_scores):
        print(f"Fold {i+1}: {score:.4f}")
    print(f"Mean CV Score: {np.mean(fold_scores):.4f} Â± {np.std(fold_scores):.4f}")


if __name__ == '__main__':
    main()

