# **Biomass Test Inference**

### Strategy for this competition

The test tabular data is missing some items compared to the training tabular data.

First, in the training phase, we verify that we can predict targets using only the tabular data (#1).

We also verify that image data can predict items that are present in the train tabular data but not in the test tabular data (#2).

**In the testing phase, we first predict the missing items in the test tabular data from the images. Finally, we use the model already trained on the training data to predict targets using the complete test tabular data (#3,this notebook).**

1. https://www.kaggle.com/code/stpeteishii/biomass-train-data-visualize-importance<br>
2. https://www.kaggle.com/code/stpeteishii/pre-gshh-ndvi-pytorch-lightning-cnn-regressor<br>
https://www.kaggle.com/code/stpeteishii/height-ave-cm-pytorch-lightning-cnn-regressor<br>
https://www.kaggle.com/code/stpeteishii/species-pytorch-lightning-cnn-classifier<br>
3. https://www.kaggle.com/code/stpeteishii/biomass-test-inference<br>

In [None]:
#!pip install lightning

In [None]:
!grep -v "nvidia-" /kaggle/input/download-lightning2/requirements.txt > requirements_no_cuda.txt
!pip install --no-index --find-links /kaggle/input/download-lightning2/offline_packages/ -r requirements_no_cuda.txt --no-deps


import pytorch_lightning as L
from pytorch_lightning import LightningDataModule
from pytorch_lightning import LightningModule
from pytorch_lightning import Trainer
from pytorch_lightning import seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger

In [None]:
import os
import random
import time
from contextlib import contextmanager

import numpy as np
import pandas as pd
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, Subset
from torch.utils.data import random_split, SubsetRandomSampler

#import lightning.pytorch as L
#from lightning.pytorch import LightningDataModule
#from lightning.pytorch import LightningModule
#from lightning.pytorch import Trainer

from torchvision import datasets, transforms, models
from torchvision.datasets import ImageFolder
from torchvision.transforms import ToTensor
from torchvision.utils import make_grid

import joblib
import lightgbm as lgbm
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    log_loss,
    mean_squared_error,
)
from tensorflow.keras.utils import to_categorical # Keras utility

import category_encoders as ce

# Data preparation

In [None]:
data0 = pd.read_csv("/kaggle/input/csiro-biomass/train.csv")
display(data0[0:3].T)
print(data0.columns.tolist())
test0=pd.read_csv('/kaggle/input/csiro-biomass/test.csv')
display(test0[0:3].T)
print(test0.columns.tolist())
delete_cols=['image_path','Sampling_Date','State']
data0=data0.drop(columns=delete_cols,axis=1)
display(data0[0:3].T)
print(set(data0.columns.tolist())-set(test0.columns.tolist()))

# In test data,'Species', 'Pre_GSHH_NDVI', and 'Height_Ave_cm' will be predicted 
# from test image data.
# ['Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm', 'target_name', 'target']

In [None]:
names=sorted(data0['Species'].unique().tolist())
name_mapping=dict(zip(names,list(range(len(names)))))
print(name_mapping)

# Model

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, path_label, transform=None):
        self.path_label = path_label
        self.transform = transform

    def __len__(self):
        return len(self.path_label)

    def __getitem__(self, idx):
        path, label = self.path_label[idx]
        img = Image.open(path).convert('RGB')

        if self.transform is not None:
            img = self.transform(img)

        return img, label

In [None]:
        
class DataModule(LightningDataModule):
    def __init__(self, path_label=None, root_dir=None, batch_size=32):
        super().__init__()
        self.path_label = path_label
        self.root_dir = root_dir
        self.batch_size = batch_size
        
        # Define your transforms
        self.transform = transforms.Compose([
            transforms.Resize(224),             # resize shortest side to 224 pixels
            transforms.CenterCrop(224),         # crop longest side to 224 pixels at center            
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406],
                               [0.229, 0.224, 0.225])
        ])
        
        # Initialize datasets
        self.train_dataset = None
        self.test_dataset = None

    def setup(self, stage=None):
        # Create dataset based on what's provided
        if self.path_label is not None:
            dataset = CustomDataset(self.path_label, self.transform)
        elif self.root_dir is not None:
            dataset = datasets.ImageFolder(root=self.root_dir, transform=self.transform)
        else:
            raise ValueError("Either path_label or root_dir must be provided")
        
        dataset_size = len(dataset)
        train_size = int(0.8 * dataset_size) 
        test_size = dataset_size - train_size

        # Split dataset
        self.train_dataset = Subset(dataset, range(train_size))
        self.test_dataset = Subset(dataset, range(train_size, dataset_size))

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)

    def predict_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)

    def __len__(self):
        if self.train_dataset is not None:
            return len(self.train_dataset)
        elif self.test_dataset is not None:
            return len(self.test_dataset)
        else:
            return 0

In [None]:
class ConvolutionalRegressor(LightningModule):
    
    def __init__(self):
        super(ConvolutionalRegressor, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 3, 1)
        self.conv2 = nn.Conv2d(6, 16, 3, 1)
        self.fc1 = nn.Linear(16 * 54 * 54, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 20)
        self.fc4 = nn.Linear(20, 1)  # Regression: output 1 value

    def forward(self, X):
        X = F.relu(self.conv1(X))
        X = F.max_pool2d(X, 2, 2)
        X = F.relu(self.conv2(X))
        X = F.max_pool2d(X, 2, 2)
        X = X.view(-1, 16 * 54 * 54)
        X = F.relu(self.fc1(X))
        X = F.relu(self.fc2(X))
        X = F.relu(self.fc3(X))
        X = self.fc4(X)  # Output a continuous value
        return X.squeeze(1)  # Output shape: [batch_size]

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

    def training_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self(X)
        loss = F.mse_loss(y_hat, y.float())
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self(X)
        loss = F.mse_loss(y_hat, y.float())
        self.log("val_loss", loss)

    def test_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self(X)
        loss = F.mse_loss(y_hat, y.float())
        self.log("test_loss", loss)

In [None]:
class ConvolutionalClassifier(LightningModule):
    
    def __init__(self, num_classes=len(names)):
        super(ConvolutionalClassifier, self).__init__()
        self.save_hyperparameters()
        
        # Keep convolutional layers unchanged
        self.conv1 = nn.Conv2d(3, 6, 3, 1)
        self.conv2 = nn.Conv2d(6, 16, 3, 1)
        self.fc1 = nn.Linear(16 * 54 * 54, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 20)
        # Modified for classification: output for num_classes
        self.fc4 = nn.Linear(20, num_classes)

    def forward(self, X):
        X = F.relu(self.conv1(X))
        X = F.max_pool2d(X, 2, 2)
        X = F.relu(self.conv2(X))
        X = F.max_pool2d(X, 2, 2)
        X = X.view(-1, 16 * 54 * 54)
        X = F.relu(self.fc1(X))
        X = F.relu(self.fc2(X))
        X = F.relu(self.fc3(X))
        X = self.fc4(X)  # Output for number of classes
        return X  # Output shape: [batch_size, num_classes]

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

    def training_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self(X)
        # Regression → Classification: MSE Loss → Cross Entropy Loss
        loss = F.cross_entropy(y_hat, y)
        self.log("train_loss", loss)
        
        # Also record accuracy
        preds = torch.argmax(y_hat, dim=1)
        acc = (preds == y).float().mean()
        self.log("train_acc", acc)
        
        return loss

    def validation_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self(X)
        loss = F.cross_entropy(y_hat, y)
        self.log("val_loss", loss)
        
        preds = torch.argmax(y_hat, dim=1)
        acc = (preds == y).float().mean()
        self.log("val_acc", acc)

    def test_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self(X)
        loss = F.cross_entropy(y_hat, y)
        self.log("test_loss", loss)
        
        preds = torch.argmax(y_hat, dim=1)
        acc = (preds == y).float().mean()
        self.log("test_acc", acc)

In [None]:
# Fix the transform to match training (224x224, not 128x128)
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def predict_single_image(image_path, model):
    """Predict for a single image"""
    image = Image.open(image_path).convert('RGB')
    image_tensor = transform(image).unsqueeze(0) 

    with torch.no_grad():
        prediction = model(image_tensor)
    
    return prediction.item() if prediction.dim() == 0 else prediction.cpu().numpy()

In [None]:
# Fixed version with better error handling

import os
import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
import joblib

# ===== Fix 1: Ensure all images are processed =====
def predict_batch(image_folder, model):
    """Predict for all images in a folder with error handling"""
    if not os.path.exists(image_folder):
        print(f"Warning: Folder {image_folder} does not exist")
        return {}
    
    image_paths = [os.path.join(image_folder, f) for f in os.listdir(image_folder) 
                  if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    print(f"Found {len(image_paths)} images to process")
    
    transform = transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    predictions = {}

    for image_path in image_paths:
        try:
            pred = predict_single_image(image_path, model)
            # Extract image ID from filename (e.g., ID1001187975 from ID1001187975.jpg)
            image_id = os.path.basename(image_path).split('.')[0]
            #-----------------
            if isinstance(pred, (list, np.ndarray)):
                predictions[image_id] = float(pred[0]) if len(pred) > 0 else float(pred)
            elif isinstance(pred, torch.Tensor):
                predictions[image_id] = pred.item() if pred.dim() == 0 else float(pred.flatten()[0])
            else:
                predictions[image_id] = float(pred)
            #--------------
            #predictions[image_id] = pred[0]
            print(f"{os.path.basename(image_path)}: {pred[0]}")
        except Exception as e:
            print(f"Error processing {image_path}: {e}")
            import traceback
            traceback.print_exc()
    
    return predictions

def predict_batch_classification(image_folder, model):
    """Predict species class for all images"""
    if not os.path.exists(image_folder):
        print(f"Warning: Folder {image_folder} does not exist")
        return {}
    
    image_paths = [os.path.join(image_folder, f) for f in os.listdir(image_folder) 
                  if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    print(f"Found {len(image_paths)} images to process")
    
    transform = transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    predictions = {}
    for image_path in image_paths:
        try:
            image = Image.open(image_path).convert('RGB')
            image_tensor = transform(image).unsqueeze(0)
            
            with torch.no_grad():
                logits = model(image_tensor)
                pred_class = torch.argmax(logits, dim=1).item()
            
            image_id = os.path.basename(image_path).split('.')[0]
            predictions[image_id] = pred_class
            
        except Exception as e:
            print(f"Error processing {os.path.basename(image_path)}: {e}")
    
    return predictions

# ===== Fix 2: Handle missing target_names properly =====
def prepare_test_data(test_df, species_preds, ndvi_preds, height_preds, train_df):
    """Prepare test data with proper handling of mappings"""
    
    # Extract image ID
    test_df['image_id'] = test_df['image_path'].apply(lambda x: os.path.basename(x).split('.')[0])
    
    # Create target_name mapping from training data ####
    names = sorted(train_df['target_name'].unique().tolist())
    name_mapping = dict(zip(names, range(len(names))))
    
    # Map target_name - keep original if not in mapping
    test_df['target_name_encoded'] = test_df['target_name'].map(name_mapping)
    
    # Check for unmapped target names
    unmapped = test_df[test_df['target_name_encoded'].isna()]['target_name'].unique()
    if len(unmapped) > 0:
        print(f"Warning: Found unmapped target names: {unmapped}")
        # Assign new indices for unmapped names
        max_idx = max(name_mapping.values())
        for i, name in enumerate(unmapped):
            name_mapping[name] = max_idx + i + 1
        test_df['target_name_encoded'] = test_df['target_name'].map(name_mapping)
    
    # Map predictions
    test_df['Species'] = test_df['image_id'].map(species_preds)
    test_df['Pre_GSHH_NDVI'] = test_df['image_id'].map(ndvi_preds)
    test_df['Height_Ave_cm'] = test_df['image_id'].map(height_preds)
    
    # Calculate fallback values from training data
    species_fallback = train_df['Species'].mode()[0] if 'Species' in train_df.columns else 0
    ndvi_fallback = train_df['Pre_GSHH_NDVI'].median() if 'Pre_GSHH_NDVI' in train_df.columns else 0.5
    height_fallback = train_df['Height_Ave_cm'].median() if 'Height_Ave_cm' in train_df.columns else 5.0
    
    # Fill missing values
    test_df['Species'] = test_df['Species'].fillna(species_fallback).astype(int)
    test_df['Pre_GSHH_NDVI'] = test_df['Pre_GSHH_NDVI'].fillna(ndvi_fallback).astype(float)
    test_df['Height_Ave_cm'] = test_df['Height_Ave_cm'].fillna(height_fallback).astype(float)
    
    # Prepare features
    X_test = test_df[['target_name_encoded', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm']].copy()
    X_test.columns = ['target_name', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm']  # Rename for compatibility
    
    return X_test, test_df

# ===== Fix 3: Add model loading error handling =====
def safe_load_model(model_path, model_class, **kwargs):
    """Safely load a model with error handling"""
    try:
        if not os.path.exists(model_path):
            print(f"Error: Model file not found: {model_path}")
            return None
        
        model = model_class.load_from_checkpoint(model_path, **kwargs)
        model.eval()
        return model
    except Exception as e:
        print(f"Error loading model from {model_path}: {e}")
        import traceback
        traceback.print_exc()
        return None

# ===== Main execution with better error handling =====
print("=" * 50)
print("Starting prediction pipeline...")
print("=" * 50)

# Load data
try:
    train_df = pd.read_csv("/kaggle/input/csiro-biomass/train.csv")
    test_df = pd.read_csv('/kaggle/input/csiro-biomass/test.csv')
    print(f"Loaded train data: {len(train_df)} rows")
    print(f"Loaded test data: {len(test_df)} rows")
except Exception as e:
    print(f"Error loading data: {e}")
    raise

test_folder = "/kaggle/input/csiro-biomass/test"

# Predict with error handling for each model
ndvi_preds = {}
height_preds = {}
species_preds = {}

# NDVI Model
print("\n" + "=" * 50)
print("Predicting Pre_GSHH_NDVI...")
print("=" * 50)
ndvi_model = safe_load_model(
    "/kaggle/input/pre-gshh-ndvi-pytorch-lightning-cnn-regressor/lightning_logs/version_0/checkpoints/epoch=999-step=9000.ckpt",
    ConvolutionalRegressor
)
if ndvi_model:
    ndvi_preds = predict_batch(test_folder, ndvi_model)
    print(f"NDVI predictions: {len(ndvi_preds)} images")

# Height Model
print("\n" + "=" * 50)
print("Predicting Height_Ave_cm...")
print("=" * 50)
height_model = safe_load_model(
    "/kaggle/input/height-ave-cm-pytorch-lightning-cnn-regressor/lightning_logs/version_0/checkpoints/epoch=999-step=9000.ckpt",
    ConvolutionalRegressor
)
if height_model:
    height_preds = predict_batch(test_folder, height_model)
    print(f"Height predictions: {len(height_preds)} images")

# Species Model
print("\n" + "=" * 50)
print("Predicting Species...")
print("=" * 50)
species_model = safe_load_model(
    "/kaggle/input/species-pytorch-lightning-cnn-classifieri/lightning_logs/version_0/checkpoints/epoch=999-step=9000.ckpt",
    ConvolutionalClassifier,
    num_classes=15
)
if species_model:
    species_preds = predict_batch_classification(test_folder, species_model)
    print(f"Species predictions: {len(species_preds)} images")

# Prepare test data
print("\n" + "=" * 50)
print("Preparing test data...")
print("=" * 50)
X_test, test_df_processed = prepare_test_data(test_df, species_preds, ndvi_preds, height_preds, train_df)
print("Test data prepared successfully")
print(X_test.head())

# LightGBM Predictions
print("\n" + "=" * 50)
print("Predicting with LightGBM ensemble...")
print("=" * 50)

try:
    loaded_models = joblib.load('/kaggle/input/biomass-train-data-visualize-importance/models/all_models_target0.joblib')
    print(f"Loaded {len(loaded_models)} models")
    
    all_predictions = []
    for fold_idx, model in enumerate(loaded_models):
        pred_fold = model.predict(X_test)
        all_predictions.append(pred_fold)
    
    final_predictions = np.mean(all_predictions, axis=0)
    print(f"Generated {len(final_predictions)} predictions")
    print(f"Prediction range: [{final_predictions.min():.2f}, {final_predictions.max():.2f}]")
    
except Exception as e:
    print(f"Error in LightGBM prediction: {e}")
    import traceback
    traceback.print_exc()
    # Create dummy predictions as fallback
    final_predictions = np.zeros(len(test_df))
    print("Using fallback predictions (zeros)")

# Create submission
submit = test0[['sample_id']]
submit['target'] = final_predictions
submit.to_csv('submission.csv', index=False)
print("\n" + "=" * 50)
print("Submission file created successfully!")
print("=" * 50)
print(submit.head(10))

### **This approach was successfully completed, but resulted in a low score, which we believe is due to inaccurate species prediction from images, and some change in strategy is essential.**