### Imports

In [1]:
import torch
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import sklearn
# import fastai
# from fastai.vision.all import *
import time
from PIL import Image
from torch.utils.data import DataLoader
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# Load a dataset into a Pandas Dataframe

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
kaggle_path = './kaggle/input/csiro-biomass/train.csv'
local_path = '/Users/guytabennett-jones/code/me/ai/kaggle/biomass/input/train.csv'
dataset_df = pd.read_csv(local_path)
print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (1785, 9)


In [3]:
train_pivot = dataset_df.pivot(
    index='image_path',
    columns='target_name',
    values='target'
).reset_index()
metadata_cols = [
    'Sampling_Date','Species', 'image_path',
    'Pre_GSHH_NDVI','Height_Ave_cm','State',
]
metadata = dataset_df[metadata_cols].drop_duplicates(subset='image_path')
dataset_df = pd.merge(metadata, train_pivot, on='image_path')
dataset_df.head()

Unnamed: 0,Sampling_Date,Species,image_path,Pre_GSHH_NDVI,Height_Ave_cm,State,Dry_Clover_g,Dry_Dead_g,Dry_Green_g,Dry_Total_g,GDM_g
0,2015/9/4,Ryegrass_Clover,train/ID1011485656.jpg,0.62,4.6667,Tas,0.0,31.9984,16.2751,48.2735,16.275
1,2015/4/1,Lucerne,train/ID1012260530.jpg,0.55,16.0,NSW,0.0,0.0,7.6,7.6,7.6
2,2015/9/1,SubcloverDalkeith,train/ID1025234388.jpg,0.38,1.0,WA,6.05,0.0,0.0,6.05,6.05
3,2015/5/18,Ryegrass,train/ID1028611175.jpg,0.66,5.0,Tas,0.0,30.9703,24.2376,55.2079,24.2376
4,2015/9/11,Ryegrass,train/ID1035947949.jpg,0.54,3.5,Tas,0.4343,23.2239,10.5261,34.1844,10.9605


### Data investigation

In [4]:
# df = dataset_df
# df['ndvi_height'] = df['Pre_GSHH_NDVI'] * df['Height_Ave_cm']
# # Check correlation between NDVI and each target type
# for target_name in df['target_name'].unique():
#     subset = df[df['target_name'] == target_name]
#     corr = subset['ndvi_height'].corr(subset['target'])
#     print(f"{target_name}: {corr:.3f}")

# # Visualize NDVI vs target for each type
# fig, axes = plt.subplots(2, 3, figsize=(15, 8))
# axes = axes.flatten()

# for i, target_name in enumerate(df['target_name'].unique()):
#     subset = df[df['target_name'] == target_name]
#     axes[i].scatter(subset['Height_Ave_cm'], subset['target'], alpha=0.5)
#     axes[i].set_xlabel('NDVI')
#     axes[i].set_ylabel('Biomass (g)')
#     axes[i].set_title(target_name)
    
# plt.tight_layout()
# plt.show()


In [5]:
dataset_df.Species.unique()

array(['Ryegrass_Clover', 'Lucerne', 'SubcloverDalkeith', 'Ryegrass',
       'Phalaris_Clover', 'SubcloverLosa', 'Clover', 'Fescue_CrumbWeed',
       'Phalaris_Ryegrass_Clover', 'Phalaris', 'WhiteClover', 'Fescue',
       'Phalaris_BarleyGrass_SilverGrass_SpearGrass_Clover_Capeweed',
       'Phalaris_Clover_Ryegrass_Barleygrass_Bromegrass', 'Mixed'],
      dtype=object)

### Pytorch

In [6]:
# Standardize: Calculate mean and std for each target column (save these!)
# target_cols = ['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g', 'GDM_g']
# target_mean = train_pivot[target_cols].mean().values
# target_std = train_pivot[target_cols].std().values


In [7]:
local_path = '/Users/guytabennett-jones/code/me/ai/kaggle/biomass/input/'
kaggle_path = '/kaggle/input/csiro-biomass/'
class BiomassDataset(torch.utils.data.Dataset):
    def __init__(self, df, base_path, transform=None):
        self.df = df
        self.base_path = base_path
        self.transform = transform
        self.target_cols = ['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g', 'GDM_g']
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        img_path = local_path + self.df.iloc[idx]['image_path']
        targets = self.df.iloc[idx][self.target_cols].values.astype('float32')
        
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)

            metadata = {
        'ndvi': torch.tensor([self.df.iloc[idx]['Pre_GSHH_NDVI']], dtype=torch.float32),
        'height': torch.tensor([self.df.iloc[idx]['Height_Ave_cm']], dtype=torch.float32)
    }
        
        return image, torch.tensor(targets, dtype=torch.float32), metadata

from torchvision import transforms

train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                       std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                       std=[0.229, 0.224, 0.225])
])
# Create dataset and dataloader
# dataset = BiomassDataset(train_pivot, '/kaggle/input/csiro-biomass/', transform=transform)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=2)


In [8]:
class MultiTaskModel(nn.Module):
    def __init__(self, num_states=4):
        super().__init__()
        # Load pretrained backbone
        self.backbone = timm.create_model('tf_efficientnetv2_m', pretrained=True, num_classes=0)  # num_classes=0 removes head
        backbone_features = self.backbone.num_features  # Get feature dimension
        
        # Main head for biomass (5 outputs)
        self.biomass_head = nn.Linear(backbone_features, 5)
        # Auxiliary heads
        self.ndvi_head = nn.Linear(backbone_features, 1)
        self.height_head = nn.Linear(backbone_features, 1)
    
    def forward(self, x, training=True):
        features = self.backbone(x)
        biomass = self.biomass_head(features)
        
        if training:
            ndvi = self.ndvi_head(features)
            height = self.height_head(features)
            return biomass, ndvi, height
        else:
            return biomass

NameError: name 'nn' is not defined

In [None]:
import torch.nn as nn
import timm
from torchvision import models

# Load pretrained ResNet50
# model = models.resnet50(weights=True)
# model = timm.tf_efficientnetv2_m(weights=True)
model = MultiTaskModel(num_states=4).to(device)

# Freeze the pretrained layers (transfer learning)
# for param in model.parameters():
#     param.requires_grad = False

# # Replace the final layer for regression
# # ResNet50's final layer is called 'fc' and has 2048 input features
# model.fc = nn.Linear(model.fc.in_features, 4)  # Output 1 value (biomass)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0007)

### Feature Engineering

In [None]:
# dataset_df['month'] = pd.to_datetime(dataset_df['Sampling_Date']).dt.month
# dataset_df.drop(['Sampling_Date'], axis=1, inplace=True)

# # One-hot encode State
# dataset_df = pd.get_dummies(dataset_df, columns=['State'], prefix='state')


### Create datasets

In [None]:
# Split sets
train_df, val_df = sklearn.model_selection.train_test_split(dataset_df, train_size=0.8, random_state=42, shuffle=True)
# Create datasets with different transforms
kaggle_path = '/kaggle/input/csiro-biomass/'
local_path =  '/Users/guytabennett-jones/code/me/ai/kaggle/biomass/input/'
train_dataset = BiomassDataset(train_df, local_path, transform=train_transform)
val_dataset = BiomassDataset(val_df, local_path, transform=val_transform)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0)

### Support Functions

In [None]:
def train_model(images, targets, metadata, validation=False):
    images = images.to(device)
    targets = targets.to(device)
    metadata = {k: v.to(device) for k, v in metadata.items()}
    
    if not validation: 
        optimizer.zero_grad()
    
    outputs = model(images, training=True)
    biomass_pred, ndvi_pred, height_pred = outputs
    
    loss_biomass = criterion(biomass_pred, targets)
    loss_ndvi = criterion(ndvi_pred, metadata['ndvi'])
    loss_height = criterion(height_pred, metadata['height'])
    
    total_loss = 1.0 * loss_biomass + 0.3 * loss_ndvi + 0.3 * loss_height
    
    if not validation:
        total_loss.backward()
        optimizer.step()
    
    return total_loss.item(), loss_biomass.item()

# **CHANGED: print_result tracks both total and biomass losses**
def print_result(train_loss, val_loss, train_biomass_loss, val_biomass_loss, 
                 train_loader, val_loader, epoch_start, epoch, num_epochs):
    train_loss /= len(train_loader)
    val_loss /= len(val_loader)
    train_biomass_loss /= len(train_loader)
    val_biomass_loss /= len(val_loader)
    
    epoch_time = time.time() - epoch_start
    mins, secs = divmod(epoch_time, 60)
    
    print(f'Epoch {epoch+1}/{num_epochs} - '
          f'Total Loss: Train={train_loss:.4f}, Val={val_loss:.4f} | '
          f'Biomass Loss: Train={train_biomass_loss:.4f}, Val={val_biomass_loss:.4f} | '
          f'Time: {int(mins)}m {int(secs)}s')


### Learning rate

In [None]:
# def find_lr(model, train_loader, start_lr=1e-7, end_lr=10, num_iter=100):
#     optimizer = torch.optim.Adam(model.parameters(), lr=start_lr)
#     lr_mult = (end_lr / start_lr) ** (1/num_iter)
#     lrs,losses = [],[]
#     model.train()
#     for i,(images,targets) in enumerate(train_loader):
#         print('.')
#         if i >= num_iter: break
#         optimizer.zero_grad()
#         outputs = model(images.to(device))
#         loss = criterion(outputs, targets.to(device))
#         loss.backward()
#         optimizer.step()
#         lrs.append(optimizer.param_groups[0]['lr'])
#         losses.append(loss.item())
#         for g in optimizer.param_groups: g['lr'] *= lr_mult
#     return lrs,losses

In [None]:
# lrs, losses = find_lr(model, train_loader, start_lr=1e-7, end_lr=2, num_iter=36)

In [None]:
# plt.plot(lrs, losses)
# plt.xscale('log')
# plt.xlabel('Learning Rate')
# plt.ylabel('Loss')
# plt.grid(True)
# lrs, losses
# # print(f"Total batches in train_loader: {len(train_loader)}")
# # print(f"Batch size: {train_loader.batch_size}")


### Train Model

In [None]:
def compute_loss(outputs, targets, metadata, weights={'biomass': 1.0, 'ndvi': 0.3, 'height': 0.3}):
    biomass_pred, ndvi_pred, height_pred = outputs
    biomass_true, ndvi_true, height_true = targets, metadata['ndvi'], metadata['height']
    
    loss_biomass = criterion(biomass_pred, biomass_true)
    loss_ndvi = criterion(ndvi_pred, ndvi_true)
    loss_height = criterion(height_pred, height_true)
    
    total_loss = (weights['biomass'] * loss_biomass + 
                  weights['ndvi'] * loss_ndvi + 
                  weights['height'] * loss_height)
    return total_loss

In [None]:
num_epochs = 25

for epoch in range(num_epochs):
    epoch_start = time.time()
    model.train()
    train_loss = 0
    train_biomass_loss = 0
    
    for images, targets, metadata in train_loader:
        total_loss, biomass_loss = train_model(images, targets, metadata)
        train_loss += total_loss
        train_biomass_loss += biomass_loss
    
    model.eval()
    val_loss = 0
    val_biomass_loss = 0
    with torch.no_grad():
        for images, targets, metadata in val_loader:
            total_loss, biomass_loss = train_model(images, targets, metadata, validation=True)
            val_loss += total_loss
            val_biomass_loss += biomass_loss
    
    print_result(train_loss, val_loss, train_biomass_loss, val_biomass_loss,
                 train_loader, val_loader, epoch_start, epoch, num_epochs)
    
print(f'Training complete!')

resnet50 322
With aug Val Loss: 404.3574  
Epoch 10 / 10  →  Train Loss: 404.0312   |   Val Loss: 526.2535  

✅ Training complete!  
Final batch loss: **730.4459**

In [None]:
# Get the predictions for testdata
with torch.no_grad():
        for images, _ in test_loader:
            images = images.to(device)
            outputs = model(images)
            preds.extend(outputs.cpu().numpy())
predictions = np.array(preds).flatten()

In [None]:
# Submit
sample_submission_df = pd.read_csv('/kaggle/input/csiro-biomass/sample_submission.csv')
sample_submission_df['target'] = predictions
sample_submission_df.to_csv('/kaggle/working/submission.csv', index=False)
sample_submission_df.head()