In [1]:
import argparse
from pathlib import Path
import sys
from tqdm.auto import tqdm
import json
from copy import deepcopy
import polars as pl
import numpy as np
import os
import torch
from PIL import Image
from transformers import AutoProcessor, AutoImageProcessor, AutoModel, Siglip2Model, Siglip2ImageProcessor, SiglipModel, SiglipImageProcessor
from sklearn.model_selection import KFold, GroupKFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from sklearn.decomposition import PCA
import catboost

2025-11-07 19:10:00.120711: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762542600.556693      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762542600.686121      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

model_name_siglip = "/kaggle/input/siglip/tensorflow2/default/1"
model_siglip = AutoModel.from_pretrained(model_name_siglip)
model_siglip = model_siglip.to(device)
model_siglip.eval()
processor_siglip = AutoImageProcessor.from_pretrained(model_name_siglip)

model_name_vit = "/kaggle/input/efficient-t5-1/pytorch/default/1"
model_vit = AutoModel.from_pretrained(model_name_vit)
model_vit = model_vit.to(device)
model_vit.eval()
processor_vit = AutoImageProcessor.from_pretrained(model_name_vit)

cuda


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [3]:
data_path = Path('/kaggle/input/csiro-biomass')

labels = [
  "Dry_Clover_g",
  "Dry_Dead_g",
  "Dry_Green_g",
  "Dry_Total_g",
  "GDM_g"
]

train = pl.read_csv(data_path / 'train.csv')

df = (
    train
    .with_columns([
        pl.when(pl.col('target_name') == label).then(pl.col('target')).alias(label)
        for label in labels
    ])
    .group_by('image_path')
    .agg([
        pl.col(label).mean()
        for label in labels
    ] + [
        pl.concat_str(["Sampling_Date", "State"], separator=" ")
        .alias("group")
        .first()
    ])
    .sort('image_path')
)

test = pl.read_csv(data_path / 'test.csv')

df_test = (
    test
    .group_by('image_path')
    .len()
    .sort('image_path')
)

In [4]:
def compute_features_siglip(images, save_path):
    batch_size = 20
    with torch.no_grad(), open(save_path, 'w') as f:
        for i in tqdm(range(0, len(images), batch_size)):
            batch_paths = images[i:i + batch_size]
            batch = [Image.open(data_path / p) for p in batch_paths]
            
            inputs = processor_siglip(images=batch, return_tensors="pt").to(model_siglip.device)
            features = model_siglip.get_image_features(**inputs)
            
            for line in features:
                data = {f'siglip_{j}': line[j].item() for j in range(len(line))}
                f.write(json.dumps(data) + '\n')

def compute_features_vit(images, save_path):
    batch_size = 20
    with torch.no_grad(), open(save_path, 'w') as f:
        for i in tqdm(range(0, len(images), batch_size)):
            batch_paths = images[i:i + batch_size]
            batch = [Image.open(data_path / p) for p in batch_paths]
            
            inputs = processor_vit(images=batch, return_tensors="pt").to(model_vit.device)
            outputs = model_vit(**inputs)
            features = outputs.last_hidden_state[:, 0, :] 
            
            for line in features:
                # Add prefix 'vit_'
                data = {f'vit_{j}': line[j].item() for j in range(len(line))}
                f.write(json.dumps(data) + '\n')

In [5]:
compute_features_siglip(df['image_path'], 'features_siglip.ndjson')
compute_features_siglip(df_test['image_path'], 'features_test_siglip.ndjson')
responses_siglip = pl.read_ndjson('features_siglip.ndjson')
responses_test_siglip = pl.read_ndjson('features_test_siglip.ndjson')

compute_features_vit(df['image_path'], 'features_vit.ndjson')
compute_features_vit(df_test['image_path'], 'features_test_vit.ndjson')
responses_vit = pl.read_ndjson('features_vit.ndjson')
responses_test_vit = pl.read_ndjson('features_test_vit.ndjson')

df_aug = pl.concat(
    [df, responses_siglip, responses_vit], how='horizontal'
)
df_test_aug = pl.concat(
    [df_test, responses_test_siglip, responses_test_vit], how='horizontal'
)

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
weights = {
    'Dry_Green_g': 0.1,
    'Dry_Dead_g': 0.1,
    'Dry_Clover_g': 0.1,
    'GDM_g': 0.2,
    'Dry_Total_g': 0.5,
}

def competition_metric(y_true, y_pred) -> float:
    y_weighted = 0
    for l, label in enumerate(labels):
        y_weighted = y_weighted + y_true[:, l].mean() * weights[label]

    ss_res = 0
    ss_tot = 0
    for l, label in enumerate(labels):
        ss_res = ss_res + ((y_true[:, l] - y_pred[:, l])**2).mean() * weights[label]
        ss_tot = ss_tot + ((y_true[:, l] - y_weighted)**2).mean() * weights[label]

    return 1 - ss_res / ss_tot

In [7]:
def cross_validate(model, data, data_test, x_columns, random_state=42) -> float:
    assert not any((col in labels for col in x_columns))
    X = data.select(x_columns).to_numpy()
    X_test = data_test.select(x_columns).to_numpy()
    y_true = data.select(labels).to_numpy()
    y_pred = np.zeros([len(X), len(labels)])
    y_pred_test = np.zeros([len(X_test), len(labels)])

    n_splits = 5
    kf = GroupKFold(n_splits=5)
    groups = data.select('group')

    for i, (train_index, test_index) in enumerate(kf.split(X, groups=groups)):
        pca = PCA(3).fit(y_true[train_index])
        for l in range(len(labels)):
            m = deepcopy(model)
            m.fit(X[train_index], y_true[train_index, l])
            y_pred[test_index, l] = m.predict(X[test_index]).clip(0)
            y_pred_test[:, l] += m.predict(X_test).clip(0) / n_splits

        # Apply the PCA trick
        y_pred[test_index] = pca.inverse_transform(pca.transform(y_pred[test_index])).clip(0)
        y_pred_test = pca.inverse_transform(pca.transform(y_pred_test)).clip(0)

        print(f'Fold {i}:', competition_metric(y_true[test_index], y_pred[test_index]))

    print('Full CV:', competition_metric(y_true, y_pred))

    return y_pred, y_pred_test

In [8]:
feature_cols = sorted(responses_siglip.columns + responses_vit.columns)
gb_model = GradientBoostingRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    n_iter_no_change=50,
    validation_fraction=0.1,
    random_state=42
)

cb_model = catboost.CatBoostRegressor(
    verbose=False,
    iterations=2000,
    learning_rate=0.05,
    early_stopping_rounds=100,
    random_seed=42
)

In [9]:
_, pred_test_gb = cross_validate(gb_model, df_aug, df_test_aug, feature_cols)

Fold 0: 0.5393715001224832
Fold 1: 0.5930444095994549
Fold 2: 0.6217779736806043
Fold 3: 0.5401076258042282
Fold 4: 0.6102696159597446
Full CV: 0.6100171501172806


In [10]:
_, pred_test_cb = cross_validate(cb_model, df_aug, df_test_aug, feature_cols)

Fold 0: 0.5804548212671232
Fold 1: 0.6010047455426617
Fold 2: 0.6044079327607796
Fold 3: 0.6075696310420455
Fold 4: 0.5370428520868066
Full CV: 0.5999113805245442


In [11]:
pred_test = (
    pred_test_gb * 0.3
    + pred_test_cb * 0.7
)

In [12]:
pred_with_id = pl.concat([
    df_test,
    pl.DataFrame(pred_test, schema=labels),
], how='horizontal')

In [13]:
pred_save = (
    test
    .join(pred_with_id, on='image_path')
    .with_columns(
        pl.coalesce(*[
            pl.when(pl.col('target_name') == col).then(pl.col(col))
            for col in labels
        ]).alias('target')
    )
    .select('sample_id', 'target')
)
pred_save

sample_id,target
str,f64
"""ID1001187975__Dry_Clover_g""",1.526501
"""ID1001187975__Dry_Dead_g""",24.328676
"""ID1001187975__Dry_Green_g""",35.748391
"""ID1001187975__Dry_Total_g""",61.60333
"""ID1001187975__GDM_g""",37.274891


In [14]:
pred_save.write_csv('submission.csv')