# About
- Feature preparation using SigLip
- Predictions using GradientBoosting
- No deep learning

## Update version 8:
- Added PCA trick with idea that predicted values are linearly dependent ([here](https://www.kaggle.com/code/none00000/lb-0-57-infer-model-code))
- My trick is different from the previous approaches, where people only predict 3 targets
- It gives around +0.01 LB
- PCA trick is in `cross_validate`

In [None]:
import argparse
from pathlib import Path
import sys
from tqdm.auto import tqdm
import json
from copy import deepcopy
import polars as pl
import numpy as np
import os

import torch
from PIL import Image
from transformers import AutoProcessor, AutoImageProcessor, AutoModel, Siglip2Model, Siglip2ImageProcessor, SiglipModel, SiglipImageProcessor

In [None]:
from sklearn.model_selection import KFold, GroupKFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from sklearn.decomposition import PCA

import catboost

# Prepare features

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

# Initialize model
model_name = "/kaggle/input/google-siglip-so400m-patch14-384/transformers/default/1/"
model = AutoModel.from_pretrained(
    model_name,
)
model = model.to(device)
model.eval()
processor = AutoImageProcessor.from_pretrained(model_name)

In [None]:
data_path = Path('/kaggle/input/csiro-biomass')

labels = [
  "Dry_Clover_g",
  "Dry_Dead_g",
  "Dry_Green_g",
  "Dry_Total_g",
  "GDM_g"
]

train = pl.read_csv(data_path / 'train.csv')

df = (
    train
    .with_columns([
        pl.when(pl.col('target_name') == label).then(pl.col('target')).alias(label)
        for label in labels
    ])
    .group_by('image_path')
    .agg([
        pl.col(label).mean()
        for label in labels
    ] + [
        pl.concat_str(["Sampling_Date", "State"], separator=" ")
        .alias("group")
        .first()
    ])
    .sort('image_path')
)

df

In [None]:
test = pl.read_csv(data_path / 'test.csv')
# test = train.select('sample_id', 'image_path', 'target_name')

df_test = (
    test
    .group_by('image_path')
    .len()
    .sort('image_path')
)

df_test

In [None]:
def compute_features(images, save_path):
    batch_size = 20
    with torch.no_grad(), open(save_path, 'w') as f:
        for i in tqdm(range(0, len(images), batch_size)):
            batch_paths = images[i:i + batch_size]
            batch = [Image.open(data_path / p) for p in batch_paths]
            
            inputs = processor(images=batch, return_tensors="pt").to(model.device)
            features = model.get_image_features(**inputs)
            
            for line in features:
                data = {f'x_{j}': line[j].item() for j in range(len(line))}
                f.write(json.dumps(data) + '\n')

In [None]:
compute_features(df['image_path'], 'features.ndjson')

In [None]:
compute_features(df_test['image_path'], 'features_test.ndjson')

In [None]:
responses = pl.read_ndjson('features.ndjson')

In [None]:
responses_test = pl.read_ndjson('features_test.ndjson')
responses_test

In [None]:
df_aug = pl.concat(
    [df, responses], how='horizontal'
)
df_aug

In [None]:
df_test_aug = pl.concat(
    [df_test, responses_test], how='horizontal'
)
df_test_aug

# Set up validation

In [None]:
weights = {
    'Dry_Green_g': 0.1,
    'Dry_Dead_g': 0.1,
    'Dry_Clover_g': 0.1,
    'GDM_g': 0.2,
    'Dry_Total_g': 0.5,
}

def competition_metric(y_true, y_pred) -> float:
    y_weighted = 0
    for l, label in enumerate(labels):
        y_weighted = y_weighted + y_true[:, l].mean() * weights[label]

    ss_res = 0
    ss_tot = 0
    for l, label in enumerate(labels):
        ss_res = ss_res + ((y_true[:, l] - y_pred[:, l])**2).mean() * weights[label]
        ss_tot = ss_tot + ((y_true[:, l] - y_weighted)**2).mean() * weights[label]

    return 1 - ss_res / ss_tot

In [None]:
def cross_validate(model, data, data_test, x_columns, random_state=42) -> float:
    assert not any((col in labels for col in x_columns))
    X = data.select(x_columns).to_numpy()
    X_test = data_test.select(x_columns).to_numpy()
    y_true = data.select(labels).to_numpy()
    y_pred = np.zeros([len(X), len(labels)])
    y_pred_test = np.zeros([len(X_test), len(labels)])

    n_splits = 5
    kf = GroupKFold(n_splits=5)
    groups = data.select('group')

    for i, (train_index, test_index) in enumerate(kf.split(X, groups=groups)):
        pca = PCA(3).fit(y_true[train_index])
        for l in range(len(labels)):
            m = deepcopy(model)
            m.fit(X[train_index], y_true[train_index, l])
            y_pred[test_index, l] = m.predict(X[test_index]).clip(0)
            y_pred_test[:, l] += m.predict(X_test).clip(0) / n_splits

        # Apply the PCA trick
        y_pred[test_index] = pca.inverse_transform(pca.transform(y_pred[test_index])).clip(0)
        y_pred_test = pca.inverse_transform(pca.transform(y_pred_test)).clip(0)

        print(f'Fold {i}:', competition_metric(y_true[test_index], y_pred[test_index]))

    print('Full CV:', competition_metric(y_true, y_pred))

    return y_pred, y_pred_test

# Choose a model

In [None]:
cross_validate(DummyRegressor(), df_aug, df_test_aug, sorted(responses.columns));

In [None]:
cross_validate(Ridge(), df_aug, df_test_aug, sorted(responses.columns));

In [None]:
cross_validate(Lasso(), df_aug, df_test_aug, sorted(responses.columns));

In [None]:
_, pred_test_gb = cross_validate(GradientBoostingRegressor(), df_aug, df_test_aug, sorted(responses.columns))

In [None]:
_, pred_test_cb = cross_validate(catboost.CatBoostRegressor(verbose=False, iterations=100), df_aug, df_test_aug, sorted(responses.columns))

I choose the last two models

In [None]:
pred_test = (
    pred_test_gb
    + pred_test_cb
) / 2

# Save predictions

In [None]:
pred_with_id = pl.concat([
    df_test,
    pl.DataFrame(pred_test, schema=labels),
], how='horizontal')
pred_with_id

In [None]:
pred_save = (
    test
    .join(pred_with_id, on='image_path')
    .with_columns(
        pl.coalesce(*[
            pl.when(pl.col('target_name') == col).then(pl.col(col))
            for col in labels
        ]).alias('target')
    )
    .select('sample_id', 'target')
)
pred_save

In [None]:
pred_save.write_csv('submission.csv')