# Baseline: DINOv2 Embeddings + LightGBM

This baseline extracts DINOv2 embeddings from images and combines with tabular features.

In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from PIL import Image
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Verify GPU
print(f'CUDA available: {torch.cuda.is_available()}')
print(f'GPU: {torch.cuda.get_device_name(0)}')
print(f'Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

CUDA available: True
GPU: NVIDIA H100 80GB HBM3
Memory: 85.0 GB


In [2]:
# Load data
DATA_DIR = '/home/data'
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')

print(f'Train shape: {train_df.shape}')
print(f'Test shape: {test_df.shape}')
print(f'\nTrain columns: {train_df.columns.tolist()}')
print(f'\nTarget names: {train_df["target_name"].unique()}')

# Get unique images
train_images = train_df['image_path'].unique()
test_images = test_df['image_path'].unique()
print(f'\nUnique train images: {len(train_images)}')
print(f'Unique test images: {len(test_images)}')

Train shape: (1785, 9)
Test shape: (5, 3)

Train columns: ['sample_id', 'image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm', 'target_name', 'target']

Target names: ['Dry_Clover_g' 'Dry_Dead_g' 'Dry_Green_g' 'Dry_Total_g' 'GDM_g']

Unique train images: 357
Unique test images: 1


In [3]:
# Pivot train data to have one row per image with all targets
train_pivot = train_df.pivot_table(
    index=['image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm'],
    columns='target_name',
    values='target'
).reset_index()

print(f'Pivoted train shape: {train_pivot.shape}')
print(train_pivot.head())

Pivoted train shape: (357, 11)
target_name              image_path Sampling_Date State            Species  \
0            train/ID1011485656.jpg      2015/9/4   Tas    Ryegrass_Clover   
1            train/ID1012260530.jpg      2015/4/1   NSW            Lucerne   
2            train/ID1025234388.jpg      2015/9/1    WA  SubcloverDalkeith   
3            train/ID1028611175.jpg     2015/5/18   Tas           Ryegrass   
4            train/ID1035947949.jpg     2015/9/11   Tas           Ryegrass   

target_name  Pre_GSHH_NDVI  Height_Ave_cm  Dry_Clover_g  Dry_Dead_g  \
0                     0.62         4.6667        0.0000     31.9984   
1                     0.55        16.0000        0.0000      0.0000   
2                     0.38         1.0000        6.0500      0.0000   
3                     0.66         5.0000        0.0000     30.9703   
4                     0.54         3.5000        0.4343     23.2239   

target_name  Dry_Green_g  Dry_Total_g    GDM_g  
0                16.2751

In [4]:
# Load DINOv2 model
from transformers import AutoImageProcessor, AutoModel

model_name = 'facebook/dinov2-base'
processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).cuda().eval()

print(f'Model loaded: {model_name}')
print(f'Hidden size: {model.config.hidden_size}')

2026-01-15 01:52:27.152606: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-15 01:52:27.168411: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-15 01:52:27.172879: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Model loaded: facebook/dinov2-base
Hidden size: 768


In [5]:
# Extract embeddings for all images
def extract_embeddings(image_paths, data_dir, batch_size=16):
    embeddings = []
    
    with torch.no_grad():
        for i in tqdm(range(0, len(image_paths), batch_size)):
            batch_paths = image_paths[i:i+batch_size]
            images = []
            
            for path in batch_paths:
                img = Image.open(f'{data_dir}/{path}').convert('RGB')
                images.append(img)
            
            inputs = processor(images=images, return_tensors='pt').to('cuda')
            outputs = model(**inputs)
            
            # Use CLS token embedding
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(cls_embeddings)
    
    return np.vstack(embeddings)

# Extract train embeddings
print('Extracting train embeddings...')
train_embeddings = extract_embeddings(train_pivot['image_path'].values, DATA_DIR)
print(f'Train embeddings shape: {train_embeddings.shape}')

Extracting train embeddings...


  0%|          | 0/23 [00:00<?, ?it/s]

  4%|▍         | 1/23 [00:01<00:25,  1.15s/it]

  9%|▊         | 2/23 [00:02<00:22,  1.05s/it]

 13%|█▎        | 3/23 [00:03<00:20,  1.02s/it]

 17%|█▋        | 4/23 [00:04<00:18,  1.00it/s]

 22%|██▏       | 5/23 [00:05<00:17,  1.02it/s]

 26%|██▌       | 6/23 [00:05<00:16,  1.04it/s]

 30%|███       | 7/23 [00:06<00:15,  1.04it/s]

 35%|███▍      | 8/23 [00:07<00:14,  1.05it/s]

 39%|███▉      | 9/23 [00:08<00:13,  1.05it/s]

 43%|████▎     | 10/23 [00:09<00:12,  1.05it/s]

 48%|████▊     | 11/23 [00:10<00:11,  1.05it/s]

 52%|█████▏    | 12/23 [00:11<00:10,  1.05it/s]

 57%|█████▋    | 13/23 [00:12<00:09,  1.05it/s]

 61%|██████    | 14/23 [00:13<00:08,  1.04it/s]

 65%|██████▌   | 15/23 [00:14<00:07,  1.04it/s]

 70%|██████▉   | 16/23 [00:15<00:06,  1.05it/s]

 74%|███████▍  | 17/23 [00:16<00:05,  1.05it/s]

 78%|███████▊  | 18/23 [00:17<00:04,  1.05it/s]

 83%|████████▎ | 19/23 [00:18<00:03,  1.05it/s]

 87%|████████▋ | 20/23 [00:19<00:02,  1.05it/s]

 91%|█████████▏| 21/23 [00:20<00:01,  1.05it/s]

 96%|█████████▌| 22/23 [00:21<00:00,  1.04it/s]

100%|██████████| 23/23 [00:21<00:00,  1.32it/s]

100%|██████████| 23/23 [00:21<00:00,  1.07it/s]

Train embeddings shape: (357, 768)





In [None]:
# Extract test embeddings
print('Extracting test embeddings...')
test_images_unique = test_df['image_path'].unique()
test_embeddings = extract_embeddings(test_images_unique, DATA_DIR)
print(f'Test embeddings shape: {test_embeddings.shape}')

In [None]:
# Create feature dataframe with embeddings
emb_cols = [f'emb_{i}' for i in range(train_embeddings.shape[1])]
train_emb_df = pd.DataFrame(train_embeddings, columns=emb_cols)
train_emb_df['image_path'] = train_pivot['image_path'].values

test_emb_df = pd.DataFrame(test_embeddings, columns=emb_cols)
test_emb_df['image_path'] = test_images_unique

print(f'Train embeddings df shape: {train_emb_df.shape}')
print(f'Test embeddings df shape: {test_emb_df.shape}')

In [None]:
# Prepare tabular features
from sklearn.preprocessing import LabelEncoder

# Encode categorical features
le_state = LabelEncoder()
le_species = LabelEncoder()

# Fit on all data
all_states = pd.concat([train_pivot['State'], pd.Series(['Unknown'])])
all_species = pd.concat([train_pivot['Species'], pd.Series(['Unknown'])])

le_state.fit(all_states)
le_species.fit(all_species)

train_pivot['State_enc'] = le_state.transform(train_pivot['State'])
train_pivot['Species_enc'] = le_species.transform(train_pivot['Species'])

print(f'States: {le_state.classes_}')
print(f'Species count: {len(le_species.classes_)}')

In [None]:
# Merge embeddings with tabular features
train_full = train_pivot.merge(train_emb_df, on='image_path')
print(f'Train full shape: {train_full.shape}')

# Define target columns and weights
target_cols = ['Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'Dry_Total_g']
target_weights = {'Dry_Green_g': 0.1, 'Dry_Dead_g': 0.1, 'Dry_Clover_g': 0.1, 'GDM_g': 0.2, 'Dry_Total_g': 0.5}

# Define feature columns
feature_cols = ['Pre_GSHH_NDVI', 'Height_Ave_cm', 'State_enc', 'Species_enc'] + emb_cols
print(f'Number of features: {len(feature_cols)}')

In [None]:
# Define weighted R2 metric
def weighted_r2(y_true_dict, y_pred_dict, weights):
    """Calculate globally weighted R2 across all targets."""
    all_y_true = []
    all_y_pred = []
    all_weights = []
    
    for target in y_true_dict.keys():
        all_y_true.extend(y_true_dict[target])
        all_y_pred.extend(y_pred_dict[target])
        all_weights.extend([weights[target]] * len(y_true_dict[target]))
    
    all_y_true = np.array(all_y_true)
    all_y_pred = np.array(all_y_pred)
    all_weights = np.array(all_weights)
    
    # Weighted mean
    y_mean = np.sum(all_weights * all_y_true) / np.sum(all_weights)
    
    # SS_res and SS_tot
    ss_res = np.sum(all_weights * (all_y_true - all_y_pred) ** 2)
    ss_tot = np.sum(all_weights * (all_y_true - y_mean) ** 2)
    
    r2 = 1 - ss_res / ss_tot
    return r2

In [None]:
# 5-Fold Cross Validation with LightGBM
import lightgbm as lgb
from sklearn.model_selection import KFold

N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

# Store OOF predictions
oof_preds = {target: np.zeros(len(train_full)) for target in target_cols}
fold_scores = []

X = train_full[feature_cols].values

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f'\n=== Fold {fold + 1} ===')
    
    X_train, X_val = X[train_idx], X[val_idx]
    
    fold_y_true = {}
    fold_y_pred = {}
    
    for target in target_cols:
        y = train_full[target].values
        y_train, y_val = y[train_idx], y[val_idx]
        
        # LightGBM parameters
        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1,
            'seed': 42
        }
        
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
        
        model_lgb = lgb.train(
            params,
            train_data,
            num_boost_round=500,
            valid_sets=[val_data],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
        )
        
        # Predict
        preds = model_lgb.predict(X_val)
        preds = np.clip(preds, 0, None)  # Biomass can't be negative
        
        oof_preds[target][val_idx] = preds
        fold_y_true[target] = y_val
        fold_y_pred[target] = preds
    
    # Calculate fold weighted R2
    fold_r2 = weighted_r2(fold_y_true, fold_y_pred, target_weights)
    fold_scores.append(fold_r2)
    print(f'Fold {fold + 1} Weighted R2: {fold_r2:.4f}')

print(f'\n=== Overall CV Results ===')
print(f'Mean Weighted R2: {np.mean(fold_scores):.4f} (+/- {np.std(fold_scores):.4f})')

In [None]:
# Calculate overall OOF weighted R2
oof_y_true = {target: train_full[target].values for target in target_cols}
overall_r2 = weighted_r2(oof_y_true, oof_preds, target_weights)
print(f'Overall OOF Weighted R2: {overall_r2:.4f}')

In [None]:
# Train final models on full data and make predictions
final_models = {}
X_full = train_full[feature_cols].values

for target in target_cols:
    print(f'Training final model for {target}...')
    y_full = train_full[target].values
    
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': 42
    }
    
    train_data = lgb.Dataset(X_full, label=y_full)
    
    model_lgb = lgb.train(
        params,
        train_data,
        num_boost_round=500
    )
    
    final_models[target] = model_lgb

print('All final models trained!')

In [None]:
# Prepare test features
# For test, we don't have tabular features, so we'll use defaults
test_features = test_emb_df.copy()

# Add default tabular features (use training means/modes)
test_features['Pre_GSHH_NDVI'] = train_pivot['Pre_GSHH_NDVI'].mean()
test_features['Height_Ave_cm'] = train_pivot['Height_Ave_cm'].mean()
test_features['State_enc'] = train_pivot['State_enc'].mode()[0]
test_features['Species_enc'] = train_pivot['Species_enc'].mode()[0]

X_test = test_features[feature_cols].values
print(f'Test features shape: {X_test.shape}')

In [None]:
# Make predictions for test set
test_preds = {}
for target in target_cols:
    preds = final_models[target].predict(X_test)
    preds = np.clip(preds, 0, None)  # Biomass can't be negative
    test_preds[target] = preds
    print(f'{target}: mean={preds.mean():.2f}, min={preds.min():.2f}, max={preds.max():.2f}')

In [None]:
# Create submission file
submission_rows = []

for i, img_path in enumerate(test_images_unique):
    img_id = img_path.split('/')[-1].replace('.jpg', '')
    
    for target in target_cols:
        sample_id = f'{img_id}__{target}'
        pred_value = test_preds[target][i]
        submission_rows.append({'sample_id': sample_id, 'target': pred_value})

submission_df = pd.DataFrame(submission_rows)
print(f'Submission shape: {submission_df.shape}')
print(submission_df.head(10))

In [None]:
# Save submission
submission_df.to_csv('/home/submission/submission.csv', index=False)
print('Submission saved to /home/submission/submission.csv')

# Verify format matches sample
sample_sub = pd.read_csv('/home/data/sample_submission.csv')
print(f'\nSample submission columns: {sample_sub.columns.tolist()}')
print(f'Our submission columns: {submission_df.columns.tolist()}')
print(f'\nSample submission shape: {sample_sub.shape}')
print(f'Our submission shape: {submission_df.shape}')

In [None]:
# Final summary
print('='*50)
print('BASELINE RESULTS SUMMARY')
print('='*50)
print(f'Model: DINOv2-base embeddings + LightGBM')
print(f'Features: {len(feature_cols)} (768 DINOv2 + 4 tabular)')
print(f'CV Folds: {N_FOLDS}')
print(f'Mean CV Weighted R2: {np.mean(fold_scores):.4f} (+/- {np.std(fold_scores):.4f})')
print(f'Overall OOF Weighted R2: {overall_r2:.4f}')
print('='*50)