# Experiment 002: DINOv2-Large Patch Features + Post-Processing

Following evolver strategy:
1. Use DINOv2-large (1024 dims) instead of base (768 dims)
2. Extract patch tokens (last_hidden_state[:,1:,:]) instead of just CLS
3. Apply post-processing to enforce biomass constraints

In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from PIL import Image
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Verify GPU
print(f'CUDA available: {torch.cuda.is_available()}')
print(f'GPU: {torch.cuda.get_device_name(0)}')
print(f'Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

CUDA available: True
GPU: NVIDIA H100 80GB HBM3
Memory: 85.0 GB


In [2]:
# Load data
DATA_DIR = '/home/data'
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')

# Pivot train data to have one row per image with all targets
train_pivot = train_df.pivot_table(
    index=['image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm'],
    columns='target_name',
    values='target'
).reset_index()

print(f'Pivoted train shape: {train_pivot.shape}')
print(f'Test shape: {test_df.shape}')

Pivoted train shape: (357, 11)
Test shape: (5, 3)


In [3]:
# Load DINOv2-large model (1024 dims vs 768 in base)
from transformers import AutoImageProcessor, AutoModel

model_name = 'facebook/dinov2-large'
processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).cuda().eval()

print(f'Model loaded: {model_name}')
print(f'Hidden size: {model.config.hidden_size}')

2026-01-15 02:03:06.933586: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-15 02:03:06.949476: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-15 02:03:06.953975: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/549 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Model loaded: facebook/dinov2-large
Hidden size: 1024


In [4]:
# Extract PATCH-BASED embeddings (mean of patch tokens, not CLS)
def extract_patch_embeddings(image_paths, data_dir, batch_size=8):
    """Extract mean of patch tokens instead of CLS token."""
    embeddings = []
    
    with torch.no_grad():
        for i in tqdm(range(0, len(image_paths), batch_size)):
            batch_paths = image_paths[i:i+batch_size]
            images = []
            
            for path in batch_paths:
                img = Image.open(f'{data_dir}/{path}').convert('RGB')
                images.append(img)
            
            inputs = processor(images=images, return_tensors='pt').to('cuda')
            outputs = model(**inputs)
            
            # Use MEAN of patch tokens (excluding CLS token at position 0)
            # last_hidden_state shape: (batch, num_patches+1, hidden_size)
            patch_tokens = outputs.last_hidden_state[:, 1:, :]  # Exclude CLS
            patch_mean = patch_tokens.mean(dim=1)  # Average over patches
            
            embeddings.append(patch_mean.cpu().numpy())
    
    return np.vstack(embeddings)

# Extract train embeddings
print('Extracting train patch embeddings with DINOv2-large...')
train_embeddings = extract_patch_embeddings(train_pivot['image_path'].values, DATA_DIR)
print(f'Train embeddings shape: {train_embeddings.shape}')

Extracting train patch embeddings with DINOv2-large...


  0%|          | 0/45 [00:00<?, ?it/s]

  2%|▏         | 1/45 [00:00<00:31,  1.42it/s]

  4%|▍         | 2/45 [00:01<00:25,  1.68it/s]

  7%|▋         | 3/45 [00:01<00:23,  1.80it/s]

  9%|▉         | 4/45 [00:02<00:21,  1.87it/s]

 11%|█         | 5/45 [00:02<00:20,  1.91it/s]

 13%|█▎        | 6/45 [00:03<00:20,  1.91it/s]

 16%|█▌        | 7/45 [00:03<00:19,  1.94it/s]

 18%|█▊        | 8/45 [00:04<00:18,  1.96it/s]

 20%|██        | 9/45 [00:04<00:18,  1.97it/s]

 22%|██▏       | 10/45 [00:05<00:17,  1.98it/s]

 24%|██▍       | 11/45 [00:05<00:16,  2.01it/s]

 27%|██▋       | 12/45 [00:06<00:16,  2.01it/s]

 29%|██▉       | 13/45 [00:06<00:16,  2.00it/s]

 31%|███       | 14/45 [00:07<00:15,  2.01it/s]

 33%|███▎      | 15/45 [00:07<00:14,  2.01it/s]

 36%|███▌      | 16/45 [00:08<00:14,  2.01it/s]

 38%|███▊      | 17/45 [00:08<00:13,  2.00it/s]

 40%|████      | 18/45 [00:09<00:13,  2.02it/s]

 42%|████▏     | 19/45 [00:09<00:12,  2.03it/s]

 44%|████▍     | 20/45 [00:10<00:12,  2.03it/s]

 47%|████▋     | 21/45 [00:10<00:11,  2.01it/s]

 49%|████▉     | 22/45 [00:11<00:11,  2.02it/s]

 51%|█████     | 23/45 [00:11<00:10,  2.02it/s]

 53%|█████▎    | 24/45 [00:12<00:10,  2.01it/s]

 56%|█████▌    | 25/45 [00:12<00:09,  2.03it/s]

 58%|█████▊    | 26/45 [00:13<00:09,  2.02it/s]

 60%|██████    | 27/45 [00:13<00:08,  2.01it/s]

 62%|██████▏   | 28/45 [00:14<00:08,  2.00it/s]

 64%|██████▍   | 29/45 [00:14<00:08,  2.00it/s]

 67%|██████▋   | 30/45 [00:15<00:07,  2.00it/s]

 69%|██████▉   | 31/45 [00:15<00:06,  2.01it/s]

 71%|███████   | 32/45 [00:16<00:06,  2.02it/s]

 73%|███████▎  | 33/45 [00:16<00:06,  2.00it/s]

 76%|███████▌  | 34/45 [00:17<00:05,  2.01it/s]

 78%|███████▊  | 35/45 [00:17<00:04,  2.01it/s]

 80%|████████  | 36/45 [00:18<00:04,  2.02it/s]

 82%|████████▏ | 37/45 [00:18<00:03,  2.03it/s]

 84%|████████▍ | 38/45 [00:19<00:03,  2.01it/s]

 87%|████████▋ | 39/45 [00:19<00:02,  2.01it/s]

 89%|████████▉ | 40/45 [00:20<00:02,  2.03it/s]

 91%|█████████ | 41/45 [00:20<00:01,  2.03it/s]

 93%|█████████▎| 42/45 [00:21<00:01,  2.04it/s]

 96%|█████████▌| 43/45 [00:21<00:00,  2.01it/s]

 98%|█████████▊| 44/45 [00:22<00:00,  2.02it/s]

100%|██████████| 45/45 [00:22<00:00,  2.29it/s]

100%|██████████| 45/45 [00:22<00:00,  2.01it/s]

Train embeddings shape: (357, 1024)





In [5]:
# Extract test embeddings
print('Extracting test patch embeddings...')
test_images_unique = test_df['image_path'].unique()
test_embeddings = extract_patch_embeddings(test_images_unique, DATA_DIR)
print(f'Test embeddings shape: {test_embeddings.shape}')

Extracting test patch embeddings...


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 13.29it/s]

Test embeddings shape: (1, 1024)





In [None]:
# Create feature dataframe with embeddings
emb_cols = [f'emb_{i}' for i in range(train_embeddings.shape[1])]
train_emb_df = pd.DataFrame(train_embeddings, columns=emb_cols)
train_emb_df['image_path'] = train_pivot['image_path'].values

test_emb_df = pd.DataFrame(test_embeddings, columns=emb_cols)
test_emb_df['image_path'] = test_images_unique

print(f'Train embeddings df shape: {train_emb_df.shape}')
print(f'Test embeddings df shape: {test_emb_df.shape}')

In [None]:
# Prepare tabular features
from sklearn.preprocessing import LabelEncoder

le_state = LabelEncoder()
le_species = LabelEncoder()

all_states = pd.concat([train_pivot['State'], pd.Series(['Unknown'])])
all_species = pd.concat([train_pivot['Species'], pd.Series(['Unknown'])])

le_state.fit(all_states)
le_species.fit(all_species)

train_pivot['State_enc'] = le_state.transform(train_pivot['State'])
train_pivot['Species_enc'] = le_species.transform(train_pivot['Species'])

print(f'States: {le_state.classes_}')
print(f'Species count: {len(le_species.classes_)}')

In [None]:
# Merge embeddings with tabular features
train_full = train_pivot.merge(train_emb_df, on='image_path')
print(f'Train full shape: {train_full.shape}')

# Define target columns and weights
target_cols = ['Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'Dry_Total_g']
target_weights = {'Dry_Green_g': 0.1, 'Dry_Dead_g': 0.1, 'Dry_Clover_g': 0.1, 'GDM_g': 0.2, 'Dry_Total_g': 0.5}

# Define feature columns (1024 DINOv2-large + 4 tabular)
feature_cols = ['Pre_GSHH_NDVI', 'Height_Ave_cm', 'State_enc', 'Species_enc'] + emb_cols
print(f'Number of features: {len(feature_cols)}')

In [None]:
# Define weighted R2 metric
def weighted_r2(y_true_dict, y_pred_dict, weights):
    """Calculate globally weighted R2 across all targets."""
    all_y_true = []
    all_y_pred = []
    all_weights = []
    
    for target in y_true_dict.keys():
        all_y_true.extend(y_true_dict[target])
        all_y_pred.extend(y_pred_dict[target])
        all_weights.extend([weights[target]] * len(y_true_dict[target]))
    
    all_y_true = np.array(all_y_true)
    all_y_pred = np.array(all_y_pred)
    all_weights = np.array(all_weights)
    
    y_mean = np.sum(all_weights * all_y_true) / np.sum(all_weights)
    ss_res = np.sum(all_weights * (all_y_true - all_y_pred) ** 2)
    ss_tot = np.sum(all_weights * (all_y_true - y_mean) ** 2)
    
    return 1 - ss_res / ss_tot

In [None]:
# Post-processing function to enforce biomass constraints
# GDM = Dry_Green + Dry_Clover
# Dry_Total = GDM + Dry_Dead
def post_process_biomass(preds_dict):
    """Apply projection matrix to enforce biomass constraints."""
    # Order: Dry_Green, Dry_Clover, Dry_Dead, GDM, Dry_Total
    ordered_cols = ['Dry_Green_g', 'Dry_Clover_g', 'Dry_Dead_g', 'GDM_g', 'Dry_Total_g']
    
    # Stack predictions into matrix (5 x n_samples)
    Y = np.vstack([preds_dict[col] for col in ordered_cols])
    
    # Constraint matrix C such that C @ Y = 0 for valid predictions
    # Constraint 1: Dry_Green + Dry_Clover - GDM = 0
    # Constraint 2: Dry_Dead + GDM - Dry_Total = 0
    C = np.array([
        [1, 1, 0, -1, 0],   # Dry_Green + Dry_Clover = GDM
        [0, 0, 1, 1, -1]    # Dry_Dead + GDM = Dry_Total
    ])
    
    # Projection matrix: P = I - C^T @ (C @ C^T)^-1 @ C
    C_T = C.T
    inv_CCt = np.linalg.inv(C @ C_T)
    P = np.eye(5) - C_T @ inv_CCt @ C
    
    # Apply projection and clip to non-negative
    Y_reconciled = (P @ Y).clip(min=0)
    
    # Return as dictionary
    result = {}
    for i, col in enumerate(ordered_cols):
        result[col] = Y_reconciled[i]
    
    return result

print('Post-processing function defined')

In [None]:
# 5-Fold Cross Validation with LightGBM
import lightgbm as lgb
from sklearn.model_selection import KFold

N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

# Store OOF predictions
oof_preds = {target: np.zeros(len(train_full)) for target in target_cols}
oof_preds_pp = {target: np.zeros(len(train_full)) for target in target_cols}  # Post-processed
fold_scores = []
fold_scores_pp = []

X = train_full[feature_cols].values

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f'\n=== Fold {fold + 1} ===')
    
    X_train, X_val = X[train_idx], X[val_idx]
    
    fold_y_true = {}
    fold_y_pred = {}
    
    for target in target_cols:
        y = train_full[target].values
        y_train, y_val = y[train_idx], y[val_idx]
        
        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1,
            'seed': 42
        }
        
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
        
        model_lgb = lgb.train(
            params,
            train_data,
            num_boost_round=500,
            valid_sets=[val_data],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
        )
        
        preds = model_lgb.predict(X_val)
        preds = np.clip(preds, 0, None)
        
        oof_preds[target][val_idx] = preds
        fold_y_true[target] = y_val
        fold_y_pred[target] = preds
    
    # Calculate fold R2 before post-processing
    fold_r2 = weighted_r2(fold_y_true, fold_y_pred, target_weights)
    fold_scores.append(fold_r2)
    
    # Apply post-processing
    fold_y_pred_pp = post_process_biomass(fold_y_pred)
    fold_r2_pp = weighted_r2(fold_y_true, fold_y_pred_pp, target_weights)
    fold_scores_pp.append(fold_r2_pp)
    
    # Store post-processed predictions
    for target in target_cols:
        oof_preds_pp[target][val_idx] = fold_y_pred_pp[target]
    
    print(f'Fold {fold + 1} Weighted R2: {fold_r2:.4f} -> {fold_r2_pp:.4f} (post-processed)')

print(f'\n=== Overall CV Results ===')
print(f'Mean Weighted R2 (raw): {np.mean(fold_scores):.4f} (+/- {np.std(fold_scores):.4f})')
print(f'Mean Weighted R2 (post-processed): {np.mean(fold_scores_pp):.4f} (+/- {np.std(fold_scores_pp):.4f})')

In [None]:
# Calculate overall OOF weighted R2
oof_y_true = {target: train_full[target].values for target in target_cols}
overall_r2 = weighted_r2(oof_y_true, oof_preds, target_weights)
overall_r2_pp = weighted_r2(oof_y_true, oof_preds_pp, target_weights)

print(f'Overall OOF Weighted R2 (raw): {overall_r2:.4f}')
print(f'Overall OOF Weighted R2 (post-processed): {overall_r2_pp:.4f}')

In [None]:
# Train final models on full data
final_models = {}
X_full = train_full[feature_cols].values

for target in target_cols:
    print(f'Training final model for {target}...')
    y_full = train_full[target].values
    
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': 42
    }
    
    train_data = lgb.Dataset(X_full, label=y_full)
    model_lgb = lgb.train(params, train_data, num_boost_round=500)
    final_models[target] = model_lgb

print('All final models trained!')

In [None]:
# Prepare test features
test_features = test_emb_df.copy()
test_features['Pre_GSHH_NDVI'] = train_pivot['Pre_GSHH_NDVI'].mean()
test_features['Height_Ave_cm'] = train_pivot['Height_Ave_cm'].mean()
test_features['State_enc'] = train_pivot['State_enc'].mode()[0]
test_features['Species_enc'] = train_pivot['Species_enc'].mode()[0]

X_test = test_features[feature_cols].values
print(f'Test features shape: {X_test.shape}')

In [None]:
# Make predictions for test set
test_preds = {}
for target in target_cols:
    preds = final_models[target].predict(X_test)
    preds = np.clip(preds, 0, None)
    test_preds[target] = preds
    print(f'{target}: mean={preds.mean():.2f}')

# Apply post-processing
test_preds_pp = post_process_biomass(test_preds)
print('\nAfter post-processing:')
for target in target_cols:
    print(f'{target}: mean={test_preds_pp[target].mean():.2f}')

In [None]:
# Create submission file with post-processed predictions
submission_rows = []

for i, img_path in enumerate(test_images_unique):
    img_id = img_path.split('/')[-1].replace('.jpg', '')
    
    for target in target_cols:
        sample_id = f'{img_id}__{target}'
        pred_value = test_preds_pp[target][i]
        submission_rows.append({'sample_id': sample_id, 'target': pred_value})

submission_df = pd.DataFrame(submission_rows)
print(f'Submission shape: {submission_df.shape}')
print(submission_df)

In [None]:
# Save submission
submission_df.to_csv('/home/submission/submission.csv', index=False)
print('Submission saved to /home/submission/submission.csv')

# Verify format
sample_sub = pd.read_csv('/home/data/sample_submission.csv')
print(f'\nSample submission columns: {sample_sub.columns.tolist()}')
print(f'Our submission columns: {submission_df.columns.tolist()}')

In [None]:
# Final summary
print('='*60)
print('EXPERIMENT 002 RESULTS SUMMARY')
print('='*60)
print(f'Model: DINOv2-large PATCH embeddings + LightGBM + Post-processing')
print(f'Features: {len(feature_cols)} (1024 DINOv2-large patch + 4 tabular)')
print(f'CV Folds: {N_FOLDS}')
print(f'\nRaw predictions:')
print(f'  Mean CV Weighted R2: {np.mean(fold_scores):.4f} (+/- {np.std(fold_scores):.4f})')
print(f'  Overall OOF Weighted R2: {overall_r2:.4f}')
print(f'\nPost-processed predictions:')
print(f'  Mean CV Weighted R2: {np.mean(fold_scores_pp):.4f} (+/- {np.std(fold_scores_pp):.4f})')
print(f'  Overall OOF Weighted R2: {overall_r2_pp:.4f}')
print(f'\nBaseline comparison: 0.7584 -> {overall_r2_pp:.4f} ({overall_r2_pp - 0.7584:+.4f})')
print('='*60)