In [None]:
import pandas as pd
import numpy as np
import torch
import torchvision
import os
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, Subset, Dataset
from PIL import Image
!cp -r "/kaggle/input/rsna-models/facebookresearch_dinov2_main (1)/root/.cache/torch/hub/facebookresearch_dinov2_main" /kaggle/working/dinov2
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

In [None]:
from transformers import AutoImageProcessor, AutoModel
processor = AutoImageProcessor.from_pretrained('/kaggle/input/dinov2/pytorch/giant/1')
model = AutoModel.from_pretrained('/kaggle/input/dinov2/pytorch/giant/1')
model = model.cuda()

In [None]:
embeds = []
targets = [[] for i in range(5)]
counter = 0
import torchvision.transforms as transforms
import pandas as pd
from PIL import Image
#transform = transforms.Compose([transforms.ToTensor(), transforms.Resize((224, 224)), transforms.Normalize(mean, std)])
train_df = pd.read_csv("/kaggle/input/csiro-biomass/train.csv")
root = "/kaggle/input/csiro-biomass/"
for i in range(len(train_df)):
    entry = train_df.iloc[i]
    file_path = root + entry['image_path']
    y = torch.tensor([[entry['target']]])
    targets[i % 5].append(y)
    if i % 5 == 0:
        img = Image.open(file_path)
        x = torch.tensor(processor(img).pixel_values)
        with torch.no_grad():
            x = x.cuda()
            embeds.append(model(x).pooler_output.cpu())
            counter += 1
            if counter % 100 == 0:
                print(f"{counter} batches processed.")

In [None]:
import random
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score

# Create indices and shuffle once
lst = list(range(len(embeds)))
random.seed(42)
random.shuffle(lst)

# Create multiple random 80/20 splits
n_splits = 5
splits = []

for i in range(n_splits):
    # Reshuffle for each split while maintaining same splits across targets
    temp_lst = lst.copy()
    random.seed(42 + i)  # Different seed for each split
    random.shuffle(temp_lst)
    
    split_point = int(len(temp_lst) * 0.8)
    train_idxs = temp_lst[:split_point]
    val_idxs = temp_lst[split_point:]
    splits.append((train_idxs, val_idxs))

# Convert embeds to numpy array once for efficiency
embeds_np = np.array(torch.cat(embeds))
regressors = [[None for i in range(5)] for j in range(5)]
# Now iterate through each target
for i in range(5):
    print(f"\n=== Target {i+1} ===")
    targets_np = np.array(torch.cat(targets[i]))
    
    split_scores = []
    
    for split_idx, (train_idxs, val_idxs) in enumerate(splits):
        print(f"Fold {split_idx+1}:")
        X_train, y_train = embeds_np[train_idxs], targets_np[train_idxs]
        X_val, y_val = embeds_np[val_idxs], targets_np[val_idxs]
        reg = Lasso()
        reg.fit(X_train, y_train)
        train_preds = reg.predict(X_train)
        train_preds[train_preds < 0.0] = 0.0
        train_r2 = r2_score(y_train, train_preds)
        val_preds = reg.predict(X_val)
        val_preds[val_preds < 0.0] = 0.0
        val_r2 = r2_score(y_val, val_preds)
        print(f"  Train R²: {train_r2:.4f}")
        print(f"  Val R²: {val_r2:.4f}")
        split_scores.append((train_r2, val_r2))
        regressors[i][split_idx] = reg
    
    # Print summary for this target
    avg_train_r2 = np.mean([score[0] for score in split_scores])
    avg_val_r2 = np.mean([score[1] for score in split_scores])
    print(f"\nTarget {i+1} Average:")
    print(f"  Avg Train R²: {avg_train_r2:.4f}")
    print(f"  Avg Val R²: {avg_val_r2:.4f}")

In [None]:
mapping = {"Dry_Clover_g": 0, "Dry_Dead_g": 1, "Dry_Green_g": 2, "Dry_Total_g": 3, "GDM_g": 4}

In [None]:
test_embeds = {}
counter = 0
import torchvision.transforms as transforms
import pandas as pd
from PIL import Image
transform = transforms.Compose([transforms.ToTensor(), transforms.Resize((224, 224)), transforms.Normalize(mean, std)])
test_df = pd.read_csv("/kaggle/input/csiro-biomass/test.csv")
root = "/kaggle/input/csiro-biomass/"
sample_ids = []
for i in range(len(test_df)):
    entry = test_df.iloc[i]
    file_path = root + entry['image_path']
    sample_id = entry['sample_id']
    #y = torch.tensor([[entry['target']]])
    if sample_id not in sample_ids:
        img = Image.open(file_path)
        x = torch.tensor(processor(img).pixel_values)
        with torch.no_grad():
            x = x.cuda()
            test_embeds[sample_id.split("_")[0]] = model(x).pooler_output.cpu()
            counter += 1
        sample_ids.append(sample_id)
    if counter % 100 == 0:
        print(f"{counter} batches processed.")

In [None]:
predictions = []
sample_ids = []
test_df = pd.read_csv("/kaggle/input/csiro-biomass/test.csv")
for i in range(len(test_df)):
    try:
        entry = test_df.iloc[i]
        X = np.array(test_embeds[entry['sample_id'].split("__")[0]])
        sample_ids.append(entry['sample_id'])
        models = regressors[mapping[entry['sample_id'].split("__")[1]]]
        prediction = 0.0
        for item in models:
            single_pred = item.predict(X)
            if single_pred < 0.0:
                single_pred = 0.0
            prediction += single_pred
        prediction = prediction / 5
        predictions.append(float(prediction))
    except Exception as e:
        predictions.append(0.0)

In [None]:
%cd /kaggle/working
submission = pd.DataFrame({
    'sample_id': sample_ids,
    'target': predictions
})

submission.to_csv('submission.csv', index=False)
submission

##### acknowledgement - https://www.kaggle.com/code/carsoncheng/dinov2-lasso-baseline-lb-0-54