In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor

# ===========================
# CONFIG
# ===========================
MODEL_ID = "/kaggle/input/clippp/transformers/default/1"
BASE = "/kaggle/input/csiro-biomass"   # adjust this path
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 16

# ===========================
# LOAD MODEL
# ===========================
print("Loading CLIP model...")
model = CLIPModel.from_pretrained(MODEL_ID, local_files_only=True).to(DEVICE)
processor = CLIPProcessor.from_pretrained(MODEL_ID, local_files_only=True)
model.eval()

# ===========================
# FUNCTION: CLIP image embeddings
# ===========================
def get_clip_embeds(image_paths, base_dir):
    feats = []
    for i in tqdm(range(0, len(image_paths), BATCH_SIZE), desc="Extracting CLIP image embeddings"):
        batch_paths = image_paths[i:i + BATCH_SIZE]
        images = []
        for p in batch_paths:
            try:
                img = Image.open(os.path.join(base_dir, p)).convert("RGB")
                images.append(img)
            except Exception as e:
                print(f"Error loading {p}: {e}")
                # Use a blank image as fallback
                images.append(Image.new('RGB', (224, 224)))
        
        inputs = processor(images=images, return_tensors="pt", padding=True).to(DEVICE)
        with torch.no_grad():
            emb = model.get_image_features(**inputs)
            emb = emb / emb.norm(dim=-1, keepdim=True)
        feats.append(emb.cpu().numpy())
    return np.vstack(feats)

# ===========================
# LOAD CSVs
# ===========================
print("Loading data...")
dftr = pd.read_csv(os.path.join(BASE, "train.csv"))
dfts = pd.read_csv(os.path.join(BASE, "test.csv"))

# Expected cols: sample_id, image_path, target_name, target, ...
train_image_paths = dftr["image_path"].tolist()
test_image_paths = dfts["image_path"].tolist()

print(f"Training samples: {len(train_image_paths)}")
print(f"Test samples: {len(test_image_paths)}")

# ===========================
# EXTRACT CLIP EMBEDS
# ===========================
print("\nExtracting training embeddings...")
train_embeds = get_clip_embeds(train_image_paths, BASE)

print("\nExtracting test embeddings...")
test_embeds = get_clip_embeds(test_image_paths, BASE)

# ===========================
# ENCODE target_name
# ===========================
print("\nEncoding target names...")
le = LabelEncoder()
dftr["target_name_enc"] = le.fit_transform(dftr["target_name"])
dfts["target_name_enc"] = le.transform(dfts["target_name"])

# ===========================
# BUILD TRAIN / TEST MATRICES
# ===========================
print("\nBuilding feature matrices...")
X_train = np.hstack([
    train_embeds,
    dftr[["target_name_enc"]].values
])
X_test = np.hstack([
    test_embeds,
    dfts[["target_name_enc"]].values
])
y_train = dftr["target"].values

# Save arrays
np.save('Xtr.npy', X_train)
np.save('ytr.npy', y_train)
np.save('Xts.npy', X_test)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

# ===========================
# TRAIN CATBOOST REGRESSOR
# ===========================
print("\nTraining CatBoost model...")
model_cb = CatBoostRegressor(
    iterations=10000,
    learning_rate=0.001,
    depth=8,
    loss_function="RMSE",
    verbose=200,
    task_type="GPU" if torch.cuda.is_available() else "CPU"
)
model_cb.fit(X_train, y_train)

# ===========================
# PREDICT & SAVE SUBMISSION
# ===========================
print("\nGenerating predictions...")
preds = model_cb.predict(X_test)

sub = pd.DataFrame({
    "sample_id": dfts["sample_id"],
    "target": preds
})
sub.to_csv("submission.csv", index=False)
print("âœ… Saved submission.csv")
print(f"Predictions range: [{preds.min():.4f}, {preds.max():.4f}]")