In [None]:
# =========================
# Setup & Imports
# =========================
from pathlib import Path
import os
import pandas as pd
import numpy as np
import torch
import torchvision
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader
from PIL import Image

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

# =========================
# Constants (edit here)
# =========================
CONFIG = {
    "DATA_DIR": "/kaggle/input/csiro-biomass",
    "TRAIN_CSV": "train.csv",
    "TEST_CSV": "test.csv",
    "OUT_DIR": "./",

    # Backbone choice + local weights path (uploaded as a Kaggle Dataset)
    "BACKBONE": "resnet18",  # "resnet18" or "resnet50"
    "WEIGHTS_DIR": "/kaggle/input/resnet-weight/pytorch/default/1",  # <- your uploaded dataset path
    "RESNET18_WEIGHTS": "resnet18_imagenet1k_v1_state_dict.pth",
    "RESNET50_WEIGHTS": "resnet50_imagenet1k_v2_state_dict.pth",

    "IMAGE_SIZE": 384,
    "BATCH_SIZE": 8,
    "NUM_WORKERS": 0,
    "VAL_SIZE": 0.2,
    "RANDOM_STATE": 42,
    "USE_GPU_FOR_XGB": True,
    "EXPORT_PCA_2D": True
}

TARGETS = ['Dry_Green_g','Dry_Dead_g','Dry_Clover_g','GDM_g','Dry_Total_g']

# =========================
# Pivot long â†’ wide helper
# =========================
def pivot_train_long_to_wide(train_long: pd.DataFrame, targets: list) -> pd.DataFrame:
    print("[DEBUG] Raw train_long shape:", train_long.shape)
    train_long = train_long.copy()
    train_long['image_id'] = train_long['image_path'].apply(lambda p: Path(p).stem)

    cnt = train_long.groupby('image_id')['target_name'].nunique()
    print("[DEBUG] target_name nunique per image (value_counts):\n", cnt.value_counts())

    pivot = train_long.pivot_table(
        index=['image_id','image_path','Sampling_Date','State','Species',
               'Pre_GSHH_NDVI','Height_Ave_cm'],
        columns='target_name', values='target', aggfunc='first'
    ).reset_index()

    pivot.columns = [c if isinstance(c, str) else c[1] for c in pivot.columns]
    for t in targets:
        if t not in pivot.columns:
            pivot[t] = np.nan

    before = len(pivot)
    pivot = pivot.dropna(subset=targets, how='any').reset_index(drop=True)
    after = len(pivot)
    print(f"[DEBUG] After pivot shape: {pivot.shape} (dropped {before-after} rows missing targets)")
    return pivot

# =========================
# Dataset + Transforms
# =========================
class ImageTable(Dataset):
    def __init__(self, df, root, transform=None):
        self.df = df.reset_index(drop=True)
        self.root = Path(root)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        path = self.root / row['image_path']
        if not path.exists():
            print(f"[WARN] Image not found: {path}")
        with Image.open(path) as im:
            im = im.convert('RGB')
        if self.transform:
            im = self.transform(im)
        return im, str(row['image_path'])

def build_transform(size):
    return T.Compose([
        T.Resize(int(size*1.15)),
        T.CenterCrop(size),
        T.ToTensor(),
        T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
    ])

# =========================
# Offline weight loading
# =========================
def load_backbone_from_local(backbone:str, weights_dir:Path):
    backbone = backbone.lower()
    if backbone == "resnet18":
        model = torchvision.models.resnet18(weights=None)
        weight_file = weights_dir / CONFIG["RESNET18_WEIGHTS"]
    elif backbone == "resnet50":
        model = torchvision.models.resnet50(weights=None)
        weight_file = weights_dir / CONFIG["RESNET50_WEIGHTS"]
    else:
        raise ValueError(f"Unsupported BACKBONE: {backbone}")

    if not weight_file.exists():
        raise FileNotFoundError(
            f"Weight file not found: {weight_file}\n"
            f"Make sure your Kaggle Dataset is added to the notebook and paths are correct."
        )

    print(f"[INFO] Loading local weights: {weight_file}")
    state = torch.load(weight_file, map_location="cpu")
    # Try strict first, then fall back (avoids minor version mismatches)
    try:
        model.load_state_dict(state, strict=True)
    except Exception as e:
        print(f"[WARN] Strict load failed: {e}\nTrying strict=False...")
        model.load_state_dict(state, strict=False)

    # Convert to feature-extractor (remove final FC)
    feat = torch.nn.Sequential(*list(model.children())[:-1])
    return feat

# =========================
# Feature Extraction
# =========================
@torch.no_grad()
def extract_features(model, loader, device):
    feats = []
    paths = []
    for i, (imgs, img_paths) in enumerate(loader):
        print(f"[DEBUG] batch {i}, imgs shape {imgs.shape}")
        imgs = imgs.to(device)
        out = model(imgs)
        if out.ndim > 2:
            out = out.view(out.size(0), -1)
        feats.append(out.cpu().numpy())
        paths.extend(list(img_paths))
    if not feats:
        raise ValueError("[ERROR] No features extracted; check images/paths.")
    feats = np.concatenate(feats, axis=0)
    return feats, paths

# =========================
# Main Notebook Flow
# =========================

# --- I/O & dirs ---
DATA_DIR = Path(CONFIG['DATA_DIR'])
OUT_DIR = Path(CONFIG['OUT_DIR'])
OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- Read + pivot train ---
train_path = DATA_DIR / CONFIG['TRAIN_CSV']
print("[INFO] Reading:", train_path)
train_long = pd.read_csv(train_path)
train_wide = pivot_train_long_to_wide(train_long, TARGETS)
print("[INFO] Train images count:", len(train_wide))

# --- Backbone & transforms (OFFLINE) ---
transform = build_transform(CONFIG['IMAGE_SIZE'])

weights_dir = Path(CONFIG["WEIGHTS_DIR"])
feature_extractor = load_backbone_from_local(CONFIG["BACKBONE"], weights_dir)
feature_extractor.eval()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
feature_extractor.to(device)
print("[INFO] Using device:", device)

# --- Train feature extraction ---
train_ds = ImageTable(train_wide, root=DATA_DIR, transform=transform)
train_loader = DataLoader(train_ds, batch_size=CONFIG['BATCH_SIZE'], shuffle=False,
                          num_workers=CONFIG['NUM_WORKERS'])
feats_train, train_paths = extract_features(feature_extractor, train_loader, device)
np.save(OUT_DIR/'features_train.npy', feats_train)
print("[INFO] feats_train:", feats_train.shape)  # (N, 512) for resnet18, (N, 2048) for resnet50

# --- Scale features ---
scaler = StandardScaler(with_mean=True, with_std=True)
Xs = scaler.fit_transform(feats_train)

# Optional: PCA export for visualization
if CONFIG['EXPORT_PCA_2D']:
    pca = PCA(n_components=2, random_state=CONFIG['RANDOM_STATE'])
    X2 = pca.fit_transform(Xs)
    pd.DataFrame({
        'x': X2[:,0],
        'y': X2[:,1],
        'image_path': train_wide['image_path']
    }).to_csv(OUT_DIR/'pca_2d_train.csv', index=False)
    print("[INFO] Exported 2D PCA to:", (OUT_DIR/'pca_2d_train.csv').resolve())

# --- Targets ---
y = train_wide[TARGETS].values

# --- Quick validation split ---
X_tr, X_va, y_tr, y_va = train_test_split(
    Xs, y, test_size=CONFIG['VAL_SIZE'], random_state=CONFIG['RANDOM_STATE']
)

# --- XGBoost Regressor (multi-output) ---
use_gpu = (CONFIG['USE_GPU_FOR_XGB'] and device == 'cuda')
tree_method = 'gpu_hist' if use_gpu else 'hist'

xgb_base = XGBRegressor(
    n_estimators=800,
    learning_rate=0.03,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    objective='reg:squarederror',
    tree_method=tree_method,
    n_jobs=-1,
    random_state=CONFIG['RANDOM_STATE']
)

model = MultiOutputRegressor(xgb_base, n_jobs=-1)

print("[INFO] Fitting XGBoost on training split...")
model.fit(X_tr, y_tr)

# --- Validation metrics ---
pred_va = model.predict(X_va)
for j, t in enumerate(TARGETS):
    rmse = mean_squared_error(y_va[:, j], pred_va[:, j], squared=False)
    r2 = r2_score(y_va[:, j], pred_va[:, j])
    print(f"[VAL] {t}: RMSE={rmse:.4f}  R2={r2:.4f}")

# --- Retrain on ALL training data ---
print("[INFO] Refitting XGBoost on ALL training data...")
model.fit(Xs, y)

# =========================
# Test Inference & Submission
# =========================
test_long = pd.read_csv(DATA_DIR / CONFIG['TEST_CSV'])
test_images_unique = test_long.drop_duplicates('image_path')['image_path'].tolist()
test_df_images = pd.DataFrame({'image_path': test_images_unique})
print("[INFO] Unique test images:", len(test_df_images))

test_ds = ImageTable(test_df_images, root=DATA_DIR, transform=transform)
test_loader = DataLoader(test_ds, batch_size=CONFIG['BATCH_SIZE'], shuffle=False,
                         num_workers=CONFIG['NUM_WORKERS'])

feats_test, test_paths = extract_features(feature_extractor, test_loader, device)
np.save(OUT_DIR/'features_test.npy', feats_test)
print("[INFO] feats_test:", feats_test.shape)

Xs_test = scaler.transform(feats_test)

print("[INFO] Predicting on test...")
test_pred = model.predict(Xs_test)
test_pred = np.clip(test_pred, 0.0, None)  # enforce non-negativity

# Map predictions back to long-form rows
img_to_idx = {img: i for i, img in enumerate(test_images_unique)}
rows = []
miss = 0
for _, row in test_long.iterrows():
    img = row['image_path']
    tname = row['target_name']
    if img not in img_to_idx:
        miss += 1
        continue
    i = img_to_idx[img]
    t_idx = TARGETS.index(tname)
    rows.append({'sample_id': row['sample_id'], 'target': float(test_pred[i, t_idx])})
if miss > 0:
    print(f("[WARN] {miss} rows in test.csv had image_path not found in dedup list."))

sub = pd.DataFrame(rows)
out_file = OUT_DIR / 'submission.csv'
sub.to_csv(out_file, index=False)
print("[OK] Wrote submission:", out_file.resolve())
print(sub.head())
