In [None]:
from IPython.display import Image, display

display(Image(filename='/kaggle/input/biomassds/dl.png', width=800))

In [None]:
from IPython.display import Image, display

display(Image(filename='/kaggle/input/summary12/Summary.png', width=800))

In [None]:
# ============================================================
# CSIRO Image2Biomass ‚Äî End-to-End Notebook (fixed & ready)
# - Uses /kaggle/input/csiro-biomass/
# - Handles long-format train.csv (sample_id, image_path, target_name, target)
# - Pivots to one-row-per-image with 5 target columns
# - Trains EfficientNet-B3 (torchvision weights API)
# - Weighted MSE Loss using competition weights
# - Outputs submission.csv in long format (sample_id,target)
# ============================================================

import os
import gc
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

# sklearn / torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models import efficientnet_b3, EfficientNet_B3_Weights

# -------------------------
# Config
# -------------------------
DATA_DIR = "/kaggle/input/csiro-biomass"
TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")
TEST_CSV = os.path.join(DATA_DIR, "test.csv")
SAMPLE_SUB = os.path.join(DATA_DIR, "sample_submission.csv")
TRAIN_IMG_ROOT = os.path.join(DATA_DIR)  # image paths in CSV are relative like "train/ID....jpg"
TEST_IMG_ROOT = os.path.join(DATA_DIR)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED = 42
BATCH_SIZE = 16
EPOCHS = 6           # adjust to fit runtime
IMG_SIZE = 384
LR = 1e-4
NUM_WORKERS = 2

COMP_WEIGHTS = {
    "Dry_Green_g": 0.1,
    "Dry_Dead_g": 0.1,
    "Dry_Clover_g": 0.1,
    "GDM_g": 0.2,
    "Dry_Total_g": 0.5
}
TARGET_NAMES = ["Dry_Green_g", "Dry_Dead_g", "Dry_Clover_g", "GDM_g", "Dry_Total_g"]

torch.manual_seed(SEED)
np.random.seed(SEED)

# -------------------------
# Load CSVs
# -------------------------
train_long = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)
sample_sub = pd.read_csv(SAMPLE_SUB)

print("Train (long) shape:", train_long.shape)
print("Test shape:", test_df.shape)
print("Sample sub shape:", sample_sub.shape)
display(train_long.head())

# -------------------------
# Pivot train (long -> wide)
# Each image should become a single row with 5 target columns.
# train_long columns include: sample_id, image_path, Sampling_Date, State,
# Species, Pre_GSHH_NDVI, Height_Ave_cm, target_name, target
# -------------------------
# Create an image-level id column (extract filename) to make things simple
# But dataset already has image_path like "train/IDxxxx.jpg" ‚Äî we'll use that directly.

# Keep metadata columns (we'll use Pre_GSHH_NDVI and Height_Ave_cm optionally)
meta_cols = ["image_path", "Sampling_Date", "State", "Species", "Pre_GSHH_NDVI", "Height_Ave_cm"]

# pivot:
pivot = train_long.pivot_table(index=meta_cols, columns="target_name", values="target").reset_index()
# After pivot, columns are meta_cols + target names
print("Pivoted train shape (one row per image):", pivot.shape)
display(pivot.head())

# Some safety: ensure all TARGET_NAMES present
missing = [t for t in TARGET_NAMES if t not in pivot.columns]
if missing:
    raise RuntimeError(f"Missing target columns after pivot: {missing}")

# Create an 'image_full_path' column that is absolute path (Kaggle)
def make_full_path(p):
    if os.path.isabs(p):
        return p
    return os.path.join(DATA_DIR, p)

pivot["image_full_path"] = pivot["image_path"].apply(make_full_path)
pivot["image_id"] = pivot["image_path"].apply(lambda p: os.path.basename(p).split(".")[0])

# Remove rows where image file missing (robustness)
exists_mask = pivot["image_full_path"].apply(os.path.exists)
if not exists_mask.all():
    missing_files = pivot.loc[~exists_mask, "image_full_path"].tolist()
    print(f"Warning: {len(missing_files)} missing image files (they will be dropped).")
    pivot = pivot.loc[exists_mask].reset_index(drop=True)

print("Final pivot shape:", pivot.shape)
display(pivot.head())

# -------------------------
# Train / Validation split (grouped by image)
# -------------------------
train_imgs, val_imgs = train_test_split(pivot, test_size=0.20, random_state=SEED, shuffle=True)
train_imgs = train_imgs.reset_index(drop=True)
val_imgs = val_imgs.reset_index(drop=True)

print("Train images:", train_imgs.shape, "Val images:", val_imgs.shape)

# -------------------------
# Dataset & Transforms
# -------------------------
train_tfm = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ColorJitter(brightness=0.15, contrast=0.15, saturation=0.1, hue=0.02),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

val_tfm = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

class BiomassImageDataset(Dataset):
    """
    Dataset expects a dataframe with:
      - image_full_path (absolute path to jpg)
      - target columns (Dry_Green_g, Dry_Dead_g, Dry_Clover_g, GDM_g, Dry_Total_g)
    For inference (test), pass targets=False and the df should contain image_full_path.
    """
    def __init__(self, df, target_cols=None, transform=None, is_test=False):
        self.df = df.reset_index(drop=True)
        self.transform = transform
        self.is_test = is_test
        self.target_cols = target_cols if target_cols is not None else []
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = row["image_full_path"]
        # Robust open
        img = Image.open(img_path).convert("RGB")
        if self.transform:
            img = self.transform(img)
        if self.is_test:
            # return image and image_id
            return img, row["image_id"]
        else:
            targets = row[self.target_cols].values.astype(np.float32)
            return img, torch.tensor(targets, dtype=torch.float32)

# Instantiate datasets & loaders
train_ds = BiomassImageDataset(train_imgs, target_cols=TARGET_NAMES, transform=train_tfm, is_test=False)
val_ds = BiomassImageDataset(val_imgs, target_cols=TARGET_NAMES, transform=val_tfm, is_test=False)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

# -------------------------
# Model definition (EfficientNet-B3) using new weights API
# -------------------------
class BiomassModel(nn.Module):
    def __init__(self, num_targets=len(TARGET_NAMES), pretrained_weights=True):
        super().__init__()
        weights = EfficientNet_B3_Weights.DEFAULT if pretrained_weights else None
        self.backbone = efficientnet_b3(weights=weights)
        # efficientnet_b3.classifier is (Dropout, Linear(in_features, num_classes))
        in_features = self.backbone.classifier[1].in_features
        # remove classifier
        self.backbone.classifier = nn.Identity()
        # head: simple MLP; optionally include NDVI/height later
        self.head = nn.Sequential(
            nn.Linear(in_features, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_targets)
        )
    
    def forward(self, x):
        feats = self.backbone(x)
        out = self.head(feats)
        return out

model = BiomassModel(num_targets=len(TARGET_NAMES)).to(DEVICE)

# -------------------------
# Loss - Weighted MSE using competition weights
# -------------------------
weights_tensor = torch.tensor([COMP_WEIGHTS[t] for t in TARGET_NAMES], dtype=torch.float32).to(DEVICE)
# We'll compute MSE per-target and weight manually
mse_loss = nn.MSELoss(reduction="none")  # we'll handle reduction

optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
# optional scaler if using mixed precision
scaler = torch.cuda.amp.GradScaler() if torch.cuda.is_available() else None

# -------------------------
# Training & Validation functions
# -------------------------
def weighted_mse_loss(preds, targets, weights):
    # preds, targets: (batch, n_targets)
    loss_per_elem = mse_loss(preds, targets)  # shape (batch, n_targets)
    # mean over batch then weight
    mean_per_target = loss_per_elem.mean(dim=0)  # (n_targets,)
    weighted = mean_per_target * weights
    return weighted.sum()

def train_one_epoch(model, loader, optimizer, scaler=None):
    model.train()
    running_loss = 0.0
    n = 0
    for imgs, targets in tqdm(loader, desc="Train", leave=False):
        imgs = imgs.to(DEVICE, non_blocking=True)
        targets = targets.to(DEVICE, non_blocking=True)
        optimizer.zero_grad()
        if scaler:
            with torch.cuda.amp.autocast():
                preds = model(imgs)
                loss = weighted_mse_loss(preds, targets, weights_tensor)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            preds = model(imgs)
            loss = weighted_mse_loss(preds, targets, weights_tensor)
            loss.backward()
            optimizer.step()
        running_loss += loss.item() * imgs.size(0)
        n += imgs.size(0)
    return running_loss / n

def validate(model, loader):
    model.eval()
    preds_list = []
    trues_list = []
    with torch.no_grad():
        for imgs, targets in tqdm(loader, desc="Val", leave=False):
            imgs = imgs.to(DEVICE, non_blocking=True)
            out = model(imgs)
            preds_list.append(out.cpu().numpy())
            trues_list.append(targets.numpy())
    preds = np.vstack(preds_list)
    trues = np.vstack(trues_list)
    per_target_r2 = []
    for i, tname in enumerate(TARGET_NAMES):
        try:
            r2 = r2_score(trues[:, i], preds[:, i])
        except Exception:
            r2 = float("nan")
        per_target_r2.append(r2)
    # weighted R2
    weighted_r2 = sum(per_target_r2[i] * COMP_WEIGHTS[TARGET_NAMES[i]] for i in range(len(TARGET_NAMES)))
    return per_target_r2, weighted_r2, preds, trues

# -------------------------
# Training loop
# -------------------------
best_w_r2 = -1e9
save_path = "best_model.pth"

for epoch in range(1, EPOCHS + 1):
    train_loss = train_one_epoch(model, train_loader, optimizer, scaler=scaler)
    per_r2, w_r2, _, _ = validate(model, val_loader)
    print(f"Epoch {epoch}/{EPOCHS} | Train Loss: {train_loss:.6f} | Weighted R2: {w_r2:.6f}")
    print("Per-target R2:", dict(zip(TARGET_NAMES, [round(x,4) if not np.isnan(x) else x for x in per_r2])))
    # save best
    if w_r2 > best_w_r2:
        best_w_r2 = w_r2
        torch.save(model.state_dict(), save_path)
        print("  -> New best model saved.")
    # free memory
    gc.collect()

print("Training finished. Best weighted R2:", best_w_r2)

# -------------------------
# Inference on test set
# Build test dataframe: test.csv includes sample_id and image_path for each target row in long format.
# We need to predict per image and output one row per (image, target) pair in long format.
# -------------------------
# Prepare test image-level dataframe:
test_long = pd.read_csv(TEST_CSV)  # long-format similar to train
# extract unique image rows and create full path
test_imgs = test_long[["image_path"]].drop_duplicates().reset_index(drop=True)
test_imgs["image_full_path"] = test_imgs["image_path"].apply(make_full_path)
test_imgs["image_id"] = test_imgs["image_path"].apply(lambda p: os.path.basename(p).split(".")[0])

# Remove missing files if any
exists_mask = test_imgs["image_full_path"].apply(os.path.exists)
if not exists_mask.all():
    missing_files = test_imgs.loc[~exists_mask, "image_full_path"].tolist()
    print(f"Warning: {len(missing_files)} missing test images (they will be skipped):", missing_files)
    test_imgs = test_imgs.loc[exists_mask].reset_index(drop=True)

print("Test image count:", len(test_imgs))
display(test_imgs.head())

# create test dataset and loader
test_ds = BiomassImageDataset(test_imgs, target_cols=None, transform=val_tfm, is_test=True)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

# load best model
model.load_state_dict(torch.load(save_path, map_location=DEVICE))
model.to(DEVICE)
model.eval()

preds_by_image = {}
with torch.no_grad():
    for imgs, image_ids in tqdm(test_loader, desc="Test Predict"):
        imgs = imgs.to(DEVICE)
        out = model(imgs).cpu().numpy()
        for iid, preds in zip(image_ids, out):
            preds_by_image[iid] = preds

# Build submission (long format: sample_id,target)
rows = []
for _, row in test_long.iterrows():
    # row has sample_id like ID1001187975__Dry_Green_g and image_path
    # Need image id (basename)
    img_id = os.path.basename(row["image_path"]).split(".")[0]
    tname = row["target_name"]
    if img_id in preds_by_image:
        pred_val = float(preds_by_image[img_id][TARGET_NAMES.index(tname)])
    else:
        # fallback if image missing: predict zero
        pred_val = 0.0
    rows.append({"sample_id": row["sample_id"], "target": pred_val})

sub_df = pd.DataFrame(rows)
# Ensure correct ordering as sample_submission
sub_df = sub_df.set_index("sample_id").reindex(sample_sub["sample_id"]).reset_index()
sub_df.to_csv("submission.csv", index=False)
print("submission.csv written, shape:", sub_df.shape)
display(sub_df.head(10))


In [None]:
from IPython.display import Image, display

display(Image(filename='/kaggle/input/visualization/dv.png', width=800))


In [None]:
 # ===============================
# üìä CSIRO Biomass Data Visualization (Full & Fixed)
# ===============================

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import random
from IPython.display import display

# -------------------------
# 0Ô∏è‚É£ Load data
# -------------------------
DATA_PATH = "/kaggle/input/csiro-biomass/"

train = pd.read_csv(f"{DATA_PATH}train.csv")
test = pd.read_csv(f"{DATA_PATH}test.csv")

print(f"‚úÖ Data Loaded Successfully!")
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}\n")

# -------------------------
# 1Ô∏è‚É£ Dataset overview
# -------------------------
print("Unique target types:", train["target_name"].unique())
print("Unique species:", train["Species"].unique())
print(f"Date range: {train['Sampling_Date'].min()} to {train['Sampling_Date'].max()}\n")

# -------------------------
# 2Ô∏è‚É£ Basic statistics
# -------------------------
display(train.describe())

# -------------------------
# 3Ô∏è‚É£ Count of samples per target
# -------------------------
sns.set(style="whitegrid", palette="muted", font_scale=1.1)

plt.figure(figsize=(8, 4))
sns.countplot(data=train, x="target_name", order=train["target_name"].value_counts().index)
plt.title("Count of Samples per Target", fontsize=14)
plt.xlabel("Target Type")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# -------------------------
# 4Ô∏è‚É£ Average target value per species (grouped by target_name)
# -------------------------
plt.figure(figsize=(10, 6))
sns.barplot(
    data=train,
    x="Species",
    y="target",
    hue="target_name",
    errorbar=None
)
plt.title("Average Target Value per Species", fontsize=14)
plt.xticks(rotation=90)
plt.ylabel("Mean Target")
plt.tight_layout()
plt.show()

# -------------------------
# 5Ô∏è‚É£ Relationship between NDVI, Height, and Target
# -------------------------
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=train,
    x="Pre_GSHH_NDVI",
    y="target",
    hue="target_name",
    alpha=0.7
)
plt.title("NDVI vs Target by Target Type", fontsize=14)
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=train,
    x="Height_Ave_cm",
    y="target",
    hue="target_name",
    alpha=0.7
)
plt.title("Height vs Target by Target Type", fontsize=14)
plt.tight_layout()
plt.show()

# -------------------------
# 6Ô∏è‚É£ Correlation heatmap (per target type)
# -------------------------
corr_list = []

for tname, group in train.groupby("target_name"):
    corr = group[["Pre_GSHH_NDVI", "Height_Ave_cm", "target"]].corr()
    corr_ndvi = corr.loc["Pre_GSHH_NDVI", "target"]
    corr_height = corr.loc["Height_Ave_cm", "target"]
    corr_list.append({
        "target_name": tname,
        "Corr(NDVI, target)": corr_ndvi,
        "Corr(Height, target)": corr_height
    })

corr_df = pd.DataFrame(corr_list).set_index("target_name")
display(corr_df)

plt.figure(figsize=(6, 4))
sns.heatmap(corr_df, annot=True, cmap="coolwarm", center=0)
plt.title("Correlation of NDVI & Height with Targets", fontsize=14)
plt.tight_layout()
plt.show()

# -------------------------
# 7Ô∏è‚É£ Show random training images
# -------------------------
sample_images = train["image_path"].unique()
plt.figure(figsize=(12, 8))
for i, img_path in enumerate(random.sample(list(sample_images), min(9, len(sample_images)))):
    plt.subplot(3, 3, i+1)
    img = cv2.imread(f"{DATA_PATH}{img_path}")
    if img is not None:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        plt.imshow(img)
        sid = img_path.split("/")[-1].split(".")[0]
        species = train.loc[train["image_path"] == img_path, "Species"].values[0]
        plt.title(f"{sid}\n{species}", fontsize=8)
    plt.axis("off")
plt.suptitle("Random Sample Training Images", fontsize=14)
plt.tight_layout()
plt.show()


In [None]:
from IPython.display import Image, display

display(Image(filename='/kaggle/input/conclusion344/Conclusion.png', width=800))