In [4]:
base_dir = "/Users/bin/Desktop/CV_Assignment/Dataset_filtered"

# Crop Operation

In [5]:
import os
import random
from PIL import Image
from torchvision import transforms

input_image_dir = os.path.join(base_dir, "train", "color")
input_mask_dir = os.path.join(base_dir, "train", "label")

output_image_dir = os.path.join(base_dir, "train_crop", "color")
output_mask_dir = os.path.join(base_dir, "train_crop", "label")

os.makedirs(output_image_dir, exist_ok=True)
os.makedirs(output_mask_dir, exist_ok=True)

# 固定裁剪参数
crop_size = (324, 324)  # 比如 crop 后的尺寸
crop_prob = 0.35  # 每张图 35% 概率被裁剪

# 遍历所有图像
image_filenames = sorted([f for f in os.listdir(input_image_dir) if f.endswith(".jpg")])

for img_name in image_filenames:
    img_path = os.path.join(input_image_dir, img_name)
    mask_name = img_name.replace(".jpg", ".png")
    mask_path = os.path.join(input_mask_dir, mask_name)

    # 打开图像和 mask
    image = Image.open(img_path).convert("RGB")
    mask = Image.open(mask_path).convert("L")

    # 获取原图尺寸
    img_width, img_height = image.size
    crop_w, crop_h = crop_size

    do_crop = random.random() < crop_prob
    use_crop = do_crop and img_width >= crop_w and img_height >= crop_h

    if use_crop:
        # 获取随机裁剪区域
        i, j, h, w = transforms.RandomCrop.get_params(image, output_size=crop_size)
        image = transforms.functional.crop(image, i, j, h, w)
        mask = transforms.functional.crop(mask, i, j, h, w)
        # 修改文件名，标识为裁剪后的版本
        new_img_name = os.path.splitext(img_name)[0] + "_crop.jpg"
        new_mask_name = os.path.splitext(mask_name)[0] + "_crop.png"
    else:
        new_img_name = img_name
        new_mask_name = mask_name

    out_img_path = os.path.join(output_image_dir, new_img_name)
    out_mask_path = os.path.join(output_mask_dir, new_mask_name)

    image.save(out_img_path, format="JPEG")
    mask.save(out_mask_path, format="PNG")

print("✅ 所有图像和掩码已处理完成并保存到 train_crop 文件夹。")

✅ 所有图像和掩码已处理完成并保存到 train_crop 文件夹。


# Resize images

initialize一个dictionary，对每个image resize之前，记录它对应的mask的height和width并写入字典，最后将字典写入一个叫original_sizes.json的file里

In [6]:
import os
import cv2
import shutil
import json

# Set target dimensions (adjust as needed)
TARGET_WIDTH = 256
TARGET_HEIGHT = 256
target_size = (TARGET_WIDTH, TARGET_HEIGHT)


# Create destination directories:
# For training, we resize images but copy masks unchanged.
resized_train_color_dir = os.path.join(base_dir, "train_resized", "color")
resized_train_label_dir = os.path.join(base_dir, "train_resized", "label")

# For validation, we resize images and copy masks unchanged.
resized_val_color_dir = os.path.join(base_dir, "val_resized", "color")
resized_val_label_dir = os.path.join(base_dir, "val_resized", "label")

for d in [resized_train_color_dir, resized_train_label_dir,
          resized_val_color_dir, resized_val_label_dir]:
    os.makedirs(d, exist_ok=True)

# Dictionary to store original image sizes before resizing
original_sizes = {}

def resize_and_save(src_path, dst_path, target_size):
    """
    Resize an image and save it (don't record size here anymore).
    """
    img = cv2.imread(src_path, cv2.IMREAD_COLOR)
    if img is None:
        print("Error reading:", src_path)
        return
    resized = cv2.resize(img, target_size, interpolation=cv2.INTER_LINEAR)
    cv2.imwrite(dst_path, resized)

def process_data(color_source, label_source, resized_color_dest, resized_label_dest, original_sizes_dict):
    """
    Processes a dataset split:
    - Resizes color images and saves them.
    - Copies label masks unchanged, and stores their original sizes with clean keys.
    """
    # Resize color images
    for filename in sorted(os.listdir(color_source)):
        if filename.lower().endswith(".jpg"):
            src_path = os.path.join(color_source, filename)
            dst_path = os.path.join(resized_color_dest, filename)
            resize_and_save(src_path, dst_path, target_size)

    # Copy masks and record original sizes
    for filename in sorted(os.listdir(label_source)):
        if filename.lower().endswith(".png"):
            src_path = os.path.join(label_source, filename)
            dst_path = os.path.join(resized_label_dest, filename)

            # Load mask
            mask = cv2.imread(src_path, cv2.IMREAD_GRAYSCALE)
            if mask is None:
                print("Error reading mask:", src_path)
                continue

            # 🔥 Clean filename key: remove suffix like ".png"
            img_key = os.path.splitext(filename)[0]  # "Abyssinian_1.png" → "Abyssinian_1"
            original_sizes_dict[img_key] = list(mask.shape)  # Ensure JSON serializable: [H, W]

            # Copy mask as-is
            shutil.copy2(src_path, dst_path)

# -----------------------------------------
# Process Training Data
# -----------------------------------------
train_color_source = os.path.join(base_dir, "train_crop", "color")
train_label_source = os.path.join(base_dir, "train_crop", "label")

process_data(train_color_source, train_label_source, resized_train_color_dir, resized_train_label_dir, original_sizes)

# # -----------------------------------------
# # Process Validation Data
# # -----------------------------------------
val_dir = os.path.join(base_dir, "val")
val_color_source = os.path.join(val_dir, "color")
val_label_source = os.path.join(val_dir, "label")

process_data(val_color_source, val_label_source, resized_val_color_dir, resized_val_label_dir, original_sizes)

# Save original sizes to a JSON file
original_size_json_path = os.path.join(base_dir, "original_sizes.json")
with open(original_size_json_path, "w") as f:
    json.dump(original_sizes, f, indent=4)

print(f"✅ Processing complete. Original sizes saved to {original_size_json_path}.")

Corrupt JPEG data: premature end of data segment


✅ Processing complete. Original sizes saved to /Users/bin/Desktop/CV_Assignment/Dataset_filtered/original_sizes.json.


将images和masks都写入DataLoader里，但同时也要记录每张image的name以便后面查询mask的original size
除此以外，对于mask tensor化的操作还要更改。我们目前有5个class:[cat, dog, background, boundary, unknown] 分别对应class 0,1,2,3,4,我们通过默认的transforms.ToTensor()使得mask的每个tensor的每个像素都变为该像素的值后，我们还要进一步操作:对于每个tensor值，如果是0，就把该值变为2（意为background),如果是1，就把这个值变为3（意为boundary),如果是介于0-1之间的值，我们看这个mask的filename，如果filename的第一个字母是大写，将该值变为0(意为猫),反之如果是小写，就变成1（意为狗),如果是图片变小了，有些区域不属于原始图片，我们标为4（意为unknown)

treat boundary as a new class: boundary for now. currently we have 5 classes: [cat, dog, boundary, background, unknown]

# Resize the Test Set

In [None]:
# -----------------------------------------
# Process Test Data
# -----------------------------------------

# Directories
test_dir = os.path.join(base_dir, "Test")
resized_test_color_dir = os.path.join(base_dir, "test_resized", "color")
resized_test_label_dir = os.path.join(base_dir, "test_resized", "label")

# Create folders
os.makedirs(resized_test_color_dir, exist_ok=True)
os.makedirs(resized_test_label_dir, exist_ok=True)

# Dictionary for test original sizes
original_sizes_test = {}

# Source folders
test_color_source = os.path.join(test_dir, "color")
test_label_source = os.path.join(test_dir, "label")

# Process test data
process_data(
    color_source=test_color_source,
    label_source=test_label_source,
    resized_color_dest=resized_test_color_dir,
    resized_label_dest=resized_test_label_dir,
    original_sizes_dict=original_sizes_test
)

# Save test sizes JSON
test_size_json_path = os.path.join(base_dir, "original_sizes_test.json")
with open(test_size_json_path, "w") as f:
    json.dump(original_sizes_test, f, indent=4)

print(f"✅ Test data processed. Test mask sizes saved to {test_size_json_path}")

# Evaluate on Test Set

In [None]:
import torch
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import DataLoader
from torchmetrics import JaccardIndex, Dice
import os, json
from tqdm import tqdm
from unet import UNet
from custom_dataset import CustomDataset
from enhanced_unet import EnhancedUNet

# === Config ===
NUM_CLASSES = 4
CLASS_NAMES = ["Cat", "Dog", "Background", "Boundary"]

MODEL_PATH = '/Users/bin/Desktop/CV_Assignment/Model/best_unet_100_epochs_baseline.pth'
original_sizes_path = os.path.join(base_dir, "original_sizes_test.json")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === Load model ===
model = EnhancedUNet(in_channels=3, out_channels=NUM_CLASSES).to(device)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()

# === Load original sizes ===
with open(original_sizes_path, "r") as f:
    original_sizes = json.load(f)

# === Load test dataset ===
test_dataset = CustomDataset(
    image_dir=os.path.join(base_dir, "test_resized", "color"),
    mask_dir=os.path.join(base_dir, "test_resized", "label"),
    transform=transforms.ToTensor()
)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# === Metrics ===
iou_metric = JaccardIndex(task="multiclass", num_classes=NUM_CLASSES, average="none").to(device)
dice_metric = Dice(num_classes=NUM_CLASSES, average="none").to(device)

# === Evaluation loop ===
with torch.no_grad():
    for image, mask, img_name, _ in tqdm(test_loader):
        image = image.to(device)
        mask = mask.squeeze(0).to(device).long()  # (H, W) with values 0~3

        # Forward pass
        logits = model(image)  # (1, 4, 256, 256)

        # Resize logits to original size before argmax
        orig_h, orig_w = original_sizes[img_name[0]]
        logits_resized = F.interpolate(logits, size=(orig_h, orig_w), mode="bilinear", align_corners=False)

        # Apply argmax AFTER resizing
        pred_mask = torch.argmax(logits_resized, dim=1).long()  # (1, H, W)

        # Resize GT mask (if needed) just in case
        gt_resized = F.interpolate(mask.unsqueeze(0).unsqueeze(0).float(), size=(orig_h, orig_w), mode="nearest").squeeze(0).long()

        # Update metrics
        iou_metric.update(pred_mask.to(device), gt_resized.to(device))
        dice_metric.update(pred_mask.to(device), gt_resized.to(device))

# === Compute final results ===
iou_scores = iou_metric.compute()
dice_scores = dice_metric.compute()

# === Print Results ===
print("\n📊 Per-Class Evaluation on Test Set:")
for i in range(NUM_CLASSES):
    print(f"Class {i} ({CLASS_NAMES[i]}): IoU = {iou_scores[i]:.4f}, Dice = {dice_scores[i]:.4f}")

mean_iou = iou_scores.mean()
mean_dice = dice_scores.mean()
print(f"\n➡️ Mean IoU of all classes: {mean_iou:.4f}")
print(f"➡️ Mean Dice of all classes: {mean_dice:.4f}")


In [None]:
import torch
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import DataLoader
from torchmetrics import JaccardIndex#, Dice
import os, json
from tqdm import tqdm
from unet import UNet
from custom_dataset import CustomDataset
from enhanced_unet import EnhancedUNet

# === Config ===
NUM_CLASSES = 4
# 我们只关注 0:Cat, 1:Dog, 2:Background，忽略 3:Boundary（不参与指标计算）
CLASS_NAMES = ["Cat", "Dog", "Background", "Boundary"]

MODEL_PATH = '/Users/bin/Desktop/CV_Assignment/Model/best_enhanced_unet_100_epochs.pth'
original_sizes_path = os.path.join(base_dir, "original_sizes_test.json")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === Load model ===
model = EnhancedUNet(in_channels=3, out_channels=NUM_CLASSES).to(device)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()

# === Load original sizes ===
with open(original_sizes_path, "r") as f:
    original_sizes = json.load(f)

# === Load test dataset ===
test_dataset = CustomDataset(
    image_dir=os.path.join(base_dir, "test_resized", "color"),
    mask_dir=os.path.join(base_dir, "test_resized", "label"),
    transform=transforms.ToTensor()
)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# === Metrics ===
# 注意：这里设置 ignore_index=3，表示在计算时忽略 ground truth 中标签为3的像素
iou_metric = JaccardIndex(task="multiclass", num_classes=NUM_CLASSES, average="none", ignore_index=3).to(device)
#dice_metric = Dice(num_classes=NUM_CLASSES, average="none", ignore_index=3).to(device)

# === Evaluation loop ===
with torch.no_grad():
    for image, mask, img_name, _ in tqdm(test_loader):
        image = image.to(device)
        # mask: (H, W) with values 0~3, convert to long tensor
        mask = mask.squeeze(0).to(device).long()

        # Forward pass
        logits = model(image)  # (1, 4, 256, 256)

        # Resize logits to original size before argmax
        orig_h, orig_w = original_sizes[img_name[0]]
        logits_resized = F.interpolate(logits, size=(orig_h, orig_w), mode="bilinear", align_corners=False)

        # Apply argmax AFTER resizing
        pred_mask = torch.argmax(logits_resized, dim=1).long()  # (1, H, W)

        # Resize GT mask to original size if needed
        gt_resized = F.interpolate(mask.unsqueeze(0).unsqueeze(0).float(), size=(orig_h, orig_w), mode="nearest").squeeze(0).long()

        # Update metrics; pixels where gt == 3 are ignored automatically
        iou_metric.update(pred_mask.to(device), gt_resized.to(device))
        #dice_metric.update(pred_mask.to(device), gt_resized.to(device))

# === Compute final results ===
iou_scores = iou_metric.compute()  # 返回 shape (num_classes,) ，其中 ignore_index=3 不会计入结果
#dice_scores = dice_metric.compute()

# 仅输出类别 0,1,2 的指标
print("\n📊 Per-Class Evaluation on Test Set (excluding class 3):")
for i in range(3):
    print(f"Class {i} ({CLASS_NAMES[i]}): IoU = {iou_scores[i]:.4f}")#, Dice = {dice_scores[i]:.4f}")

# 计算 mean IoU / Dice，只考虑 class 0,1,2
mean_iou = iou_scores[:3].mean()
# mean_dice = dice_scores[:3].mean()
print(f"\n➡️ Mean IoU of classes 0-2: {mean_iou:.4f}")
# print(f"➡️ Mean Dice of classes 0-2: {mean_dice:.4f}")