In [60]:
import glob as glob_lib
import os
from utils import *
import numpy as np
from PIL import Image
from pycocotools import mask as maskUtils

## code starts here

In [61]:
CATEGORIES = {
    "can_chowder": {
        "ref_img": 'data/data_2D/can_chowder_000001.jpg',
        "ref_mask": 'data/data_2D/can_chowder_000001_1_gt.png',
    },
    "can_soymilk": {
        "ref_img": 'data/data_2D/can_soymilk_000001.jpg',
        "ref_mask": 'data/data_2D/can_soymilk_000001_1_gt.png',
    },
    "can_tomatosoup": {
        "ref_img": 'data/data_2D/can_tomatosoup_000001.jpg',
        "ref_mask": 'data/data_2D/can_tomatosoup_000001_1_gt.png',
    },
    "carton_oj": {
        "ref_img": 'data/data_2D/carton_oj_000001.jpg',
        "ref_mask": 'data/data_2D/carton_oj_000001_1_gt.png',
    },
    "carton_soymilk": {
        "ref_img": 'data/data_2D/carton_soymilk_000001.jpg',
        "ref_mask": 'data/data_2D/carton_soymilk_000001_1_gt.png',
    },
    "diet_coke": {
        "ref_img": 'data/data_2D/diet_coke_000001.jpg',
        "ref_mask": 'data/data_2D/diet_coke_000001_1_gt.png',
    },
    "hc_potroastsoup": {
        "ref_img": 'data/data_2D/hc_potroastsoup_000001.jpg',
        "ref_mask": 'data/data_2D/hc_potroastsoup_000001_1_gt.png',
    },
    "juicebox": {
        "ref_img": 'data/data_2D/juicebox_000001.jpg',
        "ref_mask": 'data/data_2D/juicebox_000001_1_gt.png',
    },
    "rice_tuscan": {
        "ref_img": 'data/data_2D/rice_tuscan_000001.jpg',
        "ref_mask": 'data/data_2D/rice_tuscan_000001_1_gt.png',
    },
    "ricepilaf": {
        "ref_img": 'data/data_2D/ricepilaf_000001.jpg',
        "ref_mask": 'data/data_2D/ricepilaf_000001_1_gt.png',
    },
}

In [62]:


IMG_EXTS = (".jpg", ".jpeg", ".png")

def build_categories_with_tests(categories):
    """
    Given initial dict with ref_img/ref_mask, fill in test = [[test_img, test_mask], ...]
    categories: {
        "cat_key": {
            "ref_img": <path>,
            "ref_mask": <path>
        }, ...
    }
    """
    for cat, data in categories.items():
        ref_img = data["ref_img"]
        ref_mask = data["ref_mask"]

        folder = os.path.dirname(ref_img)
        prefix = "_".join(os.path.basename(ref_img).split("_")[:-1])  # e.g., can_chowder

        # Find all candidate images (exclude ref and masks)
        all_imgs = []
        for ext in IMG_EXTS:
            all_imgs.extend(glob.glob(os.path.join(folder, f"{prefix}_*{ext}")))
        test_imgs = [
            p for p in all_imgs
            if "_gt" not in os.path.basename(p).lower()
            and os.path.abspath(p) != os.path.abspath(ref_img)
        ]
        test_imgs = sorted(test_imgs)

        test_pairs = []
        for img_path in test_imgs:
            base_name = os.path.splitext(os.path.basename(img_path))[0]
            mask_path = None

            # Try *_1_gt first
            for ext in IMG_EXTS:
                candidate = os.path.join(folder, f"{base_name}_1_gt{ext}")
                if os.path.exists(candidate):
                    mask_path = candidate
                    break

            # If not found, try any *_gt
            if mask_path is None:
                for ext in IMG_EXTS:
                    matches = glob.glob(os.path.join(folder, f"{base_name}_*_gt{ext}"))
                    if matches:
                        mask_path = sorted(matches)[0]
                        break

            if not mask_path:
                raise FileNotFoundError(f"No mask found for test image {img_path}")

            test_pairs.append([img_path, mask_path])

        data["test"] = test_pairs

    return categories


In [63]:

def process_categories(categories_dict, visualize=False):
    """
    Loops through each category and its test pairs.
    Returns:
    {
      category_key: [
        {
          "ref_img": <ref image path>,
          "ref_mask": <ref mask path>,
          "test_img": <test image path>,
          "test_mask": <test mask path>,
          "relevant_mask": <predicted mask array>
        },
        ...
      ]
    }
    """
    results = {}

    for category_key, cfg in categories_dict.items():
        ref_img  = cfg["ref_img"]
        ref_mask = cfg["ref_mask"]

        # Get bounding box from reference
        xmin, xmax, ymin, ymax = process_img_png_mask(
            ref_img, ref_mask, visualize=visualize
        )

        cat_results = []
        for pair in cfg.get("test", []):
            if not isinstance(pair, (list, tuple)) or len(pair) != 2:
                continue

            test_img, test_mask = pair

            if os.path.abspath(test_img) == os.path.abspath(ref_img):
                continue

            # Run tracker
            op = track_item_boxes(
                ref_img,
                test_img,
                [([xmin, xmax, ymin, ymax], 1)],
                visualize=visualize
            )
            output_masks = op[1]
            relevant_mask = output_masks[1]

            cat_results.append({
                "ref_img": ref_img,
                "ref_mask": ref_mask,
                "test_img": test_img,
                "test_mask": test_mask,
                "relevant_mask": relevant_mask
            })

        results[category_key] = cat_results

    return results


In [64]:
def _normalize_mask(mask, target_shape=None):
    m = np.asarray(mask)

    # remove singleton dims like (1,H,W) or (H,W,1)
    m = np.squeeze(m)
    if m.ndim == 3 and m.shape[2] == 1:
        m = m[:, :, 0]
    if m.ndim == 3 and m.shape[0] == 1:
        m = m[0]

    # binarize (works for bool, uint8 0/1, or float logits/probs)
    if m.dtype != bool:
        if m.dtype.kind in ("f", "c"):
            m = m > 0.5
        else:
            m = m > 0

    # resize if requested and shapes differ (nearest to preserve labels)
    if target_shape and m.shape != target_shape:
        im = Image.fromarray(m.astype(np.uint8) * 255)
        im = im.resize((target_shape[1], target_shape[0]), Image.NEAREST)
        m = np.array(im) > 0

    return m.astype(np.uint8)  # pycocotools likes 0/1 uint8

def eval_masks_with_coco(gt_mask_path, pred_mask, auto_resize=True):
    # 1) GT mask from image path -> boolean
    gt = Image.open(gt_mask_path).convert("L")
    gt = np.array(gt) > 0

    # 2) normalize predicted mask to (H,W) bool, align shape
    pred = _normalize_mask(pred_mask, target_shape=gt.shape if auto_resize else None)

    # sanity check
    if gt.shape != pred.shape:
        raise ValueError(f"Shape mismatch after normalization: GT {gt.shape}, Pred {pred.shape}")

    # 3) RLE encode (Fortran order)
    gt_rle   = maskUtils.encode(np.asfortranarray(gt.astype(np.uint8)))
    pred_rle = maskUtils.encode(np.asfortranarray(pred.astype(np.uint8)))

    # 4) areas
    gt_area   = float(maskUtils.area(gt_rle))
    pred_area = float(maskUtils.area(pred_rle))

    # 5) IoU (iscrowd=0)
    iou = float(maskUtils.iou([pred_rle], [gt_rle], [0])[0][0])

    return {"gt_area_px": gt_area, "pred_area_px": pred_area, "iou": iou}

In [65]:
# Build categories with ref and test 
CATEGORIES = build_categories_with_tests(CATEGORIES)

# create mask and closebox for test image 
results = process_categories(CATEGORIES)

frame loading (JPEG): 100%|██████████| 2/2 [00:00<00:00, 53.36it/s]

Skipping the post-processing step due to the error above. You can still use SAM 2 and it's OK to ignore the error above, although some post-processing functionality may be limited (which doesn't affect the results in most cases; see https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).
  pred_masks_gpu = fill_holes_in_mask_scores(
propagate in video: 100%|██████████| 2/2 [00:00<00:00, 24.77it/s]
frame loading (JPEG): 100%|██████████| 2/2 [00:00<00:00, 60.62it/s]
propagate in video: 100%|██████████| 2/2 [00:00<00:00, 25.24it/s]
frame loading (JPEG): 100%|██████████| 2/2 [00:00<00:00, 67.80it/s]
propagate in video: 100%|██████████| 2/2 [00:00<00:00, 26.44it/s]
frame loading (JPEG): 100%|██████████| 2/2 [00:00<00:00, 54.09it/s]
propagate in video: 100%|██████████| 2/2 [00:00<00:00, 24.28it/s]
frame loading (JPEG): 100%|██████████| 2/2 [00:00<00:00, 65.46it/s]
propagate in video: 100%|██████████| 2/2 [00:00<00:0

In [67]:
results

{'can_chowder': [{'ref_img': 'data/data_2D/can_chowder_000001.jpg',
   'ref_mask': 'data/data_2D/can_chowder_000001_1_gt.png',
   'test_img': 'data/data_2D/can_chowder_000002.jpg',
   'test_mask': 'data/data_2D/can_chowder_000002_1_gt.png',
   'relevant_mask': array([[[False, False, False, ..., False, False, False],
           [False, False, False, ..., False, False, False],
           [False, False, False, ..., False, False, False],
           ...,
           [False, False, False, ..., False, False, False],
           [False, False, False, ..., False, False, False],
           [False, False, False, ..., False, False, False]]])},
  {'ref_img': 'data/data_2D/can_chowder_000001.jpg',
   'ref_mask': 'data/data_2D/can_chowder_000001_1_gt.png',
   'test_img': 'data/data_2D/can_chowder_000003.jpg',
   'test_mask': 'data/data_2D/can_chowder_000003_1_gt.png',
   'relevant_mask': array([[[False, False, False, ..., False, False, False],
           [False, False, False, ..., False, False, False],

In [66]:
metrics = eval_masks_with_coco(results['can_chowder'][6]['test_mask'], results['can_chowder'][6]['relevant_mask'], auto_resize=False)
print(metrics)

{'gt_area_px': 9210.0, 'pred_area_px': 11121.0, 'iou': 0.6851222544550353}


In [None]:
# # pip install pycocotools   # on Windows you may need: pip install pycocotools-windows
# import numpy as np
# from pycocotools import mask as maskUtils
# from math import floor, ceil

# def bbox_to_rle(bbox, H=None, W=None):
#     """
#     bbox: [x, y, w, h] (floats ok)
#     H, W: image height/width; if None, use the minimum canvas that fits both boxes later
#     returns: RLE dict
#     """
#     x, y, w, h = bbox
#     x0, y0 = floor(x), floor(y)
#     x1, y1 = ceil(x + w), ceil(y + h)

#     if H is None or W is None:
#         H = max(H or 0, y1 + 1)
#         W = max(W or 0, x1 + 1)

#     mask = np.zeros((H, W), dtype=np.uint8)
#     # clip to canvas just in case
#     x0c, y0c = max(0, x0), max(0, y0)
#     x1c, y1c = min(W, x1), min(H, y1)
#     if x1c > x0c and y1c > y0c:
#         mask[y0c:y1c, x0c:x1c] = 1

#     rle = maskUtils.encode(np.asfortranarray(mask))
#     return rle, (H, W)

# def iou_from_boxes(gt_bbox, pred_bbox):
#     # build a canvas big enough for both
#     def box_extent(b):
#         x, y, w, h = b
#         return ceil(x + w), ceil(y + h)
#     W = max(box_extent(gt_bbox)[0], box_extent(pred_bbox)[0]) + 1
#     H = max(box_extent(gt_bbox)[1], box_extent(pred_bbox)[1]) + 1

#     gt_rle, _   = bbox_to_rle(gt_bbox, H=H, W=W)
#     pred_rle, _ = bbox_to_rle(pred_bbox, H=H, W=W)

#     # areas
#     gt_area   = float(maskUtils.area(gt_rle))
#     pred_area = float(maskUtils.area(pred_rle))

#     # IoU (mask-based)
#     # maskUtils.iou expects lists of RLEs and a matching iscrowd list
#     iou = maskUtils.iou([pred_rle], [gt_rle], [0])[0][0]

#     return {
#         "gt_area_px": gt_area,
#         "pred_area_px": pred_area,
#         "iou": float(iou),
#         "canvas_HW": (H, W),
#     }




In [None]:
# # ---- Your rectangles ----
# gt = results['can_chowder'][8]['bbox_test_mask_xywh']  # actual
# pd = results['can_chowder'][8]['bbox_relevant_mask_xywh']  # predicted

# metrics = iou_from_boxes(gt, pd)
# print(metrics)

{'gt_area_px': 13184.0, 'pred_area_px': 12096.0, 'iou': 0.9174757281553398, 'canvas_HW': (194, 577)}
