In [1]:
import glob as glob_lib
import os
from utils import *
import numpy as np
from PIL import Image
from pycocotools import mask as maskUtils
import pandas as pd

## 1. Defining Dataset Categories and Reference Files

We set up a structured mapping of all the product categories we will process.  
The `CATEGORIES` dictionary links each product to:
- **`ref_img`**: A reference image for the product.
- **`ref_mask`**: The corresponding ground-truth segmentation mask.

For example:
- `"can_chowder"` → Image: `data/data_2D/can_chowder_000001.jpg`, Mask: `data/data_2D/can_chowder_000001_1_gt.png`



In [2]:
CATEGORIES = {
    "can_chowder": {
        "ref_img": 'data/data_2D/can_chowder_000001.jpg',
        "ref_mask": 'data/data_2D/can_chowder_000001_1_gt.png',
    },
    "can_soymilk": {
        "ref_img": 'data/data_2D/can_soymilk_000001.jpg',
        "ref_mask": 'data/data_2D/can_soymilk_000001_1_gt.png',
    },
    "can_tomatosoup": {
        "ref_img": 'data/data_2D/can_tomatosoup_000001.jpg',
        "ref_mask": 'data/data_2D/can_tomatosoup_000001_1_gt.png',
    },
    "carton_oj": {
        "ref_img": 'data/data_2D/carton_oj_000001.jpg',
        "ref_mask": 'data/data_2D/carton_oj_000001_1_gt.png',
    },
    "carton_soymilk": {
        "ref_img": 'data/data_2D/carton_soymilk_000001.jpg',
        "ref_mask": 'data/data_2D/carton_soymilk_000001_1_gt.png',
    },
    "diet_coke": {
        "ref_img": 'data/data_2D/diet_coke_000001.jpg',
        "ref_mask": 'data/data_2D/diet_coke_000001_1_gt.png',
    },
    "hc_potroastsoup": {
        "ref_img": 'data/data_2D/hc_potroastsoup_000001.jpg',
        "ref_mask": 'data/data_2D/hc_potroastsoup_000001_1_gt.png',
    },
    "juicebox": {
        "ref_img": 'data/data_2D/juicebox_000001.jpg',
        "ref_mask": 'data/data_2D/juicebox_000001_1_gt.png',
    },
    "rice_tuscan": {
        "ref_img": 'data/data_2D/rice_tuscan_000001.jpg',
        "ref_mask": 'data/data_2D/rice_tuscan_000001_1_gt.png',
    },
    "ricepilaf": {
        "ref_img": 'data/data_2D/ricepilaf_000001.jpg',
        "ref_mask": 'data/data_2D/ricepilaf_000001_1_gt.png',
    },
}

## 2. Auto-discovering Test Images and Masks for Each Category

Building on the category map defined above, we now **programmatically discover** all test images and their corresponding ground-truth masks for every product. This removes the need to manually list test files and keeps the pipeline scalable when new images are added.

### What this cell does
- Defines valid image extensions: `(".jpg", ".jpeg", ".png")`.
- Implements `build_categories_with_tests(categories)` which:
  1. **Derives the folder and filename prefix** from each category’s `ref_img` (e.g., `can_chowder_000001.jpg` → prefix `can_chowder`).
  2. **Finds all candidate images** in the same folder that match `{prefix}_*` and filters out:
     - the reference image itself, and
     - any files that are masks (contain `_gt`).
  3. **Locates each test image’s mask** using a two-stage strategy:
     - Prefer `*_1_gt` (e.g., `can_chowder_000002_1_gt.png`) to stay consistent with common annotations.
     - Fallback to **any** `*_gt` match if `*_1_gt` is not present.
  4. **Attaches discovered pairs** as `data["test"] = [[test_img, test_mask], ...]` inside the original `categories` dict.


- The **`CATEGORIES`** dictionary gave us a single `ref_img` and `ref_mask` per class. Using those anchors, this function **expands each category** by discovering all other images and their ground truths in the same directory.

### Output
- Returns the original `categories` dict **augmented** with a `test` key for each category:
  ```python
  {
    "can_chowder": {
      "ref_img": "...",
      "ref_mask": "...",
      "test": [
        [".../can_chowder_000002.jpg", ".../can_chowder_000002_1_gt.png"],
        ...
      ]
    },
    ...
  }


In [3]:
IMG_EXTS = (".jpg", ".jpeg", ".png")

def build_categories_with_tests(categories):
    """
    Given initial dict with ref_img/ref_mask, fill in test = [[test_img, test_mask], ...]
    categories: {
        "cat_key": {
            "ref_img": <path>,
            "ref_mask": <path>
        }, ...
    }
    """
    for cat, data in categories.items():
        ref_img = data["ref_img"]

        folder = os.path.dirname(ref_img)
        prefix = "_".join(os.path.basename(ref_img).split("_")[:-1])
        
        # Find all candidate images (exclude ref and masks)
        all_imgs = []
        for ext in IMG_EXTS:
            all_imgs.extend(glob.glob(os.path.join(folder, f"{prefix}_*{ext}")))
        test_imgs = [
            p for p in all_imgs
            if "_gt" not in os.path.basename(p).lower()
            and os.path.abspath(p) != os.path.abspath(ref_img)
        ]
        test_imgs = sorted(test_imgs)

        test_pairs = []
        for img_path in test_imgs:
            base_name = os.path.splitext(os.path.basename(img_path))[0]
            mask_path = None

            # Try *_1_gt
            for ext in IMG_EXTS:
                candidate = os.path.join(folder, f"{base_name}_1_gt{ext}")
                if os.path.exists(candidate):
                    mask_path = candidate
                    break

            if not mask_path:
                raise FileNotFoundError(f"No mask found for test image {img_path}")

            test_pairs.append([img_path, mask_path])

        data["test"] = test_pairs

    return categories


## 3. Processing All Categories: From Reference Box to Predicted Masks

Having populated each category with its discovered **test image–mask pairs**, we now iterate through the dataset to **generate predicted masks** on test images using the reference object’s bounding box.

### What this cell does
- Defines `process_categories(categories_dict, visualize=False)` which:
  1. Loops over each category and reads the **reference image** and **reference mask**.
  2. Calls `process_img_png_mask(ref_img, ref_mask, visualize)` to extract the **reference object’s bounding box**  
     → returns `(xmin, xmax, ymin, ymax)`.
  3. Iterates over every **test image** discovered earlier.
  4. Invokes `track_item_boxes(ref_img, test_img, [([xmin, xmax, ymin, ymax], 1)], visualize)` to **track/locate** the reference object in the test image and produce a **predicted mask**.
  5. Collects a structured record per test, including `ref_img`, `ref_mask`, `test_img`, `test_mask`, and the **`relevant_mask`** (predicted).
- Returns a dictionary keyed by category:
  ```python
  {
    "can_chowder": [
      {
        "ref_img": "...",
        "ref_mask": "...",
        "test_img": "...",
        "test_mask": "...",
        "relevant_mask": <ndarray bool HxWx1 or HxW>
      },
      ...
    ],
    ...
  }


In [None]:

def process_categories(categories_dict, visualize=False):
    """
    Loops through each category and its test pairs.
    Returns:
    {
      category_key: [
        {
          "ref_img": <ref image path>,
          "ref_mask": <ref mask path>,
          "test_img": <test image path>,
          "test_mask": <test mask path>,
          "relevant_mask": <predicted mask array>
        },
        ...
      ]
    }
    """
    results = {}

    for category_key, cfg in categories_dict.items():
        ref_img  = cfg["ref_img"]
        ref_mask = cfg["ref_mask"]

        # Get bounding box from reference
        xmin, xmax, ymin, ymax = process_img_png_mask(
            ref_img, ref_mask, visualize=visualize
        )

        cat_results = []
        for pair in cfg.get("test", []):
            test_img, test_mask = pair

            # Run tracker
            op = track_item_boxes(
                ref_img,
                test_img,
                [([xmin, xmax, ymin, ymax], 1)],
                visualize=visualize
            )
            
            output_masks = op[1]
            relevant_mask = output_masks[1]

            cat_results.append({
                "ref_img": ref_img,
                "ref_mask": ref_mask,
                "test_img": test_img,
                "test_mask": test_mask,
                "relevant_mask": relevant_mask
            })

        results[category_key] = cat_results

    return results


## 4. Normalizing Masks and Computing COCO Metrics (Area & IoU)

With predicted masks generated for each test image in the previous step, we now standardize mask shapes/types and compute **quantitative evaluation** using COCO utilities.

### What this cell does
- **`_normalize_mask(mask, target_shape=None)`**
  - Converts any incoming mask (bool, 0/1 uint8, logits/probabilities) into a **binary** `(H, W)` array.
  - **Squeezes** singleton dimensions (e.g., `(1, H, W)`, `(H, W, 1)`).
  - **Binarizes**:
    - Floats/complex → threshold at `0.5`
    - Integers → threshold at `> 0`
  - **Optionally resizes** to a requested `target_shape` using **nearest-neighbor** (to preserve labels).
  - Returns **uint8 (0/1)** masks, which `pycocotools` prefers for encoding.
- **`eval_masks_with_coco(gt_mask_path, pred_mask, auto_resize=True)`**
  1. Loads the **ground-truth** mask from path and binarizes it.
  2. Normalizes the **predicted** mask and, if `auto_resize=True`, aligns it to the GT shape.
  3. Encodes both masks into **COCO RLE** (using **Fortran order** as required).
  4. Computes **areas** (in pixels) for GT and prediction via `maskUtils.area(...)`.
  5. Computes **Intersection-over-Union (IoU)** via `maskUtils.iou(...)` with `iscrowd=0`.
  6. Returns a compact dict: `{"gt_area_px": ..., "pred_area_px": ..., "iou": ...}`.

- In **Step 4**, we produced a **predicted mask** (`relevant_mask`) for each test image.
- This step converts those predictions into a **standard format** and evaluates them against the **ground-truth masks** discovered in **Step 3**.
- The result provides objective metrics to compare performance **across categories and images**.

### Expected output
A dictionary with:
```python
{
  "gt_area_px": <float>,      # Ground-truth mask area in pixels
  "pred_area_px": <float>,    # Predicted mask area in pixels
  "iou": <float in [0, 1]>    # Overlap quality between prediction and ground truth
}


In [5]:
def _normalize_mask(mask, target_shape=None):
    m = np.asarray(mask)

    # remove singleton dims like (1,H,W) or (H,W,1)
    m = np.squeeze(m)
    if m.ndim == 3 and m.shape[2] == 1:
        m = m[:, :, 0]
    if m.ndim == 3 and m.shape[0] == 1:
        m = m[0]

    # binarize (works for bool, uint8 0/1, or float logits/probs)
    if m.dtype != bool:
        if m.dtype.kind in ("f", "c"):
            m = m > 0.5
        else:
            m = m > 0

    # resize if requested and shapes differ (nearest to preserve labels)
    if target_shape and m.shape != target_shape:
        im = Image.fromarray(m.astype(np.uint8) * 255)
        im = im.resize((target_shape[1], target_shape[0]), Image.NEAREST)
        m = np.array(im) > 0

    return m.astype(np.uint8)

def eval_masks_with_coco(gt_mask_path, pred_mask, auto_resize=True):
    # GT mask from image path -> boolean
    gt = Image.open(gt_mask_path).convert("L")
    gt = np.array(gt) > 0

    # normalize predicted mask to (H,W) bool, align shape
    pred = _normalize_mask(pred_mask, target_shape=gt.shape if auto_resize else None)

    # sanity check
    if gt.shape != pred.shape:
        raise ValueError(f"Shape mismatch after normalization: GT {gt.shape}, Pred {pred.shape}")

    # RLE encode (Fortran order)
    gt_rle   = maskUtils.encode(np.asfortranarray(gt.astype(np.uint8)))
    pred_rle = maskUtils.encode(np.asfortranarray(pred.astype(np.uint8)))

    # areas
    gt_area   = float(maskUtils.area(gt_rle))
    pred_area = float(maskUtils.area(pred_rle))

    # IoU (iscrowd=0)
    iou = float(maskUtils.iou([pred_rle], [gt_rle], [0])[0][0])

    return {"gt_area_px": gt_area, "pred_area_px": pred_area, "iou": iou}

## 5. Computing Average IoU per Category

After normalizing masks and computing **IoU metrics** for individual test images in the previous step, we now summarize results **per category** to get a high-level view of performance.

### What this cell does
- **Function:** `average_iou_per_category_from_results(...)`
- **Inputs:**
  - `results`: The output dictionary from **Step 4** (`process_categories`) containing test mask paths and predicted masks per category.
  - `gt_key`: Field name in each test record that points to the ground-truth mask path (default: `"test_mask"`).
  - `pred_key`: Field name for the predicted mask array (default: `"relevant_mask"`).
  - `auto_resize`: Passed through to `eval_masks_with_coco`; aligns shapes if they differ.
  - `return_details`: If `True`, also returns the list of individual IoUs per category.
- **Process:**
  1. Iterates over each **category** and its **test items**.
  2. For each item, calls `eval_masks_with_coco(...)` (from Step 5) to compute the IoU.
  3. Collects IoUs into a list for that category.
  4. Calculates the **average IoU** using `numpy.mean`.
  5. Optionally stores the full IoU list if `return_details=True`.
- **Output:**
  - If `return_details=False`:
    ```python
    {"can_chowder": 0.82, "rice_tuscan": 0.76, ...}
    ```
  - If `return_details=True`:
    ```python
    (
      {"can_chowder": 0.82, "rice_tuscan": 0.76, ...}, 
      {"can_chowder": [0.81, 0.83, ...], "rice_tuscan": [0.75, 0.77, ...]}
    )
    ```


In [6]:


def average_iou_per_category_from_results(
    results,
    gt_key="test_mask",
    pred_key="relevant_mask",
    auto_resize=False,
    return_details=False,
):
    """
    results: {
      "catA": [
        {"test_mask": <path>, "relevant_mask": <np.bool array or 0/1>, ...},
        ...
      ],
      "catB": [...]
    }
    gt_key:   key in each item pointing to GT mask path (default: 'test_mask')
    pred_key: key in each item pointing to predicted mask array (default: 'relevant_mask')
    auto_resize: pass-through to eval_masks_with_coco (nearest-neighbor if shapes differ)
    return_details: if True, also returns per-item IoUs for each category

    Returns:
      avg_ious: {category: avg_iou_float}
      details (optional): {category: [iou1, iou2, ...]}
    """
    avg_ious = {}
    details = {} if return_details else None

    for category, items in results.items():
        ious = []
        for idx, item in enumerate(items):
            try:
                gt_path = item[gt_key]
                pred_mask = item[pred_key]
                # Call eval_masks_with_coco function
                metrics = eval_masks_with_coco(
                    gt_mask_path=gt_path,
                    pred_mask=pred_mask,
                    auto_resize=auto_resize
                )
                ious.append(float(metrics["iou"]))
            except Exception as e:
                # Skip broken entries
                print(f"[WARN] {category} item {idx}: {e}")

        avg_ious[category] = float(np.mean(ious)) if len(ious) else 0.0
        if return_details:
            details[category] = ious

    return (avg_ious, details) if return_details else avg_ious


In [7]:
# Auto-discovering Test Images and Masks for Each Category 
CATEGORIES = build_categories_with_tests(CATEGORIES)

# Processing All Categories: From Reference Box to Predicted Masks 
results = process_categories(CATEGORIES)


frame loading (JPEG): 100%|██████████| 2/2 [00:00<00:00, 54.89it/s]

Skipping the post-processing step due to the error above. You can still use SAM 2 and it's OK to ignore the error above, although some post-processing functionality may be limited (which doesn't affect the results in most cases; see https://github.com/facebookresearch/sam2/blob/main/INSTALL.md).
  pred_masks_gpu = fill_holes_in_mask_scores(
propagate in video: 100%|██████████| 2/2 [00:00<00:00, 14.02it/s]
frame loading (JPEG): 100%|██████████| 2/2 [00:00<00:00, 49.81it/s]
propagate in video: 100%|██████████| 2/2 [00:03<00:00,  1.57s/it]
frame loading (JPEG): 100%|██████████| 2/2 [00:00<00:00, 59.12it/s]
propagate in video: 100%|██████████| 2/2 [00:00<00:00, 13.67it/s]
frame loading (JPEG): 100%|██████████| 2/2 [00:00<00:00, 64.69it/s]
propagate in video: 100%|██████████| 2/2 [00:00<00:00, 14.44it/s]
frame loading (JPEG): 100%|██████████| 2/2 [00:00<00:00, 53.95it/s]
propagate in video: 100%|██████████| 2/2 [00:00<00:0

> **Insight:** Average IoU per category is a strong indicator of how consistently the model or tracking method performs on different product classes, helping identify classes that may require additional data or tuning.

In [8]:
## Computing Average IoU per Category and per Category IoUs
avg_ious, per_item = average_iou_per_category_from_results(
    results,
    auto_resize=True,
    return_details=True
)

# Summary table for average IoU per category
summary_df = pd.DataFrame.from_dict(
    avg_ious, orient="index", columns=["Average IoU"]
).reset_index().rename(columns={"index": "Category"})

summary_df["Average IoU"] = summary_df["Average IoU"].round(3)

# Display average IoU per category
print("### Average IoU per Category")
display(summary_df)

# per-item IoUs for a specific category
category_name = "carton_oj"
if category_name in per_item:
    details_df = pd.DataFrame({
        "Image Index": range(1, len(per_item[category_name]) + 1),
        "IoU": [round(iou, 3) for iou in per_item[category_name]]
    })
    print(f"\n### Per-item IoUs for category: {category_name}")
    display(details_df)


### Average IoU per Category


Unnamed: 0,Category,Average IoU
0,can_chowder,0.218
1,can_soymilk,0.254
2,can_tomatosoup,0.15
3,carton_oj,0.401
4,carton_soymilk,0.373
5,diet_coke,0.37
6,hc_potroastsoup,0.297
7,juicebox,0.111
8,rice_tuscan,0.291
9,ricepilaf,0.035



### Per-item IoUs for category: carton_oj


Unnamed: 0,Image Index,IoU
0,1,0.0
1,2,0.938
2,3,0.0
3,4,0.0
4,5,0.0
5,6,0.0
6,7,0.0
7,8,0.001
8,9,0.473
9,10,0.623
