In [6]:

import json
from pathlib import Path
import cv2
import numpy as np
from PIL import Image 
import traceback
from tqdm import tqdm 

def calculate_bounding_box(image_width, image_height, landmarks, padding=10):
    if not landmarks: return None
    try:
        x_coords = [lm[0] * image_width for lm in landmarks]
        y_coords = [lm[1] * image_height for lm in landmarks]
    except (TypeError, IndexError):
        try:
            x_coords = [lm.x * image_width for lm in landmarks]
            y_coords = [lm.y * image_height for lm in landmarks]
        except (AttributeError, TypeError):
             print("Error: Unexpected landmark format. Cannot extract coordinates.")
             return None

    if not x_coords or not y_coords: return None 

    x_min = int(min(x_coords) - padding)
    y_min = int(min(y_coords) - padding)
    x_max = int(max(x_coords) + padding)
    y_max = int(max(y_coords) + padding)

    x1 = max(0, x_min); y1 = max(0, y_min)
    x2 = min(image_width, x_max); y2 = min(image_height, y_max)

    if x1 >= x2 or y1 >= y2: return None 
    return x1, y1, x2, y2


annotation_dir = Path("/Users/soumyadeepchatterjee/Desktop/WayneState/Winter2025/FinalProject/hagrid/ann_subsample")

all_annotations = {}
json_files = list(annotation_dir.glob("*.json"))

print(f"Found {len(json_files)} JSON annotation files in {annotation_dir}.")

if not json_files:
    raise FileNotFoundError(f"No JSON files found in '{annotation_dir}'. Please check the path.")

for json_file in tqdm(json_files, desc="Loading JSON annotations"):
    try:
        with open(json_file, 'r') as f:
            data = json.load(f)
            all_annotations.update(data) # Add annotations from this file
    except Exception as e:
        print(f"Error loading {json_file}: {e}")

print(f"Loaded annotations for {len(all_annotations)} images.")

if not all_annotations:
     raise ValueError("Failed to load any annotations.")

base_image_dir = Path("/Users/soumyadeepchatterjee/Desktop/WayneState/Winter2025/FinalProject/hagrid-sample-500k-384p/hagrid_500k")



output_dir = Path("hand_crops_json_landmarks")
output_dir.mkdir(parents=True, exist_ok=True)


PROCESS_LIMIT = 150000 
processed_count = 0

print(f"Starting processing images from: {base_image_dir}")


image_files = list(base_image_dir.rglob("*.jpg"))
print(f"Found {len(image_files)} total image files to potentially process.")

if not image_files:
    raise FileNotFoundError(f"No image files found in '{base_image_dir}' subdirectories. Please check the path.")

for image_path in tqdm(image_files, desc="Processing Images"):
    if PROCESS_LIMIT is not None and processed_count >= PROCESS_LIMIT:
        print(f"Reached processing limit ({PROCESS_LIMIT}). Stopping.")
        break

    try:
        image_id = image_path.stem 

        if image_id in all_annotations:
            annotation = all_annotations[image_id]
            landmarks_list = annotation.get("landmarks")
            labels_list = annotation.get("labels")

            if not landmarks_list:
                 continue

            img = cv2.imread(str(image_path))
            if img is None:
                print(f"Warning: Could not load image {image_path}. Skipping.")
                continue
          
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 
            h_img, w_img, _ = img_rgb.shape

            num_hands_processed_in_img = 0
            for hand_idx, hand_landmarks in enumerate(landmarks_list):
                if not isinstance(hand_landmarks, list) or not hand_landmarks or not all(isinstance(p, list) and len(p) >= 2 for p in hand_landmarks):
                    continue

                bbox = calculate_bounding_box(w_img, h_img, hand_landmarks, padding=5) # Using padding=5
                if bbox is None:
                    continue

                x1, y1, x2, y2 = bbox
                hand_crop = img_rgb[y1:y2, x1:x2] 
                if hand_crop.size == 0: continue

                hand_crop_resized = cv2.resize(hand_crop, (224, 224))

                label = "unknown"
                if labels_list and hand_idx < len(labels_list):
                    label = labels_list[hand_idx]

                out_path = output_dir / f"{image_id}_hand{hand_idx}_{label}_lm.jpg"
                cv2.imwrite(str(out_path), cv2.cvtColor(hand_crop_resized, cv2.COLOR_RGB2BGR))
                num_hands_processed_in_img += 1

            if num_hands_processed_in_img > 0:
                processed_count += 1


    except Exception as e:
        print(f"❌ Error processing image {image_path}: {e}")
        traceback.print_exc()

print(f"Processing finished. Processed annotations for {processed_count} images.")

Found 18 JSON annotation files in /Users/soumyadeepchatterjee/Desktop/WayneState/Winter2025/FinalProject/hagrid/ann_subsample.


Loading JSON annotations: 100%|████████████████| 18/18 [00:00<00:00, 160.80it/s]

Loaded annotations for 1800 images.
Starting processing images from: /Users/soumyadeepchatterjee/Desktop/WayneState/Winter2025/FinalProject/hagrid-sample-500k-384p/hagrid_500k





Found 509323 total image files to potentially process.


Processing Images: 100%|████████████| 509323/509323 [00:02<00:00, 185931.26it/s]

Processing finished. Processed annotations for 1646 images.



