In [1]:
import cv2
import numpy as np
import torch
from ultralytics import YOLO as UltralyticsYOLO
from transformers import pipeline
from typing import List, Dict

# Initialize YOLO model
class YOLO:
    def __init__(self, model_path, device="cpu", confidence_threshold=0.25, nms_threshold=0.45):
        self.model_path = model_path
        self.device = device
        self.confidence_threshold = confidence_threshold
        self.nms_threshold = nms_threshold
        self.model = None

    def load_model(self):
        """Load YOLO model."""
        self.model = UltralyticsYOLO(self.model_path)
        self.model.to(self.device)

    def detect(self, image: np.ndarray) -> List[Dict]:
        """Run detection on image."""
        results = self.model(image, conf=self.confidence_threshold, iou=self.nms_threshold)
        return self.postprocess(results)
    
    def postprocess(self, output) -> List[Dict]:
        """Convert YOLO output to list of detections."""
        detections = []
        for result in output:
            boxes = result.boxes
            for box in boxes:
                detection = {
                    'bbox': box.xyxy[0].cpu().numpy().astype(int),
                    'confidence': box.conf.item(),
                    'class_id': int(box.cls.item()),
                }
                detections.append(detection)
        return detections

# Initialize BLIP model
blip_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=0)

def crop_and_caption(image_path: str, yolo_model: YOLO, min_width=30, min_height=30) -> List[Dict]:
    """Detect bounding boxes, crop them, and generate captions using BLIP."""
    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        raise ValueError(f"Image at {image_path} could not be loaded.")
    
    # Run YOLO detection
    detections = yolo_model.detect(image)
    
    captions = []
    for i, det in enumerate(detections):
        x1, y1, x2, y2 = det['bbox']
        # add 10% padding to the bounding box
        padding = 0.2
        x1 = max(0, x1 - int(padding * (x2 - x1)))
        x2 = min(image.shape[1], x2 + int(padding * (x2 - x1)))
        y1 = max(0, y1 - int(padding * (y2 - y1)))
        y2 = min(image.shape[0], y2 + int(padding * (y2 - y1)))


        cropped_width = x2 - x1
        cropped_height = y2 - y1

  

        # Crop the detected region
        cropped_image = image[y1:y2, x1:x2]
        # Convert cropped image to RGB and save as a temporary file for BLIP
        cropped_image_rgb = cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB)


        if cropped_width < min_width or cropped_height < min_height:
            print(f"Skipping region {i + 1} due to small size: {cropped_width}x{cropped_height}")
            continue
        
        # Convert the image to PIL format (BLIP expects a PIL Image)
        from PIL import Image
        pil_image = Image.fromarray(cropped_image_rgb)

        # Generate caption
        caption = blip_pipeline(pil_image)[0]['generated_text']
        captions.append({'bbox': det['bbox'], 'caption': caption})
        
        # For debugging, show cropped regions and captions
        print(f"Region {i + 1}: {caption}")
        cv2.imshow(f"Cropped Region {i + 1}", cropped_image)
    
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    return captions

# Usage Example
if __name__ == "__main__":
    # Path to YOLO model
    yolo_model_path = "yolov8s-world.pt"  # Replace with your YOLO model path
    image_path = "image.png"  # Replace with your image path

    # Initialize and load YOLO model
    yolo = YOLO(model_path=yolo_model_path, device="cuda", confidence_threshold=0.25)
    yolo.load_model()

    # Perform detection and BLIP captioning
    results = crop_and_caption(image_path, yolo)
    print("Captions for detected regions:", results)


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(



0: 448x640 3 bottles, 1 cup, 10 knifes, 1 spoon, 1 bowl, 1 mouse, 6 scissorss, 88.0ms
Speed: 1.6ms preprocess, 88.0ms inference, 513.5ms postprocess per image at shape (1, 3, 448, 640)




Region 1: there are many different items on the wall in the store
Region 2: there is a small dog that is sitting on a spoon
Skipping region 3 due to small size: 22x241
Region 4: there is a bottle of water sitting on a table in a room
Region 5: there is a large stack of food on a counter in a kitchen
Skipping region 6 due to small size: 38x26
Skipping region 7 due to small size: 24x82
Skipping region 8 due to small size: 19x91
Region 9: there are many pairs of scissors on the wall in the store
Skipping region 10 due to small size: 26x79
Region 11: there are many tools on the shelf in the store
Skipping region 12 due to small size: 25x147
Skipping region 13 due to small size: 24x130
Skipping region 14 due to small size: 15x72
Skipping region 15 due to small size: 21x91
Region 16: there are many different types of tools on the wall
Skipping region 17 due to small size: 14x88
Skipping region 18 due to small size: 14x73
Region 19: there are several different items on the wall in the store
R

In [None]:
    import cv2
    import numpy as np
    import torch
    from ultralytics import YOLO as UltralyticsYOLO
    from transformers import pipeline
    from typing import List, Dict
    from PIL import Image

    # Initialize YOLO model
    class YOLO:
        def __init__(self, model_path, device="cpu", confidence_threshold=0.25, nms_threshold=0.45):
            self.model_path = model_path
            self.device = device
            self.confidence_threshold = confidence_threshold
            self.nms_threshold = nms_threshold
            self.model = None

        def load_model(self):
            """Load YOLO model."""
            self.model = UltralyticsYOLO(self.model_path)
            self.model.to(self.device)

        def detect(self, image: np.ndarray) -> List[Dict]:
            """Run detection on image."""
            results = self.model(image, conf=self.confidence_threshold, iou=self.nms_threshold)
            return self.postprocess(results)
        
        def postprocess(self, output) -> List[Dict]:
            """Convert YOLO output to list of detections."""
            detections = []
            for result in output:
                boxes = result.boxes
                for box in boxes:
                    detection = {
                        'bbox': box.xyxy[0].cpu().numpy().astype(int),
                        'confidence': box.conf.item(),
                        'class_id': int(box.cls.item()),
                    }
                    detections.append(detection)
            return detections

    # Initialize BLIP model
    blip_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base", device=0)

    def fade_and_caption(image_path: str, yolo_model: YOLO):
        """Detect bounding boxes, create images with faded backgrounds, and generate captions using BLIP."""
        # Load the image
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"Image at {image_path} could not be loaded.")
        
        # Run YOLO detection
        detections = yolo_model.detect(image)
        
        captions = []
        for i, det in enumerate(detections):
            x1, y1, x2, y2 = det['bbox']
            # Add 10% padding to the bounding box
            padding = 0.1
            x1 = max(0, x1 - int(padding * (x2 - x1)))
            x2 = min(image.shape[1], x2 + int(padding * (x2 - x1)))
            y1 = max(0, y1 - int(padding * (y2 - y1)))
            y2 = min(image.shape[0], y2 + int(padding * (y2 - y1)))

            # Create a faded version of the image
            faded_image = cv2.addWeighted(image, 0.2, np.zeros_like(image), 0.7, 0)

            # Overlay the original bounding box region on the faded image
            faded_image[y1:y2, x1:x2] = image[y1:y2, x1:x2]

            # Convert the modified image to RGB for BLIP
            faded_image_rgb = cv2.cvtColor(faded_image, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(faded_image_rgb)

            # Generate caption using BLIP
            caption = blip_pipeline(pil_image)[0]['generated_text']
            captions.append({'bbox': det['bbox'], 'caption': caption})
            
            # Display or save the result for each detection
            cv2.imshow(f"Faded Image for Region {i + 1}", faded_image)
            cv2.waitKey(0)

            # Optional: Save the image for reference
            cv2.imwrite(f"faded_region_{i + 1}.png", faded_image)

            print(f"Region {i + 1}: {caption}")

        cv2.destroyAllWindows()
        return captions


    # Usage Example
    if __name__ == "__main__":
        # Path to YOLO model
        yolo_model_path = "yolov8s-world.pt"  # Replace with your YOLO model path
        image_path = "image.png"  # Replace with your image path

        # Initialize and load YOLO model
        yolo = YOLO(model_path=yolo_model_path, device="cuda", confidence_threshold=0.35)
        yolo.load_model()

        # Perform detection and BLIP captioning
        results = fade_and_caption(image_path, yolo)
        print("Captions for detected regions:", results)


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(



0: 448x640 3 bottles, 2 knifes, 1 bowl, 2 scissorss, 162.0ms
Speed: 2.4ms preprocess, 162.0ms inference, 599.7ms postprocess per image at shape (1, 3, 448, 640)




Region 1: a workbench with tools and tools on it
Region 2: a workbench with tools and tools hanging on a wall
Region 3: a workbench with tools and tools hanging on a wall
Region 4: a workbench with tools and a lamp
Region 5: a workbench with tools and a lamp
Region 6: a workbench with tools and tools hanging on a wall
Region 7: a workbench with tools and tools hanging on a wall
Region 8: a workbench with tools and tools hanging on a wall
Captions for detected regions: [{'bbox': array([441, 167, 467, 324]), 'caption': 'a workbench with tools and tools on it'}, {'bbox': array([474,  79, 507, 114]), 'caption': 'a workbench with tools and tools hanging on a wall'}, {'bbox': array([470, 163, 486, 331]), 'caption': 'a workbench with tools and tools hanging on a wall'}, {'bbox': array([ 38, 347,  59, 387]), 'caption': 'a workbench with tools and a lamp'}, {'bbox': array([108, 333, 129, 382]), 'caption': 'a workbench with tools and a lamp'}, {'bbox': array([ 12, 362,  39, 381]), 'caption': 'a 