# Project - Building an Image Narration Tool

---

## Approach 1: Using Hugging Face Transformers pipelines

### Step 1: Object Detection from Image

In [None]:
! pip install transformers torch pillow requests tqdm iprogress

In [1]:
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
from PIL import Image
import requests

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

# you can specify the revision tag if you don't want the timm dependency
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50", revision="no_timm")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50", revision="no_timm")

inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)

# convert outputs (bounding boxes and class logits) to COCO API
# let's only keep detections with score > 0.9
target_sizes = torch.tensor([image.size[::-1]])
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
            f"Detected {model.config.id2label[label.item()]} with confidence "
            f"{round(score.item(), 3)} at location {box}"
    )


  from .autonotebook import tqdm as notebook_tqdm
Loading weights: 100%|██████████████████████████████████████| 530/530 [00:00<00:00, 858.79it/s, Materializing param=model.query_position_embeddings.weight]


Detected remote with confidence 0.998 at location [40.16, 70.81, 175.55, 117.98]
Detected remote with confidence 0.996 at location [333.24, 72.55, 368.33, 187.66]
Detected couch with confidence 0.995 at location [-0.02, 1.15, 639.73, 473.76]
Detected cat with confidence 0.999 at location [13.24, 52.05, 314.02, 470.93]
Detected cat with confidence 0.999 at location [345.4, 23.85, 640.37, 368.72]


Exception in thread Thread-auto_conversion:
Traceback (most recent call last):
  File [35m"C:\Users\smadduri\AppData\Roaming\uv\python\cpython-3.14.2-windows-x86_64-none\Lib\threading.py"[0m, line [35m1082[0m, in [35m_bootstrap_inner[0m
    [31mself._context.run[0m[1;31m(self.run)[0m
    [31m~~~~~~~~~~~~~~~~~[0m[1;31m^^^^^^^^^^[0m
  File [35m"C:\Users\smadduri\AppData\Roaming\uv\python\cpython-3.14.2-windows-x86_64-none\Lib\threading.py"[0m, line [35m1024[0m, in [35mrun[0m
    [31mself._target[0m[1;31m(*self._args, **self._kwargs)[0m
    [31m~~~~~~~~~~~~[0m[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
  File [35m"D:\ai\project\.venv\Lib\site-packages\transformers\safetensors_conversion.py"[0m, line [35m117[0m, in [35mauto_conversion[0m
    raise e
  File [35m"D:\ai\project\.venv\Lib\site-packages\transformers\safetensors_conversion.py"[0m, line [35m96[0m, in [35mauto_conversion[0m
    sha = get_conversion_pr_reference(api, pretrained_model_name_or_path,

### Step 2: Extract Labels and Convert to Text

In [2]:
from collections import Counter

def generate_natural_language(results, model):
    # 1. Extract the labels from your results tensor
    detected_labels = [model.config.id2label[label.item()] for label in results["labels"]]
    
    if not detected_labels:
        return "I scanned the image but didn't find any objects with high confidence."

    # 2. Count the occurrences of each object
    counts = Counter(detected_labels)
    
    # 3. Format into a list of strings (e.g., "2 cats")
    formatted_items = []
    for obj, count in counts.items():
        # Add an 's' for plurals (basic logic)
        label = f"{obj}s" if count > 1 else obj
        formatted_items.append(f"{count} {label}")

    # 4. Join them into a natural sentence
    if len(formatted_items) == 1:
        description = f"In this image, I can see {formatted_items[0]}."
    else:
        # Connects items with commas and an 'and' for the last item
        description = f"In this image, I can see {', '.join(formatted_items[:-1])} and {formatted_items[-1]}."
    
    return description

# Call the function using your existing 'results' and 'model' variables
text_summary = generate_natural_language(results, model)
print(text_summary)

In this image, I can see 2 remotes, 1 couch and 2 cats.


### Step 3: Text-to-Speech (TTS)

In [None]:
! pip install transformers scipy

In [3]:
from transformers import AutoProcessor, AutoModel

processor = AutoProcessor.from_pretrained("suno/bark-small")
model = AutoModel.from_pretrained("suno/bark-small")

inputs = processor(
    text=[text_summary],
    return_tensors="pt",
)

speech_values = model.generate(**inputs, do_sample=True)


Loading weights: 100%|███████████████████████████████████████| 542/542 [00:00<00:00, 880.11it/s, Materializing param=semantic.position_embeds_layer.weight]
Passing `generation_config` together with generation-related arguments=({'do_sample', 'min_eos_p'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Both `max_new_tokens` (=768) and `max_length`(=20) seem to have been set. `max_new_tokens` will take preced

In [4]:
from IPython.display import Audio

sampling_rate = model.generation_config.sample_rate
Audio(speech_values.cpu().numpy().squeeze(), rate=sampling_rate)


---

## Approach 2: using Google GenAI library and HuggingFace Transformers pipeline

### Step 1: Image Captioning using Google GenAI SDK

In [None]:
! pip install tiktoken google-genai

In [5]:
from google import genai
from google.genai import types

client = genai.Client(api_key="YOUR_API_KEY")


system_prompt = """
"You are a helpful AI Assistant. Given an image perform object detection and provide a text output which contains the information about the labels detected and their counts."
Always start your response with the exact phrase: "Here is what I see in the image:".
Make the output look a little more in english
"""

response = client.models.generate_content(
    model="gemini-3-flash-preview",
    config=types.GenerateContentConfig(
        system_instruction=system_prompt
    ),
    contents=f"Describe these objects: {text_summary}"
)

print(response.text)

Here is what I see in the image:

- There are **two cats** resting comfortably.
- I also see **two remotes** lying on the surface.
- They are all on **one couch**.


### Step 2: Text-to-Speech (TTS)

In [None]:
! pip install scipy

In [6]:
from transformers import AutoProcessor, AutoModel

processor = AutoProcessor.from_pretrained("suno/bark-small")
model = AutoModel.from_pretrained("suno/bark-small")

inputs = processor(
    text=[response.text],
    return_tensors="pt",
)

speech_values = model.generate(**inputs, do_sample=True)


Loading weights: 100%|███████████████████████████████████████| 542/542 [00:00<00:00, 801.22it/s, Materializing param=semantic.position_embeds_layer.weight]
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
Both `max_new_tokens` (=768) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=60) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=60) and `max_length`(=20) seem to have been set. `max_new_tokens` will t

In [7]:
from IPython.display import Audio

sampling_rate = model.generation_config.sample_rate
Audio(speech_values.cpu().numpy().squeeze(), rate=sampling_rate)
