In [1]:
import os

# Ensure transformers avoids TensorFlow
os.environ["TRANSFORMERS_NO_TF"] = "1"

from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import pytesseract


pytesseract.pytesseract.tesseract_cmd = r"C:/Program Files/Tesseract-OCR"


def run_ocr(image_path):
    """
    Perform OCR on the given image and return extracted text.
    """
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        return text.strip()
    except Exception as e:
        return f"Error reading image for OCR: {str(e)}"

def generate_caption(image_path, device="cpu"):
    """
    Generate an image caption using the BLIP model.
    """
    try:
        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

        # Move model to device (CPU or GPU)
        model = model.to(device)

        # Open and process image
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt").to(device)

        out = model.generate(**inputs, max_new_tokens=50)
        caption = processor.decode(out[0], skip_special_tokens=True)
        return caption

    except Exception as e:
        return f"Error generating caption: {str(e)}"

def main():
    print("="*60)
    print("🔍 OCR + Image Captioning Tool")
    print("="*60)

    # Get image path from user
    image_path = input("Enter the path to your image file:\n> ").strip()

    if not os.path.exists(image_path):
        print(f"ERROR: File not found → {image_path}")
        return

    # Ask for device choice
    device = input("Run on CPU or CUDA? (type 'cpu' or 'cuda') [cpu]:\n> ").strip().lower()
    if device not in ["cpu", "cuda"]:
        device = "cpu"

    print("\nProcessing your image... Please wait.\n")

    # Run OCR
    ocr_text = run_ocr(image_path)

    # Run captioning
    caption = generate_caption(image_path, device=device)

    # Display results
    print("="*70)
    print("IMAGE CAPTION:\n", caption)
    print("="*70)
    print("OCR TEXT:\n", ocr_text)
    print("="*70)

    # Combine into a single description
    combined = f"The image shows: {caption}. The text in the image reads: {ocr_text}"
    print("\nCOMBINED DESCRIPTION:\n", combined)
    print("="*70)

if __name__ == "__main__":
    main()

🔍 OCR + Image Captioning Tool


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.



Processing your image... Please wait.

IMAGE CAPTION:
 a quote that says if you don ' t like you ' re, you ' re
OCR TEXT:
 Error reading image for OCR: [WinError 5] Access is denied

COMBINED DESCRIPTION:
 The image shows: a quote that says if you don ' t like you ' re, you ' re. The text in the image reads: Error reading image for OCR: [WinError 5] Access is denied


In [2]:
# 📦 Import Libraries
from PIL import Image
import pytesseract
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
import os

# ✅ Load Image
image_path = "sample1.jpg"
assert os.path.exists(image_path), f"❌ Image not found: {image_path}"
image = Image.open(image_path).convert("RGB")

# ✅ Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

# ✅ Load processor and model ONCE globally
processor = BlipProcessor.from_pretrained(
    "Salesforce/blip-image-captioning-base", 
    use_fast=True
)
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
model.to(device)

# --- Part 1: OCR using Tesseract ---
def perform_ocr(img: Image.Image) -> str:
    text = pytesseract.image_to_string(img)
    return text.strip()

# --- Part 2: Image Captioning using BLIP ---
def generate_caption(img: Image.Image) -> str:
    inputs = processor(images=img, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(**inputs)

    caption = processor.decode(output[0], skip_special_tokens=True)
    return caption

# --- Run both tasks ---
ocr_text = perform_ocr(image)
caption_text = generate_caption(image)

# --- Display Results ---
print("\n--- 📝 OCR Result ---")
print(ocr_text if ocr_text else "[No text detected]")

print("\n--- 🖼️ Image Caption ---")
print(caption_text)

🚀 Using device: cpu

--- 📝 OCR Result ---
If yOu

youre walking,

—Dolly Parton

Prevention

--- 🖼️ Image Caption ---
a quote that says if you don ' t like you ' re, you ' re
