In [3]:
import fitz  # PyMuPDF
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import io

# Function to generate captions for images using BLIP
def generate_caption(image):
    # Load the BLIP model and processor
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

    # Preprocess the image
    inputs = processor(image, return_tensors="pt")

    # Generate caption
    with torch.no_grad():
        outputs = model.generate(**inputs)

    # Decode the generated caption
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

# Function to extract images from PDF and generate captions
def extract_images_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)
    descriptions = {}

    for page_number in range(len(pdf_document)):
        page = pdf_document[page_number]
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            image = pdf_document.extract_image(xref)
            image_bytes = image["image"]

            # Open the image using PIL
            img = Image.open(io.BytesIO(image_bytes))

            # Generate caption using BLIP
            caption = generate_caption(img)  # Pass the image for captioning
            descriptions[f"Image {page_number + 1}.{img_index + 1}"] = caption

    return descriptions

# Example usage
pdf_path = "data/Finance_Project.pdf"  # Replace with your PDF file path
captions = extract_images_from_pdf(pdf_path)

# Print the captions for each image
for image_context, caption in captions.items():
    print(f"{image_context}: {caption}")



Image 29.1: a line graph showing the number of different types of the different species
Image 30.1: a chart showing the trend in the price of commodities
Image 31.1: a line graph shows the number of people who have been in the past
Image 33.1: a graph with the number of people in each region
