In [24]:
# üì¶ Install dependencies (run once per environment)
!pip install pdf2image pillow requests tqdm
#Installs the tools to extract images (pdf2image), manipulate them (Pillow), and make HTTP requests to Ollama.



In [1]:
#Imports and configuration
import base64
import json
import requests
from pathlib import Path
from pdf2image import convert_from_path
from tqdm import tqdm
from pathlib import Path

# ---- CONFIG ----
PDF_PATH = Path("../data/raw/test_pages25-28.pdf")      # path to your local PDF

OUTPUT_DIR = Path("data/test")     # directory to save page images
MODEL_NAME = "llava-phi3:latest"  # Ollama model name

OLLAMA_URL = "http://localhost:11434/api/generate"



In [2]:
PROMPT_TEMPLATE ="""

Summarize this image in a concise, academic tone relevant to a statistical software(JASP) interface screenshots or outputs.

"""

In [None]:
PROMPT_TEMPLATE = """
You are an assistant specialized in describing technical documentation pages.

Task:
Generate a detailed *description* of the given PDF page from the "Statistical Analysis in JASP" manual (2025).
Do NOT summarize or paraphrase ‚Äî instead, capture and preserve all textual information, headings, interface labels, and relevant visual elements.

Focus on factual description for later instruction-based reasoning.

Include:
1. **Section title(s)** or visible headings.
2. **Verbatim textual content** if it‚Äôs short and informative (menus, options, captions, settings).
3. **Descriptions of tables, figures, screenshots, or UI workflows** ‚Äî explain what they depict, their purpose, and notable visual elements (buttons, charts, plots, dialog boxes, menus, etc.).
4. **Contextual notes** ‚Äî what the page teaches (e.g., ‚Äúillustrates how to import CSV files into JASP‚Äù, ‚Äúshows ANOVA options menu‚Äù).
5. **Any numbered steps, menu paths, or parameter settings** ‚Äî retain them as text.
6. **Keywords** for retrieval and later task grounding.

Output format:
{
  "page_number": <page number>,
  "section_title": "...",
  "description": "...",
  "key_elements": ["tables", "figures", "menus", "commands"],
  "keywords": ["JASP", "data import", "menu", ...]
}

Style:
- Write in neutral, factual tone.
- Keep length flexible (prefer completeness over brevity).
- If visuals are present, describe their content and relation to the text.
- Do NOT paraphrase or shorten; preserve meaning and instructional details.
"""


In [37]:
PROMPT = (
    "You are an OCR assistant. Carefully read every visible word, line, and paragraph in this document page image. "
    "Return all text content exactly as it appears, preserving line breaks and punctuation. "
    "If the page contains tables or lists, reproduce their text content in a readable format. "
    "Do not summarize or rephrase ‚Äî output the extracted text only. "
    "If a page number or header/footer is visible, include it as well. "
    "Output the result in plain text under a single key named 'page_text'."
)


In [3]:
#These functions separate image conversion and summarization, making the pipeline modular.

def pdf_page_to_image(pdf_path, page_number, dpi=200, output_dir=Path("data/tmp_pages")):
    """
    Convert a specific page from a PDF to a PNG image.
    Returns the image path.
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    pages = convert_from_path(pdf_path, dpi=dpi, first_page=page_number, last_page=page_number)
    if not pages:
        raise ValueError(f"No page {page_number} found in {pdf_path}")
    image_path = output_dir / f"page_{page_number}.png"
    pages[0].save(image_path, "PNG")
    return image_path


def summarize_image_with_llava(image_path, model, prompt, ollama_url=OLLAMA_URL):
    """
    Send an image and a prompt to the LLaVA model running on Ollama.
    Returns the text summary.
    """
    with open(image_path, "rb") as f:
        img_b64 = base64.b64encode(f.read()).decode("utf-8")

    payload = {
        "model": model,
        "prompt": prompt,
        "images": [img_b64],
        "stream": False
    }

    response = requests.post(ollama_url, json=payload, timeout=180)
    response.raise_for_status()
    return response.json().get("response", "").strip()


In [4]:
# Choose a PDF page to summarize
page_number = 2
  # change to any page index

# Convert to image
img_path = pdf_page_to_image(PDF_PATH, page_number)

# Generate summary
summary = summarize_image_with_llava(img_path, MODEL_NAME, PROMPT_TEMPLATE)

print(f"üñºÔ∏è Page {page_number} summarized by {MODEL_NAME}:")
print("=" * 80)
print(summary)


üñºÔ∏è Page 2 summarized by llava-phi3:latest:
The image you've sent is a screenshot of an output from the JASP statistical software. The main feature of this graph is a bar chart that compares two groups over time, specifically "Age at First Sex" and "Incidence of HIV". The x-axis represents time in years, with 1990 being on the left side of the chart and 2015 on the right. There are four bars in total, representing data from 1990, 1995, 2000, and 2015.

The bar for "Age at First Sex" is taller than the one for "Incidence of HIV", indicating a difference between these two variables over time. The text on the right side of the image explains this: "There was no significant change in AFS over the 20 years but a significant increase in IHV". This suggests that while there wasn't a notable shift in age at first sexual activity, there was a significant rise in HIV incidence over the same period.

It's important to note that this is just an interpretation based on the visible elements of t