In [1]:
from janus.models import MultiModalityCausalLM, VLChatProcessor
from transformers import AutoTokenizer
from PIL import Image
from pathlib import Path
import fitz
import torch
import time

# Configuração do dispositivo
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "deepseek-ai/Janus-Pro-7B"

# Carrega processor e modelo multimodal
processor = VLChatProcessor.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = MultiModalityCausalLM.from_pretrained(
    model_id, trust_remote_code=True,
    torch_dtype=torch.float16 if device=="cuda" else torch.float32
).to(device)

# Garante template de chat padrão
if getattr(processor, "chat_template", None) is None:
    processor.chat_template = [
        {"role":"system","content":"You are a helpful assistant that can read document pages and answer questions based on them."}
    ]


Python version is above 3.10, patching the collections module.


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.
Some kwargs in processor config are unused and will not have any effect: num_image_tokens, add_special_token, mask_p

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
def pdf_to_images(pdf_path: str, output_folder: str) -> None:
    output_dir = Path(output_folder)
    output_dir.mkdir(parents=True, exist_ok=True)
    pdf = fitz.open(pdf_path)
    for i, page in enumerate(pdf, start=1):
        out_path = output_dir / f"page_{i}.png"
        if out_path.exists():
            continue
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
        img = Image.open(pix.tobytes("png")).convert("RGB")
        img = img.resize((384,384), resample=Image.LANCZOS)
        img.save(out_path, format="PNG")
        time.sleep(0.1)

In [3]:
def get_image_paths(folder: str) -> list[str]:

    pngs = [p for p in Path(folder).iterdir() if p.suffix.lower() == ".png"]
    pngs.sort(key=lambda p: int(p.stem.split("_")[1]))
    return [str(p) for p in pngs]

In [4]:
def resposta_textual(image_path: str, question: str, max_new_tokens=100) -> str:
    img = Image.open(image_path).convert("RGB")
    conversation = [
        {"role":"user","content":f"<image_placeholder>\n{question}"}
    ]
    processed = processor(
        conversations=conversation,
        images=[img],
        force_batchify=True,
        return_tensors="pt"
    ).to(device, dtype=torch.float16)
    
    inputs_embeds = model.prepare_inputs_embeds(
        input_ids=processed.input_ids,
        pixel_values=processed.pixel_values,
        images_seq_mask=processed.images_seq_mask,
        images_emb_mask=processed.images_emb_mask
    )
    with torch.no_grad():
        generated = model.language_model.generate(
            inputs_embeds=inputs_embeds,
            attention_mask=processed.attention_mask,
            max_new_tokens=max_new_tokens,
            do_sample=False
        )
    return tokenizer.batch_decode(generated, skip_special_tokens=True)[0].strip()


In [None]:

pdf_to_images("JCRIBEIRO.R02.TOBE.V5.92.pdf", "Documents_Image_Janus")
paths = get_image_paths("Documents_Image_Janus")

# modificar para a página que vc quer a resposta da pergunta feita abaixo
page_number = 106

image_path = paths[page_number - 1]  

# definir a pergunta que vc quer fazer da página (de preferencia em inglês)
pergunta = "Pode me dizer a principal ideia desta pagina?"

resposta = resposta_textual(image_path, pergunta)
print(f"Resposta para {Path(image_path).name}:\n{resposta}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Resposta para page_106.png:
Este é um exemplo de uma rede de trabalho em que os trabalhadores compartilham informaç�es e recursos.Este é um exemplo de uma rede de trabalho em que os trabalhadores compartilham informaç�es e recursos.
Este é um exemplo de uma rede de trabalho em que os trabalhadores compartilham informaç�es e recursos.
