In [3]:


from transformers import AutoProcessor, AutoModelForCausalLM  
from PIL import Image
import requests
import copy
import torch
%matplotlib inline  



In [4]:
model_id = 'microsoft/Florence-2-large'
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype='auto').eval().cuda()
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

In [5]:


def run_example(task_prompt, text_input=None):
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input
    inputs = processor(text=prompt, images=image, return_tensors="pt").to('cuda', torch.float16)
    generated_ids = model.generate(
      input_ids=inputs["input_ids"].cuda(),
      pixel_values=inputs["pixel_values"].cuda(),
      max_new_tokens=1024,
      early_stopping=False,
      do_sample=False,
      num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text, 
        task=task_prompt, 
        image_size=(image.width, image.height)
    )

    return parsed_answer



In [19]:
image = Image.open('image.png').convert("RGB")

In [30]:
run_example('<VQA>', text_input='what Q number is this? Please answer in ONE short word and DO NOT output any <loc_*> tokens or bounding boxes. Answer:')

{'<VQA>': 'VQA>what Q number is this? Please answer in ONE short word and DO NOT output any <loc_*> tokens or bounding boxes. Answer:<loc_0><loc_0><loc_999><loc_999>'}

{'<VQA>': 'VQA>what section are we on on<loc_0><loc_0><loc_999><loc_999>'}

In [17]:
tasks = [    
    '<OCR>',
    '<OCR_WITH_REGION>',
    '<CAPTION>',
    '<DETAILED_CAPTION>',
    '<MORE_DETAILED_CAPTION>',
    '<OD>',
    '<DENSE_REGION_CAPTION>',
    '<CAPTION_TO_PHRASE_GROUNDING>',
    '<REFERRING_EXPRESSION_SEGMENTATION>',
    '<REGION_TO_SEGMENTATION>',
    '<OPEN_VOCABULARY_DETECTION>',
    '<REGION_TO_CATEGORY>',
    '<REGION_TO_DESCRIPTION>',
    '<REGION_TO_OCR>',
    '<REGION_PROPOSAL>',
    ]

for task in tasks:
    print(run_example(task))

{'<OCR>': "Digital signature Flow:\nNot writing, much since\nA\nRSA key generation: CreaSida: Not writing, such since\nD Generate P.g.ofcompute m.e.d. j.xegistex le (d) with tp(withTPE\n8) Digital signature = E (PRA(X)\nDigital signature h=SHA256(x) (X) x xis maisDitsecret)\nand convertit\n(a) Compute hayh = hd mod n = using signet's private d)\n(b) Signi: sig(sig(s)\nto inteigneHt\n(c) Transmit (x sig) = valu(s))\nVerification (c)\nVerification (x Sig(s)(x) = values) and H=int. from bytes(ch), 'big)\n1) Compule n = siga(s(x)) and using: signals public key (e(n)\n3) Recover V = sig(modn = n= H=1modn) = signals publick key (c(n))\nAccept signature if n=H-modn\nattention to be practicn\n3\nAuthenticated Handshake (SS1-H=1-Style) = seccently than aspertoably\nC)\n- sender(CA):\n. Generate diffe tellman secret S & hashit hs = SHA256(s).\nD Sign with A's private key : sig+Sigma(hs) = hs2 modn = E(PRA)\nForm anenuelofofa payload (sig+s) = sig+sigma(sis) = decimals(g)\n2) Encrypt payload with

In [44]:
!pip freeze > requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [37]:
import re
from transformers import AutoTokenizer

# cache tokenizer / bad_words_ids once
try:
    _tokenizer = processor.tokenizer
except Exception:
    _tokenizer = AutoTokenizer.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)

_bad_words_ids = [[_tokenizer.get_vocab()[tok]] for tok in _tokenizer.get_vocab() if tok.startswith("<")]

def natural_vqa(question, max_new_tokens=64, num_beams=4, block_angle_brackets=True):
    prompt = f"Question: {question} Answer:"
    # request tensors (torch.float32 by default)
    inputs = processor(text=prompt, images=image, return_tensors="pt")

    # determine model/device/dtype
    model_device = next(model.parameters()).device
    model_dtype = next(model.parameters()).dtype

    # move/cast both input_ids and pixel_values to the model device & dtype
    # input_ids should stay integer type on device; only pixel_values must be cast to model dtype
    input_ids = inputs["input_ids"].to(model_device)
    # pixel_values must be same dtype as model parameters (float16 if model is half)
    pixel_values = inputs["pixel_values"].to(device=model_device, dtype=model_dtype)

    gen_kwargs = dict(
        input_ids=input_ids,
        pixel_values=pixel_values,
        max_new_tokens=max_new_tokens,
        num_beams=num_beams,
        do_sample=False,
        early_stopping=True,
    )

    if block_angle_brackets and _bad_words_ids:
        gen_kwargs["bad_words_ids"] = _bad_words_ids

    gen = model.generate(**gen_kwargs)
    raw = processor.batch_decode(gen, skip_special_tokens=False)[0]

    # clean up angle-bracket tokens if any remain
    cleaned = re.sub(r"<[^>]+>", "", raw).strip()
    # further clean: remove leading "VQA>" or "QA>" fragments
    cleaned = re.sub(r"^(VQA>|QA>|<VQA>)\s*", "", cleaned, flags=re.IGNORECASE)
    return {"raw": raw, "clean": cleaned}



In [43]:
natural_vqa('this is Q. number')

{'raw': '</s><s>Q. number Answer (i) question: "This is Q. number answer: "this is Q." question: \'this is q. question: This is Q.\' question: this is Q \'s question:</s>',
 'clean': 'Q. number Answer (i) question: "This is Q. number answer: "this is Q." question: \'this is q. question: This is Q.\' question: this is Q \'s question:'}