In [None]:
!pip install einops torchvision


In [6]:
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from PIL import Image
import requests

# load the processor
processor = AutoProcessor.from_pretrained(
    'allenai/Molmo-7B-D-0924',
    trust_remote_code=True,
    torch_dtype='auto',
    device_map='auto'
)

# load the model
model = AutoModelForCausalLM.from_pretrained(
    'allenai/Molmo-7B-D-0924',
    trust_remote_code=True,
    torch_dtype='auto',
    device_map='auto'
)

# process the image and text
inputs = processor.process(
    images=[Image.open("/home/ttyh/myhot3d/dataset/images/coffecup.jpg")],
    text="Point to the centre of each coffee pod."
)

# move inputs to the correct device and make a batch of size 1
inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}

# generate output; maximum 200 new tokens; stop generation when <|endoftext|> is generated
# with torch.autocast(device_type="cuda", enabled=True, dtype=torch.bfloat16):
 
output = model.generate_from_batch(
    inputs,
    GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
    tokenizer=processor.tokenizer
)

# only get generated tokens; decode them to text
generated_tokens = output[0,inputs['input_ids'].size(1):]
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)

# print the generated text
print(generated_text)

# >>>  This image features an adorable black Labrador puppy, captured from a top-down
#      perspective. The puppy is sitting on a wooden deck, which is composed ...


processor_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

preprocessing_molmo.py:   0%|          | 0.00/6.56k [00:00<?, ?B/s]

image_preprocessing_molmo.py:   0%|          | 0.00/22.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/allenai/Molmo-7B-D-0924:
- image_preprocessing_molmo.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/allenai/Molmo-7B-D-0924:
- preprocessing_molmo.py
- image_preprocessing_molmo.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


preprocessor_config.json:   0%|          | 0.00/634 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/14.3k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.0k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

config_molmo.py:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/allenai/Molmo-7B-D-0924:
- config_molmo.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_molmo.py:   0%|          | 0.00/95.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/allenai/Molmo-7B-D-0924:
- modeling_molmo.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/62.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]

model-00001-of-00007.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00007.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00007.safetensors:   0%|          | 0.00/4.73G [00:00<?, ?B/s]

model-00004-of-00007.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00005-of-00007.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00006-of-00007.safetensors:   0%|          | 0.00/4.27G [00:00<?, ?B/s]

model-00007-of-00007.safetensors:   0%|          | 0.00/3.80G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

 <points x1="38.8" y1="43.5" x2="41.2" y2="50.0" x3="50.2" y3="52.5" x4="51.2" y4="44.2" x5="61.2" y5="49.2" alt="centre of each coffee pod.">centre of each coffee pod.</points>
