In [1]:
!pip install torch torchvision transformers accelerate pillow sentencepiece

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)
  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-n

In [12]:
!pip install -q torch torchvision transformers pillow requests gradio accelerate

import os
import torch
from PIL import Image
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import gradio as gr
import requests
from io import BytesIO
import numpy as np

# Disable tokenizer parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Initialize device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Model configuration
MODEL_NAME = "nlpconnect/vit-gpt2-image-captioning"
MAX_LENGTH = 64
NUM_BEAMS = 4

# Load model components
feature_extractor = ViTImageProcessor.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load model with appropriate precision
model = VisionEncoderDecoderModel.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)

# Configure model
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.max_length = MAX_LENGTH
model.config.num_beams = NUM_BEAMS

def process_image(image):
    """Process image and generate caption"""
    try:
        # Convert input to PIL Image
        if isinstance(image, str):
            if image.startswith(("http:", "https:")):
                response = requests.get(image, timeout=10)
                img = Image.open(BytesIO(response.content)).convert("RGB")
            else:
                img = Image.open(image).convert("RGB")
        elif isinstance(image, np.ndarray):
            img = Image.fromarray(image).convert("RGB")
        else:
            img = image.convert("RGB") if hasattr(image, 'convert') else image
        
        # Process image
        pixel_values = feature_extractor(
            images=img, 
            return_tensors="pt"
        ).pixel_values.to(device)
        
        with torch.no_grad():
            output_ids = model.generate(
                pixel_values,
                max_length=model.config.max_length,
                num_beams=model.config.num_beams,
                early_stopping=True
            )
        
        return tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    except Exception as e:
        return f"Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Image Caption Generator") as demo:
    gr.Markdown(f"""
    # 🖼️ Image Captioning
    **Model:** `{MODEL_NAME}`  
    **Device:** `{device.upper()}`  
    **Max Length:** `{MAX_LENGTH}` | **Beams:** `{NUM_BEAMS}`
    """)
    
    with gr.Row():
        image_input = gr.Image(label="Upload Image", type="pil")
        output_text = gr.Textbox(label="Generated Caption", lines=4)
    
    with gr.Accordion("Advanced Settings", open=False):
        with gr.Row():
            max_length = gr.Slider(16, 128, value=MAX_LENGTH, label="Max Length")
            num_beams = gr.Slider(1, 8, value=NUM_BEAMS, step=1, label="Number of Beams")
    
    def generate_caption(img, ml, beams):
        try:
            pixel_values = feature_extractor(
                images=img,
                return_tensors="pt"
            ).pixel_values.to(device)
            
            with torch.no_grad():
                output_ids = model.generate(
                    pixel_values,
                    max_length=ml,
                    num_beams=beams,
                    early_stopping=True
                )
            
            return tokenizer.decode(output_ids[0], skip_special_tokens=True)
        except Exception as e:
            return f"Generation error: {str(e)}"
    
    image_input.change(
        fn=generate_caption,
        inputs=[image_input, max_length, num_beams],
        outputs=output_text
    )
    
    gr.Examples(
        examples=[
            ["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png"],
            ["https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"]
        ],
        inputs=[image_input],
        outputs=[output_text],
        fn=lambda img: generate_caption(img, MAX_LENGTH, NUM_BEAMS),
        cache_examples=False
    )

# Launch the interface
try:
    demo.launch(debug=False)
except Exception as e:
    print(f"Interface error: {e}")
    demo.launch(debug=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Using device: cuda


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": true,
  "torch_dtype": "float16",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_to

* Running on local URL:  http://127.0.0.1:7864
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://09cf85aa05a949d1cd.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


  
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
