In [1]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds


predict_step(['girl2.jpg'])


  from .autonotebook import tqdm as notebook_tqdm
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.47.1"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop":

['a woman in a bikini standing on a beach']

In [None]:
import streamlit as st
from PIL import Image
from image_to_text import predict_step

# --- Конфигурация страницы ---
st.set_page_config(
    page_title="What on the Image ?",
    page_icon="🖼️",
    layout="centered"
)

# --- Стили CSS ---
st.markdown(
    """
    <style>
        .stApp {
            background-color: #f5f7fa;
        }
        .title {
            text-align: center;
            font-size: 2.5em;
            font-weight: bold;
            color: #4f8bf9;
            margin-bottom: 10px;
        }
        .subtitle {
            text-align: center;
            font-size: 1.2em;
            color: #6c757d;
            margin-bottom: 30px;
        }
       .upload-box {
            border: 2px dashed #6c757d;
            border-radius: 10px;
            padding: 50px;
            text-align: center;
            background-color: #f8f9fa;
            font-size: 18px;
            color: #6c757d;
        }
        .caption-box {
            text-align: center;
            font-size: 1.2em;
            margin-top: 20px;
            color: #4f8bf9;
        }
    </style>
    """,
    unsafe_allow_html=True
)

# --- Заголовок ---
st.markdown('<div class="title">🖼️ Image Captioning App</div>', unsafe_allow_html=True)
st.markdown('<div class="subtitle">Upload an image to generate a caption</div>', unsafe_allow_html=True)

# --- Загрузка изображения ---
uploaded_file = st.file_uploader(
    "Upload an image",
    type=["jpg", "png", "jpeg"],
    key="file_uploader"
)

# --- Основной блок приложения ---
if uploaded_file is not None:
    st.markdown('<div class="upload-box">Image Preview</div>', unsafe_allow_html=True)
    image = Image.open(uploaded_file)
    st.image(image, caption="Uploaded Image", use_column_width=True)
    
    if st.button("Generate Caption 📝"):
        with st.spinner("Generating caption..."):
            caption = predict_step([uploaded_file])
            st.markdown(f'<div class="caption-box">📝 Caption: <b>{caption[0]}</b></div>', unsafe_allow_html=True)
else:
    st.markdown('<div class="upload-box">Drop your image file here or click to upload</div>', unsafe_allow_html=True)

