In [24]:
from PIL import Image
import torch
from transformers import (
    AutoTokenizer,
    GPT2LMHeadModel,
    CLIPProcessor,  # If you're using the updated processor instead of feature extractor
    CLIPModel
)


# Define the model names for encoder and decoder
encoder_model_name = "flaviagiammarino/pubmed-clip-vit-base-patch32"  # Example for your encoder
decoder_model_name = "gpt2"  # Define the decoder model name

# Initialize the decoder with GPT2LMHeadModel
decoder = GPT2LMHeadModel.from_pretrained(decoder_model_name)
tokenizer = AutoTokenizer.from_pretrained(decoder_model_name, clean_up_tokenization_spaces=True)

# Load the feature extractor for the encoder (PubMed CLIP ViT)
feature_extractor = CLIPImageProcessor.from_pretrained(encoder_model_name)

def preprocess_image(image_path):
    """Preprocess the image for the encoder."""
    from PIL import Image
    image = Image.open(image_path)
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
    return pixel_values

def extract_image_embeddings(encoder, pixel_values):
    """Extract embeddings from the vision part of the PubMed CLIP ViT encoder."""
    with torch.no_grad():
        # Explicitly call only the vision model to extract image embeddings
        vision_outputs = encoder.vision_model(pixel_values=pixel_values)
        image_embeddings = vision_outputs.last_hidden_state  # Use the last hidden state
    return image_embeddings


def generate_caption(encoder, decoder, image_path, max_new_tokens=50, num_beams=4):
    # Load and preprocess the image
    image = Image.open(image_path)
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
    
    # Extract image embeddings
    image_embeddings = extract_image_embeddings(encoder, pixel_values)

    # Project image embeddings to match the decoder input dimension
    input_ids = image_embeddings.view(1, -1)  # Adjust shape if necessary
    input_ids = input_ids.to(torch.long)  # Convert to LongTensor
    
    # Generate captions using the GPT-2 decoder model
    generated_ids = decoder.generate(
        input_ids=input_ids,
        max_new_tokens=max_new_tokens,
        num_beams=num_beams,
        pad_token_id=tokenizer.eos_token_id  # Specify pad_token_id if needed
    )
    
    # Decode the generated captions to text
    generated_caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
    return generated_caption


In [35]:
# Example usage:
image_path = "sample1.jpg"  # Replace with your image path
image = Image.open(image_path)
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
    
# Extract image embeddings
image_embeddings = extract_image_embeddings(encoder, pixel_values)

# Project image embeddings to match the decoder input dimension
input_ids = image_embeddings.view(1, -1)  # Adjust shape if necessary
input_ids = input_ids.to(torch.long)  # Convert to LongTensor
print("Input IDs:", input_ids.max())

Input IDs: tensor(10)


In [32]:
example_text = "Your example input text here."
input_ids = tokenizer.encode(example_text, return_tensors='pt')
print("Example Input IDs:", input_ids)

Example Input IDs: tensor([[7120, 1672, 5128, 2420,  994,   13]])


In [26]:
vocab_size = tokenizer.vocab_size
print("Vocabulary Size:", vocab_size)


Vocabulary Size: 50257


In [25]:

caption = generate_caption(encoder, decoder, image_path)
print("Generated Caption:", caption)


IndexError: index out of range in self