In [2]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, AutoProcessor, AutoModelForCausalLM

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# image loading and preprocessing
def load_and_preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    inputs = processor(images=image, return_tensors="pt")
    return inputs, processor

In [None]:
# image understanding with CLIP
def generate_image_embeddings(inputs):
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)

    return image_features, model

In [None]:
# caption matching (using CLIP text embeddings)
def match_captions(image_features, captions, clip_model, processor):
    # 1. get text embeddings for the captions:
    text_inputs = processor(text=captions, return_tensors="pt", padding=True)
    with torch.no_grad():
        text_features = clip_model.get_text_features(**text_inputs)

    # 2. calculate cosine similarity between image and text features:
    image_features = image_features.detach().cpu().numpy()
    text_features = text_features.detach().cpu().numpy()

    similarities = cosine_similarity(image_features, text_features)

    # 3. find the best matching captions:
    best_indices = similarities.argsort(axis=1)[0][::-1]
    best_captions = [captions[i] for i in best_indices]

    return best_captions, similarities[0][best_indices].tolist()

In [None]:
# main function
def image_captioning(image_path, candidate_captions):
    inputs, processor = load_and_preprocess_image(image_path)
    image_features, clip_model = generate_image_embeddings(inputs)

    best_captions, similarities = match_captions(image_features, candidate_captions, clip_model, processor)
    return best_captions, similarities

In [None]:
candidate_captions = [
    "Mirror, mirror on the wall...",

"Serving looks from every angle.",

"Reflection game strong.",

"Catching my own vibe.",

"Just me, myself, and I.",

"Proof that I exist.",

"Serving face and confidence.",

"Mirror reflects, but the vibe is all me.",

"Looking this good should be illegal.",

"Stealing glances at myself.",

"My own paparazzi moment.",

"Confidence level: mirror selfie.",

"No bad vibes, only good angles.",

"Self-love, one glance at a time.",

"The view is unbeatable.",

"Main character energy unlocked.",

"Reflecting on how great I look.",

"Dressed for the selfie I deserve.",

"My only competition is my reflection.",

"Mirror, meet your new obsession.",

"Just me, stealing my own spotlight.",

"The reflection never lies.",

"Checking out the competition-it's me.",

"Angles on point, mood unmatched.",

"Mirror magic in progress.",

"Let the reflection do the talking.",

"Caught myself staring again.",

"The glow is real, the mirror agrees.",

"Making mirrors proud, one selfie at a time.",

"A reflection worth capturing.",

"Double the beauty, one snap.",

"This reflection deserves a standing ovation.",

"Dressed up, just for the mirror.",

"When the mirror loves you back.",

"Front row seat to my own show.",

"This mirror knows all my secrets.",

"No filter, just vibes.",

"Staring contest with myself-guess who wins?",

"In my reflection era.",

"Mirror, I think we're soulmates.",

"Where words fail, my introspection speaks.",

"In the hush, I bloom.",

"Dreaming awake in a room full of asleep.",

"Books, coffee, silence: The introvert's symphony.",

"Eyes that see beyond the noise.",

"Every pause, a story waiting to unfold.",

"Fluent in silence, with a minor in deep thoughts.",

"Beyond the chaos, lies my tranquil cosmos.",

"Savoring the sweet symphony of stillness.",

"Not shy, just selectively social.",

"Decoding the world, one quiet moment at a time.",

"Some souls are painted in the hues of quiet.",

"Deep dives in the ocean of thoughts.",

"Finding beauty in the pauses.",

"Sip in solitude, exhale serenity.",

"Crafting wonders from whispers.",

"But first, coffee.",

"I'm sorry for what I said before my morning coffee.",

"Another day, another cup of coffee.",

"Dear coffee, you were in my dreams last night.",

"Too much morning, not enough coffee.",

"Mornings are better with delicious breakfast food and coffee.",

"I've bean thinking about you a latte.",

"This is my resting coffee face.",

"Relationship status: Looking for a tall, dark, rich cup of coffee.",

"When life gives you lemons, trade them for coffee.",

"Coffee before talkie, please.",

"Guess what? It's coffee o'clock.",

"May your coffee be hot and your eyeliner be slayin'.",

"My birthstone is a coffee bean.",

"I like big cups of coffee and I cannot lie.",

"Today's good mood is sponsored by coffee.",

"Make some coffee and own the day.",

"Inhale the caffeine, exhale the negativity.",

"Hit me with your best espresso shot.",

"I don't need an inspirational quote. I need a coffee.",

"Change the world. Start with coffee.",

"I'll take a coffee with my sunshine.",

"A caffeine queen in her natural habitat.",

"All you need is love and good coffee.",

"It's a brew-tiful day.",

"Life can be tough, coffee helps.",

"Wake up and smell the coffee.",

"come out and play",

"Do you get deja brew?",

"You mocha me so happy."
]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

best_captions, similarities = image_captioning("/content/Testing jpg file.jpg", candidate_captions)

# get the top 5 results
top_n = min(5, len(best_captions))
top_best_captions = best_captions[:top_n]
top_similarities = similarities[:top_n]

print("Top 5 Best Captions:")
for i, (caption, similarity) in enumerate(zip(top_best_captions, top_similarities)):
    print(f"{i+1}. {caption} (Similarity: {similarity:.4f})")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Top 5 Best Captions:
1. Mirror reflects, but the vibe is all me. (Similarity: 0.2550)
2. Making mirrors proud, one selfie at a time. (Similarity: 0.2446)
3. Confidence level: mirror selfie. (Similarity: 0.2435)
4. Dressed for the selfie I deserve. (Similarity: 0.2425)
5. My only competition is my reflection. (Similarity: 0.2404)
