In [None]:
!pip install gradio transformers datasets torchvision

In [None]:
import os
import pandas as pd
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import gradio as gr
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# -------------------- Paths --------------------
MODEL_PATH = "/content/drive/My Drive/CLIP_Project/Model_Files"
CSV_PATH = "/content/drive/My Drive/CLIP_Project/Data/results.csv"
IMG_DIR = "/content/drive/My Drive/CLIP_Project/Data/Images"
NUM_IMAGES = 1000

Mounted at /content/drive


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# -------------------- Load Model --------------------
model = CLIPModel.from_pretrained(MODEL_PATH).to(device)
processor = CLIPProcessor.from_pretrained(MODEL_PATH)
model.eval()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [None]:
# -------------------- Load and Prepare Dataset --------------------
# Load CSV with pipe separator and strip columns
df = pd.read_csv(CSV_PATH, sep='|', engine='python', skip_blank_lines=True)

# Clean column names and strip spaces from text columns
df.columns = [col.strip() for col in df.columns]
df['image_name'] = df['image_name'].astype(str).str.strip()
df['comment'] = df['comment'].astype(str).str.strip()
df['comment_number'] = df['comment_number'].astype(str).str.strip()

# Function to safely parse comment_number integer from messy strings
def parse_comment_number(x):
    try:
        # Just take the first token and convert to int
        return int(x.split()[0])
    except:
        return -1  # invalid values become -1

df['comment_number'] = df['comment_number'].apply(parse_comment_number)

# Keep only rows where comment_number == 0 (first comment per image)
df = df[df['comment_number'] == 0]

# Add full image file path
df['filepath'] = df['image_name'].apply(lambda x: os.path.join(IMG_DIR, x))

# Filter only rows where image file exists
df = df[df['filepath'].apply(os.path.exists)]

# Drop duplicate images just in case
df = df.drop_duplicates(subset='image_name')

# Sample N images or all if less than N
df = df.sample(n=min(NUM_IMAGES, len(df)), random_state=42).reset_index(drop=True)

# Final lists
image_paths = df['filepath'].tolist()
captions = df['comment'].tolist()

print(f"Prepared {len(image_paths)} images and captions.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['filepath'] = df['image_name'].apply(lambda x: os.path.join(IMG_DIR, x))


Prepared 1000 images and captions.


In [None]:
# -------------------- Precompute Embeddings --------------------
image_embeddings = []
text_embeddings = []

print("Computing embeddings...")

with torch.no_grad():
    for i in tqdm(range(0, len(image_paths), 16)):
        batch_imgs = [Image.open(p).convert("RGB") for p in image_paths[i:i+16]]
        batch_texts = captions[i:i+16]
        inputs = processor(text=batch_texts, images=batch_imgs, return_tensors="pt", padding=True, truncation=True).to(device)
        outputs = model(**inputs)
        img_embed = torch.nn.functional.normalize(outputs.image_embeds, p=2, dim=1)
        txt_embed = torch.nn.functional.normalize(outputs.text_embeds, p=2, dim=1)
        image_embeddings.append(img_embed.cpu())
        text_embeddings.append(txt_embed.cpu())

image_embeddings = torch.cat(image_embeddings)
text_embeddings = torch.cat(text_embeddings)

print("Embeddings ready!")

Computing embeddings...


100%|██████████| 63/63 [09:07<00:00,  8.70s/it]

Embeddings ready!





In [None]:
# Saving Embedding
torch.save(image_embeddings, '/content/drive/My Drive/CLIP_Project/Data/Image_Embeddings.pt')
torch.save(text_embeddings, '/content/drive/My Drive/CLIP_Project/Data/Text_Embeddings.pt')

In [None]:
# Load embeddings without recomputing:
image_embeddings = torch.load('/content/drive/My Drive/CLIP_Project/Data/Image_Embeddings.pt')
text_embeddings = torch.load('/content/drive/My Drive/CLIP_Project/Data/Text_Embeddings.pt')

In [None]:
# -------------------- Retrieval Functions --------------------

def image_to_text(img):
    with torch.no_grad():
        inputs = processor(images=img, return_tensors="pt").to(device)
        query_embed = model.get_image_features(**inputs)
        query_embed = torch.nn.functional.normalize(query_embed, p=2, dim=1).cpu()
        similarities = (query_embed @ text_embeddings.T).squeeze()
        best_idx = similarities.argmax().item()
        return captions[best_idx]

def text_to_image(text):
    with torch.no_grad():
        inputs = processor(text=text, return_tensors="pt", padding=True, truncation=True).to(device)
        query_embed = model.get_text_features(**inputs)
        query_embed = torch.nn.functional.normalize(query_embed, p=2, dim=1).cpu()
        similarities = (query_embed @ image_embeddings.T).squeeze()
        best_idx = similarities.argmax().item()
        return image_paths[best_idx]

In [None]:
# -------------------- Gradio Interface --------------------

from PIL import Image

image_input = gr.Image(type="pil", label="Upload an Image")
text_input = gr.Textbox(label="Enter Caption")

# Interface for Image → Text
iface_img2txt = gr.Interface(
    fn=image_to_text,
    inputs=image_input,
    outputs=gr.Textbox(label="Matched Caption"),
    title="Image to Text Retrieval"
)

# Interface for Text → Image
def text_to_image_output(text):
    img_path = text_to_image(text)
    return Image.open(img_path)

iface_txt2img = gr.Interface(
    fn=text_to_image_output,
    inputs=text_input,
    outputs=gr.Image(label="Matched Image"),
    title="Text to Image Retrieval"
)

# Combine both interfaces in tabs
interface = gr.TabbedInterface(
    [iface_img2txt, iface_txt2img],
    ["Image to Text", "Text to Image"]
)

In [None]:
interface.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://aa1ce5e21a61c1ae5f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


