# Image Retrieval with CLIP

In [1]:
# Import packages
import numpy as np
import torch
from PIL import Image
import pandas as pd
import os
import open_clip
from tqdm.notebook import tqdm
import random
import matplotlib.pyplot as plt

## Download Image Dataset
This notebook uses the image data and annotations provided by the FWF-funded project [Ottoman Nature in Travelogues, 1501–1850: A Digital Analysis (ONiT)](https://onit.oeaw.ac.at/). The images that we will be using were extracted from travelogues about the Ottoman Empire in English, French, German and Latin language that were printed between 1501 and 1850 and survived in the holdings of the [Austrian National Library (ÖNB) in Vienna](https://www.onb.ac.at/). The images have been scanned in the course of the [Austrian Books Online](https://www.onb.ac.at/digitale-angebote/austrian-books-online) project – a public private partnership between the ÖNB and Google Books.

1. Download the ZIP and CSV files from the following URL: https://1drv.ms/f/c/869f28ab041d44d9/ErTeV8fMekFCvtgexDOCxZoBCxCoxLJYUBOjam6rrwBSdw?e=IiKiVF
2. Unzip the image data folder and save it in the Download folder
3. Inspect the image data. Which types of images do you see?

## Data Preprocessing
### Load Data

This cell loads the data from the downloaded CSV file. Filenames, paths, and other metadata of the 8720 images from the downloaded datasets are stored in lists. These will be used later to preview and load the images with the data loader and compute the image embeddings with CLIP.

In [2]:
# ONiT data paths
onit_data_dir = '~/Downloads/curated_images_2024-11_hackathon_subset'
onit_data = pd.read_csv('~/Downloads/ONiT_singleEd-images-curated_annotated_subset_2024-11.csv', sep=",", na_filter=False)

# Load ONiT image data
onit_filenames = []
onit_paths = []
lang_year = []
labels = []

# Load image paths
for index, row in onit_data.iterrows():
    onit_filename = row["filename"]

    if onit_filename[9] == "_":
        barcode = onit_filename[:9]        
    else:
        barcode = onit_filename[:10]
        
    onit_tag = row["lang_year"]
    onit_path = os.path.join(onit_data_dir, onit_filename)
    onit_path = os.path.expanduser(onit_path)
    label = row["label_filled"]
    #print(type(label))
    onit_paths.append(onit_path)
    onit_filenames.append(onit_filename)
    lang_year.append(onit_tag)
    labels.append(label)

print(len(onit_filenames))

786


### Inspect data
In the Pandas dataframe below, inspect the loaded metadata for each image file.

- **filename**: Filename of the corresponding image file.
- **barcode**: Code of the physical book at the Austrian National Library (ÖNB) from which the image was extracted.
- **iiif**: Link to the extracted image on the ÖNB IIIF server.
- **lang_year**: Alphanumeric code describing language and century of the printed book (e.g. D16 means German, 16th century).
- **label_filled**: Groundtruth with annotations of nature classes per image done in context of the ONiT project.

In [3]:
# Inspect loaded data - the CSV contains the metadata for each image file.
onit_data

Unnamed: 0,filename,barcode,iiif,lang_year,label_filled
0,Z156577207_00009_page9_01.jpg,Z156577207,https://iiif.onb.ac.at/images/ABO/Z156577207/0...,D16,"plants; vegetation, 25G*|animals, 25F*|birds, ..."
1,Z15958790X_00126_page126_01.jpg,Z15958790X,https://iiif.onb.ac.at/images/ABO/Z15958790X/0...,D16,"landscapes, 25H"
2,Z160260500_00003_page3_01.jpg,Z160260500,https://iiif.onb.ac.at/images/ABO/Z160260500/0...,D16,"animals, 25F|plants; vegetation, 25G*|fabulous..."
3,Z160260500_00033_page33_01.jpg,Z160260500,https://iiif.onb.ac.at/images/ABO/Z160260500/0...,D16,"plants; vegetation, 25G"
4,Z160260500_00229_page229_01.jpg,Z160260500,https://iiif.onb.ac.at/images/ABO/Z160260500/0...,D16,"plants; vegetation, 25G*"
...,...,...,...,...,...
781,Z184478408_00111_page111_01.jpg,Z184478408,https://iiif.onb.ac.at/images/ABO/Z184478408/0...,L18,"plants; vegetation, 25G"
782,Z184478408_00131_page131_01.jpg,Z184478408,https://iiif.onb.ac.at/images/ABO/Z184478408/0...,L18,"plants; vegetation, 25G"
783,Z184478500_00075_page75_01.jpg,Z184478500,https://iiif.onb.ac.at/images/ABO/Z184478500/0...,L18,"plants; vegetation, 25G"
784,Z184478500_00105_page105_01.jpg,Z184478500,https://iiif.onb.ac.at/images/ABO/Z184478500/0...,L18,"plants; vegetation, 25G"


### Load image preprocessing parameters and data loader

Image preprocessing parameters are necessary to load the images in the correct size and ratio into the CLIP model. The pad function keeps the aspect ratio of the original images.

In [4]:
# Import image preprocessing packages
import torchvision.transforms as transforms
import torchvision.transforms.functional as F

# Define custom image preprocessing parameters
BATCH_SIZE = 128
IMAGE_SIZE = 224
MEAN = [0.485, 0.456, 0.406]
STD = [0.229, 0.224, 0.225]
#train_val_split_ratio = 0.8 # 80/20 ratio for training/validation data

# Define pad function (add black borders to keep aspect ratio)
class SquarePad:
    def __call__(self, image):
        w, h = image.size
        max_wh = np.max([w, h])
        hp = int((max_wh - w) / 2)
        vp = int((max_wh - h) / 2)
        padding = (hp, vp, hp, vp)
        return F.pad(image, padding, 0, 'constant')

# Define custom preprocess function
preprocess = transforms.Compose([
    SquarePad(),
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(MEAN, STD)
])

Dataset class and data loader are defined below to correctly load the images into the CLIP model.

In [5]:
from torch.utils.data import Dataset, DataLoader

# Dataset class & Data Loader
class OnitFull(Dataset):
    def __init__(self, onit_paths, transform=None):
        self.onit_paths = onit_paths
        self.preprocess = preprocess

    def __len__(self):
        return len(self.onit_paths)

    def __getitem__(self, idx):
        image_path = self.onit_paths[idx]
        try:
            with open(image_path, 'rb') as f:
                image = Image.open(f).convert("RGB")
                image_prep = preprocess(image)
                return image_prep
        except FileNotFoundError:
            print(f"Warning: Image file {image_path} not found. Skipping...")
            pass

# Set Data Loader
onit_full_dataset = OnitFull(onit_paths)
onitdata_loader = DataLoader(onit_full_dataset, batch_size=BATCH_SIZE, shuffle=False)

## Load model and get features

1. Go to Huggingface and select the model that you want to use. In this example, we are going to use the CLIP-ViT-B-32 model pretrained on the [LAION-2B English subset of LAION-5B](https://laion.ai/blog/laion-5b/) dataset. You can find the model here: https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K
2. Click on the button "Use this model" to copy the code for loading the model via Huggingface in your Python script. You can also directly open it in a Google Collab or Kaggle notebook.
3. In the cell below, the code to load the CLIP model is already inserted. Before loading the model, make sure to set the device (i.e. if the model is run on GPU or CPU). The code below automatically checks if a GPU is available. If a GPU is available, the device is automatically set to CUDA. Otherwise the CPU is used.

In [6]:
## https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K?library=open_clip

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device: ", device)

# Load model
model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms('hf-hub:laion/CLIP-ViT-B-32-laion2B-s34B-b79K', device=device)

Device:  cpu


4. The get features function below loads the images with the data loader defined above in batches and embeds all images. Depending on the device used, this can take several minutes.
5. Once the embeddings are calculated, store them in a numpy file so that you can load them later without having to calculate the features again.

In [7]:
# Get features of ONiT dataset with loaded image encoder of fine-tuned/pre-trained model

# Get Features function
def get_features(onit_full_dataset):
    image_features = []

    with torch.no_grad():
        for image_prep in tqdm(onitdata_loader):
            #print('Images: ', image_prep.shape)
            imfeat = model.encode_image(image_prep.to(device)) ## Pre-trained model
            
            image_features.append(imfeat)

    return torch.cat(image_features).cpu().numpy()

# Calculate image features
image_features = get_features(onit_full_dataset)

  0%|          | 0/7 [00:00<?, ?it/s]

In [9]:
# Store the embeddings
np.save("onit_images_CURATED_embeddings_openCLIP.npy", image_features)

# Image retrieval with natural language prompt
## Load tokenizer and image vectors

Once the image embeddings are calculated, we can start with the image retrieval using vectors. Before that, we have to set the correct tokenizer for the natural language prompt and load our embeddings (or image features) from the numpy file.

To find similar vectors, we will use cosine similarity. We will import it from the Scikit Learn library.

1. Load the tokenizer to preprocess the natural language prompts before passing them to the language transformer.
2. Load the image embeddings and scikit learn cosine similarity.

In [7]:
# Import text preprocessing packages
from open_clip import tokenizer

tokenizer = open_clip.get_tokenizer('hf-hub:laion/CLIP-ViT-B-32-laion2B-s34B-b79K')

In [8]:
# Load image features
imagesCurated = "onit_images_CURATED_embeddings_openCLIP.npy"

loaded_image_features = np.load(imagesCurated)
image_embeddings = torch.tensor(loaded_image_features, device=device)

print(len(image_embeddings))
print(len(onit_paths))

# Import image similarity
from sklearn.metrics.pairwise import cosine_similarity

786
786


## Retrieval based on cosine similarity

Now that we have set our prompt, we need to pass it to CLIP's language transformer.

1. First, the text prompt is tokenized and encoded with the language transformer. Since it is just one text vector, this is really fast.
2. In the next step, the cosine similarity is calculated between the text embedding and all image embeddings previously computed/loaded.
3. To find the similar indices, the indices are sorted by the resulting similarities. A ranking with the highest similarity scores on top is created.

In [9]:
# Example text prompt
text_prompt = "An image of a horse" ## GT: "horses and kindred animals, 46C1314"

# Encode the text prompt to get the text embedding
with torch.no_grad():
    text_embedding = model.encode_text(tokenizer(text_prompt).to(device)) #for original CLIP model

print("Text embedding shape [number of vectors, dimensions]: ", text_embedding.shape)
print("Image embeddings shape [number of vectors, dimensions]: ", image_embeddings.shape)

Text embedding shape [number of vectors, dimensions]:  torch.Size([1, 512])
Image embeddings shape [number of vectors, dimensions]:  torch.Size([786, 512])


In [10]:
# Compute the cosine similarity between the text prompt and all images
similarities = cosine_similarity(text_embedding.cpu(), image_embeddings.cpu())
similarities_tensor = torch.tensor(similarities[0], device=device)

# Sort the images based on similarity (descending order)
similar_images_indices = torch.argsort(similarities_tensor, descending=True)
similar_ranks = list(enumerate(similar_images_indices.tolist()))
#print(similar_ranks)

## Preview retrieval results

Finally, we can try our own vector similarity search!

**The following cell creates a simple interface to inspect the first N retrieved images. You can inspect the rank, similarity score, filename, and language/century of each image retrieved.** In the search bar below, enter a text prompt describing the type of images that you want to look for and press the search button. The model works best in English, but also German prompts are able to yield good results.

Like in the code above, the string is passed to the language transformer and vectorized before calculating the cosine similarity. The resulting ranking sorts the image indices by similarity, as displayed in the code above. We can now inspect the results to check which images are the most similar to our text prompt according to the CLIP model used.

In [11]:
## Run to start interactive CLIP image search & preview of top N results ##

import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display, clear_output

# --- UI widgets ---
text_box = widgets.Text(
    value="An image of a dog",
    description="Prompt:",
    placeholder="Enter text prompt",
    layout=widgets.Layout(width='70%')
)
top_n_slider = widgets.IntSlider(
    value=36,
    min=4, max=100, step=4,
    description='Top N:',
    continuous_update=False,
    style={'description_width': '70px'}
)
search_button = widgets.Button(
    description="Search",
    button_style='success',
    tooltip="Search for similar images",
    icon='search'
)
output = widgets.Output()

# --- main search function ---
def on_search_clicked(b):
    with output:
        clear_output(wait=True)
        text_prompt = text_box.value.strip()
        if not text_prompt:
            print("⚠️ Please enter a text prompt.")
            return

        print(f"Searching for images similar to: '{text_prompt}'")

        with torch.no_grad():
            text_embedding = model.encode_text(tokenizer(text_prompt).to(device))

        similarities = cosine_similarity(text_embedding.cpu(), image_embeddings.cpu())
        similarities_tensor = torch.tensor(similarities[0], device=device)

        similar_images_indices = torch.argsort(similarities_tensor, descending=True)
        top_n = top_n_slider.value

        # --- visualization ---
        cols = 6
        rows = (top_n + cols - 1) // cols
        fig, axes = plt.subplots(rows, cols, figsize=(12, 2 * rows))
        axes = axes.flatten()

        for i in range(top_n):
            index = similar_images_indices[i].item()
            img = Image.open(onit_paths[index])
            axes[i].imshow(img)
            axes[i].axis("off")
            axes[i].set_title(
                f"Rank: {i+1}\nSim: {similarities_tensor[index]:.4f}\n{onit_filenames[index][:25]}\n{lang_year[index]}",
                fontsize=8
            )

        # hide unused axes
        for j in range(top_n, len(axes)):
            axes[j].axis("off")

        plt.tight_layout()
        plt.show()

# --- bind button ---
search_button.on_click(on_search_clicked)

# --- layout ---
ui = widgets.VBox([
    widgets.HBox([text_box, top_n_slider, search_button]),
    output
])
display(ui)

VBox(children=(HBox(children=(Text(value='An image of a dog', description='Prompt:', layout=Layout(width='70%'…

# Check against Ground Truth

How accurate are the results that we achieved? To get a better idea about the accuracy and performance of CLIP, we can compare the retrieval results with the annotated ground truth. Remember that the text vector is only compared to the image vectors for the similarity search. The manually annotated ground truth is used to compare how well the retrieval results match the human classification.

**Run the cell below to compare the annotated ground truth (GT) with the search results.**

## How well did the model perform?

**The cell below calculates the ratio of true positives (TP) and true negatives (TN) that the model could retrieve for the first k results.**

For example, of the first 30 retrieved images for the text prompt "image of a camel", only 2 are actually camels (=TP), i.e. 6,67% of the retrieved images.

Let's calculate some more metrics. **To analyse the relationships of true positives, false positives, and false negatives, we calculate recall at k and precision at k.**

- **Recall at k** measures the percentage of relevant items found in the top K retrieved results out of all relevant items (i.e. total examples per class).
- **Precision at k** measures the percentage of relevant items within the top K retrieved results (which corresponds to the percentage of TP in the first k retrieved results that we calculated before).

An additional measure to examine the model’s capability to retrieve images relevant to the query is **R-precision**. It can be useful to assess how well the model performs for a large number of relevant documents. For example, we know that we have 24 camels in our dataset. Therefore, a perfect system would retrieve 100% camels in the top 24 ranked results.

- **R-precision** shows the percentage of images from our Gold Standard that the model retrieved for the first R-similarities, where R is the total number of examples per class in our image dataset.

In our case, R-precision for dogs in our dataset is only 5.5%. What does this mean?

In [12]:
## Run to start interactive inspection of TP, FP and metrics ##
import matplotlib.patches as patches

# --- Map keywords to default prompts ---
keyword_prompt_map = {
    "animals, 25F": "image of an animal",
    "plants; vegetation, 25G": "image of a plant",
    "landscapes, 25H": "image of a landscape",
    "maps; atlases, 25A": "image of a map",
    "horses and kindred animals, 46C1314": "image of a horse",
    "birds, 25F3": "image of a bird",
    "camel, 25F24": "image of a camel",
    "hoofed animals (GIRAFFE), 25F24": "image of a giraffe"
}

# --- UI widgets ---
dropdown = widgets.Dropdown(
    options=list(keyword_prompt_map.keys()),
    value="animals, 25F",
    description="Select Keyword:",
    layout=widgets.Layout(width="60%")
)
top_n_slider = widgets.IntSlider(
    value=30,
    min=10, max=200, step=10,
    description="Top-N:",
    continuous_update=False
)
run_button = widgets.Button(
    description="Search",
    button_style="info",
    icon="search"
)
output = widgets.Output()

# --- Main search and evaluation function ---
def on_search_clicked(b):
    with output:
        clear_output(wait=True)
        
        selected_keyword = dropdown.value
        text_prompt = keyword_prompt_map[selected_keyword]
        k = top_n_slider.value

        print(f"🔍 Searching for images similar to: '{text_prompt}'")
        print(f"✅ Using keyword for TP/FP: '{selected_keyword}'")
        print(f"Top-{k} images\n")

        # --- Encode text prompt ---
        with torch.no_grad():
            text_embedding = model.encode_text(tokenizer(text_prompt).to(device))

        # --- Compute similarity ---
        similarities = cosine_similarity(text_embedding.cpu(), image_embeddings.cpu())
        similarities_tensor = torch.tensor(similarities[0], device=device)
        similar_images_indices = torch.argsort(similarities_tensor, descending=True)
        similar_images_indices_list = similar_images_indices[:k].cpu().numpy().tolist()
        similar_images_indices_full = similar_images_indices.cpu().numpy().tolist()

        # --- TP / FP analysis ---
        indices_TP, indices_FP = [], []
        total_tuples = 0
        for idx in similar_images_indices_list:
            label_tuple = tuple(labels[idx].split("|"))
            if isinstance(label_tuple, tuple):
                total_tuples += 1
                if any(selected_keyword in item for item in label_tuple):
                    indices_TP.append(idx)
                else:
                    indices_FP.append(idx)

        percentage_TP = (len(indices_TP)/total_tuples*100) if total_tuples else 0
        #print(f"Total tuples checked: {total_tuples}")
        print(f"✅ True Positives: {len(indices_TP)} | ❌ False Positives: {len(indices_FP)}")
        print(f"📊 Percentage TP: {percentage_TP:.2f}%\n")

        # --- FN / TN / Precision / Recall ---
        indices_labelled_animals = []
        similarities_scores_animals = []
        for idx in similar_images_indices_full:
            label_tuple = tuple(labels[idx].split("|"))
            if isinstance(label_tuple, tuple) and any(selected_keyword in item for item in label_tuple):
                indices_labelled_animals.append(idx)
                similarities_scores_animals.append(similarities_tensor[idx])

        # False Negatives (FN)
        indices_FN = [idx for idx in indices_labelled_animals if idx not in indices_TP]

        # True Negatives (TN)
        TN = len(similar_images_indices_list) - len(indices_labelled_animals) - len(indices_FP)

        # Recall@K
        recall_at_k = len(indices_TP) / (len(indices_TP) + len(indices_FN)) if (len(indices_TP) + len(indices_FN)) else 0

        # Precision@K
        precision_at_k = len(indices_TP) / (len(indices_TP) + len(indices_FP)) if (len(indices_TP) + len(indices_FP)) else 0

        print(f"🖼️ Total occurrences of '{selected_keyword}': {len(indices_labelled_animals)}")
        print(f"⚠️ False Negatives: {len(indices_FN)}")
        print(f"Recall@{k}: {recall_at_k:.3f}")
        print(f"Precision@{k}: {precision_at_k:.3f}")

        # --- R-precision ---
        R = len(indices_labelled_animals)
        indices_rel = []
        similarities_scores_TP = []

        for index in similar_images_indices_list[:R]:
            label_tuple = tuple(labels[index].split("|"))
            if isinstance(label_tuple, tuple) and any(selected_keyword in item for item in label_tuple):
                indices_rel.append(index)
                similarities_scores_TP.append(similarities_tensor[index])

        R_precision = len(indices_rel)/R if R > 0 else 0
        print(f"R-precision at {R}: {R_precision:.3f}\n")

        # --- Visualization with colored rectangles ---
        cols = 10
        rows = (k + cols - 1) // cols
        fig, axes = plt.subplots(rows, cols, figsize=(18, 2.5*rows))
        axes = axes.flatten()

        for i, idx in enumerate(similar_images_indices_list):
            img = Image.open(onit_paths[idx])
            axes[i].imshow(img)
            axes[i].axis("off")

            # Draw TP/FP rectangle
            if idx in indices_TP:
                rect = patches.Rectangle(
                    (0, 0), img.size[0], img.size[1],
                    linewidth=4, edgecolor='green', facecolor='none'
                )
                axes[i].add_patch(rect)
            elif idx in indices_FP:
                rect = patches.Rectangle(
                    (0, 0), img.size[0], img.size[1],
                    linewidth=4, edgecolor='red', facecolor='none'
                )
                axes[i].add_patch(rect)

            sim_score = similarities_tensor[idx].item()
            axes[i].set_title(
                f"{i+1}. {onit_filenames[idx][:15]}\n{lang_year[idx]}\n{sim_score:.3f}",
                fontsize=8
            )

        for j in range(k, len(axes)):
            axes[j].axis("off")

        plt.tight_layout()
        plt.show()

# --- Bind button ---
run_button.on_click(on_search_clicked)

# --- Display UI ---
ui = widgets.VBox([
    widgets.HBox([dropdown, top_n_slider, run_button]),
    output
])
display(ui)

VBox(children=(HBox(children=(Dropdown(description='Select Keyword:', layout=Layout(width='60%'), options=('an…

**Done!** You have now learned how to load a CLIP model from Huggingface, how to prepare and load your data into the model, how to embed the images and text prompt, and how to use cosine similarity to rank the images according to their similarity with the vectorized text prompt.
You have also learned how to inspect the results, assess how well the model performs by calculating precision/recall metrics, and understand the limitations of the pretrained model.