In [None]:
import os
import time
import os.path as osp

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from torchvision.datasets import CIFAR10
from torchvision import datasets
from torchvision import transforms
import torchvision

from PIL import Image, ImageFilter
import matplotlib.pyplot as plt
from PIL import Image
from clip import clip
import pickle
import torchvision.transforms.functional as Func

In [None]:
BATCH_SIZE = 80

VISUAL_BACKBONE = 'RN50' # RN50, ViT-B/32, ViT-B/16

IMAGE_PATH = "Flickr8k/Images/"

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
model, preprocess = clip.load(name=VISUAL_BACKBONE, device=device, download_root='/shareddata/clip/')
model.to(device);

## Import the Dataset

In [None]:
with open("images_captions.pkl", 'rb') as file:
    images_captions = pickle.load(file)

In [None]:
images_captions = pd.DataFrame(images_captions, columns=['file_name', 'caption'])
images_captions

In [None]:
def to_right_form(captions):
    captions = list(captions)
    for i in range(len(captions)):
        captions[i] = captions[i][0]
        
    text_inputs = clip.tokenize(captions).to(device)
    return text_inputs

## Compute the top-k accuracy 

In [None]:
def select_largest_k_numbers(A, k):
    sorted_array = sorted(A, reverse=True)
    largest_k_numbers = sorted_array[:k]
    return largest_k_numbers

In [None]:
text = to_right_form(images_captions["caption"])

In [None]:
def top_k_acc(images_captions, k):
    scores = []
    i = 0
    for img_name in images_captions["file_name"]:
        image = preprocess(Image.open(IMAGE_PATH + img_name)).unsqueeze(0).to(device)

        with torch.no_grad():
            image_features = model.encode_image(image)
            text_features = model.encode_text(text)
            logits, _ = model(image, text)
            similarities = (logits.softmax(dim=-1).cpu().numpy())[0]
            top_k_similarities = select_largest_k_numbers(similarities, k)
            rank = np.sum(top_k_similarities>=similarities[i])
            score = 1-rank/k
            scores.append(score)
            i+=1
    
    score = np.mean(scores)
    return score

In [None]:
top_k_acc(images_captions, 5)

## Example

### An Image

In [None]:
sample_file_name = "1000268201_693b08cb0e.jpg"
sample_image = Image.open(IMAGE_PATH + sample_file_name)
sample_image

### Generate the top-5 captions

In [None]:
sample_image = preprocess(sample_image).unsqueeze(0).to(device)

with torch.no_grad():
    image_features = model.encode_image(sample_image)
    text_features = model.encode_text(text)
    logits, _ = model(sample_image, text)
    similarities = (logits.softmax(dim=-1).cpu().numpy())[0]

In [None]:
candidates = images_captions
candidates["similarities"] = similarities

In [None]:
top_5_indices = candidates["similarities"].nlargest(5).index

# Getting the corresponding values in column A
top_5_captions = candidates.loc[top_5_indices, 'caption'].tolist()
top_5_captions