In [1]:
import torchvision
import clip
import torch
from PIL import Image
import matplotlib.pyplot as plt
from sklearn import neighbors
from glob import glob
import os
from datasets import load_dataset
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import io
import requests
import urllib
import numpy as np

from datasets.utils.file_utils import get_datasets_user_agent

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
USER_AGENT = get_datasets_user_agent()

DOWNLOAD_DIR = '/home/ubuntu/data/conceptual_captions/labeled'

def download_single_image(image_url, ind, download_dir=DOWNLOAD_DIR):
        try:
            filename = os.path.join(download_dir, f"{ind}.jpg")#os.path.basename(image_url))
            if os.path.exists(filename):
                return
            img_data = requests.get(image_url).content
            with open(filename, 'wb') as handler:
                handler.write(img_data)
        except Exception as e:
            print(e)


def fetch_images(batch, num_threads, timeout=None, retries=0):
    fetch_single_image_with_args = partial(fetch_single_image, timeout=timeout, retries=retries)
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        batch["image"] = list(executor.map(fetch_single_image_with_args, batch["image_url"]))
    return batch

In [3]:
dset_labeled = load_dataset("conceptual_captions", name='labeled')

Found cached dataset conceptual_captions (/home/ubuntu/.cache/huggingface/datasets/conceptual_captions/labeled/1.0.0/05266784888422e36944016874c44639bccb39069c2227435168ad8b02d600d8)
100%|██████████| 1/1 [00:00<00:00,  7.27it/s]


In [4]:
num_to_download = 100000
subset = dset_labeled['train'][:num_to_download]

In [7]:
with ThreadPoolExecutor(max_workers=10) as executor:
    list(executor.map(download_single_image, subset['image_url'], range(len(subset['image_url']))))#, range(len(subset['image_url'])))

HTTPConnectionPool(host='www.coolranchero.com', port=80): Max retries exceeded with url: /wp-content/uploads/red-oak-cut.jpg (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f97aaf2f340>: Failed to establish a new connection: [Errno -2] Name or service not known'))
HTTPConnectionPool(host='www.lornabj.co.uk', port=80): Max retries exceeded with url: /gallery/00068989.jpg (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f97aae61880>: Failed to establish a new connection: [Errno -2] Name or service not known'))
HTTPSConnectionPool(host='2qibqm39xjt6q46gf1rwo2g1-wpengine.netdna-ssl.com', port=443): Max retries exceeded with url: /wp-content/uploads/2017/10/8663117_web1_L1-JulcolKind-edh-171001.jpg (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f97aaf28d60>: Failed to establish a new connection: [Errno -2] Name or service not known'))
HTTPSConnectionPool(host='sermonquotes-eszuskq0bptlfh8awbb.stack

In [5]:
# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [6]:
# Helper function to get CLIP features of an image
def get_image_features(image_path):
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image)
    return image_features.cpu().numpy()

In [7]:
# Helper function to get CLIP features of text
def get_text_features(text):
    text = clip.tokenize(text).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text)
    return text_features.cpu().numpy()

In [61]:
torch.from_numpy(downloaded_dset['image_features'].astype('float32'))

tensor([[-0.3389,  0.1622,  0.1849,  ...,  0.3142, -0.2822, -0.1782],
        [-0.1791,  0.4587,  0.0641,  ...,  0.6108, -0.2385, -0.2062],
        [-0.1298,  0.4797, -0.1777,  ...,  0.3574,  0.0505,  0.1039],
        ...,
        [ 0.1338,  0.4475, -0.2644,  ...,  0.4087, -0.5146,  0.1832],
        [ 0.1134, -0.0311,  0.4126,  ...,  0.2025, -0.3774, -0.6797],
        [ 0.2874, -0.5991, -0.0970,  ...,  0.9458, -0.2844, -0.1941]])

In [62]:
# Gets similarity
def get_cosine_similarity(image_features, text_features):
    norm_image_features = torch.from_numpy(image_features.astype('float32'))
    norm_text_features = torch.from_numpy(text_features.astype('float32'))
    # normalize
    norm_image_features = norm_image_features / np.expand_dims(np.linalg.norm(image_features, axis=-1),1)
    norm_text_features = norm_text_features / np.expand_dims(np.linalg.norm(text_features, axis=-1),1)
    #similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    return (100.0 * norm_image_features @ norm_text_features.T)

In [8]:
def get_order(index, query):
    nbrs = neighbors.NearestNeighbors(n_neighbors=index.shape[0]).fit(index)
    distances, indices = nbrs.kneighbors(query)
    return indices[:,1:] # assumes first element is the same as query

In [9]:
downloaded_files = glob(os.path.join(DOWNLOAD_DIR, '*'))
len(downloaded_files)

6948

In [10]:
downloaded_dset = {'image_url': [], 'image_path': [], 'caption': [], 'labels': [],'confidence_scores' : []}
for filename in downloaded_files:
    ind = int(os.path.splitext(os.path.basename(filename))[0])
    for k in downloaded_dset:
        if k != 'image_path':
            downloaded_dset[k].append(subset[k][ind])
    downloaded_dset['image_path'].append(filename)

In [42]:
# get image features
features = []
for f in tqdm(downloaded_dset['image_path']):
    features.append(get_image_features(f))
downloaded_dset['image_features'] = np.array(features).squeeze()

100%|██████████| 6948/6948 [02:43<00:00, 42.52it/s]


In [43]:
# get text features
text_features = []
for c in tqdm(downloaded_dset['caption']):
    text_features.append(get_text_features(c))
downloaded_dset['text_features'] = np.array(text_features).squeeze()

100%|██████████| 6948/6948 [01:11<00:00, 97.09it/s]


In [13]:
# split dataset into query and index set
num_to_query = 100
query_inds = np.random.randint(0, len(downloaded_files), num_to_query)

In [14]:
# get image feature ranking
image_ranking = get_order(downloaded_dset['image_features'],downloaded_dset['image_features'][query_inds] )
image_ranking.shape

(100, 6947)

In [15]:
# get text feature ranking
text_ranking = get_order(downloaded_dset['text_features'],downloaded_dset['text_features'][query_inds] )
text_ranking.shape

(100, 6947)

In [16]:
# do evaluation with rank biased overlap
from rbo_code import rbo
rbos = []
cutoff = 100
for i in tqdm(range(len(text_ranking))):
    rbos.append( rbo(image_ranking[i][:cutoff], text_ranking[i][:cutoff], p=0.9).min)
rbos = np.array(rbos)

100%|██████████| 100/100 [00:00<00:00, 174.93it/s]


In [17]:
np.min(rbos), np.mean(rbos), np.max(rbos)

(0.0, 0.09100514201600252, 0.4208914740827287)

In [82]:
# simpler to interpet : how often do, in the first K results, they share the same result
def share_result(rankingA, rankingB, K=3):
    share_k = 0
    for i in range(len(rankingA)):
        a_rank = rankingA[i][:K]
        b_rank = rankingB[i][:K]
        if len(np.intersect1d(a_rank, b_rank)) > 0:
            share_k += 1
    return share_k / len(rankingA)
share_result(image_ranking, text_ranking)

0.2

In [63]:
cosine_similarity = get_cosine_similarity(downloaded_dset['image_features'], downloaded_dset['text_features'])

In [78]:
similarity_ranking = cosine_similarity.numpy().argsort(axis=-1)

In [79]:
similarity_to_image_rbos = []
cutoff = 100
for i in tqdm(range(len(text_ranking))):
    similarity_to_image_rbos.append( rbo(image_ranking[i][:cutoff], similarity_ranking[i][:cutoff], p=0.9).min)
similarity_to_image_rbos = np.array(similarity_to_image_rbos)
np.min(similarity_to_image_rbos), np.mean(similarity_to_image_rbos), np.max(similarity_to_image_rbos)

100%|██████████| 100/100 [00:00<00:00, 177.09it/s]


(0.0, 0.0010945623003236892, 0.03292587008470111)

In [80]:
similarity_to_txt_rbos = []
cutoff = 100
for i in tqdm(range(len(text_ranking))):
    similarity_to_txt_rbos.append( rbo(text_ranking[i][:cutoff], similarity_ranking[i][:cutoff], p=0.9).min)
similarity_to_txt_rbos = np.array(similarity_to_txt_rbos)
np.min(similarity_to_txt_rbos), np.mean(similarity_to_txt_rbos), np.max(similarity_to_txt_rbos)

100%|██████████| 100/100 [00:00<00:00, 176.29it/s]


(0.0, 0.0015039955457385337, 0.05324187847561122)

In [92]:
share_result(image_ranking, similarity_ranking, K=10)

0.0

In [93]:
share_result(text_ranking, similarity_ranking, K=10)

0.03