# Imagenet Evaluation Script
modified from [the evluation script by OpenAI](https://colab.research.google.com/github/openai/clip/blob/master/notebooks/Prompt_Engineering_for_ImageNet.ipynb).

In [1]:
import os 
import sys
import json

import numpy as np
import pandas as pd

os.environ['TOKENIZERS_PARALLELISM'] = "false"

import transformers
from transformers import AutoTokenizer

import torch
import torchvision
from torchvision import transforms
from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize, ToTensor
from torchvision.transforms.functional import InterpolationMode
from tqdm.notebook import tqdm

# !wget -q -N https://github.com/huggingface/transformers/raw/master/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
# !wget -q -N https://github.com/huggingface/transformers/raw/master/examples/research_projects/jax-projects/hybrid_clip/configuration_hybrid_clip.py

sys.path.append('.')

# Choosing the model to evaluate

In [2]:
# Model selection

# MODEL_TYPE = 'mClip'
#MODEL_TYPE = 'clip_italian'
# MODEL_TYPE = 'clip_arabic'
MODEL_TYPE = 'altclip'

In [3]:
from PIL import Image
import requests

# transformers version >= 4.21.0

from modeling_altclip import AltCLIP
from processing_altclip import AltCLIPProcessor

# now our repo's in private, so we need `use_auth_token=True`
model = AltCLIP.from_pretrained("BAAI/AltCLIP-m9")
processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP-m9")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities


You are using a model of type altclip to instantiate a model of type clip. This is not supported for all configurations of models and can yield errors.


In [None]:
model

In [4]:
TOKENIZER_NAME = "BAAI/AltCLIP-m9"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, cache_dir=None, use_fast=True)


In [5]:
def tokenize(texts):
    inputs = tokenizer(texts, max_length=128, padding="max_length", return_tensors="np",truncation=True)
    return inputs['input_ids'], inputs['attention_mask']

In [None]:
language_model = lambda queries: np.asarray(model.get_text_features(*tokenize(queries)))

# Loading the model

In [4]:
from PIL import Image

if MODEL_TYPE == 'mClip':
    from sentence_transformers import SentenceTransformer
    # Here we load the multilingual CLIP model. Note, this model can only encode text.
    # If you need embeddings for images, you must load the 'clip-ViT-B-32' model
    se_language_model = SentenceTransformer('clip-ViT-B-32-multilingual-v1')
    se_image_model = SentenceTransformer("clip-ViT-B-32")
    language_model = lambda queries: se_language_model.encode(queries, convert_to_tensor=True, show_progress_bar=False).cpu().detach().numpy()
    image_model = lambda images: se_image_model.encode(images, batch_size=128, convert_to_tensor=True, show_progress_bar=False).cpu().detach().numpy()
elif MODEL_TYPE == 'clip_italian':
    import jax
    from jax import numpy as jnp
    TOKENIZER_NAME = "dbmdz/bert-base-italian-xxl-uncased"
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, cache_dir=None, use_fast=True)
    model = FlaxHybridCLIP.from_pretrained("clip-italian/clip-italian")
    def tokenize(texts):
        inputs = tokenizer(texts, max_length=96, padding="max_length", return_tensors="np")
        return inputs['input_ids'], inputs['attention_mask']

    language_model = lambda queries: np.asarray(model.get_text_features(*tokenize(queries)))
    image_model = lambda images: np.asarray(model.get_image_features(images.permute(0, 2, 3, 1).numpy(),))
elif MODEL_TYPE == 'clip_arabic':
    import jax
    from jax import numpy as jnp
    TOKENIZER_NAME = "aubmindlab/bert-large-arabertv2"
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, cache_dir=None, use_fast=True)
    model = FlaxHybridCLIP.from_pretrained(MODEL_FILE, config=config)
    def tokenize(texts):
        inputs = tokenizer(texts, max_length=128, padding="max_length", return_tensors="np",truncation=True)
        return inputs['input_ids'], inputs['attention_mask']

    language_model = lambda queries: np.asarray(model.get_text_features(*tokenize(queries)))
    image_model = lambda images: np.asarray(model.get_image_features(images.permute(0, 2, 3, 1).numpy(),))
elif MODEL_TYPE == 'altclip':
    from modeling_altclip import AltCLIP
    from processing_altclip import AltCLIPProcessor
    # Here we load the multilingual CLIP model. Note, this model can only encode text.
    # If you need embeddings for images, you must load the 'clip-ViT-B-32' model
    # now our repo's in private, so we need `use_auth_token=True`
    model = AltCLIP.from_pretrained("BAAI/AltCLIP-m9")
    processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP-m9")

    TOKENIZER_NAME = "BAAI/AltCLIP-m9"
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, cache_dir=None, use_fast=True)

    def tokenize(texts):
        inputs = tokenizer(texts,padding=True,
                                    truncation=True,
                                    max_length=77,
                                    return_tensors='pt')
        
        return inputs['input_ids'], inputs['attention_mask']
                                    
    # print(model.get_text_features(*tokenize(queries)))

    # def language_model(queries):

    #     output = model.get_text_features(*tokenize(queries))
    #     return output

language_model = lambda queries: model.get_text_features(*tokenize(queries)).detach().numpy() 
image_model = lambda images: model.get_image_features(images)

You are using a model of type altclip to instantiate a model of type clip. This is not supported for all configurations of models and can yield errors.


# Preparing the translated ImageNet labels

In [6]:
# !wget -N -q https://raw.githubusercontent.com/clip-italian/clip-italian/imagenet_templates/evaluation/imagenet_labels_IT.tsv
# classes_df = pd.read_csv('./imagenet_labels_IT.tsv', sep='\t', header=0)
classes_df = pd.read_csv("/home/think3/Desktop/AltClip/arabic_templates_ar.tsv", sep='\t', header=0)
imagenet_classes = list(classes_df['query_short_translated_ar'])  # list(classes_df['query_long_translated'])
imagenet_templates = ['{}']

print(f"{len(imagenet_classes)} classes, {len(imagenet_templates)} templates")

1000 classes, 1 templates


In [7]:
# classes_df

# Set up Validation Set

In [8]:
val_preprocess = transforms.Compose([
    Resize([224], interpolation=InterpolationMode.BICUBIC),
    CenterCrop(224),
    ToTensor(),
    Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
])

In [9]:
IMAGENET_ROOT = "/home/think3/Desktop/AltClip/imagenet_root"

In [10]:
# !pip install natsort

In [11]:
from PIL import Image as PilImage
import natsort

class CustomDataSet(torchvision.datasets.ImageNet):
    def __init__(self, main_dir, transform):
        self.main_dir = main_dir
        self.transform = transform
        all_imgs = os.listdir(main_dir)
        self.total_imgs = natsort.natsorted(all_imgs)

    def __getitem__(self, idx):
        img_loc = os.path.join(self.main_dir, self.total_imgs[idx])
        image = PilImage.open(img_loc).convert("RGB")
        # tensor_image = self.transform(image)
        # print(type(image))
        return str(image)

In [12]:
# images = CustomDataSet(IMAGENET_ROOT, transform=val_preprocess)
# images = CustomDataSet(IMAGENET_ROOT, split='val')


In [28]:
# print('Downloading Imagenet validation set...')
# !wget -N -q --show-progress https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar
# print('Downloading Imagenet devkit...')
# !wget -N -q --show-progress https://image-net.org/data/ILSVRC/2012/ILSVRC2012_devkit_t12.tar.gz
# print('Done.')

images = torchvision.datasets.ImageNet(IMAGENET_ROOT, split='val', transform=val_preprocess)
# images = CustomDataSet(IMAGENET_ROOT, transform=val_preprocess)

loader = torch.utils.data.DataLoader(
    images,
    batch_size=16,
    shuffle=False,
    num_workers=2,
    persistent_workers=True,
    drop_last=True
)

In [29]:
# next(iter(loader))

In [30]:
# from PIL import Image
# import requests

# # transformers version >= 4.21.0

# from modeling_altclip import AltCLIP
# from processing_altclip import AltCLIPProcessor

# # now our repo's in private, so we need `use_auth_token=True`
# model = AltCLIP.from_pretrained("BAAI/AltCLIP-m9")
# processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP-m9")

# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
# image = Image.open(requests.get(url, stream=True).raw)


In [31]:
# inputs = processor(text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True)

In [32]:
# inputs["input_ids"]

In [33]:
# outputs = model(**inputs)
# outputs
# # logits_per_image = outputs.logits_per_image # this is the image-text similarity score
# # probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities

In [34]:
# probs

In [35]:
type(language_model)

function

In [36]:
# from modeling_altclip import AltCLIP
# from processing_altclip import AltCLIPProcessor
# # Here we load the multilingual CLIP model. Note, this model can only encode text.
# # If you need embeddings for images, you must load the 'clip-ViT-B-32' model
# # now our repo's in private, so we need `use_auth_token=True`
# model = AltCLIP.from_pretrained("BAAI/AltCLIP-m9")
# processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP-m9")

# TOKENIZER_NAME = "BAAI/AltCLIP-m9"
# tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, cache_dir=None, use_fast=True)

# def tokenize(texts):
#     inputs = tokenizer(texts,padding=True,
#                                 truncation=True,
#                                 max_length=77,
#                                 return_tensors='pt')
    
#     return inputs['input_ids'], inputs['attention_mask']
                                
# # print(model.get_text_features(*tokenize(queries)))

# # def language_model(queries):

# #     output = model.get_text_features(*tokenize(queries))
# #     return output

# language_model = lambda queries: model.get_text_features(*tokenize(queries))
# image_model = lambda images: np.asarray(model.get_image_features(images.permute(0, 2, 3, 1).numpy(),))

In [37]:
language_model("Hello")

array([[-1.69459283e-01,  3.63053888e-01,  2.16432109e-01,
        -5.92770576e-01, -1.66359365e-01,  7.38082826e-03,
        -1.43948406e-01,  5.30790538e-04,  6.73699737e-01,
         3.38250935e-01, -1.33275781e-02,  1.79065332e-01,
         7.16237843e-01, -3.00831497e-01,  5.53340137e-01,
        -1.84166171e-02, -3.72724682e-01, -2.36073539e-01,
        -2.10589468e-01, -7.84269720e-02, -1.60549521e-01,
         1.67538822e-01, -7.29247555e-03,  1.73323080e-01,
         1.14745945e-01, -2.54839808e-01, -3.18433702e-01,
        -3.50466184e-02, -3.21821235e-02,  9.32539850e-02,
         2.32405216e-01, -2.39703149e-01, -5.54765500e-02,
         5.97656906e-01,  4.25791070e-02,  2.65929282e-01,
        -9.96464416e-02,  5.94151676e-01,  1.93083867e-01,
        -2.41590306e-01,  3.12625505e-02, -1.21946000e-01,
        -1.62103578e-01, -1.97920278e-02,  3.46367449e-01,
        -1.12141460e-01,  3.99102449e-01,  2.13723630e-01,
         3.85334432e-01, -3.95973325e-01, -3.64907920e-0

# Creating zero-shot classifier weights

In [23]:
def zeroshot_classifier(classnames, templates):
    zeroshot_weights = []
    for classname in tqdm(classnames):
        texts = [template.format(classname) for template in templates]
        class_embeddings = language_model(texts)
        class_embeddings = class_embeddings / np.linalg.norm(class_embeddings, axis=-1, keepdims=True)
        class_embedding = np.mean(class_embeddings, axis=0)
        class_embedding /= np.linalg.norm(class_embedding, axis=-1)
        zeroshot_weights.append(class_embedding)
    zeroshot_weights = np.stack(zeroshot_weights, axis=1)
    return zeroshot_weights

zeroshot_weights = zeroshot_classifier(imagenet_classes, imagenet_templates)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [24]:
zeroshot_weights

array([[-0.03438196, -0.02537012, -0.02752385, ...,  0.00154732,
         0.01728298, -0.01343255],
       [-0.01028563, -0.00829885, -0.02152286, ...,  0.02257943,
         0.0135249 ,  0.00599771],
       [-0.04859017, -0.02701858, -0.03748238, ..., -0.02183821,
         0.02223223,  0.00255312],
       ...,
       [-0.03631806, -0.04637718, -0.04345605, ..., -0.01641936,
        -0.02627807, -0.03985934],
       [-0.01188057,  0.01669974, -0.00439028, ...,  0.01583908,
        -0.01018465,  0.0128259 ],
       [ 0.00941693, -0.00656846,  0.01349543, ...,  0.04371212,
         0.00028141, -0.01159007]], dtype=float32)

# Zero-shot prediction

In [38]:
def accuracy(output, target, topk=(1,)):
    output = torch.from_numpy(np.asarray(output))
    target = torch.from_numpy(np.asarray(target))
    pred = output.topk(max(topk), dim=1, largest=True, sorted=True)[1].t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk]

In [39]:
len(images.samples)

50000

In [40]:
images, target = next(iter(loader))

In [41]:
images

tensor([[[[ 1.2442,  0.9230,  1.5654,  ...,  0.8063,  0.3683,  0.4559],
          [ 1.5654,  0.4267,  0.9376,  ...,  0.1493,  0.5143,  0.7771],
          [ 1.1712,  0.1347,  0.3391,  ...,  0.3975,  0.8063,  0.7625],
          ...,
          [-0.2010, -0.2156, -0.2448,  ...,  0.2515,  0.4559,  0.4559],
          [-0.3032, -0.2594, -0.1718,  ...,  1.0252,  0.3391, -0.2448],
          [-0.3178, -0.3032, -0.1134,  ...,  0.1639, -0.1718, -0.0988]],

         [[ 1.0093,  0.6642,  1.0393,  ...,  1.0694,  0.6041,  0.7092],
          [ 1.2645,  0.1089,  0.5141,  ...,  0.4691,  0.7092,  1.0093],
          [ 0.6642, -0.2363,  0.0338,  ...,  0.6942,  1.0393,  1.0393],
          ...,
          [ 0.0038, -0.0712, -0.1313,  ...,  0.6792,  1.1444,  0.7242],
          [ 0.1239,  0.1239,  0.3190,  ...,  1.4596,  0.6792, -0.2213],
          [ 0.0939,  0.0188,  0.6041,  ...,  0.2289,  0.0038,  0.1389]],

         [[ 0.4395,  0.3115,  0.1266,  ...,  0.8234,  0.5106,  0.7523],
          [ 0.4537,  0.0129, -

In [42]:
if isinstance(images,torch.Tensor):
    images = images
else:
    images['pixel_values'] = torch.cat(images['pixel_values'])

In [43]:
image_model(images)

tensor([[ 0.6033,  0.2574,  0.1195,  ..., -0.4617, -0.0521,  0.2634],
        [ 0.4286,  0.0148,  0.8539,  ..., -0.6012, -0.4711,  0.3360],
        [ 0.9212, -0.1406, -0.0337,  ..., -0.7697, -0.0154,  0.6181],
        ...,
        [ 0.5848,  0.3772,  0.2961,  ...,  0.2487,  0.5082,  0.0399],
        [ 1.3886,  0.9096, -0.0191,  ..., -1.1461, -0.5643,  0.0538],
        [-0.1566,  0.2814,  0.2736,  ..., -0.2751, -0.3831,  0.2298]],
       grad_fn=<MmBackward0>)

In [47]:
top_ns = [1, 5, 10, 100]
acc_counters = [0. for _ in top_ns]
n = 0.

for i, (data, target) in enumerate(tqdm(loader)):
    data = data
    target = target.numpy()
    
    if isinstance(data,torch.Tensor):
        data = data
    else:
        data['pixel_values'] = torch.cat(data['pixel_values'])
    target = target

    image_features = image_model(data)
    image_features = image_features.detach().numpy()
    image_features = image_features / np.linalg.norm(image_features, axis=-1, keepdims=True)
    logits = 100. * image_features @ zeroshot_weights

    # measure accuracy
    accs = accuracy(logits, target, topk=top_ns)
    for j in range(len(top_ns)):
        acc_counters[j] += accs[j]
    n += data.shape[0]


  0%|          | 0/3125 [00:00<?, ?it/s]

In [48]:
# Arabic Results 
tops = {f'top{top_ns[i]}': acc_counters[i] / n * 100 for i in range(len(top_ns))}

print(tops)

{'top1': 44.278, 'top5': 67.964, 'top10': 75.202, 'top100': 92.32000000000001}


In [49]:
acc_counters

[22139.0, 33982.0, 37601.0, 46160.0]

In [50]:
tops = {f'top{top_ns[i]}': acc_counters[i] / n * 100 for i in range(len(top_ns))}

print(tops)

{'top1': 44.278, 'top5': 67.964, 'top10': 75.202, 'top100': 92.32000000000001}


OpenAI:  
    prompt engineering: {top1: 55.73, 'top5': 83.45}
  
mClip - multilanguage clip:  
    short translation:                      {'top1': 20.146, 'top5': 36.57, 'top10': 42.912, 'top100': 67.106}  
  
clip-italian:  
    short translation:                      {'top1': 22.122, 'top5': 43.672, 'top10': 52.59, 'top100': 81.084}  
    short translation + prompt engineering: {'top1': 21.886, 'top5': 43.086, 'top10': 51.739999999999995, 'top100': 82.06599999999999}  
    long tanslation:                        {'top1': 21.12, 'top5': 42.472, 'top10': 51.086, 'top100': 81.44}

In [None]:
# results:
# clip_arabic: {'top1': 5.444, 'top5': 14.934, 'top10': 21.248, 'top100': 56.808}
