# Imagenet Evaluation Script
modified from [the evluation script by OpenAI](https://colab.research.google.com/github/openai/clip/blob/master/notebooks/Prompt_Engineering_for_ImageNet.ipynb).

In [95]:
# # !pip install -q -U jax jaxlib
# # !pip install -q pandas
# !pip install -q ipywidgets
# !pip install -q -U flax
# !pip install -q sentence-transformers
# #!pip install -q git+https://github.com/huggingface/transformers.git
# !pip install -q transformers
# !pip install -q torch torchvision

In [96]:
import os 
import sys
import json

import numpy as np
import pandas as pd

os.environ['TOKENIZERS_PARALLELISM'] = "false"

import transformers
from transformers import AutoTokenizer

import torch
import torchvision
from torchvision import transforms
from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize, ToTensor
from torchvision.transforms.functional import InterpolationMode
from tqdm.notebook import tqdm

# !wget -q -N https://github.com/huggingface/transformers/raw/master/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
# !wget -q -N https://github.com/huggingface/transformers/raw/master/examples/research_projects/jax-projects/hybrid_clip/configuration_hybrid_clip.py

sys.path.append('.')

from modeling_hybrid_clip import FlaxHybridCLIP

# Choosing the model to evaluate

In [97]:
# Model selection

MODEL_TYPE = 'mClip'
#MODEL_TYPE = 'clip_italian'
# MODEL_TYPE = 'clip_arabic'

In [98]:
CONFIG_FILE = '/home/think3/Desktop/training_CLIP/model_allcaptions/39/config.json'
MODEL_FILE = '/home/think3/Desktop/training_CLIP/model_allcaptions/39/flax_model.msgpack'

In [99]:
from configuration_hybrid_clip import HybridCLIPConfig

with open(CONFIG_FILE, 'r') as f:
    config_dict = json.load(f)
# config_dict['text_config']
config_dict['vision_config']['model_type'] = 'clip'
config = HybridCLIPConfig(text_config=config_dict['text_config'], vision_config=config_dict['vision_config'])

{'text_config': {'_name_or_path': 'aubmindlab/bert-large-arabertv2', 'add_cross_attention': False, 'architectures': ['BertForMaskedLM'], 'attention_probs_dropout_prob': 0.1, 'bad_words_ids': None, 'begin_suppress_tokens': None, 'bos_token_id': None, 'chunk_size_feed_forward': 0, 'classifier_dropout': None, 'cross_attention_hidden_size': None, 'decoder_start_token_id': None, 'diversity_penalty': 0.0, 'do_sample': False, 'early_stopping': False, 'encoder_no_repeat_ngram_size': 0, 'eos_token_id': None, 'exponential_decay_length_penalty': None, 'finetuning_task': None, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'hidden_act': 'gelu', 'hidden_dropout_prob': 0.1, 'hidden_size': 1024, 'id2label': {'0': 'LABEL_0', '1': 'LABEL_1'}, 'initializer_range': 0.02, 'intermediate_size': 4096, 'is_decoder': False, 'is_encoder_decoder': False, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'layer_norm_eps': 1e-12, 'length_penalty': 1.0, 'max_length': 20, 'max_position_embeddings': 512, 'min_leng

In [100]:
# !pip install -U sentence-transformers

# Loading the model

In [101]:
from PIL import Image

if MODEL_TYPE == 'mClip':
    from sentence_transformers import SentenceTransformer
    # Here we load the multilingual CLIP model. Note, this model can only encode text.
    # If you need embeddings for images, you must load the 'clip-ViT-B-32' model
    se_language_model = SentenceTransformer('clip-ViT-B-32-multilingual-v1')
    se_image_model = SentenceTransformer("clip-ViT-B-32")
    language_model = lambda queries: se_language_model.encode(queries, convert_to_tensor=True, show_progress_bar=False).cpu().detach().numpy()
    image_model = lambda images: se_image_model.encode(images, batch_size=128, convert_to_tensor=True, show_progress_bar=False).cpu().detach().numpy()
elif MODEL_TYPE == 'clip_italian':
    import jax
    from jax import numpy as jnp
    TOKENIZER_NAME = "dbmdz/bert-base-italian-xxl-uncased"
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, cache_dir=None, use_fast=True)
    model = FlaxHybridCLIP.from_pretrained("clip-italian/clip-italian")
    def tokenize(texts):
        inputs = tokenizer(texts, max_length=96, padding="max_length", return_tensors="np")
        return inputs['input_ids'], inputs['attention_mask']

    language_model = lambda queries: np.asarray(model.get_text_features(*tokenize(queries)))
    image_model = lambda images: np.asarray(model.get_image_features(images.permute(0, 2, 3, 1).numpy(),))
elif MODEL_TYPE == 'clip_arabic':
    import jax
    from jax import numpy as jnp
    TOKENIZER_NAME = "aubmindlab/bert-large-arabertv2"
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, cache_dir=None, use_fast=True)
    model = FlaxHybridCLIP.from_pretrained(MODEL_FILE, config=config)
    def tokenize(texts):
        inputs = tokenizer(texts, max_length=128, padding="max_length", return_tensors="np",truncation=True)
        return inputs['input_ids'], inputs['attention_mask']

    language_model = lambda queries: np.asarray(model.get_text_features(*tokenize(queries)))
    image_model = lambda images: np.asarray(model.get_image_features(images.permute(0, 2, 3, 1).numpy(),))

# Preparing the translated ImageNet labels

In [102]:
# !wget -N -q https://raw.githubusercontent.com/clip-italian/clip-italian/imagenet_templates/evaluation/imagenet_labels_IT.tsv
# classes_df = pd.read_csv('./imagenet_labels_IT.tsv', sep='\t', header=0)
classes_df = pd.read_csv("arabic_templates_ar.tsv", sep='\t', header=0)
imagenet_classes = list(classes_df['query_short_translated_ar'])  # list(classes_df['query_long_translated'])
imagenet_templates = ['{}']

print(f"{len(imagenet_classes)} classes, {len(imagenet_templates)} templates")

1000 classes, 1 templates


In [103]:
# classes_df

# Set up Validation Set

In [104]:
val_preprocess = transforms.Compose([
    Resize([224], interpolation=InterpolationMode.BICUBIC),
    CenterCrop(224),
    ToTensor(),
    Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
])

In [105]:
IMAGENET_ROOT = "/home/think3/Desktop/training_CLIP/imagenet_root"


In [144]:
from PIL import Image as PilImage
import natsort

class CustomDataSet(torchvision.datasets.ImageNet):
    def __init__(self, main_dir, transform):
        self.main_dir = main_dir
        self.transform = transform
        all_imgs = os.listdir(main_dir)
        self.total_imgs = natsort.natsorted(all_imgs)

    def __getitem__(self, idx):
        img_loc = os.path.join(self.main_dir, self.total_imgs[idx])
        image = PilImage.open(img_loc).convert("RGB")
        # tensor_image = self.transform(image)
        # print(type(image))
        return str(image)

In [107]:
# images = CustomDataSet(IMAGENET_ROOT, transform=val_preprocess)
# images = CustomDataSet(IMAGENET_ROOT, split='val')


In [157]:
# print('Downloading Imagenet validation set...')
# !wget -N -q --show-progress https://image-net.org/data/ILSVRC/2012/ILSVRC2012_img_val.tar
# print('Downloading Imagenet devkit...')
# !wget -N -q --show-progress https://image-net.org/data/ILSVRC/2012/ILSVRC2012_devkit_t12.tar.gz
# print('Done.')

images = torchvision.datasets.ImageNet(IMAGENET_ROOT, split='val', transform=val_preprocess)
# images = CustomDataSet(IMAGENET_ROOT, transform=val_preprocess)

loader = torch.utils.data.DataLoader(
    images,
    batch_size=1024,
    shuffle=False,
    num_workers=2,
    persistent_workers=True,
    drop_last=True
)

In [155]:
# next(iter(loader))

# Creating zero-shot classifier weights

In [123]:
def zeroshot_classifier(classnames, templates):
    zeroshot_weights = []
    for classname in tqdm(classnames):
        texts = [template.format(classname) for template in templates]
        class_embeddings = language_model(texts)
        class_embeddings = class_embeddings / np.linalg.norm(class_embeddings, axis=-1, keepdims=True)
        class_embedding = np.mean(class_embeddings, axis=0)
        class_embedding /= np.linalg.norm(class_embedding, axis=-1)
        zeroshot_weights.append(class_embedding)
    zeroshot_weights = np.stack(zeroshot_weights, axis=1)
    return zeroshot_weights


zeroshot_weights = zeroshot_classifier(imagenet_classes, imagenet_templates)

  0%|          | 0/1000 [00:00<?, ?it/s]

tokenize
['سمك التنش']
tokenize
['السمكة الذهبية']
tokenize
['القرش الأبيض الكبير']
tokenize
['القرش الببري']
tokenize
['القرش المطرقة']
tokenize
['سمك الرعاد']
tokenize
['سمك الرقيطة']
tokenize
['ديك']
tokenize
['دجاجة']
tokenize
['نعامة']
tokenize
['الشرشور الجبلي']
tokenize
['طائر الحسون']
tokenize
['طائر التفاحي الاوروبي']
tokenize
['طائر الجنك داكن العيون']
tokenize
['طائر الدرسة السماوي']
tokenize
['طائر ابو الحناء']
tokenize
['بلبل']
tokenize
['طائر القيق']
tokenize
['عقعق طائر الذيل الطويل']
tokenize
['طائر القرقف']
tokenize
['طائر الغطاس']
tokenize
['طائر الحدأة']
tokenize
['طائر العقاب الرخمة']
tokenize
['نسر']
tokenize
['البومة الرمادية']
tokenize
['السمندر الناري']
tokenize
['ليسوتريتون فولجاريس']
tokenize
['السمندر المائي']
tokenize
['السمندر المرقط']
tokenize
['السمندر المكسيكي']
tokenize
['ضفدع الثور الامريكي']
tokenize
['ضفدع الشجر']
tokenize
['الضفادع ذات الذيل']
tokenize
['السلحفاة البحرية ضخمة الرأس']
tokenize
['سلحفاة المحيط جلدية الظهر']
tokenize
['سلحفاة الطين']
t

# Zero-shot prediction

In [149]:
def accuracy(output, target, topk=(1,)):
    output = torch.from_numpy(np.asarray(output))
    target = torch.from_numpy(np.asarray(target))
    pred = output.topk(max(topk), dim=1, largest=True, sorted=True)[1].t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk]

In [166]:
len(images.samples)

50000

In [171]:
top_ns = [1, 5, 10, 100]
acc_counters = [0. for _ in top_ns]
n = 0.

for i, (data, target) in enumerate(tqdm(loader)):
    data = data
    target = target.numpy()
    # predict
    # print(images)
    
    if ((i+1)*1024) > len(images.samples):
        batch_paths = [images.samples[idx][0] for idx in range(i*1024, len(images.samples))]
    else:
        batch_paths = [images.samples[idx][0] for idx in range(i*1024, (i+1)*1024)]
    image_features = image_model(batch_paths)
    image_features = image_features / np.linalg.norm(image_features, axis=-1, keepdims=True)
    logits = 100. * image_features @ zeroshot_weights

    # measure accuracy
    accs = accuracy(logits, target, topk=top_ns)
    for j in range(len(top_ns)):
        acc_counters[j] += accs[j]
    n += data.shape[0]



  0%|          | 0/49 [00:00<?, ?it/s]

tokenize
['/home/think3/Desktop/training_CLIP/imagenet_root/val/n01440764/ILSVRC2012_val_00000293.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n01534433/ILSVRC2012_val_00015416.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n01534433/ILSVRC2012_val_00016677.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n01534433/ILSVRC2012_val_00017384.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n01534433/ILSVRC2012_val_00017712.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n01534433/ILSVRC2012_val_00017970.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n01534433/ILSVRC2012_val_00021650.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n01534433/ILSVRC2012_val_00023125.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n01534433/ILSVRC2012_val_00023850.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n01534433/ILSVRC2012_val_00024884.JPEG', '/home/think3/Desktop/training_CLIP/imag

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.9/multiprocessing/queues.py", line 239, in _feed
    reader_close()
  File "/usr/lib/python3.9/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/usr/lib/python3.9/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.9/multiprocessing/queues.py", line 271, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times
Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.9/multiprocessing/queues.py", line 239, in _fe

tokenize
['/home/think3/Desktop/training_CLIP/imagenet_root/val/n09193705/ILSVRC2012_val_00008461.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n07930864/ILSVRC2012_val_00022056.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n07930864/ILSVRC2012_val_00026174.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n07930864/ILSVRC2012_val_00026197.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n07930864/ILSVRC2012_val_00027368.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n07930864/ILSVRC2012_val_00028134.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n07930864/ILSVRC2012_val_00028410.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n07930864/ILSVRC2012_val_00028512.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n07930864/ILSVRC2012_val_00028858.JPEG', '/home/think3/Desktop/training_CLIP/imagenet_root/val/n07930864/ILSVRC2012_val_00028868.JPEG', '/home/think3/Desktop/training_CLIP/imag

In [169]:
acc_counters

[43.0, 250.0, 530.0, 4835.0]

In [172]:
tops = {f'top{top_ns[i]}': acc_counters[i] / n * 100 for i in range(len(top_ns))}

print(tops)

{'top1': 0.086, 'top5': 0.5, 'top10': 1.06, 'top100': 10.068000000000001}


OpenAI:  
    prompt engineering: {top1: 55.73, 'top5': 83.45}
  
mClip - multilanguage clip:  
    short translation:                      {'top1': 20.146, 'top5': 36.57, 'top10': 42.912, 'top100': 67.106}  
  
clip-italian:  
    short translation:                      {'top1': 22.122, 'top5': 43.672, 'top10': 52.59, 'top100': 81.084}  
    short translation + prompt engineering: {'top1': 21.886, 'top5': 43.086, 'top10': 51.739999999999995, 'top100': 82.06599999999999}  
    long tanslation:                        {'top1': 21.12, 'top5': 42.472, 'top10': 51.086, 'top100': 81.44}

In [None]:
# results:
# clip_arabic: {'top1': 5.444, 'top5': 14.934, 'top10': 21.248, 'top100': 56.808}
