In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [27]:
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

[0mCollecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-xarog4ui
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-xarog4ui
  Resolved https://github.com/openai/CLIP.git to commit a9b1bf5920416aaeaec965c25dd9e8f98c864f16
  Preparing metadata (setup.py) ... [?25ldone
[0m

In [28]:
import torch
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import clip
import os
import shutil
import random
from tqdm.notebook import tqdm
from pkg_resources import packaging

# Define basic and modified prompts
basic_prompt = "a photo of a {}"
modified_prompt = "a photo of a {}, a type of pet"

templates1 = [basic_prompt] 
templates2 = [modified_prompt]

In [29]:
def split_images(data_dir):
    images = "/kaggle/working/images"
    if(not os.path.exists(images)):
        os.makedirs(images)
    for filename in os.listdir(data_dir):
        class_name = "_".join(list(filename.split("_"))[:-1])
        class_dir = os.path.join(images, class_name)
        src_dir = os.path.join(data_dir,filename)
        if(not os.path.exists(class_dir)):
            os.makedirs(class_dir)
        shutil.copy(src_dir,class_dir)
    
    shutil.make_archive(os.path.join("/kaggle/working/","oxford_dataset_iiit-pet_splitted"), 'zip', images)


# Create zeroshot weights
def zeroshot_classifier(classnames, templates):
    with torch.no_grad():
        zeroshot_weights = []
        for classname in tqdm(classnames):
            texts = [template.format(classname) for template in templates] #format with class
            texts = clip.tokenize(texts).cuda() #tokenize
            class_embeddings = model.encode_text(texts) #embed with text encoder
            class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True)
            class_embedding = class_embeddings.mean(dim=0)
            class_embedding /= class_embedding.norm()
            zeroshot_weights.append(class_embedding)
        zeroshot_weights = torch.stack(zeroshot_weights, dim=1).cuda()
    return zeroshot_weights

def accuracy(output, target, topk=(1,)):
    pred = output.topk(max(topk), 1, True, True)[1].t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk]



In [30]:
# Load pre-trained CLIP model
clip.available_models()
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32")
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408


In [None]:
# split_images(data_dir)

In [31]:
#smaller dataset
data_dir = "/kaggle/working/"
data_set = datasets.OxfordIIITPet(data_dir, transform=preprocess,download = True)
loader = torch.utils.data.DataLoader(data_set, batch_size=32, num_workers=2)
classes = data_set.classes
zeroshot_weights = zeroshot_classifier(classes, templates1)
modified_zeroshot_weights = zeroshot_classifier(classes, templates2)


with torch.no_grad():
    top1, top5, n = 0., 0., 0.
    modified_top1,modified_top5 = 0. , 0.
    for i, (images, target) in enumerate(tqdm(loader)):
        images = images.cuda()
        target = target.cuda()
        
        # predict
        image_features = model.encode_image(images)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        logits = 100. * image_features @ zeroshot_weights
        modified_logits = 100. * image_features @ modified_zeroshot_weights

        # measure accuracy
        acc1, acc5 = accuracy(logits, target, topk=(1, 5))
        top1 += acc1
        top5 += acc5
        
        modified_acc1, modified_acc5 = accuracy(modified_logits, target, topk=(1, 5))
        modified_top1 += modified_acc1
        modified_top5 += modified_acc5
        
        n += images.size(0)

top1 = (top1 / n) * 100
top5 = (top5 / n) * 100 

modified_top1 = (modified_top1 / n) * 100
modified_top5 = (modified_top5 / n) * 100 

print(f"Top-1 basic-accuracy: {top1:.2f}")
print(f"Top-5 basic-accuracy: {top5:.2f}")

print(f"Top-1 modified-accuracy: {modified_top1:.2f}")
print(f"Top-5 modified-accuracy: {modified_top5:.2f}")

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

Top-1 basic-accuracy: 80.57
Top-5 basic-accuracy: 96.60
Top-1 modified-accuracy: 85.49
Top-5 modified-accuracy: 99.13


In [32]:
#bigger dataset
data_dir = "/kaggle/input/oxford-dataset-iiit-pet-splitted"
data_set = datasets.ImageFolder(data_dir, transform=preprocess)
loader = torch.utils.data.DataLoader(data_set, batch_size=32, num_workers=2)
classes = data_set.classes
zeroshot_weights = zeroshot_classifier(classes, templates1)
modified_zeroshot_weights = zeroshot_classifier(classes, templates2)


with torch.no_grad():
    top1, top5, n = 0., 0., 0.
    modified_top1,modified_top5 = 0. , 0.
    for i, (images, target) in enumerate(tqdm(loader)):
        images = images.cuda()
        target = target.cuda()
        
        # predict
        image_features = model.encode_image(images)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        logits = 100. * image_features @ zeroshot_weights
        modified_logits = 100. * image_features @ modified_zeroshot_weights

        # measure accuracy
        acc1, acc5 = accuracy(logits, target, topk=(1, 5))
        top1 += acc1
        top5 += acc5
        
        modified_acc1, modified_acc5 = accuracy(modified_logits, target, topk=(1, 5))
        modified_top1 += modified_acc1
        modified_top5 += modified_acc5
        
        n += images.size(0)

top1 = (top1 / n) * 100
top5 = (top5 / n) * 100 

modified_top1 = (modified_top1 / n) * 100
modified_top5 = (modified_top5 / n) * 100 

print(f"Top-1 basic-accuracy: {top1:.2f}")
print(f"Top-5 basic-accuracy: {top5:.2f}")

print(f"Top-1 modified-accuracy: {modified_top1:.2f}")
print(f"Top-5 modified-accuracy: {modified_top5:.2f}")

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/231 [00:00<?, ?it/s]

Top-1 basic-accuracy: 77.93
Top-5 basic-accuracy: 95.01
Top-1 modified-accuracy: 82.21
Top-5 modified-accuracy: 97.24
