In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [18]:
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

[0m

In [19]:
import torch
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import clip
import os
import shutil
import random
from tqdm.notebook import tqdm
from pkg_resources import packaging

# Define basic and modified prompts
basic_prompt = "a photo of a {}"
modified_prompt = "a photo of a {}, a type of pet"

templates1 = [basic_prompt] 
templates2 = [modified_prompt]

In [21]:
def split_images(data_dir):
    images = "/kaggle/working/images"
    if(not os.path.exists(images)):
        os.makedirs(images)
    for filename in os.listdir(data_dir):
        class_name = "_".join(list(filename.split("_"))[:-1])
        class_dir = os.path.join(images, class_name)
        src_dir = os.path.join(data_dir,filename)
        if(not os.path.exists(class_dir)):
            os.makedirs(class_dir)
        shutil.copy(src_dir,class_dir)
    
    shutil.make_archive(os.path.join("/kaggle/working/","oxford_dataset_iiit-pet_splitted"), 'zip', images)


# Create zeroshot weights
def zeroshot_classifier(classnames, templates):
    with torch.no_grad():
        zeroshot_weights = []
        for classname in tqdm(classnames):
            texts = [template.format(classname) for template in templates] #format with class
            texts = clip.tokenize(texts).cuda() #tokenize
            class_embeddings = model.encode_text(texts) #embed with text encoder
            class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True)
            class_embedding = class_embeddings.mean(dim=0)
            class_embedding /= class_embedding.norm()
            zeroshot_weights.append(class_embedding)
        zeroshot_weights = torch.stack(zeroshot_weights, dim=1).cuda()
    return zeroshot_weights

def accuracy(output, target, boxer_label,topk = (1,)):
    true_positive = 0
    false_positive = 0
    true_negative = 0
    false_negative = 0
    pred = (output.topk(max(topk), 1, True, True)[1].t()).flatten() == boxer_label
    target = (target == boxer_label).flatten()
    for i in range(pred.shape[0]):
        if(target[i] == True):
            if(pred[i] == target[i]):
                true_positive += 1
            else:
                false_negative += 1
        else:
            if(pred[i] == target[i]):
                true_negative += 1
            else:
                false_positive += 1
            
    return (true_positive,false_positive,true_negative,false_negative)



In [22]:
# Load pre-trained CLIP model
clip.available_models()
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32")
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408


In [None]:
# split_images(data_dir)

In [24]:
#for small dataset
data_dir = "/kaggle/working/"
data_set = datasets.OxfordIIITPet(data_dir, transform=preprocess,download = True)
loader = torch.utils.data.DataLoader(data_set, batch_size=32, num_workers=2)
classes = data_set.classes
boxer_label = classes.index('Boxer')
zeroshot_weights = zeroshot_classifier(classes, templates1)
modified_zeroshot_weights = zeroshot_classifier(classes, templates2)


with torch.no_grad():
    tp,fp,tn,fn = 0,0,0,0
    mtp,mfp,mtn,mfn = 0,0,0,0
    for i, (images, target) in enumerate(tqdm(loader)):
        images = images.cuda()
        target = target.cuda()
        
        # predict
        image_features = model.encode_image(images)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        logits = 100. * image_features @ zeroshot_weights
        modified_logits = 100. * image_features @ modified_zeroshot_weights

        # measure accuracy
        tp1,fp1,tn1,fn1 = accuracy(logits, target,boxer_label)
        tp += tp1
        fp += fp1
        tn += tn1
        fn += fn1
        
        mtp1,mfp1,mtn1,mfn1 = accuracy(modified_logits, target,boxer_label)
        mtp += mtp1
        mfp += mfp1
        mtn += mtn1
        mfn += mfn1


bp = (tp/(tp + fp))
br = (tp/(tp + fn))
bacc = (tp + tn)/(tp + tn + fp + fn)
bf1 = (2*bp*br)/(bp + br)
print(f"Basic_prompt: \n  precision-{bp} \n  recall-{br} \n  accuracy-{bacc} \n  f1score-{bf1} \n \n")
mp = (mtp/(mtp + mfp))
mr = (mtp/(mtp + mfn))
macc = (mtp + mtn)/(mtp + mtn + mfp + mfn)
mf1 = (2*mp*mr)/(mp + mr)
print(f"Modified_prompt: \n  precision-{mp} \n  recall-{mr} \n  accuracy-{macc}  \n  f1score-{mf1}")

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

Basic_prompt: 
  precision-0.9047619047619048 
  recall-0.38 
  accuracy-0.9820652173913044 
  f1score-0.5352112676056339 
 

Modified_prompt: 
  precision-0.8333333333333334 
  recall-0.85 
  accuracy-0.991304347826087  
  f1score-0.8415841584158417


In [25]:
#for big dataset
data_dir = "/kaggle/input/oxford-dataset-iiit-pet-splitted"
data_set = datasets.ImageFolder(data_dir, transform=preprocess)

loader = torch.utils.data.DataLoader(data_set, batch_size=32, num_workers=2)
classes = data_set.classes
boxer_label = classes.index('boxer')
zeroshot_weights = zeroshot_classifier(classes, templates1)
modified_zeroshot_weights = zeroshot_classifier(classes, templates2)


with torch.no_grad():
    tp,fp,tn,fn = 0,0,0,0
    mtp,mfp,mtn,mfn = 0,0,0,0
    for i, (images, target) in enumerate(tqdm(loader)):
        images = images.cuda()
        target = target.cuda()
        
        # predict
        image_features = model.encode_image(images)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        logits = 100. * image_features @ zeroshot_weights
        modified_logits = 100. * image_features @ modified_zeroshot_weights

        # measure accuracy
        tp1,fp1,tn1,fn1 = accuracy(logits, target,boxer_label)
        tp += tp1
        fp += fp1
        tn += tn1
        fn += fn1
        
        mtp1,mfp1,mtn1,mfn1 = accuracy(modified_logits, target,boxer_label)
        mtp += mtp1
        mfp += mfp1
        mtn += mtn1
        mfn += mfn1


bp = (tp/(tp + fp))
br = (tp/(tp + fn))
bacc = (tp + tn)/(tp + tn + fp + fn)
bf1 = (2*bp*br)/(bp + br)
print(f"Basic_prompt: \n  precision-{bp} \n  recall-{br} \n  accuracy-{bacc} \n  f1score-{bf1} \n \n")
mp = (mtp/(mtp + mfp))
mr = (mtp/(mtp + mfn))
macc = (mtp + mtn)/(mtp + mtn + mfp + mfn)
mf1 = (2*mp*mr)/(mp + mr)
print(f"Modified_prompt: \n  precision-{mp} \n  recall-{mr} \n  accuracy-{macc}  \n  f1score-{mf1}")

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/231 [00:00<?, ?it/s]

Basic_prompt: 
  precision-0.8880597014925373 
  recall-0.595 
  accuracy-0.9870094722598105 
  f1score-0.7125748502994012 
 

Modified_prompt: 
  precision-0.8505154639175257 
  recall-0.825 
  accuracy-0.9913396481732071  
  f1score-0.8375634517766498
