In [1]:
import torch
import clip
import os
import random
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from nltk.corpus import wordnet as wn
from torchvision.datasets import CIFAR100, ImageNet
from google_images_search import GoogleImagesSearch
import PIL
import cv2

Comments:

    1. Try to use polysemous words, ideally with one sense more popular than the other
    
    2. Dataset analysis, underrepresented classes/combination of classes and performance on them
    
    3. Object detection, answer present/absent
    
    4. Understand loss(if present) of robustness when fine tuned on datasets as compared to zero shot evaluation
    
    5. Ability of CLIP to detect actions


In [2]:
# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

In [3]:
def get_similarities(images, texts):
    scores = []
    for image in images:
#         print(type(image))
        with torch.no_grad():
            image = preprocess(image).unsqueeze(0).to(device)
            image_features = model.encode_image(image)
            text_features = model.encode_text(texts)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        scores.append(similarity)
    return scores

In [4]:
def load_images(dir_path):
    dir_path = dir_path.replace(' ', '_')
    print(dir_path)
    images = []
    for file in os.listdir(dir_path):
        try:
            images.append(Image.open(dir_path+'/'+file, 'r'))
        except PIL.UnidentifiedImageError as e:
            print(dir_path+'/'+file)
        except IsADirectoryError as e:
            pass
    return images

In [5]:
def blue_to_yellow(a):
    a = np.asarray(a, dtype="int32" ).astype(np.float)
    new_a = (a[:,:,0]+a[:,:,1], a[:,:,0]-a[:,:,1], a[:,:,2])
    new_a = (new_a[2], new_a[1], new_a[0])
    #print(np.abs(a[:,:,0] - (new_a[0]+new_a[1])/2).max())
    new_a = np.stack( ((new_a[0]+new_a[1])/2, (new_a[0]-new_a[1])/2, new_a[2]), axis=-1)
    return Image.fromarray(new_a.astype(np.uint8), "RGB" )

def invert(image):
    return PIL.ImageOps.invert(image)

def alter_brightness(image, factor):
    enhancer = PIL.ImageEnhance.Brightness(image)
    return enhancer.enhance(factor)

def rotate(image, angle):
    return image.rotate(angle)

def add_salt_and_pepper(image, amount):
    output = np.copy(np.array(image))
    # add pepper
    nb_pepper = np.ceil(amount* output.size * 0.5)
    coords = [np.random.randint(0, i - 1, int(nb_pepper)) for i in output.shape[:2]]
    for i, j in zip(coords[0], coords[1]):
        output[i][j] = 0
    
    # add salt
    nb_salt = np.ceil(amount * output.size * 0.5)
    coords = [np.random.randint(0, i - 1, int(nb_salt)) for i in output.shape[:2]]
    for i, j in zip(coords[0], coords[1]):
        output[i][j] = 256
#     output[coords] = 1    
    return Image.fromarray(output)

def add_gaussian_noise(image, sigma):
    output = np.copy(np.array(image))
    noise = np.random.normal(size=output.shape[:2])*sigma
    output += np.stack((noise, noise, noise), axis=-1).astype(np.uint8)
    return Image.fromarray(output)

def transformation(dir_path, transform):
    dir_path = dir_path.replace(' ', '_')
    print(dir_path)
    images = []
    for file in os.listdir(dir_path):
        image = Image.open(dir_path+'/'+file, 'r')
        image = transform(image)
        images.append(image)
    return images

images = transformation(image_dir+'/'+"yellow sun in blue sky", lambda x: rotate(x, -45))

NameError: name 'image_dir' is not defined

In [6]:
def get_scores(queries, text_probes=None):
    images = dict([(q, load_images(image_dir+'/'+q)) for q in queries])
    if text_probes is None:
        text_probes = ["a photo of an "+q for q in queries]
    text_probes = torch.cat([clip.tokenize(text) for text in text_probes]).to(device)
    scores= dict([])
    for q, imgs in images.items():
        scores[q] = get_similarities(imgs, text_probes) 
    return scores

In [7]:
image_dir = './images_google_search'

# Polysemous words

## Apple (fruit vs logo)

In [42]:
queries = ["apple fruit cartoon", "apple fruit"]
scores = get_scores(queries, text_probes=["a drawing of an apple", "a photo of an apple"])
predictions = np.array([np.bincount([int(s.argmax()) for s in scores[q]], minlength=len(queries)) for q in queries])

./images_google_search/apple_fruit_cartoon
./images_google_search/apple_fruit


In [43]:
predictions

array([[34, 10],
       [ 4, 40]])

In [41]:
predictions

array([[35,  9],
       [ 3, 41]])

Probes tried:

    1. ["a photo of the Apple logo", "a photo of an apple fruit"]
        [[50,  0],
        [17, 33]]
    2. ["a photo of a tree", "a photo of an apple fruit"] (contrast with tree)
        [[50,  0],
       [ 0, 50]]
    

## Tree (Graph vs normal)

In [254]:
queries = ["tree", "tree graph"]
scores = get_scores(queries, text_probes=["a photo of a real tree", "a drawing of a tree diagram"])
predictions = np.array([np.bincount([int(s.argmax()) for s in scores[q]], minlength=len(queries)) for q in queries])

./images_google_search/tree
./images_google_search/tree_graph


In [255]:
predictions

array([[43,  7],
       [ 0, 49]], dtype=int64)

Probes tried: 

    1. ["a photo of a tree", "a photo of a tree graph"] 
        [[37, 13],
       [ 0, 49]]
       
    2. ["a photo of a natural tree", "a photo of a tree graph"]  
        [(40, 10), 
        (0, 49)]
        
    3. ["a photo of a planted tree", "a photo of a tree graph"]
        [(35, 15), 
        (0, 49)]
        
    4.  ["a photo of a natural tree", "a photo of a tree diagram"] 
        [[42,  8],
        [ 0, 49]]


## Basket (wicker vs basketball)

In [77]:
queries = ["wicker basket", "basketball basket"]
scores = get_scores(queries, text_probes=["wooden basket", "basketball basket"])
predictions = np.array([np.bincount([int(s.argmax()) for s in scores[q]], minlength=len(queries)) for q in queries])

./images_google_search/wicker_basket
./images_google_search/basketball_basket


In [79]:
predictions

array([[50,  0],
       [ 0, 51]], dtype=int64)

In [249]:
queries = ["finger nail", "iron nail"]
scores = get_scores(queries, text_probes=["a photo of a human nail", "a photo of a tool nail"])
predictions = np.array([np.bincount([int(s.argmax()) for s in scores[q]], minlength=len(queries)) for q in queries])
predictions

./images_google_search/finger_nail
./images_google_search/iron_nail


array([[49,  0],
       [ 0, 51]], dtype=int64)

In [85]:
predictions

array([[49,  0],
       [ 3, 48]], dtype=int64)

In [91]:
images = dict([(q, load_images(image_dir+'/'+q)) for q in ["iron nail"]])

./images_google_search/iron_nail


In [97]:
queries = ["file tool", "file stationery"]
scores = get_scores(queries)
predictions = np.array([np.bincount([int(s.argmax()) for s in scores[q]], minlength=len(queries)) for q in queries])

./images_google_search/file_tool
./images_google_search/file_stationery


In [98]:
predictions

array([[49,  0],
       [ 0, 51]], dtype=int64)

In [24]:
queries = ["man", "cartoon man"]
scores = get_scores(queries, text_probes=["a photo of a real man", "a picture of a cartoon man"])
predictions = np.array([np.bincount([int(s.argmax()) for s in scores[q]], minlength=len(queries)) for q in queries])

./images_google_search/man
./images_google_search/cartoon_man


In [25]:
predictions

array([[41,  9],
       [ 0, 49]])

In [118]:
queries = ["fat man", "thin man"]
scores = get_scores(queries, text_probes=None)
predictions = np.array([np.bincount([int(s.argmax()) for s in scores[q]], minlength=len(queries)) for q in queries])
predictions

./images_google_search/fat_man
./images_google_search/thin_man


array([[44,  6],
       [ 3, 47]], dtype=int64)

In [117]:
queries = ["tall man", "short man"]
scores = get_scores(queries, text_probes=None)
predictions = np.array([np.bincount([int(s.argmax()) for s in scores[q]], minlength=len(queries)) for q in queries])
predictions

./images_google_search/tall_man
./images_google_search/short_man


array([[34, 16],
       [26, 23]], dtype=int64)

In [119]:
queries = ["fat cow", "thin cow"]
scores = get_scores(queries, text_probes=None)
predictions = np.array([np.bincount([int(s.argmax()) for s in scores[q]], minlength=len(queries)) for q in queries])
predictions

./images_google_search/fat_cow
./images_google_search/thin_cow


array([[23, 27],
       [ 1, 49]], dtype=int64)

In [120]:
queries = ["tall cow", "short cow"]
scores = get_scores(queries, text_probes=None)
predictions = np.array([np.bincount([int(s.argmax()) for s in scores[q]], minlength=len(queries)) for q in queries])
predictions

./images_google_search/tall_cow
./images_google_search/short_cow


array([[30, 19],
       [ 4, 46]], dtype=int64)

In [125]:
queries = ["sitting man", "standing man", "running man", "walking man", "smiling man", "frowning man"]
scores = get_scores(queries, text_probes=None)
predictions = np.array([np.bincount([int(s.argmax()) for s in scores[q]], minlength=len(queries)) for q in queries])
predictions

./images_google_search/sitting_man
./images_google_search/standing_man
./images_google_search/running_man
./images_google_search/walking_man
./images_google_search/smiling_man
./images_google_search/frowning_man


array([[49,  0,  0,  0,  1,  0],
       [ 0, 48,  0,  1,  0,  0],
       [ 0,  0, 48,  0,  1,  0],
       [ 0, 12,  6, 33,  0,  0],
       [ 1,  3,  0,  0, 46,  0],
       [ 1,  0,  0,  0,  1, 47]], dtype=int64)

In [235]:
queries = ["green cat", "green frog", "yellow cat"]
scores = get_scores(queries, text_probes=None)
predictions = np.array([np.bincount([int(s.argmax()) for s in scores[q]], minlength=len(queries)) for q in queries])
predictions

./images_google_search/green_cat
./images_google_search/green_frog
./images_google_search/yellow_cat


array([[48,  0,  2],
       [ 0, 50,  0],
       [ 1,  0, 50]], dtype=int64)

In [246]:
queries = ["green grass", "red grass", "red ball"]
scores = get_scores(queries, text_probes=["a photo of green grass", "a photo of red grass", "a photo of red ball"])
predictions = np.array([np.bincount([int(s.argmax()) for s in scores[q]], minlength=len(queries)) for q in queries])
predictions

./images_google_search/green_grass
./images_google_search/red_grass
./images_google_search/red_ball


array([[52,  0,  0],
       [ 4, 46,  0],
       [ 0,  0, 49]], dtype=int64)

In [14]:
classes = ["cat","dog", "deer", "cow", "pig"]
queries = [c+" on grass during day" for c in classes]
scores = get_scores(queries, text_probes=["a cartoon of a "+c+" on grass" for c in classes])
predictions = np.array([np.bincount([int(s.argmax()) for s in scores[q]], minlength=len(queries)) for q in queries])

./images_google_search/cat_on_grass_during_day
./images_google_search/dog_on_grass_during_day
./images_google_search/deer_on_grass_during_day
./images_google_search/cow_on_grass_during_day
./images_google_search/pig_on_grass_during_day


In [13]:
predictions

array([[64,  0,  0,  0,  0],
       [ 0, 91,  0,  1,  1],
       [ 0,  1, 43,  0,  0],
       [ 0,  0,  1, 42,  1],
       [ 0,  0,  0,  0, 48]])

In [15]:
predictions

array([[64,  0,  0,  0,  0],
       [ 1, 91,  0,  1,  0],
       [ 0,  1, 43,  0,  0],
       [ 0,  1,  0, 43,  0],
       [ 0,  0,  0,  0, 48]])

In [11]:
predictions

array([[55,  2,  0,  0,  7],
       [ 1, 72,  1,  7, 12],
       [ 1,  0, 37,  4,  2],
       [ 0,  1,  1, 38,  4],
       [ 0,  1,  1,  7, 39]])