In [1]:
import torch
import clip
from PIL import Image
import json
from tqdm import tqdm
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset, Dataset
import numpy as np
import os 
import imageio
import matplotlib.pyplot as plt
import json
from scipy import spatial
import language_evaluation
from tokenizers import Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [100]:
a = np.array([[[1,2],[3,4]], [[11,22],[33,44]], [[10,20],[30,40]]])
b = -np.array([[[1,2],[3,4]], [[11,22],[33,44]], [[10,20],[30,40]]])
c = [a,b]
np.vstack(c).shape

(6, 2, 2)

In [7]:
a[:,0,:]

array([[ 1,  2],
       [11, 22],
       [10, 20]])

In [101]:
np.argmax(a[:,1,:], axis=-1)

array([1, 1, 1])

## Read annotaion, predict json file

In [21]:
import json
with open('./pred_v1.json') as json_file:
    pred = json.load(json_file)
    
with open('../hw3_data/p2_data/val.json') as val_file:
    truth = json.load(val_file)

In [25]:
pred

{'5769934076': 'A man in a white hat and hat is cutting a cake.',
 '000000304355': 'A bathroom with a toilet, sink, and shower.',
 '3191169746': 'A man is walking through a forest with trees.',
 '000000295642': 'A train that is sitting on the tracks.',
 '000000336937': 'A bunch of toilets are lined up in a row',
 '000000122597': 'A bunch of stuffed animals sitting on a table',
 '000000182416': 'A man sitting on a curb next to a red fire hydrant.',
 '000000480313': 'A man holding a hot dog in a bun.',
 '229059021': 'A man is laying on the ground with his feet up.',
 '3677954655': 'A man is jumping over a pole on the sidewalk.',
 '4725077313': 'A man in a blue shirt and blue jeans sits at a table.',
 '000000543042': 'A black and white photo of a boy and a cat.',
 '2444070322': 'A woman in a blue shirt and blue pants is holding a real frog.',
 '000000257301': 'Two women and two dogs are walking down a trail.',
 '4854738791': 'Two men working on a boat in the woods.',
 '000000312282': 'A c

In [22]:
class HW3_2_eval(Dataset):
    def __init__(self, ann):
        super().__init__()
        self.annot = [(val['image_id'], val['caption']) for val in ann['annotations']]
        
        #file dict: id2filename
        self.file_dict = {}
        for file in ann['images']:
            self.file_dict[file['id']] = file['file_name'] 
    
    def __len__(self):
        return len(self.annot)

    def __getitem__(self, idx):        
        image_id, caption = self.annot[idx]
        filename = self.file_dict[image_id].split('.')[0]
        return filename, caption

## annotation dictionary

In [23]:
ann = HW3_2_eval(truth)
ann_dict = {}
for i in range(len(ann)):
    if ann[i][0] not in ann_dict:
        ann_dict[ann[i][0]] = [ann[i][1]]
    ann_dict[ann[i][0]] = ann_dict[ann[i][0]] + [ann[i][1]]

## stored sorted results

In [24]:
sort_caps = []
sort_preds = []
for keys, values in pred.items():
    caps = ann_dict[keys]
    sort_caps.append(caps)
    sort_preds.append(values)

## compute cider

In [25]:
evaluator = language_evaluation.CocoEvaluator()
results = evaluator.run_evaluation(sort_preds, sort_caps)
cider = results['CIDEr']
print('cider:', cider)

PTBTokenizer tokenized 134385 tokens at 1002181.43 tokens per second.
PTBTokenizer tokenized 21385 tokens at 319786.13 tokens per second.
Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.3 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.dist

SPICE evaluation took: 47.17 s
cider: 0.9945605886662228


## Clip score

In [26]:
def get_clipscore(img_embed, text_embed):
    w = 2.5
    cos_sim = 1 - spatial.distance.cosine(img_embed, text_embed)
    return w*max(cos_sim, 0)

In [27]:
# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

#data path
val_path = '../hw3_data/p2_data/images/val/'
path_list = [os.path.join(val_path,x) for x in os.listdir(val_path) if x.endswith(".jpg")]
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")

im_feat_full = []
txt_feat_full = []
for i, path in tqdm(enumerate(path_list), total=len(path_list)):
    image = Image.open(path)
    filename = path.split('/')[-1].split('.')[0]
    caption = pred[filename]
    image_input = preprocess(image).unsqueeze(0).to(device)
    text_input = clip.tokenize(caption).to(device)
    
    # Calculate features
    with torch.no_grad():
        image_features = model.encode_image(image_input) #(1,512)
        text_features = model.encode_text(text_input)
        im_feat_full.append(image_features.cpu().numpy().reshape(-1))
        txt_feat_full.append(text_features.cpu().numpy().reshape(-1))

im_embed = np.hstack(im_feat_full)
txt_embed = np.hstack(txt_feat_full)
clipscore = get_clipscore(im_embed, txt_embed)
print('clipscore:', clipscore)

100%|██████████| 1789/1789 [00:49<00:00, 35.81it/s]

clipscore: 0.7177734375



