In [1]:
import sys

In [2]:
%load_ext autoreload

In [3]:
%autoreload 1

In [4]:
import argparse
import time
import torch
import torchvision.transforms as transforms
import numpy as np
import os

from misc.dataset import CocoCaptionsRV, Multi30k
from misc.evaluation import eval_recall, eval_recall5
from misc.model import joint_embedding
from misc.utils import collate_fn_padded
from torch.utils.data import DataLoader
import torch.utils.data as data

os.environ["CUDA_VISIBLE_DEVICES"]="3"

## Parameters

In [5]:
normalize = transforms.Normalize(
        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

prepro_val = transforms.Compose([
        transforms.Resize((400, 400)),
        transforms.ToTensor(),
        normalize,
    ])

In [6]:
class arguments:
    def __init__(self, dict):
        self.dict = '/data/m.portaz/'+dict

In [7]:
batch_size = 128

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Test Evaluation

In [10]:
def cosine_sim(A, B):
    """
        Return similarity of each image with each caption
        One line of the output matrix correspond to one image
        Each row correspond to one caption
    """
    img_norm = np.linalg.norm(A, axis=1)
    caps_norm = np.linalg.norm(B, axis=1)
    scores = np.dot(A, B.T)
    norms = np.dot(np.expand_dims(img_norm, 1),np.expand_dims(caps_norm.T, 1).T)
    scores = (scores / norms)
    return scores

In [11]:
def multilingual_recall(imgs, caps, indices, ks=[1,5,10]):
    """
        Compute multingual recall
    """
    imgs = np.vstack(imgs)
    caps = np.vstack(caps)

    scores = -cosine_sim(imgs, caps)
    ranks = np.argsort(np.argsort(scores))
    # scores represent all the similarity between each images and each captions
    recall = {k:0 for k in ks}
    nb_imgs, nb_caps = ranks.shape
    for i in range(nb_imgs):
        for k in ks:
            for j in range(nb_caps):
                if indices[j] == i and ranks[i][j] < k: #if the caption correspond to the image and is ranked less than k
                    recall[k] += 1
                
    scores = np.transpose(scores)
    ranks_caps = np.argsort(np.argsort(scores))
    recall_caps = {k:0 for k in ks}
    print("Caption ranks :", ranks_caps)
    nb_caps, nb_imgs = ranks.shape
    for i in range(nb_caps):
        for k in ks:
            if ranks[indices[i]][i] < k:
                recall_caps[k] += 1
    
    return [recall[k] / imgs.shape[0]*100 for k in ks], [recall_caps[k] / ranks_caps.shape[0]*100 for k in ks]

# Models to evaluate
## with their corresponding dictionnaries

In [9]:
models = []

#english only models
models.append(["best_sota_coco.pth.tar", 
                {'en':"wiki.en.bin"}
              ]) # state of the art model

models.append(["best_w2vec.pth.tar", 
               {'en':"w2vec_model_vec.en.vec"}
              ]) # word2vec model
               
models.append(["best_bivec_coco.pth.tar", 
               {'en':"bivec_model_vec.en-fr.en.vec"}
               ]) # bivec on coco only
               
models.append(["best_correct_en.pth.tar", 
               {'en':"wiki.multi.en.vec"}
              ]) # muse on english


#multilingual models
models.append(["best_bivec_enfr.pth.tar",
               {'en':"bivec_model_vec.en-fr.en.vec",
                'fr':"bivec_model_vec.en-fr.fr.vec"}
            ])

models.append(["best_bivec_de.pth.tar", 
               {'en':"bivec_model_vec.en-de.en.vec",
                'de':"bivec_model_vec.en-de.de.vec"}
              ])
                

models.append(["best_correct_enfr.pth.tar", 
               {'en':"wiki.multi.en.vec",
                'fr':"wiki.multi.fr.vec",
                'fr':"wiki.multi.de.vec",
                'fr':"wiki.multi.cs.vec"}
              ])
                

models.append(["best_correct_full_enfrde.pth.tar",
               {'en':"wiki.multi.en.vec",
                'fr':"wiki.multi.fr.vec",
                'fr':"wiki.multi.de.vec",
                'fr':"wiki.multi.cs.vec"}
              ])
                
models.append(["best_correct_full_enfrde.pth.tar", 
               {'en':"wiki.multi.en.vec",
                'fr':"wiki.multi.fr.vec",
                'fr':"wiki.multi.de.vec",
                'fr':"wiki.multi.cs.vec"}
              ])

# Evaluation on english

In [12]:
np.sum([1,2,3])

6

In [10]:
for model, dic in models:
    aa = arguments(dic['en'])
    coco_dataset = CocoCaptionsRV(aa, sset="val", transform=prepro_val)

    coco_dataset_loader = DataLoader(coco_dataset, batch_size=batch_size, shuffle=False,
                                num_workers=6, collate_fn=collate_fn_padded, pin_memory=True)
    
    # load model
    checkpoint = torch.load("weights/"+model, map_location=lambda storage, loc: storage)
    join_emb = joint_embedding(checkpoint['args_dict']).cuda()
    join_emb.load_state_dict(checkpoint["state_dict"])
    join_emb = torch.nn.DataParallel(join_emb.cuda().eval())
    
    
    imgs_enc = list()
    caps_enc = list()

    for i, (imgs, caps, lengths) in enumerate(coco_dataset_loader, 0):
        print("%2.2f" % (i*batch_size/len(coco_dataset)*100), "\%", end="\r")
        input_imgs, input_caps = imgs.cuda(), caps.cuda()
        with torch.no_grad():
            output_imgs, output_caps = join_emb(input_imgs, input_caps, lengths)

        imgs_enc.append(output_imgs.cpu().data.numpy())
        caps_enc.append(output_caps.cpu().data.numpy())
    

    print(model, eval_recall5(imgs_enc, caps_enc))

best_sota_coco.pth.tar [array([66.08, 90.7 , 96.2 ]), array([54.124, 85.748, 92.928]), 0.0, 0.0]


FileNotFoundError: [Errno 2] No such file or directory: '/data/m.portaz/w2vec_model_vec.en.vec'

# Multilang evaluation

In [9]:
%aimport misc.dataset

In [10]:
dataset_multi = misc.dataset.MultiLingualDataset(filename="data/image_splits/test_2016_flickr.txt",
                              image_dir="/data/datasets/flickr30k_images", 
                              captionsFileList=
                                  [("data/tok/test_2016_flickr.lc.norm.tok.en",'en'),
                                    ("data/tok/test_2016_flickr.lc.norm.tok.fr",'fr'),
                                   ("data/tok/test_2016_flickr.lc.norm.tok.de",'de'),
                                   ("data/tok/test_2016_flickr.lc.norm.tok.cs",'cs')
                                  ],
                              dictDict={'en':'/data/m.portaz/wiki.multi.en.vec',
                                        'fr':"/data/m.portaz/wiki.multi.fr.vec",
                                        'de':"/data/m.portaz/wiki.multi.de.vec",
                                        'cs':"/data/m.portaz/wiki.multi.cs.vec"
                                       },
                              transform=prepro_val
                             )

In [12]:
image_dataset = misc.dataset.ImageDataset("data/image_splits/test_2016_flickr.txt",
                            "/data/datasets/flickr30k_images",
                            transform=prepro_val
                            )

In [13]:
english_dataset = misc.dataset.CaptionDataset("data/tok/test_2016_flickr.lc.norm.tok.en",
                                "/data/m.portaz/wiki.multi.en.vec")
french_dataset  = misc.dataset.CaptionDataset("data/tok/test_2016_flickr.lc.norm.tok.fr",
                                "/data/m.portaz/wiki.multi.fr.vec")


In [None]:
image_dataset[0]

In [61]:
for model, dic in models:
    
    
    
    for lang in dic:
        
    aa = arguments(dic['en'])
    coco_dataset = CocoCaptionsRV(aa, sset="val", transform=prepro_val)

    coco_dataset_loader = DataLoader(coco_dataset, batch_size=batch_size, shuffle=False,
                                num_workers=6, collate_fn=collate_fn_padded, pin_memory=True)
    
    # load model
    checkpoint = torch.load("weights/"+model, map_location=lambda storage, loc: storage)
    join_emb = joint_embedding(checkpoint['args_dict']).cuda()
    join_emb.load_state_dict(checkpoint["state_dict"])
    join_emb = torch.nn.DataParallel(join_emb.cuda().eval())
    
    
    imgs_enc = list()
    caps_enc = list()

    for i, (imgs, caps, lengths) in enumerate(coco_dataset_loader, 0):
        print("%2.2f" % (i*batch_size/len(coco_dataset)*100), "\%", end="\r")
        input_imgs, input_caps = imgs.cuda(), caps.cuda()
        with torch.no_grad():
            output_imgs, output_caps = join_emb(input_imgs, input_caps, lengths)

        imgs_enc.append(output_imgs.cpu().data.numpy())
        caps_enc.append(output_caps.cpu().data.numpy())
    

    print(model, eval_recall5(imgs_enc, caps_enc))

Opening image : /data/datasets/flickr30k_images/1007129816.jpg


(tensor([[[-1.5870, -1.5699, -1.5699,  ..., -1.6555, -1.6727, -1.6727],
          [-1.5870, -1.5699, -1.5699,  ..., -1.6555, -1.6727, -1.6727],
          [-1.5870, -1.5699, -1.5699,  ..., -1.6384, -1.6555, -1.6555],
          ...,
          [-1.5528, -1.5528, -1.5528,  ..., -1.8097, -1.8097, -1.7754],
          [-1.5699, -1.5528, -1.5699,  ..., -1.8268, -1.8268, -1.7925],
          [-1.5699, -1.5528, -1.5870,  ..., -1.9295, -1.9638, -1.9467]],
 
         [[-1.5105, -1.4930, -1.4930,  ..., -1.5630, -1.5805, -1.5805],
          [-1.5105, -1.4930, -1.4930,  ..., -1.5630, -1.5805, -1.5805],
          [-1.5105, -1.4930, -1.4930,  ..., -1.5455, -1.5630, -1.5630],
          ...,
          [-1.4405, -1.4405, -1.4405,  ..., -1.9307, -1.9132, -1.8782],
          [-1.4580, -1.4405, -1.4580,  ..., -1.9132, -1.9132, -1.8782],
          [-1.4580, -1.4405, -1.4755,  ..., -1.9482, -1.9832, -1.9657]],
 
         [[-1.3164, -1.2990, -1.2990,  ..., -1.2990, -1.3164, -1.3164],
          [-1.3164, -1.2990,

In [62]:
dataset[len(dataset.captions['en'])]

Opening image : /data/datasets/flickr30k_images/1007129816.jpg


(tensor([[[-1.5870, -1.5699, -1.5699,  ..., -1.6555, -1.6727, -1.6727],
          [-1.5870, -1.5699, -1.5699,  ..., -1.6555, -1.6727, -1.6727],
          [-1.5870, -1.5699, -1.5699,  ..., -1.6384, -1.6555, -1.6555],
          ...,
          [-1.5528, -1.5528, -1.5528,  ..., -1.8097, -1.8097, -1.7754],
          [-1.5699, -1.5528, -1.5699,  ..., -1.8268, -1.8268, -1.7925],
          [-1.5699, -1.5528, -1.5870,  ..., -1.9295, -1.9638, -1.9467]],
 
         [[-1.5105, -1.4930, -1.4930,  ..., -1.5630, -1.5805, -1.5805],
          [-1.5105, -1.4930, -1.4930,  ..., -1.5630, -1.5805, -1.5805],
          [-1.5105, -1.4930, -1.4930,  ..., -1.5455, -1.5630, -1.5630],
          ...,
          [-1.4405, -1.4405, -1.4405,  ..., -1.9307, -1.9132, -1.8782],
          [-1.4580, -1.4405, -1.4580,  ..., -1.9132, -1.9132, -1.8782],
          [-1.4580, -1.4405, -1.4755,  ..., -1.9482, -1.9832, -1.9657]],
 
         [[-1.3164, -1.2990, -1.2990,  ..., -1.2990, -1.3164, -1.3164],
          [-1.3164, -1.2990,