In [0]:
import json
import time
import datetime
import numpy as np
import code
import os
import cPickle as pickle
import math
import scipy.io
import matplotlib.pyplot  as plt
import matplotlib.image as mpimg
from IPython import get_ipython
ipy = get_ipython()
if ipy is not None:
    ipy.run_line_magic('matplotlib', 'inline')
from IPython.display import display,Image
#from PIL import Image



from imagernn.solver import Solver
#from imagernn.imagernn_utils import eval_split
from imagernn.generic_batch_generator import GenericBatchGenerator

## Metrics

In [0]:
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider

class COCOEvalCap:
    def __init__(self,images,gts,res):
        self.evalImgs = []
        self.eval = {}
        self.imgToEval = {}
        self.params = {'image_id': images}
        self.gts = gts
        self.res = res

    def evaluate(self):
        imgIds = self.params['image_id']
        #print imgIds
        gts = self.gts
        res = self.res

        # =================================================
        # Set up scorers
        # =================================================
        print 'tokenization...'
        tokenizer = PTBTokenizer()
        gts  = tokenizer.tokenize(gts)
        res = tokenizer.tokenize(res)

        # =================================================
        # Set up scorers
        # =================================================
        print 'setting up scorers...'
        scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            (Meteor(),"METEOR"),
            (Rouge(), "ROUGE_L"),
            (Cider(), "CIDEr")
        ]

        # =================================================
        # Compute scores
        # =================================================
        eval = {}
        for scorer, method in scorers:
            print 'computing %s score...'%(scorer.method())
            score, scores = scorer.compute_score(gts, res)
            if type(method) == list:
                for sc, scs, m in zip(score, scores, method):
                    self.setEval(sc, m)
                    self.setImgToEvalImgs(scs, imgIds, m)
                    print "%s: %0.3f"%(m, sc)
            else:
                self.setEval(score, method)
                self.setImgToEvalImgs(scores, imgIds, method)
                print "%s: %0.3f"%(method, score)
        self.setEvalImgs()

    def setEval(self, score, method):
        self.eval[method] = score

    def setImgToEvalImgs(self, scores, imgIds, method):
        for imgId, score in zip(imgIds, scores):
            if not imgId in self.imgToEval:
                self.imgToEval[imgId] = {}
                self.imgToEval[imgId]["image_id"] = imgId
            self.imgToEval[imgId][method] = score

    def setEvalImgs(self):
        self.evalImgs = [eval for imgId, eval in self.imgToEval.items()]

def calculate_metrics(rng,datasetGTS,datasetRES,train_set,test_set):
    imgIds = rng
    gts = {}
    res = {}

    imgToAnnsGTS = {ann['image_id']: [] for ann in datasetGTS['annotations']}
    for ann in datasetGTS['annotations']:
        imgToAnnsGTS[ann['image_id']] += [ann]

    imgToAnnsRES = {ann['image_id']: [] for ann in datasetRES['annotations']}
    for ann in datasetRES['annotations']:
        imgToAnnsRES[ann['image_id']] += [ann]
        
        
    if train_set == 0 and test_set == 1:
        for imgId in range(0,1590):
            gts[imgId] = imgToAnnsGTS[imgId]
            res[imgId] = imgToAnnsRES[imgId]
    else:
        for imgId in range(0,5999):
            gts[imgId] = imgToAnnsGTS[imgId]
            res[imgId] = imgToAnnsRES[imgId]

    evalObj = COCOEvalCap(imgIds,gts,res)
    evalObj.evaluate()
    return evalObj.eval

In [0]:
train_set = 0
test_set = 0
parent = os.getcwd()
dataset_path = parent + "/data/flickr8k/dataset.json"
img_path = parent +"/data/flickr8k/Flicker8k_Dataset"

data = json.load(open(dataset_path,'r'))
# load the features for all images
features_path = parent + "/data/flickr8k/vgg_feats.mat"
features_struct = scipy.io.loadmat(features_path)
features = features_struct['feats'] # this is a 4096 x N numpy array of features
features = features.T
D,N = features.shape
print 'features.shape:',features.shape
print 'image id:',features_struct.keys()
BatchGenerator = GenericBatchGenerator()

datasetGTS = {}
datasetGTS['annotations'] = []
datasetRES = {}
datasetRES['annotations'] = []


features.shape: (4096, 8091)
image id: ['__version__', '__header__', 'feats', '__globals__']


## Evaluation metric on model trained on 6000 images

### Metric on unweighted

### For training data (0-6000 images)

In [0]:
checkpoint_path = parent + "/cv/model_checkpoint_flickr8k_7966c3c38f83_baseline_7395.00.p"
print 'loading checkpoint %s' % (checkpoint_path, )
checkpoint = pickle.load(open(checkpoint_path, 'rb'))
checkpoint_params = checkpoint['params']
dataset = checkpoint_params['dataset']
model = checkpoint['model']
misc = {}
misc['wordtoix'] = checkpoint['wordtoix']
ixtoword = checkpoint['ixtoword']

clearValues()

for i in range(0,6000):
    img = {}
    img['feat'] = features[:, i]
    kwparams = { 'beam_size' : 1 }
    Ys = BatchGenerator.predict([{'image':img}], model, checkpoint_params, **kwparams)
    img_blob = {}
    top_predictions = Ys[0] # take predictions for the first (and only) image we passed in
    top_prediction = top_predictions[0] # these are sorted with highest on top
    candidate = ' '.join([str(ixtoword[ix]) for ix in top_prediction[1] if ix > 0]) 
    gtsobj = {}
    gtsobj['image_id'] = i
    gtsobj['caption'] = data['images'][i]['sentences'][0]['raw']
    datasetGTS['annotations'].append(gtsobj)
    
    resobj = {}
    resobj['image_id'] = i
    resobj['caption'] = candidate
    datasetRES['annotations'].append(resobj)
    
rng = range(6000)
train_set = 1
test_set = 0
print calculate_metrics(rng,datasetGTS,datasetRES,train_set,test_set)
    

loading checkpoint /home/sakthi/image-cap/cv/model_checkpoint_flickr8k_7966c3c38f83_baseline_7395.00.p
tokenization...
setting up scorers...
computing Bleu score...
{'reflen': 66638, 'guess': [67839, 61840, 55841, 49842], 'testlen': 67839, 'correct': [20081, 4986, 1466, 545]}
ratio: 1.01802274978
Bleu_1: 0.296
Bleu_2: 0.154
Bleu_3: 0.086
Bleu_4: 0.051
computing METEOR score...
METEOR: 0.109
computing Rouge score...
ROUGE_L: 0.286
computing CIDEr score...
CIDEr: 0.285
{'CIDEr': 0.2848574737088937, 'Bleu_4': 0.051161424620386864, 'Bleu_3': 0.08557032045771909, 'Bleu_2': 0.1544878559802474, 'Bleu_1': 0.29600966995385697, 'ROUGE_L': 0.28550485692732097, 'METEOR': 0.10916282499921855}


### For test data 1591 images (6500 - 8091)

In [0]:
checkpoint_path = parent+ "/cv/model_checkpoint_flickr8k_7966c3c38f83_baseline_7395.00.p"
print 'loading checkpoint %s' % (checkpoint_path, )
checkpoint = pickle.load(open(checkpoint_path, 'rb'))
checkpoint_params = checkpoint['params']
dataset = checkpoint_params['dataset']
model = checkpoint['model']
misc = {}
misc['wordtoix'] = checkpoint['wordtoix']
ixtoword = checkpoint['ixtoword']


datasetGTS = {}
datasetGTS['annotations'] = []
datasetRES = {}
datasetRES['annotations'] = []

for i in range(6500,8091):
    img = {}
    img['feat'] = features[:, i]
    kwparams = { 'beam_size' : 1 }
    Ys = BatchGenerator.predict([{'image':img}], model, checkpoint_params, **kwparams)
    img_blob = {}
    top_predictions = Ys[0] # take predictions for the first (and only) image we passed in
    top_prediction = top_predictions[0] # these are sorted with highest on top
    candidate = ' '.join([str(ixtoword[ix]) for ix in top_prediction[1] if ix > 0]) 
    gtsobj = {}
    gtsobj['image_id'] = i-6500
    gtsobj['caption'] = data['images'][i]['sentences'][0]['raw']
    datasetGTS['annotations'].append(gtsobj)
    
    resobj = {}
    resobj['image_id'] = i-6500
    resobj['caption'] = candidate
    datasetRES['annotations'].append(resobj)
    
rng = range(1591)
train_set = 0
test_set = 1
print calculate_metrics(rng,datasetGTS,datasetRES,train_set,test_set)
    

loading checkpoint /home/sakthi/image-cap/cv/model_checkpoint_flickr8k_7966c3c38f83_baseline_7395.00.p
tokenization...
setting up scorers...
computing Bleu score...
{'reflen': 17717, 'guess': [17758, 16168, 14578, 12988], 'testlen': 17758, 'correct': [5310, 1282, 376, 140]}
ratio: 1.00231416154
Bleu_1: 0.299
Bleu_2: 0.154
Bleu_3: 0.085
Bleu_4: 0.051
computing METEOR score...
METEOR: 0.108
computing Rouge score...
ROUGE_L: 0.287
computing CIDEr score...
CIDEr: 0.300
{'CIDEr': 0.3003102906039707, 'Bleu_4': 0.050670123430397895, 'Bleu_3': 0.0848803896147402, 'Bleu_2': 0.15398063172774473, 'Bleu_1': 0.299020159927903, 'ROUGE_L': 0.28745549917612734, 'METEOR': 0.1084317376274938}


### Metrics for weighted

### For training data (0-6000 images)

In [0]:
checkpoint_path = parent + "/cv/model_checkpoint_flickr8k_ba0af9c36d3f_baseline_7395.00.p"
print 'loading checkpoint %s' % (checkpoint_path, )
checkpoint = pickle.load(open(checkpoint_path, 'rb'))
checkpoint_params = checkpoint['params']
dataset = checkpoint_params['dataset']
model = checkpoint['model']
misc = {}
misc['wordtoix'] = checkpoint['wordtoix']
ixtoword = checkpoint['ixtoword']


datasetGTS = {}
datasetGTS['annotations'] = []
datasetRES = {}
datasetRES['annotations'] = []

for i in range(0,6000):
    img = {}
    img['feat'] = features[:, i]
    kwparams = { 'beam_size' : 1 }
    Ys = BatchGenerator.predict([{'image':img}], model, checkpoint_params, **kwparams)
    img_blob = {}
    top_predictions = Ys[0] # take predictions for the first (and only) image we passed in
    top_prediction = top_predictions[0] # these are sorted with highest on top
    candidate = ' '.join([str(ixtoword[ix]) for ix in top_prediction[1] if ix > 0]) 
    gtsobj = {}
    gtsobj['image_id'] = i
    gtsobj['caption'] = data['images'][i]['sentences'][0]['raw']
    datasetGTS['annotations'].append(gtsobj)
    
    resobj = {}
    resobj['image_id'] = i
    resobj['caption'] = candidate
    datasetRES['annotations'].append(resobj)
    
rng = range(6000)
train_set = 1
test_set = 0
print calculate_metrics(rng,datasetGTS,datasetRES,train_set,test_set)
    

loading checkpoint /home/sakthi/image-cap/cv/model_checkpoint_flickr8k_ba0af9c36d3f_baseline_7395.00.p
features.shape: (4096, 8091)
image id: ['__version__', '__header__', 'feats', '__globals__']
tokenization...
setting up scorers...
computing Bleu score...
{'reflen': 66647, 'guess': [71689, 65689, 59689, 53689], 'testlen': 71689, 'correct': [21061, 5741, 1864, 653]}
ratio: 1.07565231743
Bleu_1: 0.294
Bleu_2: 0.160
Bleu_3: 0.093
Bleu_4: 0.056
computing METEOR score...
METEOR: 0.117
computing Rouge score...
ROUGE_L: 0.296
computing CIDEr score...
CIDEr: 0.333
{'CIDEr': 0.3325606807259991, 'Bleu_4': 0.055882420422674786, 'Bleu_3': 0.09290184143523018, 'Bleu_2': 0.1602362162015724, 'Bleu_1': 0.29378286766449113, 'ROUGE_L': 0.2957046114358279, 'METEOR': 0.11692933739288391}


### For test data 1591 images (6500 - 8091)

In [0]:
checkpoint_path = parent + "/cv/model_checkpoint_flickr8k_ba0af9c36d3f_baseline_7395.00.p"
print 'loading checkpoint %s' % (checkpoint_path, )
checkpoint = pickle.load(open(checkpoint_path, 'rb'))
checkpoint_params = checkpoint['params']
dataset = checkpoint_params['dataset']
model = checkpoint['model']
misc = {}
misc['wordtoix'] = checkpoint['wordtoix']
ixtoword = checkpoint['ixtoword']

datasetGTS = {}
datasetGTS['annotations'] = []
datasetRES = {}
datasetRES['annotations'] = []

for i in range(6500,8091):
    img = {}
    img['feat'] = features[:, i]
    kwparams = { 'beam_size' : 1 }
    Ys = BatchGenerator.predict([{'image':img}], model, checkpoint_params, **kwparams)
    img_blob = {}
    top_predictions = Ys[0] # take predictions for the first (and only) image we passed in
    top_prediction = top_predictions[0] # these are sorted with highest on top
    candidate = ' '.join([str(ixtoword[ix]) for ix in top_prediction[1] if ix > 0]) 
    gtsobj = {}
    gtsobj['image_id'] = i-6500
    gtsobj['caption'] = data['images'][i]['sentences'][0]['raw']
    datasetGTS['annotations'].append(gtsobj)
    
    resobj = {}
    resobj['image_id'] = i-6500
    resobj['caption'] = candidate
    datasetRES['annotations'].append(resobj)
    
rng = range(1591)
train_set = 0
test_set = 1
print calculate_metrics(rng,datasetGTS,datasetRES,train_set,test_set)

loading checkpoint /home/sakthi/image-cap/cv/model_checkpoint_flickr8k_ba0af9c36d3f_baseline_7395.00.p
features.shape: (4096, 8091)
image id: ['__version__', '__header__', 'feats', '__globals__']


  IFOGf[t,:3*d] = 1.0/(1.0+np.exp(-IFOG[t,:3*d]))


tokenization...
setting up scorers...
computing Bleu score...
{'reflen': 17717, 'guess': [18700, 17110, 15520, 13930], 'testlen': 18700, 'correct': [5488, 1439, 464, 160]}
ratio: 1.05548343399
Bleu_1: 0.293
Bleu_2: 0.157
Bleu_3: 0.090
Bleu_4: 0.054
computing METEOR score...
METEOR: 0.115
computing Rouge score...
ROUGE_L: 0.292
computing CIDEr score...
CIDEr: 0.340
{'CIDEr': 0.3399695427417475, 'Bleu_4': 0.05395660337999134, 'Bleu_3': 0.09036560933736541, 'Bleu_2': 0.15710559081758674, 'Bleu_1': 0.2934759358288613, 'ROUGE_L': 0.292373978229879, 'METEOR': 0.11460105656257771}


## Metric for weighting strategy 2 trained on 6000 images

### For training data (0-6000 images)

In [0]:
checkpoint_path = parent + "/cv/model_checkpoint_flickr8k_2e3bd92f6ccb_baseline_7395.00.p"
print 'loading checkpoint %s' % (checkpoint_path, )
checkpoint = pickle.load(open(checkpoint_path, 'rb'))
checkpoint_params = checkpoint['params']
dataset = checkpoint_params['dataset']
model = checkpoint['model']
misc = {}
misc['wordtoix'] = checkpoint['wordtoix']
ixtoword = checkpoint['ixtoword']

datasetGTS = {}
datasetGTS['annotations'] = []
datasetRES = {}
datasetRES['annotations'] = []

for i in range(0,6000):
    img = {}
    img['feat'] = features[:, i]
    kwparams = { 'beam_size' : 1 }
    Ys = BatchGenerator.predict([{'image':img}], model, checkpoint_params, **kwparams)
    img_blob = {}
    top_predictions = Ys[0] # take predictions for the first (and only) image we passed in
    top_prediction = top_predictions[0] # these are sorted with highest on top
    candidate = ' '.join([str(ixtoword[ix]) for ix in top_prediction[1] if ix > 0]) 
    gtsobj = {}
    gtsobj['image_id'] = i
    gtsobj['caption'] = data['images'][i]['sentences'][0]['raw']
    datasetGTS['annotations'].append(gtsobj)
    
    resobj = {}
    resobj['image_id'] = i
    resobj['caption'] = candidate
    datasetRES['annotations'].append(resobj)
    
rng = range(6000)
train_set = 1
test_set = 0
print calculate_metrics(rng,datasetGTS,datasetRES,train_set,test_set)
    

loading checkpoint /home/sakthi/image-cap/cv/model_checkpoint_flickr8k_2e3bd92f6ccb_baseline_7395.00.p
features.shape: (4096, 8091)
image id: ['__version__', '__header__', 'feats', '__globals__']
tokenization...
setting up scorers...
computing Bleu score...
{'reflen': 66647, 'guess': [69538, 63538, 57538, 51538], 'testlen': 69538, 'correct': [21380, 5811, 1884, 744]}
ratio: 1.04337779645
Bleu_1: 0.307
Bleu_2: 0.168
Bleu_3: 0.097
Bleu_4: 0.060
computing METEOR score...
METEOR: 0.119
computing Rouge score...
ROUGE_L: 0.298
computing CIDEr score...


### For test data 1591 images (6500 - 8091)

In [0]:
checkpoint_path = parent + "/cv/model_checkpoint_flickr8k_2e3bd92f6ccb_baseline_7395.00.p"
print 'loading checkpoint %s' % (checkpoint_path, )
checkpoint = pickle.load(open(checkpoint_path, 'rb'))
checkpoint_params = checkpoint['params']
dataset = checkpoint_params['dataset']
model = checkpoint['model']
misc = {}
misc['wordtoix'] = checkpoint['wordtoix']
ixtoword = checkpoint['ixtoword']


datasetGTS = {}
datasetGTS['annotations'] = []
datasetRES = {}
datasetRES['annotations'] = []

for i in range(6500,8091):
    img = {}
    img['feat'] = features[:, i]
    kwparams = { 'beam_size' : 1 }
    Ys = BatchGenerator.predict([{'image':img}], model, checkpoint_params, **kwparams)
    img_blob = {}
    top_predictions = Ys[0] # take predictions for the first (and only) image we passed in
    top_prediction = top_predictions[0] # these are sorted with highest on top
    candidate = ' '.join([str(ixtoword[ix]) for ix in top_prediction[1] if ix > 0]) 
    gtsobj = {}
    gtsobj['image_id'] = i-6500
    gtsobj['caption'] = data['images'][i]['sentences'][0]['raw']
    datasetGTS['annotations'].append(gtsobj)
    
    resobj = {}
    resobj['image_id'] = i-6500
    resobj['caption'] = candidate
    datasetRES['annotations'].append(resobj)
    
rng = range(1591)
train_set = 0
test_set = 1
print calculate_metrics(rng,datasetGTS,datasetRES,train_set,test_set)

loading checkpoint /home/sakthi/image-cap/cv/model_checkpoint_flickr8k_2e3bd92f6ccb_baseline_7395.00.p
features.shape: (4096, 8091)
image id: ['__version__', '__header__', 'feats', '__globals__']
tokenization...
setting up scorers...
computing Bleu score...
{'reflen': 17717, 'guess': [18229, 16639, 15049, 13459], 'testlen': 18229, 'correct': [5584, 1422, 452, 167]}
ratio: 1.02889879776
Bleu_1: 0.306
Bleu_2: 0.162
Bleu_3: 0.092
Bleu_4: 0.056
computing METEOR score...
METEOR: 0.115
computing Rouge score...
ROUGE_L: 0.296
computing CIDEr score...
CIDEr: 0.323
{'CIDEr': 0.3225420725729645, 'Bleu_4': 0.05588848697879914, 'Bleu_3': 0.09229862770371737, 'Bleu_2': 0.16179960970427323, 'Bleu_1': 0.30632508640077316, 'ROUGE_L': 0.2957793647161788, 'METEOR': 0.11507930399416517}


In [0]:
import pandas as pd

In [0]:
train_table = pd.DataFrame({"Metrics":["BLEU-1","BLEU-2","BLEU-3","BLEU-4","METEOR","ROGUE_L","CIDEr"],
                         "Vanilla Model":[29.6,15.4,8.6,5.1,10.9,28.6,28.5],
                         "Weighting strategy 1":[29.4,16,9.3,5.6,11.7,29.6,33.3],
                         "Weighting strategy 2":[30.7,16.8,9.7,6,11.9,29.8,34]})

In [0]:
test_table = pd.DataFrame({"Metrics":["BLEU-1","BLEU-2","BLEU-3","BLEU-4","METEOR","ROGUE_L","CIDEr"],
                         "Vanilla Model":[23.5,17.8,11.9,1.82e-3,14.9,26.8,47.1],
                         "Weighting strategy 1":[30.4,24.1,18.3,2.44e-3,17.5,28.4,62.1],
                         "Weighting strategy 2":[27.8,18.6,13.5,2.13e-3,14.2,34.4,144.6]})

In [0]:
print "training result"
train_table

training result


Unnamed: 0,Metrics,Vanilla Model,Weighting strategy 1,Weighting strategy 2
0,BLEU-1,29.6,29.4,30.7
1,BLEU-2,15.4,16.0,16.8
2,BLEU-3,8.6,9.3,9.7
3,BLEU-4,5.1,5.6,6.0
4,METEOR,10.9,11.7,11.9
5,ROGUE_L,28.6,29.6,29.8
6,CIDEr,28.5,33.3,34.0


In [0]:
print "testing result"
test_table

testing result


Unnamed: 0,Metrics,Vanilla Model,Weighting strategy 1,Weighting strategy 2
0,BLEU-1,23.5,30.4,27.8
1,BLEU-2,17.8,24.1,18.6
2,BLEU-3,11.9,18.3,13.5
3,BLEU-4,0.00182,0.00244,0.00213
4,METEOR,14.9,17.5,14.2
5,ROGUE_L,26.8,28.4,34.4
6,CIDEr,47.1,62.1,144.6
