### Deep Reinforcement Learning-based Image Captioning with Embedding Reward
Pranshu Gupta, Deep Learning @ Georgia Institute of Technology

In [154]:
# As usual, a bit of setup
from __future__ import print_function
import time, os, json
import numpy as np
import matplotlib.pyplot as plt
import nltk

import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn

from cs231n.coco_utils import load_coco_data, sample_coco_minibatch, decode_captions
from cs231n.image_utils import image_from_url

from torchsummary import summary

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Working on: ", device)

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Working on:  cuda:0


### Load MS-COCO data
We will use the Microsoft COCO dataset for captioning.

In [140]:
# Load COCO data from disk; this returns a dictionary
# We'll work with dimensionality-reduced features for this notebook, but feel
# free to experiment with the original features by changing the flag below.
data = load_coco_data(pca_features=True)

data["train_captions_lens"] = np.zeros(data["train_captions"].shape[0])
data["val_captions_lens"] = np.zeros(data["val_captions"].shape[0])
for i in range(data["train_captions"].shape[0]):
    data["train_captions_lens"][i] = np.nonzero(data["train_captions"][i] == 2)[0][0] + 1
for i in range(data["val_captions"].shape[0]):
    data["val_captions_lens"][i] = np.nonzero(data["val_captions"][i] == 2)[0][0] + 1


# Print out all the keys and values from the data dictionary
for k, v in data.items():
    if type(v) == np.ndarray:
        print(k, type(v), v.shape, v.dtype)
    else:
        print(k, type(v), len(v))

train_captions <class 'numpy.ndarray'> (400135, 17) int32
train_image_idxs <class 'numpy.ndarray'> (400135,) int32
val_captions <class 'numpy.ndarray'> (195954, 17) int32
val_image_idxs <class 'numpy.ndarray'> (195954,) int32
train_features <class 'numpy.ndarray'> (82783, 512) float32
val_features <class 'numpy.ndarray'> (40504, 512) float32
idx_to_word <class 'list'> 1004
word_to_idx <class 'dict'> 1004
train_urls <class 'numpy.ndarray'> (82783,) <U63
val_urls <class 'numpy.ndarray'> (40504,) <U63
train_captions_lens <class 'numpy.ndarray'> (400135,) float64
val_captions_lens <class 'numpy.ndarray'> (195954,) float64


In [166]:
data["idx_to_word"]

['<NULL>',
 '<START>',
 '<END>',
 '<UNK>',
 'a',
 'on',
 'of',
 'the',
 'in',
 'with',
 'and',
 'is',
 'man',
 'to',
 'sitting',
 'an',
 'two',
 'standing',
 'at',
 'people',
 'are',
 'next',
 'white',
 'woman',
 'table',
 'street',
 'that',
 'holding',
 'person',
 'large',
 'some',
 'it',
 'down',
 'top',
 'group',
 'field',
 'up',
 'small',
 'near',
 'tennis',
 'his',
 'front',
 'black',
 'plate',
 'room',
 'train',
 'riding',
 'dog',
 'red',
 'young',
 'by',
 'cat',
 'water',
 'baseball',
 'has',
 'while',
 'walking',
 'playing',
 'bathroom',
 'sign',
 'blue',
 'food',
 'kitchen',
 'grass',
 'bus',
 'there',
 'parked',
 'pizza',
 'green',
 'side',
 'other',
 'building',
 'looking',
 'snow',
 'bed',
 'ball',
 'beach',
 'couple',
 'three',
 'boy',
 'for',
 'men',
 'flying',
 'toilet',
 'city',
 'road',
 'skateboard',
 'out',
 'her',
 'wearing',
 'player',
 'clock',
 'game',
 'over',
 'several',
 'laying',
 'girl',
 'from',
 'sits',
 'wooden',
 'bear',
 'picture',
 'their',
 'bench',
 

### Caption Evaluation

In [8]:
def BLEU_score(gt_caption, sample_caption):
    """
    gt_caption: string, ground-truth caption
    sample_caption: string, your model's predicted caption
    Returns unigram BLEU score.
    """
    reference = [x for x in gt_caption.split(' ') 
                 if ('<END>' not in x and '<START>' not in x and '<UNK>' not in x)]
    hypothesis = [x for x in sample_caption.split(' ') 
                  if ('<END>' not in x and '<START>' not in x and '<UNK>' not in x)]
    BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis, weights = [1])
    return BLEUscore

def evaluate_model(model):
    """
    model: CaptioningRNN model
    Prints unigram BLEU score averaged over 1000 training and val examples.
    """
    BLEUscores = {}
    for split in ['train', 'val']:
        minibatch = sample_coco_minibatch(data, split=split, batch_size=1000)
        gt_captions, features, urls = minibatch
        gt_captions = decode_captions(gt_captions, data['idx_to_word'])

        sample_captions = model.sample(features)
        sample_captions = decode_captions(sample_captions, data['idx_to_word'])

        total_score = 0.0
        for gt_caption, sample_caption, url in zip(gt_captions, sample_captions, urls):
            total_score += BLEU_score(gt_caption, sample_caption)

        BLEUscores[split] = total_score / len(sample_captions)

    for split in BLEUscores:
        print('Average BLEU score for %s: %f' % (split, BLEUscores[split]))

### Policy Network

In [141]:
class PolicyNetwork(nn.Module):
    def __init__(self, word_to_idx, input_dim=512, wordvec_dim=512, hidden_dim=512, dtype=np.float32):
        super(PolicyNetwork, self).__init__()
        
        self.word_to_idx = word_to_idx
        self.idx_to_word = {i: w for w, i in word_to_idx.items()}
        
        vocab_size = len(word_to_idx)
        
        self.null = word_to_idx['<NULL>']
        self.start = word_to_idx.get('<START>', None)
        self.end = word_to_idx.get('<END>', None)
        
        self.caption_embedding = nn.Embedding(vocab_size, wordvec_dim)
        
        self.cnn2linear = nn.Linear(input_dim, hidden_dim)
        self.lstm = nn.LSTM(wordvec_dim, hidden_dim, batch_first=True)
        self.linear2vocab = nn.Linear(hidden_dim, vocab_size)
        self.probs = nn.Softmax(dim=1)
        
    def forward(self, features, captions):
        input_captions = self.caption_embedding(captions)
        hidden_init = self.cnn2linear(features)
        cell_init = torch.zeros_like(hidden_init)
        output, _ = self.lstm(input_captions, (hidden_init, cell_init))
        output = self.linear2vocab(output)
        return output

### Training the Policy Network

In [174]:
policyNetwork = PolicyNetwork(data["word_to_idx"]).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(policyNetwork.parameters(), lr=0.0001)

In [179]:
small_data = load_coco_data(max_train=5000)

In [None]:
batch_size = 50
for epoch in range(10000):
    captions, features, _ = sample_coco_minibatch(small_data, batch_size=batch_size, split='train')
    features = torch.tensor(features, device=device).float().unsqueeze(0)
    captions_in = torch.tensor(captions[:, :-1], device=device).long()
    captions_ou = torch.tensor(captions[:, 1:], device=device).long()
    output = policyNetwork(features, captions_in)
    
    loss = 0
    for i in range(batch_size):
        caplen = np.nonzero(captions[i] == 2)[0][0] + 1
        loss += (caplen/batch_size)*criterion(output[i][:caplen], captions_ou[i][:caplen])
    
    print(loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

27.79432487487793
27.849533081054688
27.58513069152832
27.21681022644043
27.63636589050293
25.90688133239746
26.322227478027344
25.373939514160156
27.0172119140625
26.972564697265625
26.703039169311523
27.454120635986328
25.543241500854492
25.674583435058594
26.441699981689453
29.13265037536621
25.3128719329834
27.321428298950195
25.9634952545166
30.09293556213379
27.7325496673584
27.94649314880371
24.76369857788086
26.091564178466797
28.10906982421875
24.65267562866211
26.953224182128906
24.66082000732422
26.02132797241211
27.454591751098633
27.32353973388672
23.738489151000977
28.0628604888916
28.438858032226562
25.434661865234375
26.316926956176758
25.262582778930664
26.811782836914062
26.032957077026367
25.807418823242188
27.193553924560547
25.387460708618164
26.97427749633789
27.020227432250977
25.15505027770996
28.873199462890625
26.605329513549805
25.473217010498047
23.93617820739746
24.721664428710938
24.98944854736328
23.934255599975586
26.70647621154785
27.844886779785156
24.

22.645687103271484
21.97199821472168
22.470836639404297
21.06716537475586
24.048362731933594
23.15460777282715
23.966880798339844
22.162729263305664
23.804338455200195
22.906126022338867
23.846351623535156
22.51207160949707
22.391454696655273
21.20732307434082
21.163246154785156
27.542184829711914
24.25726890563965
21.149694442749023
21.76241683959961
22.83191680908203
23.928850173950195
22.765165328979492
19.811464309692383
21.6955509185791
20.231046676635742
20.649961471557617
21.744834899902344
21.3497314453125
21.884807586669922
22.64280891418457
23.826995849609375
23.46689224243164
22.188884735107422
22.697906494140625
22.175413131713867
23.403059005737305
24.06847381591797
24.333877563476562
21.89696502685547
23.51311683654785
22.63047218322754
23.18398094177246
22.984363555908203
22.55386734008789
22.750141143798828
20.74024200439453
21.84124755859375
22.65003776550293
23.06637954711914
22.198423385620117
20.72493553161621
22.968273162841797
23.209575653076172
22.231731414794922

19.36012077331543
20.380081176757812
20.591659545898438
18.464773178100586
19.584346771240234
20.647756576538086
20.021907806396484
19.590078353881836
21.67384147644043
19.363222122192383
21.397781372070312
21.05316734313965
20.514493942260742
20.207273483276367
19.69734764099121
20.443838119506836
19.31246566772461
18.87904930114746
19.971071243286133
19.172292709350586
19.388654708862305
20.402530670166016
18.756546020507812
19.78031349182129
19.495223999023438
21.937179565429688
19.20139503479004
19.422136306762695
17.47979164123535
19.4014949798584
20.058124542236328
18.577594757080078
20.702543258666992
20.569438934326172
20.193504333496094
19.57501983642578
18.914730072021484
22.296972274780273
21.617692947387695
20.102937698364258
19.556209564208984
21.49709701538086
19.97968292236328
19.642227172851562
18.455245971679688
21.126127243041992
19.562767028808594
19.621843338012695
19.116348266601562
19.86553192138672
19.720985412597656
20.587221145629883
19.567201614379883
18.96635

16.52035903930664
16.756528854370117
17.512603759765625
16.47405242919922
16.82382583618164
17.400257110595703
17.498798370361328
16.167015075683594
18.561479568481445
18.53936195373535
16.96645164489746
16.050161361694336
16.977142333984375
17.69694709777832
16.97746467590332
17.37704849243164
17.269468307495117
17.077611923217773
16.294111251831055
17.67597007751465
16.637929916381836
16.739334106445312
17.607873916625977
17.00547981262207
16.68943214416504
17.795129776000977
17.446752548217773
15.599882125854492
17.901718139648438
17.679229736328125
16.582874298095703
18.56984519958496
17.390270233154297
17.444656372070312
17.502845764160156
17.148841857910156
17.1641845703125
17.189069747924805
18.53083610534668
17.129283905029297
16.333742141723633
16.129941940307617
19.091035842895508
17.662424087524414
17.67222023010254
16.310266494750977
16.795438766479492
16.719974517822266
17.831628799438477
17.912647247314453
16.220932006835938
17.45115852355957
16.337800979614258
18.1019878

13.982637405395508
15.119560241699219
14.367569923400879
14.995641708374023
16.315549850463867
14.590584754943848
14.793878555297852
15.694168090820312
14.548624992370605
15.646646499633789
16.883480072021484
14.826050758361816
15.637417793273926
16.8961124420166
15.781402587890625
13.327326774597168
15.323782920837402
15.60424518585205
14.650141716003418
14.027310371398926
14.382247924804688
15.413471221923828
15.304827690124512
15.875590324401855
15.18087387084961
15.06418514251709
15.058929443359375
13.179152488708496
16.391103744506836
14.680070877075195
15.06024169921875
15.208877563476562
14.330772399902344
14.491387367248535
14.52420711517334
14.994158744812012
14.481712341308594
14.2969388961792
15.974528312683105
14.030848503112793
14.665898323059082
16.41237449645996
13.35745620727539
14.134611129760742
15.97082805633545
16.595582962036133
15.670831680297852
15.759856224060059
15.673979759216309
13.627920150756836
15.131420135498047
14.183320045471191
14.567517280578613
16.05

13.679970741271973
11.871960639953613
12.928191184997559
14.440319061279297
12.423202514648438
13.872703552246094
12.564227104187012
13.36518383026123
13.750725746154785
13.456587791442871
12.764060020446777
13.569356918334961
12.04643440246582
12.204291343688965
12.873522758483887
13.466581344604492
13.788949966430664
13.161148071289062
11.986165046691895
12.434239387512207
13.589369773864746
14.111196517944336
11.450149536132812
12.5341215133667
12.805606842041016
13.482734680175781
14.744122505187988
11.309704780578613
14.079022407531738
12.632120132446289
12.70294189453125
13.655792236328125
13.443853378295898
13.115306854248047
15.128591537475586
12.057496070861816
12.921547889709473
14.18010425567627
14.41728401184082
12.936777114868164
13.624520301818848
11.687973022460938
13.860184669494629
12.767793655395508
14.44638442993164
13.496790885925293
13.675024032592773
13.453093528747559
13.939045906066895
13.475329399108887
13.708029747009277
14.599228858947754
12.442903518676758
1

10.601192474365234
11.283592224121094
11.608179092407227
11.197319030761719
11.126643180847168
12.748832702636719
11.50821304321289
11.08484172821045
11.154123306274414
11.152732849121094
10.241779327392578
11.836747169494629
12.238824844360352
12.335317611694336
11.195847511291504
11.208220481872559
11.038739204406738
12.57010555267334
11.581209182739258
10.279369354248047
11.984508514404297
10.267718315124512
11.239900588989258
11.109984397888184
12.498908996582031
10.280421257019043
10.678519248962402
11.661758422851562
12.168126106262207
10.453099250793457
10.821184158325195
11.435990333557129
11.55788803100586
11.651598930358887
10.575665473937988
11.444045066833496
10.781128883361816
11.938196182250977
11.453885078430176
10.948177337646484
10.909830093383789
12.656447410583496
12.021135330200195
11.322613716125488
10.935972213745117
11.928808212280273
10.89395523071289
11.050812721252441
10.095694541931152
11.484929084777832
12.058274269104004
10.733806610107422
12.18430900573730

9.957572937011719
9.70686149597168
10.074828147888184
10.079229354858398
8.308733940124512
9.920793533325195
10.061873435974121
9.304940223693848
9.524828910827637
10.220166206359863
9.925686836242676
10.082130432128906
9.209993362426758
9.975050926208496
8.960357666015625
9.1747465133667
8.36665153503418
9.272363662719727
8.405049324035645
9.023518562316895
9.589479446411133
10.150930404663086
10.641060829162598
9.44835090637207
10.08356761932373
9.83733081817627
9.94266414642334
9.80445384979248
10.20321273803711
10.13640022277832
8.58151912689209
9.284523963928223
9.629415512084961
9.025362014770508
10.174905776977539
10.177773475646973
10.096685409545898
8.438508033752441
9.383747100830078
10.233128547668457
10.394465446472168
9.439654350280762
9.485889434814453
8.675834655761719
9.967422485351562
9.601719856262207
10.007267951965332
9.55290699005127
9.789595603942871
10.561944961547852
9.968168258666992
9.826604843139648
9.173372268676758
9.913694381713867
10.532221794128418
9.608

9.438272476196289
7.981037139892578
7.8755106925964355
7.721837520599365
8.042274475097656
7.167961597442627
8.318634986877441
8.55938720703125
8.545491218566895
8.64275074005127
8.278905868530273
8.44921875
8.162833213806152
8.485028266906738
7.774521827697754
7.2935075759887695
7.979465961456299
8.144752502441406
8.108306884765625
7.388726234436035
8.301942825317383
8.72939395904541
7.714054107666016
9.22988224029541
8.546542167663574
7.462831497192383
8.095844268798828
8.504453659057617
8.176345825195312
7.9396820068359375
8.835857391357422
7.784792423248291
8.329707145690918
8.834925651550293
8.087481498718262
9.557615280151367
8.032142639160156
7.67951774597168
8.42294979095459
7.629621505737305
8.336410522460938
8.056780815124512
9.371013641357422
8.276070594787598
8.491046905517578
8.40596866607666
8.073868751525879
7.737828254699707
7.803447723388672
8.083803176879883
8.064772605895996
8.447308540344238
7.401754856109619
7.980551242828369
9.06160831451416
7.706363677978516
8.29

6.797475814819336
6.539464950561523
6.731550216674805
7.517207622528076
6.996216773986816
7.183610439300537
7.316282749176025
7.588812828063965
7.0385847091674805
6.838772773742676
6.579854965209961
6.826920509338379
7.496461868286133
7.951927185058594
6.790221691131592
7.421730041503906
6.915374755859375
6.426033020019531
7.908452987670898
6.688291072845459
7.092153549194336
6.889214992523193
8.073698043823242
6.349565029144287
7.977138042449951
7.051569938659668
6.8467912673950195
7.313314914703369
6.799627304077148
6.6270751953125
7.899942398071289
7.158297061920166
7.414275169372559
6.859832286834717
6.871603488922119
6.190738201141357
7.216708183288574
7.087217330932617
7.319047927856445
6.826152324676514
7.237839698791504
7.2679853439331055
6.7745137214660645
6.757894039154053
6.60120964050293
6.654378890991211
7.53122615814209
7.795788288116455
6.577693939208984
6.6534504890441895
6.830066680908203
6.661509990692139
5.942878246307373
6.769339561462402
7.102340221405029
7.1422624

5.646081924438477
6.066290855407715
5.891241073608398
6.390215873718262
5.2119598388671875
5.3352251052856445
5.821944236755371
5.290273666381836
6.032908916473389
5.658734321594238
5.532431602478027
6.122658729553223
6.532634735107422
6.960811138153076
4.9438605308532715
6.320159435272217
7.180797576904297
6.5845417976379395
5.508238315582275
6.417459487915039
5.739360332489014
5.742857456207275
5.77394962310791
5.521547794342041
5.662650108337402
5.462189197540283
5.827879905700684
6.650187015533447
5.622256278991699
5.425473690032959
5.714205741882324
5.1435441970825195
5.303990840911865
5.7022929191589355
6.537487983703613
6.398095607757568
5.813839912414551
5.487791538238525
6.784912586212158
5.205014228820801
6.36372709274292
6.125710964202881
5.928740501403809
5.550624847412109
5.613574028015137
5.830404281616211
6.186218738555908
5.840057373046875
5.927513599395752
5.7294840812683105
6.584293365478516
5.443726539611816
5.644827365875244
5.5316996574401855
5.564546585083008
5.32

### Value Network

In [None]:
class ValueNetworkRNN(nn.Module):
    def __init__(self, input_dim=512, wordvec_dim=512, hidden_dim=512, dtype=np.float32):
        super(ValueNetworkRNN, self).__init__()
        
        self.caption_embedding = nn.Embedding(vocab_size, wordvec_dim)
        self.lstm = nn.LSTM(wordvec_dim, hidden_dim, batch_first=True)
    
    def forward(self, captions):
        input_captions = self.caption_embedding(captions)
        hidden_init = torch.zeros_like(512)
        cell_init = torch.zeros_like((1, 512))
        output, _ = self.lstm(input_captions, (hidden_init, cell_init))
        return output
    
class ValueNetwork(nn.Module):
    def __init__():
        super(ValueNetwork, self).__init__()
        self.linear1 = nn.Linear(1024, 512)
        self.linear2 = nn.Linear(512, 1)
    
    def forward(self, features, vnrnn):
        state = torch.cat((features, vrnn), dim=1)
        output = self.linear1(state)
        output = self.linear2(output)
        return output

### Reward Network

In [None]:
class RewardNetworkRNN(nn.Module):
    def __init__(self, input_dim=512, wordvec_dim=512, hidden_dim=512, dtype=np.float32):
        super(RewardNetworkRNN, self).__init__()
        
        self.caption_embedding = nn.Embedding(vocab_size, wordvec_dim)
        self.gru = nn.GRU(wordvec_dim, hidden_dim, batch_first=True)
    
    def forward(self, captions):
        input_captions = self.caption_embedding(captions)
        hidden_init = torch.zeros_like(512)
        output, _ = self.gru(input_captions, hidden_init)
        return output
    
class RewardNetwork(nn.Module):
    def __init__():
        super(RewardNetwork, self).__init__()
        self.visual_embed = nn.Linear(512, 512)
        self.semantic_embed = nn.Linear(512, 512)
        
    def forward(self, features, captions):
        ve = self.visual_embed(features)
        se = self.semantic_embed(captions)
        return ve, se