In [1]:
import numpy as np
import copy
import h5py
import matplotlib.pyplot as plt
from utils.coco_utils import load_coco_data
%matplotlib inline

from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense, Activation, TimeDistributed, Merge
from keras.layers import Embedding, Dropout
from keras.layers import LSTM
from keras.layers.core import RepeatVector
from keras import callbacks, utils, optimizers

Using TensorFlow backend.


### Loading the data to Memory

In [2]:

data = load_coco_data(pca_features=True)

# Print out all the keys and values from the data dictionary
for k, v in data.iteritems():
  if type(v) == np.ndarray:
    print k, type(v), v.shape, v.dtype
  else:
    print k, type(v), len(v)

idx_to_word <type 'list'> 1004
train_captions <type 'numpy.ndarray'> (400135, 17) int32
val_captions <type 'numpy.ndarray'> (195954, 17) int32
train_image_idxs <type 'numpy.ndarray'> (400135,) int32
val_features <type 'numpy.ndarray'> (40504, 512) float32
val_image_idxs <type 'numpy.ndarray'> (195954,) int32
train_features <type 'numpy.ndarray'> (82783, 512) float32
train_urls <type 'numpy.ndarray'> (82783,) |S63
val_urls <type 'numpy.ndarray'> (40504,) |S63
word_to_idx <type 'dict'> 1004


In [3]:
train_captions_mat = data['train_captions']
train_image_idxs = data['train_image_idxs']
train_features = data['train_features']
val_captions_mat = data['val_captions']
val_image_idxs = data['val_image_idxs']
val_features = data['val_features']
idx_to_word = data['idx_to_word']

In [4]:
a=  data['val_image_idxs'][3]
b = data['val_urls'][a]
b

'http://farm8.staticflickr.com/7003/6528937031_10e1ce0960_z.jpg'

In [5]:
print train_captions_mat[0]
for i in train_captions_mat[0]:
    print idx_to_word[i],

[  1   4 142 510  10 667 415 277  58   2   0   0   0   0   0   0   0]
<START> a very clean and well decorated empty bathroom <END> <NULL> <NULL> <NULL> <NULL> <NULL> <NULL> <NULL>


### Number of images to use for training and validation

In [6]:
limit1 = len(train_image_idxs)
limit2 = len(val_image_idxs)
num_train_img = limit1
num_val_img = limit2
num_train_cap = limit1
num_val_cap = limit2

### Constructng image_train and image_val

In [7]:
image_train = np.zeros((num_train_img, 512))
image_val = np.zeros((num_val_img, 512))
for i in range(num_train_img):
    index = train_image_idxs[i]
    image_train[i] = train_features[index]
    
for i in range(num_val_img):
    index = val_image_idxs[i]
    image_val[i] = val_features[index]
    

In [8]:
print image_train.shape, image_val.shape

(400135, 512) (195954, 512)


### Construct word_train and word_val

In [9]:
word_train = train_captions_mat[:num_train_cap]
word_val = val_captions_mat[:num_val_cap]
print word_train.shape, word_val.shape

(400135, 17) (195954, 17)


### Constructing one-hot ecodings for y_train, y_val

In [10]:
"""
y_train = np.zeros((num_train_img, 17, 1004))
y_val = np.zeros((num_val_img, 17, 1004))

for i in range(num_train_img):
    for j in range(17):
        y_train[i][j] = utils.to_categorical(train_captions_mat[i][j], 1004)
        
for i in range(num_val_img):
    for j in range(17):
        y_val[i][j] = utils.to_categorical(val_captions_mat[i][j], 1004)

print y_train.shape, y_val.shape
"""

'\ny_train = np.zeros((num_train_img, 17, 1004))\ny_val = np.zeros((num_val_img, 17, 1004))\n\nfor i in range(num_train_img):\n    for j in range(17):\n        y_train[i][j] = utils.to_categorical(train_captions_mat[i][j], 1004)\n        \nfor i in range(num_val_img):\n    for j in range(17):\n        y_val[i][j] = utils.to_categorical(val_captions_mat[i][j], 1004)\n\nprint y_train.shape, y_val.shape\n'

### Sparse encoding

In [11]:
print train_captions_mat[0]
print train_captions_mat.shape

[  1   4 142 510  10 667 415 277  58   2   0   0   0   0   0   0   0]
(400135, 17)


In [12]:
y_train = np.zeros((num_train_img, 17, 1))
y_val = np.zeros((num_val_img, 17, 1))
y_train.shape

(400135, 17, 1)

In [13]:
for i in range(num_train_img):
    for j in range(17):
        y_train[i][j] = train_captions_mat[i][j]
        
for i in range(num_val_img):
    for j in range(17):
        y_val[i][j] = val_captions_mat[i][j]

In [14]:
# shifting values one bit left
"""
for i in range(num_train_img):
    for j in range(16):
        y_train[i][j] = y_train[i][j+1]
    
for i in range(num_val_img):
    for j in range(16):
        y_val[i][j] = y_val[i][j+1]
""" 

'\nfor i in range(num_train_img):\n    for j in range(16):\n        y_train[i][j] = y_train[i][j+1]\n    \nfor i in range(num_val_img):\n    for j in range(16):\n        y_val[i][j] = y_val[i][j+1]\n'

In [15]:
print y_train[1000]
print train_captions_mat[1000]

[[   1.]
 [   4.]
 [ 142.]
 [  29.]
 [  58.]
 [   9.]
 [   3.]
 [ 257.]
 [  36.]
 [   2.]
 [   0.]
 [   0.]
 [   0.]
 [   0.]
 [   0.]
 [   0.]
 [   0.]]
[  1   4 142  29  58   9   3 257  36   2   0   0   0   0   0   0   0]


In [16]:
print y_train.shape, y_val.shape

(400135, 17, 1) (195954, 17, 1)


In [17]:
print len(y_train[0][0])

1


### Feeding input into the model to fit sentence by sentence in a LOOP

In [18]:
# create the model
embedding_vector_length = 512
vocabulary_size = 1004
max_caption_length = 17
batch_size = 128             # one parameter update per sentence
image_vector_length = 512

# image vector
image_model = Sequential()
image_model.add(Dense(image_vector_length, input_dim=512))
image_model.add(RepeatVector(max_caption_length))

# caption vector
word_model = Sequential()
word_model.add(Embedding(input_dim=vocabulary_size, 
                    output_dim=embedding_vector_length, input_length=17))

# Merge models

model = Sequential()
model.add(Merge([image_model, word_model], mode='concat'))  # merging layers

model.add(LSTM(512, input_shape=(17,512), unroll=True, return_sequences=True, implementation=2, stateful=False))
model.add(Dropout(rate=0.5))
model.add(TimeDistributed(Dense(vocabulary_size)))
model.add(Activation('softmax'))

sgd = optimizers.SGD(lr=0.01, momentum=0.0, nesterov=False)
model.compile(loss='sparse_categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

model.summary()
# path to checkpoints
filepath = './checkpoints/weights-{epoch:02d}-{val_acc:.2f}.hdf5'
checkpoint = callbacks.ModelCheckpoint(filepath, monitor='val_acc', save_best_only=True, verbose=0, mode='max')
board = callbacks.TensorBoard(log_dir='./tensorboard_logs', histogram_freq=5, write_graph=True, write_images=True)


model_history = model.fit([image_train, word_train], y_train, verbose=2,
                              batch_size=batch_size, shuffle=True, epochs=100)

score = model.evaluate([image_val, word_val], y_val, verbose=2, batch_size=batch_size)



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_1 (Merge)              (None, 17, 1024)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 17, 512)           3147776   
_________________________________________________________________
dropout_1 (Dropout)          (None, 17, 512)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 17, 1004)          515052    
_________________________________________________________________
activation_1 (Activation)    (None, 17, 1004)          0         
Total params: 4,439,532.0
Trainable params: 4,439,532.0
Non-trainable params: 0.0
_________________________________________________________________
Epoch 1/100
129s - loss: 4.3500 - acc: 0.3005
Epoch 2/100
128s - loss: 3.6040 - acc: 0.3718
Epoch 3/100
128s - loss: 3.1725 - 

In [19]:
print score

[0.44071341010851289, 0.92489414288479654]


In [88]:
prediction = model.predict_classes([image_val[:29], word_val[:29]], verbose=2)[27]
prediction

array([  1,   4,  42,  51,  11, 160,   4,  22,  83,   2,   0,   0,   0,
         0,   0,   0,   0])

In [89]:
actual = val_captions_mat[27]

In [90]:
def arr_to_sent(predicted, actual):
    s = []
    m = data['idx_to_word']
    for i in predicted:
        s.append(m[i])
    print " ".join(s)
    s = []
    for i in actual:
        s.append(m[i])
    print " ".join(s)

In [91]:
arr_to_sent(prediction, actual)

<START> a black cat is inside a white toilet <END> <NULL> <NULL> <NULL> <NULL> <NULL> <NULL> <NULL>
<START> a black cat is inside a white toilet <END> <NULL> <NULL> <NULL> <NULL> <NULL> <NULL> <NULL>


In [21]:
model.save('my_model.h5')