In [1]:
#!git clone https://github.com/fchollet/keras.git && cd keras && python setup.py install --user

In [2]:
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD, Adadelta, Adagrad
from keras.utils import np_utils, generic_utils
from keras.callbacks import EarlyStopping
from keras.layers.advanced_activations import PReLU, LeakyReLU
from keras.layers import Embedding,GRU,TimeDistributed,RepeatVector, LSTM, concatenate , Input, Reshape
from keras.preprocessing.text import one_hot
from keras.preprocessing import sequence
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras import backend as K 
from keras.applications.inception_resnet_v2 import InceptionResNetV2
from keras.preprocessing import image
from keras.applications.inception_resnet_v2 import preprocess_input
from keras.models import Model
import numpy as np
from numpy import argmax
import os

Using TensorFlow backend.
  return f(*args, **kwds)


In [7]:
images = []
for filename in os.listdir('images/'):
    print(filename)
    images.append(img_to_array(load_img('images/'+filename, target_size=(299, 299))))
images = np.array(images, dtype=float)
images = preprocess_input(images)

IR2 = InceptionResNetV2(weights='imagenet', include_top=False)
features = IR2.predict(images)
print(features.shape)

86.jpg
87.jpg
88.jpg
89.jpg
90.jpg
(5, 8, 8, 1536)


In [6]:
num_words = 435
max_caption_len = 50
tokenizer = Tokenizer(num_words=num_words, filters='', split=" ", lower=False)

def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

X = []
for filename in os.listdir('html/'):
    X.append(load_doc('html/'+filename))

tokenizer.fit_on_texts(X)

vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(X)
max_length = max(len(s) for s in sequences)
 
X, y, image_data = list(), list(), list()
for img_no, seq in enumerate(sequences):
    for i in range(1, len(seq)):
        in_seq, out_seq = seq[:i], seq[i]
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        image_data.append(features[img_no])
        X.append(in_seq[-50:])
        y.append(out_seq)

X, y, image_data = np.array(X), np.array(y), np.array(image_data)

86.html
87.html
88.html
89.html
90.html


In [5]:
print(image_data.shape)
print(X.shape)
print(y.shape)
print(features.shape)

(2305, 8, 8, 1536)
(2305, 50)
(2305, 436)
(5, 8, 8, 1536)


In [6]:
image_features = Input(shape=(1536,))
#image_flat = Flatten()(image_features)
image_flat = Dense(128, activation='relu')(image_features)
ir2_out = RepeatVector(max_caption_len)(image_flat)

language_input = Input(shape=(max_caption_len,))
language_model = Embedding(vocab_size, 200, input_length=max_caption_len)(language_input)
language_model = LSTM(256, return_sequences=True)(language_model)
language_model = LSTM(256, return_sequences=True)(language_model)
language_model = TimeDistributed(Dense(128, activation='relu'))(language_model)

decoder = concatenate([ir2_out, language_model])
decoder = LSTM(512, return_sequences=False)(decoder)
print(decoder)
decoder_output = Dense(vocab_size, activation='softmax')(decoder)

model = Model(inputs=[image_features, language_input], outputs=decoder_output)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

(?, 50, 32)
Tensor("lstm_2/TensorArrayReadV3:0", shape=(?, 512), dtype=float32)


In [7]:
model.summary()
model.fit([image_data, X], y, batch_size=1, shuffle=False, epochs=1)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 8, 8, 1536)   0                                            
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 98304)        0           input_2[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 128)          12583040    flatten_1[0][0]                  
__________________________________________________________________________________________________
embedding_

<keras.callbacks.History at 0x129533588>

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [8]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

<article


In [21]:
# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
    # seed the generation process
    in_text = 'START'
    # iterate over the whole length of the sequence
    for i in range(900):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0][-50:]
        # pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        # predict next word
        yhat = model.predict([photo,sequence], verbose=0)
        # convert probability to integer
        yhat = argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if we cannot map the word
        if word is None:
            break
        # append as input for generating the next word
        in_text += ' ' + word
        print(' ' + word, end='')
        # stop if we predict the end of the sequence
        if word == 'END':
            break
    return in_text
print(generate_desc(model, tokenizer, np.array([features[0]]), 50))

START <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer <footer
