In [3]:
from os import listdir
from numpy import array
from keras_preprocessing.text import Tokenizer, one_hot
# from keras.preprocessing.sequence import pad_sequences
from keras_preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.utils import to_categorical
from keras.layers import Embedding, TimeDistributed, RepeatVector, LSTM, concatenate , Input, Reshape, Dense, Flatten
# from keras.preprocessing.image import array_to_img, img_to_array, load_img
from tensorflow.keras.utils import array_to_img, img_to_array, load_img
from keras.applications.inception_resnet_v2 import InceptionResNetV2, preprocess_input
import numpy as np

In [4]:
# Load the images and preprocess them for inception-resnet
images = []
all_filenames = listdir('images/')
all_filenames.sort()
for filename in all_filenames:
    images.append(img_to_array(load_img('images/'+filename, target_size=(299, 299))))
images = np.array(images, dtype=float)
images = preprocess_input(images)

# Run the images through inception-resnet and extract the features without the classification layer
IR2 = InceptionResNetV2(weights='imagenet', include_top=False)
features = IR2.predict(images)
print('111',features)

111 [[[[5.10737188e-02 2.23518819e-01 0.00000000e+00 ... 0.00000000e+00
    0.00000000e+00 0.00000000e+00]
   [1.44603893e-01 4.93346244e-01 0.00000000e+00 ... 0.00000000e+00
    1.86003298e-01 0.00000000e+00]
   [0.00000000e+00 3.05079818e-01 0.00000000e+00 ... 0.00000000e+00
    0.00000000e+00 2.14182124e-01]
   ...
   [0.00000000e+00 1.07090771e-01 3.82578701e-01 ... 0.00000000e+00
    1.76228851e-01 0.00000000e+00]
   [0.00000000e+00 2.50038326e-01 2.14602470e-01 ... 0.00000000e+00
    3.02572921e-02 0.00000000e+00]
   [0.00000000e+00 2.36402765e-01 3.74622494e-02 ... 0.00000000e+00
    0.00000000e+00 0.00000000e+00]]

  [[2.99069613e-01 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
    0.00000000e+00 1.30672514e-01]
   [1.09122491e+00 2.51498520e-01 0.00000000e+00 ... 0.00000000e+00
    0.00000000e+00 0.00000000e+00]
   [1.75337389e-01 2.40811571e-01 4.00699377e-01 ... 0.00000000e+00
    0.00000000e+00 1.12656623e-01]
   ...
   [2.95595139e-01 6.73654079e-01 0.00000000e+00 ... 

In [5]:
# We will cap each input sequence to 100 tokens
max_caption_len = 100
# Initialize the function that will create our vocabulary 
tokenizer = Tokenizer(filters='', split=" ", lower=False)

# Read a document and return a string
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

# Load all the HTML files
X = []
all_filenames = listdir('html/')
all_filenames.sort()
for filename in all_filenames:
    X.append(load_doc('html/'+filename))

# Create the vocabulary from the html files
tokenizer.fit_on_texts(X)

# Add +1 to leave space for empty words
vocab_size = len(tokenizer.word_index) + 1
# Translate each word in text file to the matching vocabulary index
sequences = tokenizer.texts_to_sequences(X)
# The longest HTML file
max_length = max(len(s) for s in sequences)

# Intialize our final input to the model
X, y, image_data = list(), list(), list()
for img_no, seq in enumerate(sequences):
    for i in range(1, len(seq)):
        # Add the entire sequence to the input and only keep the next word for the output
        in_seq, out_seq = seq[:i], seq[i]
        # If the sentence is shorter than max_length, fill it up with empty words
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        # Map the output to one-hot encoding
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        # Add and image corresponding to the HTML file
        image_data.append(features[img_no])
        # Cut the input sentence to 100 tokens, and add it to the input data
        X.append(in_seq[-100:])
        y.append(out_seq)

X, y, image_data = np.array(X), np.array(y), np.array(image_data)

print('X',X)
print('y',y)
print('image_data',image_data)


X [[  0   0   0 ...   0   0 205]
 [  0   0   0 ...   0 205  90]
 [  0   0   0 ... 205  90  91]
 ...
 [ 15  22  32 ...  66  18  67]
 [ 22  32  17 ...  18  67 160]
 [ 32  17   5 ...  67 160 161]]
y [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
image_data [[[[0.05107372 0.22351882 0.         ... 0.         0.
    0.        ]
   [0.1446039  0.49334624 0.         ... 0.         0.1860033
    0.        ]
   [0.         0.30507982 0.         ... 0.         0.
    0.21418212]
   ...
   [0.         0.10709077 0.3825787  ... 0.         0.17622885
    0.        ]
   [0.         0.25003833 0.21460247 ... 0.         0.03025729
    0.        ]
   [0.         0.23640276 0.03746225 ... 0.         0.
    0.        ]]

  [[0.2990696  0.         0.         ... 0.         0.
    0.13067251]
   [1.0912249  0.25149852 0.         ... 0.         0.
    0.        ]
   [0.17533739 0.24081157 0.40069938 

In [6]:
# Create the encoder
image_features = Input(shape=(8, 8, 1536,))
image_flat = Flatten()(image_features)
image_flat = Dense(128, activation='relu')(image_flat)
ir2_out = RepeatVector(max_caption_len)(image_flat)

language_input = Input(shape=(max_caption_len,))
language_model = Embedding(vocab_size, 200, input_length=max_caption_len)(language_input)
language_model = LSTM(256, return_sequences=True)(language_model)
language_model = LSTM(256, return_sequences=True)(language_model)
language_model = TimeDistributed(Dense(128, activation='relu'))(language_model)

# Create the decoder
decoder = concatenate([ir2_out, language_model])
decoder = LSTM(512, return_sequences=False)(decoder)
decoder_output = Dense(vocab_size, activation='softmax')(decoder)

# Compile the model
model = Model(inputs=[image_features, language_input], outputs=decoder_output)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [7]:
# Train the neural network
model.fit([image_data, X], y, batch_size=64, shuffle=False, epochs=125)

Epoch 1/125


2022-11-16 03:07:00.711383: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 732168192 exceeds 10% of free system memory.


Epoch 2/125
Epoch 3/125
Epoch 4/125
Epoch 5/125
Epoch 6/125
Epoch 7/125
Epoch 8/125
Epoch 9/125
Epoch 10/125
Epoch 11/125
Epoch 12/125
Epoch 13/125
Epoch 14/125
Epoch 15/125
Epoch 16/125
Epoch 17/125
Epoch 18/125
Epoch 19/125
Epoch 20/125
Epoch 21/125
Epoch 22/125
Epoch 23/125
Epoch 24/125
Epoch 25/125
Epoch 26/125
Epoch 27/125
Epoch 28/125
Epoch 29/125
Epoch 30/125
Epoch 31/125
Epoch 32/125
Epoch 33/125
Epoch 34/125
Epoch 35/125
Epoch 36/125
Epoch 37/125
Epoch 38/125
Epoch 39/125
Epoch 40/125
Epoch 41/125
Epoch 42/125
Epoch 43/125
Epoch 44/125
Epoch 45/125
Epoch 46/125
Epoch 47/125
Epoch 48/125
Epoch 49/125
Epoch 50/125
Epoch 51/125
Epoch 52/125
Epoch 53/125
Epoch 54/125
Epoch 55/125
Epoch 56/125
Epoch 57/125
Epoch 58/125
Epoch 59/125
Epoch 60/125
Epoch 61/125
Epoch 62/125
Epoch 63/125
Epoch 64/125
Epoch 65/125
Epoch 66/125
Epoch 67/125
Epoch 68/125
Epoch 69/125

In [33]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [34]:
# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
    # seed the generation process
    in_text = 'START'
    # iterate over the whole length of the sequence
    for i in range(900):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0][-100:]
        # pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        # predict next word
        yhat = model.predict([photo,sequence], verbose=0)
        # convert probability to integer
        yhat = np.argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if we cannot map the word
        if word is None:
            break
        # append as input for generating the next word
        in_text += ' ' + word
        # Print the prediction
        print(' ' + word, end='')
        # stop if we predict the end of the sequence
        if word == 'END':
            break
    return

In [35]:
# Load and image, preprocess it for IR2, extract features and generate the HTML
test_image = img_to_array(load_img('images/86.jpg', target_size=(299, 299)))
test_image = np.array(test_image, dtype=float)
test_image = preprocess_input(test_image)
test_features = IR2.predict(np.array([test_image]))
html = generate_desc(model, tokenizer, np.array(test_features), 100)

 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->
 -->


KeyboardInterrupt: 

In [36]:
from IPython.display import display, HTML
display(HTML(html[5:-5]))

TypeError: 'NoneType' object is not subscriptable