In [16]:
import os
import string
import glob
from tensorflow.keras.applications import MobileNet
import tensorflow.keras.applications.mobilenet  

from tensorflow.keras.applications.inception_v3 import InceptionV3
import tensorflow.keras.applications.inception_v3

import json
import random
import collections
from tqdm import tqdm
import tensorflow.keras.preprocessing.image
import pickle
from time import time
import numpy as np
from PIL import Image
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (LSTM, Embedding, 
    TimeDistributed, Dense, RepeatVector, 
    Activation, Flatten, Reshape, concatenate,  
    Dropout, BatchNormalization)
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import Input, layers
from tensorflow.keras import optimizers

from tensorflow.keras.models import Model

from tensorflow.keras.layers import add
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt

START = "<start>"
STOP = "<end>"
EPOCHS = 10
USE_INCEPTION = True

In [40]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m:>02}:{s:>05.2f}"

In [17]:
# Download caption annotation files
annotation_folder = '/annotations/'
if not os.path.exists(os.path.abspath('.') + annotation_folder):
  annotation_zip = tf.keras.utils.get_file('captions.zip',
                                          cache_subdir=os.path.abspath('.'),
                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',
                                          extract = True)
  annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'
  os.remove(annotation_zip)
else:
  annotation_file = './annotations/captions_train2014.json' 
# Download image files
image_folder = '/train2014/'
if not os.path.exists(os.path.abspath('.') + image_folder):
  image_zip = tf.keras.utils.get_file('train2014.zip',
                                      cache_subdir=os.path.abspath('.'),
                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',
                                      extract = True)
  PATH = os.path.dirname(image_zip) + image_folder
  os.remove(image_zip)
else:
  PATH = os.path.abspath('.') + image_folder

In [18]:
with open(annotation_file, 'r') as f:
    annotations = json.load(f)

In [19]:
# Group all captions together having the same image ID.
image_path_to_caption = collections.defaultdict(list)
for val in annotations['annotations']:
  caption = f"<start> {val['caption']} <end>"
  image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (val['image_id'])
  image_path_to_caption[image_path].append(caption)

In [20]:
#image_path_to_caption

defaultdict(list,
            {'/mnt/e/MachineLearning/SonnetGeneration/train2014/COCO_train2014_000000318556.jpg': ['<start> A very clean and well decorated empty bathroom <end>',
              '<start> A blue and white bathroom with butterfly themed wall tiles. <end>',
              '<start> A bathroom with a border of butterflies and blue paint on the walls above it. <end>',
              '<start> An angled view of a beautifully decorated bathroom. <end>',
              '<start> A clock that blends in with the wall hangs in a bathroom.  <end>'],
             '/mnt/e/MachineLearning/SonnetGeneration/train2014/COCO_train2014_000000116100.jpg': ['<start> A panoramic view of a kitchen and all of its appliances. <end>',
              '<start> A panoramic photo of a kitchen and dining room <end>',
              '<start> A wide angle view of the kitchen work area <end>',
              '<start> multiple photos of a brown and white kitchen.  <end>',
              '<start> A kitchen that has 

In [21]:
image_paths = list(image_path_to_caption.keys())
random.shuffle(image_paths)

# Select the first 6000 image_paths from the shuffled set.
# Approximately each image id has 5 captions associated with it, so that will 
# lead to 30,000 examples.
train_image_paths = image_paths[:6000]
print(len(train_image_paths))

6000


In [74]:
#train_descriptions = image_path_to_caption
train_descriptions={}
for path in train_image_paths:
  train_descriptions[path] = image_path_to_caption[path]

In [75]:
#print(train_descriptions)



In [28]:
WIDTH = 299
HEIGHT = 299
OUTPUT_DIM = 2048

In [32]:
encode_model = InceptionV3(weights='imagenet')
encode_model = Model(encode_model.input, encode_model.layers[-2].output)
preprocess_input = tensorflow.keras.applications.inception_v3.preprocess_input

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5


In [34]:
encode_model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 299, 299, 3) 0                                            
__________________________________________________________________________________________________
conv2d_94 (Conv2D)              (None, 149, 149, 32) 864         input_2[0][0]                    
__________________________________________________________________________________________________
batch_normalization_94 (BatchNo (None, 149, 149, 32) 96          conv2d_94[0][0]                  
__________________________________________________________________________________________________
activation_94 (Activation)      (None, 149, 149, 32) 0           batch_normalization_94[0][0]     
_______________________________________________________________________________________

In [35]:
def encodeImage(img):
  # Resize all images to a standard size (specified bythe image 
  # encoding network)
  img = img.resize((WIDTH, HEIGHT), Image.ANTIALIAS)
  # Convert a PIL image to a numpy array
  x = tensorflow.keras.preprocessing.image.img_to_array(img)
  # Expand to 2D array
  x = np.expand_dims(x, axis=0)
  # Perform any preprocessing needed by InceptionV3 or others
  x = preprocess_input(x)
  # Call InceptionV3 (or other) to extract the smaller feature set for 
  # the image.
  x = encode_model.predict(x) # Get the encoding vector for the image
  # Shape to correct form to be accepted by LSTM captioning network.
  x = np.reshape(x, OUTPUT_DIM )
  return x

In [81]:
root_captioning = "./finalprogpickle/"
train_path = os.path.join(root_captioning,"data",f'train{OUTPUT_DIM}.pkl')
if not os.path.exists(train_path):
  start = time()
  encoding_train = {}
  for path in train_image_paths:
    img = tensorflow.keras.preprocessing.image.load_img(path,target_size=(HEIGHT, WIDTH))
    encoding_train[path] = encodeImage(img)
  with open(train_path, "wb") as fp:
    pickle.dump(encoding_train, fp)
  print(f"\nGenerating training set took: {hms_string(time()-start)}")
else:
  with open(train_path, "rb") as fp:
    encoding_train = pickle.load(fp)


Generating training set took: 0:12:20.11


In [82]:
# with open(train_path, "wb") as fp:
#   pickle.dump(encoding_train, fp)
# print(f"\nGenerating training set took: {hms_string(time()-start)}")

In [83]:
print(encoding_train)

{'/mnt/e/MachineLearning/SonnetGeneration/train2014/COCO_train2014_000000292910.jpg': array([0.05595997, 0.4499844 , 0.20447984, ..., 0.16337116, 0.83789295,
       0.342545  ], dtype=float32), '/mnt/e/MachineLearning/SonnetGeneration/train2014/COCO_train2014_000000388279.jpg': array([0.12353189, 0.31529617, 0.10361968, ..., 1.0970889 , 0.7596234 ,
       0.17887527], dtype=float32), '/mnt/e/MachineLearning/SonnetGeneration/train2014/COCO_train2014_000000411400.jpg': array([0.36626947, 0.07549714, 0.28849462, ..., 0.        , 0.18871488,
       0.6844986 ], dtype=float32), '/mnt/e/MachineLearning/SonnetGeneration/train2014/COCO_train2014_000000221748.jpg': array([0.48884726, 0.30591616, 0.13687374, ..., 0.50431293, 0.12024393,
       0.35808653], dtype=float32), '/mnt/e/MachineLearning/SonnetGeneration/train2014/COCO_train2014_000000002697.jpg': array([0.13489927, 0.70065576, 0.77022284, ..., 0.09757603, 0.26685235,
       0.11128788], dtype=float32), '/mnt/e/MachineLearning/SonnetGene

In [84]:
all_train_captions = []
for key, val in train_descriptions.items():
    for cap in val:
        all_train_captions.append(cap)
len(all_train_captions)

30014

In [85]:
word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1

vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
print('preprocessed words %d ==> %d' % (len(word_counts), len(vocab)))

preprocessed words 12519 ==> 2038


In [86]:
idxtoword = {}
wordtoidx = {}

ix = 1
for w in vocab:
    wordtoidx[w] = ix
    idxtoword[ix] = w
    ix += 1
    
vocab_size = len(idxtoword) + 1 
vocab_size

2039

In [87]:
#max_length = 47
max_length =0
for path in train_image_paths:
  cap = image_path_to_caption[path]
  for c in cap:
    tok = c.split()
    max_length = max(max_length,len(tok))
print(max_length)

47


In [88]:
def data_generator(descriptions, photos, wordtoidx, \
                   max_length, num_photos_per_batch):
  # x1 - Training data for photos
  # x2 - The caption that goes with each photo
  # y - The predicted rest of the caption
  x1, x2, y = [], [], []
  n=0
  while True:
    for key, desc_list in descriptions.items():
      n+=1
      photo = photos[key]
      # Each photo has 5 descriptions
      for desc in desc_list:
        # Convert each word into a list of sequences.
        seq = [wordtoidx[word] for word in desc.split(' ') \
               if word in wordtoidx]
        # Generate a training case for every possible sequence and outcome
        for i in range(1, len(seq)):
          in_seq, out_seq = seq[:i], seq[i]
          in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
          out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
          x1.append(photo)
          x2.append(in_seq)
          y.append(out_seq)
      if n==num_photos_per_batch:
        yield ([np.array(x1), np.array(x2)], np.array(y))
        x1, x2, y = [], [], []
        n=0

In [89]:
glove_dir = "./"
embeddings_index = {} 
f = open(os.path.join(glove_dir, 'glove.6B.200d.txt'), encoding="utf-8")

for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print(f'Found {len(embeddings_index)} word vectors.')

400000it [00:24, 16580.24it/s]

Found 400000 word vectors.





In [90]:
embedding_dim = 200

# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in wordtoidx.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

In [91]:
embedding_matrix.shape

(2039, 200)

In [92]:
inputs1 = Input(shape=(OUTPUT_DIM,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
caption_model = Model(inputs=[inputs1, inputs2], outputs=outputs)

In [93]:
embedding_dim

200

In [94]:
caption_model.summary()

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 47)]         0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 47, 200)      407800      input_6[0][0]                    
__________________________________________________________________________________________________
dropout_2 (Dropout)             (None, 2048)         0           input_5[0][0]                    
_______________________________________________________________________________________

In [95]:
caption_model.layers[2].set_weights([embedding_matrix])
caption_model.layers[2].trainable = False
caption_model.compile(loss='categorical_crossentropy', optimizer='adam')

In [96]:
number_pics_per_bath = 3
steps = len(train_descriptions)//number_pics_per_bath

In [102]:
steps

1000

In [97]:
model_path = "./caption_model/caption-model.hdf5"
if not os.path.exists(model_path):
  for i in tqdm(range(EPOCHS*2)):
      generator = data_generator(train_descriptions, encoding_train, 
                    wordtoidx, max_length, number_pics_per_bath)
      #caption_model.fit_generator(generator, epochs=1,
      #              steps_per_epoch=steps, verbose=1)
      caption_model.fit(generator, epochs=1,
                    steps_per_epoch=steps, verbose=1)
  caption_model.optimizer.lr = 1e-4
  number_pics_per_bath = 6
  steps = len(train_descriptions)//number_pics_per_bath

  for i in range(EPOCHS):
      generator = data_generator(train_descriptions, encoding_train, 
                    wordtoidx, max_length, number_pics_per_bath)
      caption_model.fit_generator(generator, epochs=1, 
                            steps_per_epoch=steps, verbose=1)  
  caption_model.save_weights(model_path)
  print(f"\Training took: {hms_string(time()-start)}")
else:
  caption_model.load_weights(model_path)

  0%|          | 0/20 [00:00<?, ?it/s]



  5%|▌         | 1/20 [09:31<3:00:56, 571.42s/it]



 10%|█         | 2/20 [19:06<2:51:47, 572.66s/it]



 15%|█▌        | 3/20 [28:31<2:41:31, 570.12s/it]



 20%|██        | 4/20 [37:51<2:31:16, 567.29s/it]



 25%|██▌       | 5/20 [47:17<2:21:39, 566.66s/it]



 30%|███       | 6/20 [56:41<2:12:05, 566.09s/it]



 35%|███▌      | 7/20 [1:06:09<2:02:45, 566.55s/it]



 40%|████      | 8/20 [1:15:35<1:53:16, 566.40s/it]



 45%|████▌     | 9/20 [1:25:37<1:45:48, 577.09s/it]



 50%|█████     | 10/20 [1:37:22<1:42:33, 615.33s/it]



 55%|█████▌    | 11/20 [1:46:44<1:29:55, 599.55s/it]



 60%|██████    | 12/20 [1:56:04<1:18:20, 587.56s/it]



 65%|██████▌   | 13/20 [2:06:58<1:10:51, 607.40s/it]



 70%|███████   | 14/20 [2:20:29<1:06:52, 668.71s/it]



 75%|███████▌  | 15/20 [2:31:06<54:55, 659.08s/it]  



 80%|████████  | 16/20 [2:40:36<42:09, 632.26s/it]



 85%|████████▌ | 17/20 [2:50:03<30:38, 612.86s/it]



 90%|█████████ | 18/20 [2:59:32<19:59, 599.74s/it]



 95%|█████████▌| 19/20 [3:08:59<09:49, 589.91s/it]



100%|██████████| 20/20 [3:18:37<00:00, 595.87s/it]


\Training took: 4:52:34.31


In [98]:
def generateCaption(photo):
    in_text = START
    for i in range(max_length):
        sequence = [wordtoidx[w] for w in in_text.split() if w in wordtoidx]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = caption_model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = idxtoword[yhat]
        in_text += ' ' + word
        if word == STOP:
            break
    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [3]:
img_path = "./test.jpg"

img = Image.open(img_path)
img.load()

plt.imshow(img)
plt.show()

img = encodeImage(img).reshape((1,OUTPUT_DIM))
print(img.shape)
print("Caption:",generateCaption(img))
print("_____________________________________")

