In [1]:
import json
import tensorflow
from tqdm import tqdm
from keras.models import Model
from keras.layers import Input,Dense,Dropout
from keras.layers.recurrent import LSTM
from keras.layers.merge import add
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import pickle

In [2]:
def load_json_file(json_file_name):
  file_obj=open(json_file_name)
  all_data=json.load(file_obj)
  unique_code_to_word_mapping=all_data['unique_code_to_word_mapping']
  train_image_encoded_captions=all_data['train_image_encoded_captions']
  file_obj.close()
  return unique_code_to_word_mapping,train_image_encoded_captions

In [3]:
json_file_name="/content/drive/MyDrive/Colab Notebooks/Image Captioning/json_files/all_data.json"
unique_code_to_word_mapping,train_image_encoded_captions=load_json_file(json_file_name)

In [4]:
def load_pickle_file(pickle_file_name):
  train_image_features=pickle.load(open(pickle_file_name,"rb"))
  return train_image_features

In [5]:
train_image_features_pickle="/content/drive/MyDrive/Colab Notebooks/Image Captioning/pickle_files/train_image_features.pkl"
train_image_features=load_pickle_file(train_image_features_pickle)

In [6]:
def create_generator(train_image_encoded_captions,train_image_features,batch_size,vocabulary_size,max_caption_length):
  all_train_features=[]
  all_sentences=[]
  all_next_words=[]
  temp_size=batch_size
  for image_name in tqdm(train_image_encoded_captions):
    temp_size-=1
    for caption in train_image_encoded_captions[image_name]:
      for index in range(1,max_caption_length):
        sentence_now=caption[0:index]
        next_word=caption[index]
        sentence_now=pad_sequences([sentence_now], maxlen=max_caption_length, padding='post')[0]
        categorical_word=to_categorical([next_word],vocabulary_size)[0]
        all_train_features.append(train_image_features[image_name])
        all_sentences.append(sentence_now)
        all_next_words.append(categorical_word)
    if(temp_size==0):
      yield [[np.array(all_train_features), np.array(all_sentences)], np.array(all_next_words)]
      all_train_features=[]
      all_sentences=[]
      all_next_words=[]
      temp_size=batch_size

In [7]:
def make_lstm_model(vocabulary_size,max_caption_length):
  input_layer1=Input(shape=(2048,))
  dropout_1=Dropout(0.2)(input_layer1)
  hidden_layer1=Dense(256,activation='relu')(dropout_1)
  input_layer2=Input(shape=(max_caption_length,))
  embedding_1=Embedding(vocabulary_size,256)(input_layer2)
  dropout_2=Dropout(0.2)(embedding_1)
  lstm_1=LSTM(256)(dropout_2)
  add_1=add([hidden_layer1,lstm_1])
  hidden_layer2=Dense(256,activation='relu')(add_1)
  output_layer=Dense(vocabulary_size,activation='softmax')(hidden_layer2)
  lstm_model=Model(inputs=[input_layer1,input_layer2],outputs=output_layer)
  return lstm_model

In [8]:
max_caption_length=38
vocabulary_size=len(unique_code_to_word_mapping)
epochs=7
batch_size=5
steps=len(train_image_encoded_captions)//batch_size
lstm_model=make_lstm_model(vocabulary_size,max_caption_length)
lstm_model.summary()
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
for i in range(epochs):
  generator=create_generator(train_image_encoded_captions,train_image_features,batch_size,vocabulary_size,max_caption_length)
  lstm_model.fit_generator(generator,epochs=1,steps_per_epoch=steps,verbose=2)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 38)]         0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 2048)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 38, 256)      1619712     ['input_2[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 2048)         0           ['input_1[0][0]']                
                                                                                              

  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 6000/6000 [05:10<00:00, 19.32it/s]

1200/1200 - 311s - loss: 1.6819 - accuracy: 0.7220 - 311s/epoch - 259ms/step



100%|██████████| 6000/6000 [05:00<00:00, 19.94it/s]

1200/1200 - 301s - loss: 1.2689 - accuracy: 0.7540 - 301s/epoch - 251ms/step



100%|██████████| 6000/6000 [04:59<00:00, 20.02it/s]

1200/1200 - 300s - loss: 1.1104 - accuracy: 0.7686 - 300s/epoch - 250ms/step



100%|██████████| 6000/6000 [05:01<00:00, 19.89it/s]

1200/1200 - 302s - loss: 1.0142 - accuracy: 0.7790 - 302s/epoch - 251ms/step



100%|██████████| 6000/6000 [05:03<00:00, 19.79it/s]

1200/1200 - 303s - loss: 0.9485 - accuracy: 0.7872 - 303s/epoch - 253ms/step



100%|██████████| 6000/6000 [04:59<00:00, 20.04it/s]

1200/1200 - 300s - loss: 0.8950 - accuracy: 0.7942 - 300s/epoch - 250ms/step



100%|██████████| 6000/6000 [04:57<00:00, 20.19it/s]

1200/1200 - 297s - loss: 0.8554 - accuracy: 0.8002 - 297s/epoch - 248ms/step





In [9]:
def create_pickle_dump(lstm_model_pickle):
  pickle.dump(lstm_model,open(lstm_model_pickle,"wb"))

In [10]:
lstm_model_pickle="/content/drive/MyDrive/Colab Notebooks/Image Captioning/pickle_files/lstm_model.pkl"
create_pickle_dump(lstm_model_pickle)

INFO:tensorflow:Assets written to: ram://e636b3bd-70c5-448b-847e-d878af36c4e1/assets


