In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# IMPORTS
from tensorflow.python.keras import backend as K
import tensorflow as tf
import pickle
import os
from keras.models import Model
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions
from tensorflow.keras.preprocessing import image
import numpy as np
from time import time
import matplotlib.pyplot as plt
from keras.utils import pad_sequences, to_categorical
from keras.utils.vis_utils import plot_model
from keras.layers import add
from keras.layers import Input, Dense, Dropout, Embedding, LSTM
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [3]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num CPUs Available: ", len(tf.config.experimental.list_physical_devices('CPU')))

Num GPUs Available:  1
Num CPUs Available:  1


In [4]:
config = tf.compat.v1.ConfigProto( device_count = {'GPU': 1 , 'CPU': 1} )
sess = tf.compat.v1.Session(config=config) 
K.set_session(sess)

### Loading features and vocabulary

In [17]:
# resnet_features = np.load('/content/drive/MyDrive/Information Retrieval Project/flickr dataset/feature vectors/max_pool_train_feature_vectors.npy')
# # /content/drive/MyDrive/Information Retrieval Project/flickr dataset/feature vectors/feature_vectors.npy

word_to_index, index_to_word, caption_dict, resnet_train_encoding, train_content, test_content, xception_train_ecoding = None, None, None, None, None, None, None

with open('/content/drive/MyDrive/Information Retrieval Project/flickr dataset/word_to_index.pkl', 'rb') as file:
  word_to_index = pickle.load(file)

with open('/content/drive/MyDrive/Information Retrieval Project/flickr dataset/index_to_word.pkl', 'rb') as file:
  index_to_word = pickle.load(file)

with open('/content/drive/MyDrive/Information Retrieval Project/flickr dataset/caption_dict.pkl', 'rb') as file:
  caption_dict = pickle.load(file)

with open('/content/drive/MyDrive/Information Retrieval Project/flickr dataset/resnet_train_encoding.pkl', 'rb') as file:
  resnet_train_encoding = pickle.load(file)

with open('/content/drive/MyDrive/Information Retrieval Project/flickr dataset/resnet_test_encoding.pkl', 'rb') as file:
  resnet_test_encoding = pickle.load(file)

with open('/content/drive/MyDrive/Information Retrieval Project/flickr dataset/train_cap_dict.pkl', 'rb') as file:
  train_content = pickle.load(file)

with open('/content/drive/MyDrive/Information Retrieval Project/flickr dataset/test_cap_dict.pkl', 'rb') as file:
  test_content = pickle.load(file)

with open('/content/drive/MyDrive/Information Retrieval Project/flickr dataset/xception_train_encoding.pkl', 'rb') as file:
    xception_train_encoding = pickle.load(file)

VOCAB_SIZE = len(word_to_index) + 1

In [8]:
VOCAB_SIZE

2847

In [9]:
cap_max_len = 0
for img, cap_list in train_content.items():
  for caption in cap_list:
    temp = len(caption.split())
    if(temp > cap_max_len):
      cap_max_len = temp

In [10]:
cap_max_len

51

### Decoder

In [11]:
class Decoder():
    def __init__(self, img_input_dim, cap_max_len, VOCAB_SIZE):

      self.img_input_dim = img_input_dim
      self.cap_max_len = cap_max_len
      self.VOCAB_SIZE = VOCAB_SIZE
      
      self.build_model()

    def build_model(self):
      img_input = Input(shape=(self.img_input_dim,))
      img_dropout = Dropout(0.2)(img_input)
      img_dense = Dense(256, activation='relu')(img_dropout)

      cap_input = Input(shape=(self.cap_max_len,))
      cap_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=50, mask_zero=True)(cap_input)
      cap_dropout = Dropout(0.3)(cap_embedding)
      cap_lstm = LSTM(256)(cap_dropout)

      img_cap_add = add([img_dense, cap_lstm])
      decoder = Dense(256, activation='relu')(img_cap_add)
      outputs = Dense(VOCAB_SIZE, activation='softmax')(decoder)

      self.model = Model (inputs=[img_input, cap_input], outputs=outputs)
    
    def getSummary(self):
        return self.model.summary()
    
    def set_embed_weights(self, layer_num, embedding_matrix):
        self.model.layers[layer_num].set_weights([embedding_matrix])
        self.model.layers[layer_num].trainable = False
    
    def compile(self):
        self.model.compile(loss="categorical_crossentropy", optimizer="adam")
    
    def train(self, epochs, batch_size, train_cap_dict, train_encoding, word_to_index, max_len):
        steps = 1600 // batch_size
        for i in range(epochs):        
            generator = self.data_generator(train_cap_dict, train_encoding, word_to_index, max_len, batch_size)
            self.model.fit_generator(generator, steps_per_epoch=steps)

    def data_generator (self, train_content, train_encoding, word_to_index, max_len, batch_size):
        X1, X2, y = [], [], []
        cur_batch = 0

        while True:
            for imageID, cap_list in train_content.items():
                cur_batch += 1

                image = train_encoding [imageID]

                for caption in cap_list:
                    indexes = [word_to_index[key] for key in caption.split() if key in word_to_index]

                    for i in range (1, len(indexes)):
                        x_cap_temp = indexes[0 : i]
                        x_cap_pred = indexes[i]

                        x_cap_temp = pad_sequences([x_cap_temp], maxlen=max_len, value=0, padding='post')[0]
                        x_cap_pred = to_categorical([x_cap_pred], num_classes=VOCAB_SIZE)[0]

                        X1.append(image)
                        X2.append(x_cap_temp)
                        y.append(x_cap_pred)

                    if cur_batch == batch_size:
                        yield [[np.array(X1), np.array(X2)], np.array(y)]
                        
                        X1, X2, y = [], [], []
                        cur_batch = 0

### Embedding

In [12]:
index = {}
with open('/content/drive/MyDrive/Information Retrieval Project/glove.6B.50d.txt', encoding = 'utf8') as embedding_file:

  for line in embedding_file:

    temp = line.split()

    key = temp[0]
    embeddings = np.array(temp[1:], dtype = 'float')
    index[key] = embeddings
  

In [13]:
embedding_matrix = np.zeros(shape = (VOCAB_SIZE, 50))

for word in word_to_index.keys():
  if word in index:
    embedding_matrix[word_to_index[word]] = index.get(word)

## train model

#### resnet

In [None]:
decoder = Decoder(2048, cap_max_len, VOCAB_SIZE)

In [None]:
decoder.getSummary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 51)]         0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 2048)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 51, 50)       142350      ['input_2[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 2048)         0           ['input_1[0][0]']                
                                                                                              

In [None]:
decoder.set_embed_weights(2, embedding_matrix)

In [None]:
decoder.compile()
epochs = 20
batch_size = 8
decoder.train(epochs, batch_size, train_content, resnet_train_encoding, word_to_index, cap_max_len)

  self.model.fit_generator(generator, steps_per_epoch=steps)




In [None]:
decoder.model.save('/content/drive/MyDrive/Information Retrieval Project/saved_models/' + 'resnet_model.h5')

#### xception

In [14]:
xception_decoder = Decoder(2048, cap_max_len, VOCAB_SIZE)

In [16]:
xception_decoder.set_embed_weights(2, embedding_matrix)

In [22]:
xception_decoder.compile()
epochs = 20
batch_size = 8
xception_decoder.train(epochs, batch_size, train_content, xception_train_encoding, word_to_index, cap_max_len)

  self.model.fit_generator(generator, steps_per_epoch=steps)




In [23]:
xception_decoder.model.save('/content/drive/MyDrive/Information Retrieval Project/saved_models/' + 'xception_model.h5')