In [None]:
import os
import shutil
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.resnet50 import ResNet50,preprocess_input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import *
import matplotlib.pyplot as plt
import numpy as np
import json
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

### directory to save model weights

In [None]:
mw='./mw'

In [None]:
os.mkdir(mw)

### loading preprocessed files

In [None]:
with open('train_description.pkl','rb') as f:
    train_description=pickle.load(f)
with open('val_description.pkl','rb') as f:
    val_description=pickle.load(f)
with open('word_to_index.pkl','rb') as f:
    word_to_index=pickle.load(f)
with open('index_to_word.pkl','rb') as f:
    index_to_word=pickle.load(f)
with open('encoding_train.pkl','rb') as f:
    encoding_train=pickle.load(f)
with open('encoding_val.pkl','rb') as f:
    encoding_val=pickle.load(f)
embedding_idx=np.load('embedding_idx.npy',allow_pickle=True)

### calculating size of vocabulary and sentence of max_length

In [None]:
vocab_size=len(word_to_index)+1
vocab_size

1848

In [None]:
max_len=0
for ll in train_description.values():
    for caption in ll:
        a=caption.split()
        if(len(a)>max_len):
            max_len=len(a)
            b=a
print(max_len,b)

33 ['startseq', 'an', 'african', 'american', 'man', 'wearing', 'green', 'sweatshirt', 'and', 'blue', 'vest', 'is', 'holding', 'up', 'in', 'front', 'of', 'his', 'face', 'while', 'standing', 'on', 'busy', 'sidewalk', 'in', 'front', 'of', 'group', 'of', 'men', 'playing', 'instruments', 'endseq']


### Building model

In [None]:
img_input=Input((2048,))
drop_img=Dropout(0.5)(img_input)
img_act=Dense(256,activation='relu')(drop_img)

In [None]:
text_input=Input((max_len,))
emb=Embedding(vocab_size,50,mask_zero=True,weights=[embedding_idx],trainable=False)(text_input)
drog_txt=Dropout(0.5)(emb)
lstm=LSTM(256)(drog_txt)

In [None]:
combination=Add()([img_act,lstm])
dense_1=Dense(256,activation='relu')(combination)
dense_2=Dense(vocab_size,activation='softmax')(dense_1)
model=Model(inputs=[img_input,text_input],outputs=dense_2)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 33)]         0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 33, 50)       92400       input_2[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 2048)         0           input_1[0][0]                    
______________________________________________________________________________________________

In [None]:
#train_description-> img_id:sents
#word_to_index-> word:index
#encoding_train-> img_id:(2048,)

In [None]:
def data_generator(train_description,word_to_index,encoding_train,batch_size=3,max_len=35):
    x1,x2,y=[],[],[]
    n=0
    while True:
        for key,desc_list in train_description.items():
            n+=1
            photo=encoding_train[key]
            for desc in desc_list:
                desc=[word_to_index[words] for words in desc.split() if words in word_to_index]
                for i in range(1,len(desc)):
                    xi=desc[:i]
                    yi=desc[i]
                    xi=pad_sequences([xi],maxlen=max_len,padding='post',truncating='post')[0]
                    yi=to_categorical([yi],num_classes=vocab_size)[0]
                    x1.append(photo)
                    x2.append(xi)
                    y.append(yi)
            if(n==batch_size):
                x1=np.array(x1)
                x2=np.array(x2)
                y=np.array(y)
                yield([x1,x2],y)
                x1,x2,y=[],[],[]
                n=0

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [None]:
epochs=30
batch_size=3
steps=len(train_description)//batch_size
val_steps=len(val_description)//batch_size

### training for first 30 epochs at fast learning rate

In [None]:
for i in range(epochs):
    gen=data_generator(train_description,word_to_index,encoding_train,batch_size,max_len)
    val_gen=data_generator(val_description,word_to_index,encoding_val,batch_size,max_len)
    print('epochs {}/{}'.format(i+1,epochs))
    model.fit_generator(gen,epochs=1,steps_per_epoch=steps,validation_data=val_gen, validation_steps=val_steps)
    model.save('./mw/epochs'+str(i+1)+'.h5')

epochs 1/30
epochs 2/30
epochs 3/30
epochs 4/30
epochs 5/30
epochs 6/30
epochs 7/30
epochs 8/30
epochs 9/30
epochs 10/30
epochs 11/30
epochs 12/30
epochs 13/30
epochs 14/30
epochs 15/30
epochs 16/30
epochs 17/30
epochs 18/30
epochs 19/30
epochs 20/30
epochs 21/30
epochs 22/30
epochs 23/30
epochs 24/30
epochs 25/30
epochs 26/30
epochs 27/30
epochs 28/30
epochs 29/30
epochs 30/30


### training for rest 20 epochs as lower learning rate but larger batch size

In [None]:
model.compile(loss='categorical_crossentropy',optimizer=Adam(0.0001))
model.load_weights('model_weights/epochs30.h5')

In [None]:
batch_size=6
steps=len(train_description)//batch_size
val_steps=len(val_description)//batch_size

In [None]:
for i in range(30,50):
    gen=data_generator(train_description,word_to_index,encoding_train,batch_size,max_len)
    val_gen=data_generator(val_description,word_to_index,encoding_val,batch_size,max_len)
    print('epochs {}/{}'.format(i+1,50))
    model.fit_generator(gen,epochs=1,steps_per_epoch=steps,validation_data=val_gen, validation_steps=val_steps)
    model.save('./mw/epochs'+str(i+1)+'.h5')

epochs 31/50
epochs 32/50
epochs 33/50
epochs 34/50
epochs 35/50
epochs 36/50
epochs 37/50
epochs 38/50
epochs 39/50
epochs 40/50
epochs 41/50
epochs 42/50
epochs 43/50
epochs 44/50
epochs 45/50
epochs 46/50
epochs 47/50
epochs 48/50
epochs 49/50
epochs 50/50


### Caption generation

In [None]:
base_model = ResNet50(weights='imagenet')
rn = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5


In [None]:
def preprocess(img):
    x = np.expand_dims(img, axis=0)
    x = preprocess_input(x)
    preds = rn.predict(x)
    return preds.flatten()

In [None]:
def caption(photo):
    start = 'startseq'
    for i in range(max_len):
        sequence = [word_to_index[w] for w in start.split() if w in word_to_index]
        sequence = pad_sequences([sequence], maxlen=max_len)
        y_pred = model.predict([photo,sequence], verbose=0)
        y_pred = np.argmax(y_pred)
        word = index_to_word[y_pred]
        start += ' ' + word
        if word == 'endseq':
            break
    sents = start.split()
    sents = sents[1:-1]
    sents = ' '.join(sents)
    return sents

In [None]:
def gen_caption(path):
    img = image.load_img(path,target_size=(224,224))
    img= image.img_to_array(img,dtype='uint8')
    photo=preprocess(img)
    plt.imshow(img)
    plt.show()
    print(caption(photo.reshape((1,-1))))