In [None]:
import numpy as np
import pandas as pd
import cv2
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import csv 
from glob import glob
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

In [None]:
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import plot_model
from keras.models import Model, Sequential
from keras.layers import Input
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers.merge import add
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Flatten,Input, Convolution2D, Dropout, LSTM, TimeDistributed, Embedding, Bidirectional, Activation, RepeatVector,Concatenate, CuDNNLSTM
from keras.models import Sequential, Model


# Preprocessing Images

In [None]:
import pickle

# #with open("../input/pertrained/images_features.pkl","rb") as f:
# #    pickle.dump(images_features , f) 

image_features = {}
with open("/kaggle/input/modal-and-pickle/combined_resnet50_images_features.pkl","rb") as f:
    images_features=pickle.load(f)
    

# Preprocessing Captions and Creating Vocabulary

In [None]:
caption_path = '/kaggle/input/stanford-paragraph-dataset-in-nepali/stanford_cultural_nepalis.csv'

with open(caption_path) as csvfile:
    captions=csv.reader(csvfile, delimiter=',', quotechar='"')    
    captions_dict_train,captions_dict_test,captions_dict_val= {},{},{}
    for i in captions:
        try:
            img_name = i[0]  + ".jpg"
            caption = i[1]
            train_input=i[2]
            test_input=i[3]
            val_input=i[4]            
            if img_name in images_features:
                if train_input=='TRUE':
                    captions_dict_train[img_name] = caption 
                elif test_input=='TRUE':
                    captions_dict_test[img_name] = caption
                elif val_input=='TRUE':
                    captions_dict_val[img_name] = caption
        except:        
            pass
        
print("size of train, test and validation dataset:" ,len(captions_dict_train),len(captions_dict_test),len(captions_dict_val))

In [None]:
def preprocessed(txt):
    modified = txt.lower().replace('।',' ।').replace("'","")
    modified = 'startofseq ' + modified + ' endofseq'
    return modified

count_words = {}
# print("Captions before preprocessing\n ",list(captions_dict_train.items())[:5])

for k,v in captions_dict_train.items():
    captions_dict_train[k]=preprocessed(v)
    
    
#creating count_words dict with word and frequency
for k,v in captions_dict_train.items():
    for word in v.split():
        if word not in count_words:
            count_words[word] = 1
        else:
            count_words[word] += 1

#creating words_dict as vocabulary where only words occuring more than threshold is considered
THRESH = 5
count = 1
words_dict = {}
for k,v in count_words.items():
    if count_words[k] > THRESH:
        words_dict[k] = count
        count += 1

#captions_dict converting words into numbers from words_dict       
for k, v in captions_dict_train.items():    
    encoded = []
    for word in v.split():
        if word in words_dict.keys():
            encoded.append(words_dict[word])
            
    captions_dict_train[k] = encoded
    
print("training Captions after preprocessing\n ",list(captions_dict_train.items())[:5])


In [None]:
# Assuming you have a similar validation captions dictionary captions_dict_val

# Preprocess validation captions
for k, v in captions_dict_val.items():
    captions_dict_val[k] = preprocessed(v)

# Create count_words dict with word and frequency for validation set
for k, v in captions_dict_val.items():
    for word in v.split():
        if word not in count_words:
            count_words[word] = 1
        else:
            count_words[word] += 1

# Create words_dict as vocabulary for validation set
words_dict_val = {}
count_val = 1
for k, v in count_words.items():
    if count_words[k] > THRESH:
        words_dict_val[k] = count_val
        count_val += 1

# Convert validation captions to numbers using words_dict_val
for k, v in captions_dict_val.items():
    encoded_val = []
    for word in v.split():
        if word in words_dict_val.keys():
            encoded_val.append(words_dict_val[word])

    captions_dict_val[k] = encoded_val

print("Validation Captions after preprocessing\n", list(captions_dict_val.items())[:5])


In [None]:
import pickle

with open("./words_dict_nepali_sc.pkl", "wb") as f:
   pickle.dump(words_dict , f)   


# Merge Model

In [None]:

vocab_size = len(words_dict)+1
MAX_LEN = 0

for k, v in captions_dict_train.items():
    if len(v) > MAX_LEN:
        MAX_LEN = len(v)  
        
# feature extractor model
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

# language sequence model
inputs2 = Input(shape=(MAX_LEN,))
se1 = Embedding(vocab_size, MAX_LEN, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

# decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

# tie it together [image, seq] [word]
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
# summarize model
# model.summary()
# plot_model(model, show_shapes=True)


# Training RNN- Using Generator Function to create input

In [None]:
N=32 #BatchSize
VOCAB_SIZE = len(words_dict)+1

def progressive_generator(photo_dict, caption_dict, MAX_LEN,VOCAB_SIZE):
    while 1:
        for i in range(0,len(caption_dict),N):
            caption = dict(list(caption_dict.items())[0+i: N+i])
            X, y_in, y_out = create_sequences(photo_dict,caption,MAX_LEN, VOCAB_SIZE)
            yield [X, y_in], y_out       
    

def create_sequences(photo, caption, MAX_LEN,VOCAB_SIZE):
    #n_samples = 0    
    X,y_in,y_out = [],[],[]
    
    for k, v in caption.items():   
        for i in range(1, len(v)):
            X.append(photo[k])
    
            in_seq= [v[:i]]
            out_seq = v[i]
    
            in_seq = pad_sequences(in_seq, maxlen=MAX_LEN, padding='post', truncating='post')[0]
            out_seq = to_categorical([out_seq], num_classes=VOCAB_SIZE)[0]
    
            y_in.append(in_seq)
            y_out.append(out_seq)
            
    return np.array(X), np.array(y_in), np.array(y_out)

In [None]:
# steps = len(captions_dict_train)/N
# generator = progressive_generator(images_features, captions_dict_train, MAX_LEN, VOCAB_SIZE)
# model.fit(generator, epochs=50, steps_per_epoch=steps, verbose=1)
# model.save('./cultural_nepali_50_test' + '.h5')

In [None]:

# model.save('./image_caption_model_newarch_stanford_nepali50' + '.h5')

#  Prediction

In [None]:
vocab_size = len(words_dict)+1
MAX_LEN = 0

for k, v in captions_dict_train.items():
    if len(v) > MAX_LEN:
        MAX_LEN = len(v)  
        
inv_dict = {v:k for k, v in words_dict.items()}

In [None]:
## For selected test images

#2373586.jpg 75 epoch
#2361833.jpg 75 epoch

#2394335.jpg 150 epoch
#2364210.jpg 150 epoch
#2402430.jpg 2395361.jpg 150 epoch
#2335374.jpg 2373374.jpg 150 epoch


model = tf.keras.models.load_model('../input/imagecaptioningmodels/image_caption_model_newarch150.h5')
# img_name='01659403106.jpg'
# test_feature = images_features[img_name]
test_img_path = '../input/paragraph/01659403106.jpg'
test_img = cv2.imread(test_img_path)
test_img = cv2.cvtColor(test_img, cv2.COLOR_BGR2RGB)


text_inp = ['startofseq']
count = 0
caption = ''
while count < MAX_LEN:
    count += 1
    encoded = []
    for i in text_inp:
        encoded.append(words_dict[i])
    encoded = [encoded]
    encoded = pad_sequences(encoded, padding='post', truncating='post', maxlen=MAX_LEN)
    data_list=[np.array(test_feature).reshape(1,-1), np.array(encoded).reshape(1,-1)]        
    prediction = np.argmax(model.predict(data_list))        
    sampled_word = inv_dict[prediction]
    caption = caption + ' ' + sampled_word
        
    if sampled_word == 'endofseq':
        break
    text_inp.append(sampled_word)
    
caption= caption.replace('endofseq','') 
predicted= caption.split()
# actual= captions_dict_test[img_name].split()

# blueScore= sentence_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))
  
# print('BLEU-1: %f' % sentence_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
# print('BLEU-2: %f' % sentence_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
# print('BLEU-3: %f' % sentence_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
# print('BLEU-4: %f' % sentence_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

plt.figure()
plt.imshow(test_img)    
# plt.title(img_name+" - BLUE Score: "+str(blueScore))
plt.show()
print(caption.replace(' .','.'))

In [None]:
## For selected test images

#2373586.jpg 75 epoch
#2361833.jpg 75 epoch

#2394335.jpg 150 epoch
#2364210.jpg 150 epoch
#2402430.jpg 2395361.jpg 150 epoch
#2335374.jpg 2373374.jpg 150 epoch
#ktm_154
#p_168


model = tf.keras.models.load_model('/kaggle/input/modal-file/cultural_nepali_50_test.h5')
img_name='ktm_154.jpg'
test_feature = images_features[img_name]
test_img_path = '/kaggle/input/stanford-cultural-images/stanford_img/content/stanford_images/'+img_name
test_img = cv2.imread(test_img_path)
test_img = cv2.cvtColor(test_img, cv2.COLOR_BGR2RGB)


text_inp = ['startofseq']
count = 0
caption = ''
while count < MAX_LEN:
    count += 1
    encoded = []
    for i in text_inp:
        encoded.append(words_dict[i])
    encoded = [encoded]
    encoded = pad_sequences(encoded, padding='post', truncating='post', maxlen=MAX_LEN)
    data_list=[np.array(test_feature).reshape(1,-1), np.array(encoded).reshape(1,-1)]        
    prediction = np.argmax(model.predict(data_list))        
    sampled_word = inv_dict[prediction]
    caption = caption + ' ' + sampled_word
        
    if sampled_word == 'endofseq':
        break
    text_inp.append(sampled_word)
    
caption= caption.replace('endofseq','') 
predicted= caption.split()
actual= captions_dict_test[img_name].split()

blueScore= sentence_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))
  
print('BLEU-1: %f' % sentence_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
print('BLEU-2: %f' % sentence_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
print('BLEU-3: %f' % sentence_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
print('BLEU-4: %f' % sentence_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

plt.figure()
plt.imshow(test_img)    
plt.title(img_name+" - BLUE Score: "+str(blueScore))
plt.show()
print(caption.replace(' .','.'))

In [None]:
## For random test images

model = tf.keras.models.load_model('../input/imagecaptioningmodels/image_caption_model_newarch75.h5')

plt.figure()
for i in range(3):
    
    num=  np.random.randint(0,len(captions_dict_test))    
    img_name=list(captions_dict_test)[num]
    test_feature = images_features[img_name]
    test_img_path = '/kaggle/input/testpucture/'+img_name

    test_img = cv2.imread(test_img_path)
    test_img = cv2.cvtColor(test_img, cv2.COLOR_BGR2RGB)
    
    text_inp = ['startofseq']

    count = 0
    caption = ''
    while count < MAX_LEN:
        count += 1

        encoded = []
        for i in text_inp:
            encoded.append(words_dict[i])

        encoded = [encoded]
        encoded = pad_sequences(encoded, padding='post', truncating='post', maxlen=MAX_LEN)
        data_list=[np.array(test_feature).reshape(1,-1), np.array(encoded).reshape(1,-1)]        
        prediction = np.argmax(model.predict(data_list))        
        sampled_word = inv_dict[prediction]
        caption = caption + ' ' + sampled_word
            
        if sampled_word == 'endofseq':
            break

        text_inp.append(sampled_word)
        
    caption= caption.replace('endofseq','')
    predicted= caption.split()
    actual= captions_dict_test[img_name].split()
    
    blueScore= sentence_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25))

    plt.imshow(test_img)    
    plt.title(img_name+" - BLUE Score: "+str(blueScore))
    plt.show()
    print(caption.replace(' .','.') )
    print('BLEU-1: %f' % sentence_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % sentence_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % sentence_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % sentence_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
    
                               