## Import Library

In [None]:
!pip install rouge

In [None]:
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
%matplotlib inline

import string
import os
import glob
from PIL import Image
from time import time
import collections
import random
import numpy as np
import json
import tensorflow as tf
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
import requests

from keras import Input, layers
from keras import optimizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Embedding, Dense, Activation, Flatten, Reshape, Dropout
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import add
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

from tqdm import tqdm

## Import Dataset dan data preprocessing

In [None]:
train_images = '../input/coco-2017-dataset/coco2017/train2017/'
test_images = '../input/coco-2017-dataset/coco2017/test2017/'
glove_path = '../input/glove6b/glove.6B.200d.txt'

In [None]:
train_images_len = len(os.listdir(train_images))
test_images_len = len(os.listdir(test_images))
print(train_images_len)
print(test_images_len)

In [None]:
annotation_file = '../input/coco-2017-dataset/coco2017/annotations/captions_train2017.json'
with open(annotation_file, 'r') as f:
    annotations = json.load(f)
print(annotations['annotations'][0])

In [None]:
# Mengelompokkan semua caption dengan ID gambar yang sama.
image_path_to_caption = collections.defaultdict(list)
for val in annotations['annotations']:
  caption = (f"{val['caption']}")
  image_path = train_images + '%012d.jpg' % (val['image_id'])  
  image_path_to_caption[image_path].append(caption)

In [None]:
print(len(image_path_to_caption))
image_path_to_caption['../input/coco-2017-dataset/coco2017/train2017/000000519186.jpg']

In [None]:
# mengonversi ke dict
image_path_to_caption = dict(image_path_to_caption)
print(type(image_path_to_caption))

In [None]:
import random

lst = list(image_path_to_caption.items())
random.shuffle(lst)
x = int(len(lst)*0.5)
lst  = lst[0:x]
image_path_to_caption = dict(lst)
print(len(image_path_to_caption))

### Membuat fungsi Image_id_to_caption untuk mengonversi path_to_caption menjadi id_to_caption

In [None]:
def id_caption(image_path_to_caption): 
    image_id_to_caption = collections.defaultdict(list)
    for (key,val) in image_path_to_caption.items(): 
        for values in val:
            x = key.split('/')[-1]
            x = x.split('.')[0]
            image_id_to_caption[x].append(values)

    # Mengonversi image_id_to_caption ke dict
    image_id_to_caption = dict(image_id_to_caption)
    return image_id_to_caption

In [None]:
# Menghilangkan tanda baca dan memjadikan huruf kecil
table = str.maketrans('', '', string.punctuation)
for key, desc_list in image_path_to_caption.items():
    for i in range(len(desc_list)):
        desc = desc_list[i]
        desc = desc.split()
        desc = [word.lower() for word in desc]
        desc = [w.translate(table) for w in desc]
        desc_list[i] =  ' '.join(desc)

In [None]:
items = list(image_path_to_caption.items())
print(items[7])

In [None]:
# kata-kata unik di caption data
vocabulary = set()
for key in image_path_to_caption.keys():
        [vocabulary.update(d.split()) for d in image_path_to_caption[key]]
print(len(vocabulary))


In [None]:
image_id_to_caption = id_caption(image_path_to_caption)
print(len(image_id_to_caption))
print(type(image_id_to_caption))
print(list(image_id_to_caption.keys())[:5])

In [None]:
lines = list()
for key, desc_list in image_id_to_caption.items():
    for desc in desc_list:
        lines.append(key + ' ' + desc)
new_descriptions = '\n'.join(lines)

print(type(new_descriptions))
print(new_descriptions[:400])

In [None]:
## mendapatkan id gambar
train = list(image_id_to_caption.keys())
print(train[0:5])

In [None]:
train_img = list(image_path_to_caption.keys())
print(len(train_img))

In [None]:
train_descriptions = dict()
for line in tqdm(new_descriptions.split('\n')):
    tokens = line.split()
    image_id, image_desc = tokens[0], tokens[1:]
    if image_id in train:
        if image_id not in train_descriptions:
            train_descriptions[image_id] = list()
        desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
        train_descriptions[image_id].append(desc)

print(len(train_descriptions))

In [None]:
list(train_descriptions.keys())[0:5]

In [None]:
all_train_captions = []
for key, val in train_descriptions.items():
    for cap in val:
        all_train_captions.append(cap)

In [None]:
print(len(all_train_captions)) # contain all the captions   5*11858 = 59173
print(all_train_captions[:5])

In [None]:
word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1
vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]

print('Vocabulary = %d' % (len(vocab)))

In [None]:
ixtoword = {}
wordtoix = {}
ix = 1
for w in vocab:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1

vocab_size = len(ixtoword) + 1

In [None]:
all_desc = list()
for key in train_descriptions.keys():
    [all_desc.append(d) for d in train_descriptions[key]]
lines = all_desc
max_length = max(len(d.split()) for d in lines)

print('Description Length: %d' % max_length)

In [None]:
embeddings_index = {} 
f = open(glove_path, encoding="utf-8")
for line in f:   
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

In [None]:
embedding_dim = 200
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in wordtoix.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix[5].shape

## Model (InceptionV3)

In [None]:
# model = InceptionV3(weights='imagenet')
# model = VGG19(include_top = False, weights = 'imagenet')
model = InceptionV3(weights='imagenet')
model_new = Model(model.input, model.layers[-2].output)

In [None]:
from keras.preprocessing.image import load_img,img_to_array
def preprocess(image_path):
    img = load_img(image_path, target_size=(299, 299))
    x = img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x

In [None]:
print(len(train_img))
print(train_img[0:5])

In [None]:
def encode(image):
    image = preprocess(image) 
    fea_vec = model_new.predict(image) 
    #print(fea_vec.shape)
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1])
    #print(fea_vec.shape)
    return fea_vec

In [None]:
encoding_train = {}
for img in tqdm(train_img):
    path = img.split('/')[-1]
    encoding_train[path] = encode(img)
train_features = encoding_train

# encoding_test = {}
# for img in test_img:
#     encoding_test[img[len(test_images):]] = encode(img)

In [None]:
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
      
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [None]:
def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    while 1:
        for key, desc_list in descriptions.items():
            n+=1
            # mengambil fitur photo
            photo = photos[key+'.jpg']
            for desc in desc_list:
                # encode sequence
                seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
                # split satu sequence ke X, y pairs
                for i in range(1, len(seq)):
                    # split menjadi input and output pair
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)

            if n==num_photos_per_batch:
                yield ([array(X1), array(X2)], array(y))
                X1, X2, y = list(), list(), list()
                n=0

In [None]:
epochs = 5
batch_size = 32
steps = len(train_descriptions)//batch_size

generator = data_generator(train_descriptions, train_features, wordtoix, max_length, batch_size)
model.fit(generator, epochs=epochs, steps_per_epoch=steps, verbose=1)

In [None]:
def greedySearch(photo):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = ixtoword[yhat]
        in_text += ' ' + word
        if word == 'endseq':
            break

    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [None]:
def beam_search_predictions(image, beam_index = 3):
    start = [wordtoix["startseq"]]
    start_word = [[start, 0.0]]
    while len(start_word[0][0]) < max_length:
        temp = []
        for s in start_word:
            par_caps = sequence.pad_sequences([s[0]], maxlen=max_length, padding='post')
            preds = model.predict([image,par_caps], verbose=0)
            word_preds = np.argsort(preds[0])[-beam_index:]
            for w in word_preds:
                next_cap, prob = s[0][:], s[1]
                next_cap.append(w)
                prob += preds[0][w]
                temp.append([next_cap, prob])
                    
        start_word = temp
        start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
        start_word = start_word[-beam_index:]
    
    start_word = start_word[-1][0]
    intermediate_caption = [ixtoword[i] for i in start_word]
    final_caption = []
    
    for i in intermediate_caption:
        if i != 'endseq':
            final_caption.append(i)
        else:
            break

    final_caption = ' '.join(final_caption[1:])
    return final_caption

## Saving the model

In [None]:
# Simpan model
model.save("image_caption_generator.h5")

## Testing of the Model

In [None]:
# fungsi generate caption dari gambar dan BLEU Score
def generate_caption(image_path):
    image = encode(image_path)
    image = image.reshape((1, 2048))
    x=plt.imread(image_path)
    plt.imshow(x)
    plt.show()
    print("Greedy:",greedySearch(image))
    
    print("Beam Search, K = 3:",beam_search_predictions(image, beam_index = 3))
    bleu_score = sentence_bleu(greedySearch(image), beam_search_predictions(image, beam_index = 3))
    print('BLEU Score K = 3:', bleu_score)
    rouge = Rouge()
    scores = rouge.get_scores(greedySearch(image), beam_search_predictions(image, beam_index = 3))
    print("ROUGE Score K = 3:", scores)
    
    print("Beam Search, K = 5:",beam_search_predictions(image, beam_index = 5))
    bleu_score = sentence_bleu(greedySearch(image), beam_search_predictions(image, beam_index = 5))
    print('BLEU Score K = 5:', bleu_score)
    scores = rouge.get_scores(greedySearch(image), beam_search_predictions(image, beam_index = 5))
    print("ROUGE Score K = 5:", scores)
    
    print("Beam Search, K = 7:",beam_search_predictions(image, beam_index = 7))
    bleu_score = sentence_bleu(greedySearch(image), beam_search_predictions(image, beam_index = 7))
    print('BLEU Score K = 7:', bleu_score)
    scores = rouge.get_scores(greedySearch(image), beam_search_predictions(image, beam_index = 7))
    print("ROUGE Score K = 7:", scores)

In [None]:
generate_caption('../input/coco-2017-dataset/coco2017/train2017/000000000009.jpg')

In [None]:
generate_caption('../input/coco-2017-dataset/coco2017/train2017/000000000030.jpg')

In [None]:
generate_caption('../input/coco-2017-dataset/coco2017/val2017/000000000776.jpg')

In [None]:
generate_caption('../input/coco-2017-dataset/coco2017/val2017/000000000885.jpg')

In [None]:
generate_caption('../input/coco-2017-dataset/coco2017/test2017/000000000063.jpg')

In [None]:
generate_caption('../input/coco-2017-dataset/coco2017/test2017/000000000155.jpg')

In [None]:
generate_caption('../input/coco-2017-dataset/coco2017/val2017/000000000785.jpg')

In [None]:
generate_caption('../input/coco-2017-dataset/coco2017/test2017/000000000178.jpg')

In [None]:
img_url = "https://cdn-cas.orami.co.id/parenting/images/kucing_gemas-1.width-800.jpg"

im = Image.open(requests.get(img_url, stream=True).raw)
im = im.convert('RGB')
im.save('tmp.jpg')
generate_caption('tmp.jpg')

In [None]:
path = '/kaggle/working/'

dir_contents = os.listdir(path)

for item in dir_contents:
    print(item)