In [2]:
import tensorflow as tf
from os import listdir, path
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import string
import re
import numpy as np

2024-07-17 20:48:25.115035: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-17 20:48:25.115244: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-17 20:48:25.300962: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

**Load Photographs (Flickr8K)**

In [4]:
def load_photos(directory):
    images = dict()
    for name in listdir(directory):
        filename = path.join(directory, name)
        image = load_img(filename, target_size = (224, 224))
        image = img_to_array(image)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        image = preprocess_input(image)
        image_id = name.split('.')[0]
        images[image_id] = image
    return images

In [5]:
directory = '/kaggle/input/flickr8k/Images'
# dictionary of images mapped with their id
images = load_photos(directory)
print(f'Loaded Images: {len(images)}')

Loaded Images: 8091


**Load Description (Flickr8K)**

In [6]:
def load_descriptions(doc):
    mapping = dict()
    for line in doc.split('\n'):
        tokens = line.split()
        if len(line) < 2:
            continue
        image_id, image_desc = tokens[0], tokens[1:]
        image_id = image_id.split('.')[0]
        image_desc = ' '.join(image_desc)
        if image_id not in mapping:
            mapping[image_id] = image_desc
    return mapping

In [7]:
def clean_descriptions(descriptions):
    # prepare regex for char filtering
    re_punc = re.compile(f"[{re.escape(string.punctuation)}]") 
    for key, desc in descriptions.items():
        desc = desc.split()
        desc = [word.lower() for word in desc]
        desc = [re_punc.sub('', w) for w in desc]
        desc = [word for word in desc if len(word)>1]
        descriptions[key] = ' '.join(desc)

In [8]:
def save_doc(descriptions, filename):
    lines = list()
    for key, desc in descriptions.items():
        lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [9]:
filename= '/kaggle/input/flickr8k/captions.txt'
doc = load_doc(filename)
descriptions = load_descriptions(doc)
clean_descriptions(descriptions)

all_tokens = ' '.join(descriptions.values()).split()
vocabulary = set(all_tokens)
print(f'Vocabulary Size: {len(vocabulary)}')

save_doc(descriptions, 'descriptions.txt')

Vocabulary Size: 4465


**Data Preparation**

In [10]:
def load_clean_descriptions(filename):
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        tokens = line.split()
        image_id, image_desc = tokens[0], tokens[1:]
        descriptions[image_id] = ' '.join(image_desc)
    return descriptions

In [11]:
descriptions = load_clean_descriptions('descriptions.txt')
desc_text = list(descriptions.values())

# prepare tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(desc_text)
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocab Size = {vocab_size}")

# integer encode descriptions
sequences = tokenizer.texts_to_sequences(desc_text)

max_length = max(len(s) for s in sequences)
print(f"Max Length = {max_length}")

Vocab Size = 4466
Max Length = 28


In [15]:
print(desc_text[:5])
print(sequences[:5])

['', 'child in pink dress is climbing up set of stairs in an entry way', 'black dog and spotted dog are fighting', 'little girl covered in paint sits in front of painted rainbow with her hands in bowl', 'man lays on bench while his dog sits by him']
[[], [35, 1, 63, 140, 5, 121, 50, 403, 7, 363, 1, 28, 2342, 517], [10, 6, 3, 747, 6, 15, 364], [47, 13, 166, 1, 577, 97, 1, 38, 7, 545, 1193, 9, 53, 210, 1, 1070], [8, 615, 4, 147, 23, 20, 6, 97, 42, 109]]


**Word-By-Word Model**

In [13]:
x, y = list(), list()

for img_no, seq in enumerate(sequences):
    # split one sequence into multiple x,y pairs
    for i in range(1, len(seq)):
        # split into input, output pair
        in_seq, out_seq = seq[:i], seq[i]
        # pad input sequence
        in_seq = pad_sequences([in_seq], maxlen = max_length)[0]
        # encode output sequence
        out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
        x.append(in_seq)
        y.append(out_seq)

# convert to numpy arrays
x, y = np.array(x), np.array(y)
print(x.shape)
print(y.shape)

(65428, 28)
(65428, 4466)
