In [1]:
import numpy as np
from PIL import Image
import os
import string
from pickle import dump, load
import tensorflow

In [2]:
from tensorflow.keras.applications.xception import Xception 
from tensorflow.keras.applications.xception import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout

In [3]:
def read_file(file_path: str):
    file = open(file_path, "r")
    text = file.readlines()
    file.close()
    return text

In [4]:
def develop_caption(file_path: str):
    texts = read_file(file_path)
    descriptions = {}
    for i in range(len(texts)):
        filename, caption = texts[i].split("\t")
        filename, _ = filename.split("#")
        try:
            _ = len(descriptions[filename])
        except:
            descriptions[filename] = []
        descriptions[filename].append(caption.lower())
    return descriptions

descriptions = develop_caption("data\\labels\\Flickr8k.token.txt")

In [5]:
def clean_text(captions: dict):
    table = str.maketrans('','',string.punctuation)
    for filename, caps in captions.items():
        for i, label in enumerate(caps):
            label.replace("-"," ")
            label = label.split()
            #uppercase to lowercase
            label = [word.lower() for word in label]
            #remove punctuation from each token
            label = [word.translate(table) for word in label]
            #remove hanging 's and a
            label = [word for word in label if(len(word) > 1)]
            #remove words containing numbers with them
            label = [word for word in label if(word.isalpha())]
            #converting back to string
            label = " ".join(label)
            captions[filename][i]= label
    return captions

In [6]:
def vocabulary(captions: dict):
    # To build vocabulary of all unique words
    vocab = set()
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in captions[key]]
    return vocab

In [7]:
def save_captions(captions: dict, file_path: str):
    lines = list()
    for key, caption_list in captions.items():
        for caption in caption_list:
            lines.append(key + "\t" + caption)
    data = "\n".join(lines)
    file = open(file_path, "w")
    file.write(data)
    file.close()
    print("saved")

In [8]:
dataset_text = "data\\labels"
dataset_images = "data\\images"

#to prepare our text data
file_name = os.path.join(dataset_text, "Flickr8k.token.txt")

#loading the file that contains all data
#map them into descriptions dictionary 
captions = develop_caption(file_name)
print("Length of captions:", len(captions))

#cleaning the descriptions
clean_captions = clean_text(captions)

#to build vocabulary
vocab = vocabulary(clean_captions)
print("Unique words in vocabulary:", len(vocab))

#saving all descriptions in one file
save_captions(clean_captions, "captions.txt")

Length of captions: 8092
Unique words in vocabulary: 8763
saved


In [9]:
# def extract_image_features(image_folder_path):
#     xception_model = Xception(include_top = False, pooling = "avg")
#     features = {}
#     image_files = [os.path.join(image_folder_path, f) for f in os.listdir(image_folder_path)]
#     for i in tqdm(range(len(image_files))):
#         image_path = image_files[i]
#         image = Image.open(image_path)
#         image = image.resize((299,299))
#         image = np.expand_dims(image, axis=0)
#         image = image/127.5
#         image = image - 1.0
#         feature = xception_model.predict(image, verbose=False)
#         features[image_path] = feature
#     return features

In [10]:
# # features = extract_image_features(dataset_images)
# # dump(features, open("features.p", "wb"))

# #to directly load the features from the pickle file.
# features = load(open("features.p", "rb"))

### Loading dataset for model training

In [11]:
# def load_photos(file_path: str):
#     file = read_file(file_path)
#     file = [f.rstrip("\n") for f in file]
#     return file

In [12]:
# def load_clean_captions(file_path: str, photos):
#     #loading clean_descriptions
#     file = read_file(file_path)
#     captions = {}
#     for line in file:
#         words = line.split()
#         if len(words)<1 :
#             continue
#         image, image_caption = words[0], words[1:]
#         if image in photos:
#             if image not in captions:
#                 captions[image] = []
#             caption = ' ' + " ".join(image_caption) + ' '
#             captions[image].append(caption)
#     return captions

In [13]:
# def load_features(photos):
#     #loading all features
#     all_features = load(open("features.p","rb"))
#     #selecting only needed features
#     new_keys = [k.split("\\")[-1] for k in all_features.keys()]
#     f = {}
#     for i, key in enumerate(all_features.keys()):
#         f[new_keys[i]] = all_features[key]
    
#     features = {k:f[k] for k in photos}
#     return features

In [14]:
# filename = os.path.join(dataset_text, "Flickr_8k.trainImages.txt")
# #train = loading_data(filename)
# train_imgs = load_photos(filename)
# train_descriptions = load_clean_captions("captions.txt", train_imgs)
# train_features = load_features(train_imgs)

### Tokenizing the Vocabulary

In [15]:
# def dict_to_list(captions: dict):
#     all_captions = []
#     for key in descriptions.keys():
#         [all_captions.append(d) for d in descriptions[key]]
#     return all_captions

In [16]:
# from tensorflow.keras.preprocessing.text import Tokenizer

# def create_tokenizer(captions: dict):
#     # Ensure dict_to_list is defined or imported if used
#     captions_list = dict_to_list(captions)  # Assuming dict_to_list function converts dictionary values to a list
#     tokenizer = Tokenizer()
#     tokenizer.fit_on_texts(captions_list)
#     return tokenizer

In [17]:
# tokenizer = create_tokenizer(captions)
# dump(tokenizer, open('tokenizer.p', 'wb'))
# vocab_size = len(tokenizer.word_index) + 1

In [18]:
# def max_length(captions):
#     captions = dict_to_list(descriptions)
#     return max(len(d.split()) for d in captions)

# max_length = 30
# print(f"{vocab_size=}")
# print(f"{max_length=}")

vocab_size=8494
max_length=30


### Create a Data generator

In [19]:
# def data_generator(descriptions, features, tokenizer, max_length):
#     while 1:
#         for key, description_list in descriptions.items():
#             #retrieve photo features
#             feature = features[key][0]
#             inp_image, inp_seq, op_word = create_sequences(tokenizer, max_length, description_list, feature)
#             yield [[inp_image, inp_seq], op_word]

In [20]:
# def create_sequences(tokenizer, max_length, desc_list, feature):
#     x_1, x_2, y = list(), list(), list()
#     # move through each description for the image
#     for desc in desc_list:
#         # encode the sequence
#         seq = tokenizer.texts_to_sequences([desc])[0]
#         # divide one sequence into various X,y pairs
#         for i in range(1, len(seq)):
#             # divide into input and output pair
#             in_seq, out_seq = seq[:i], seq[i]
#             # pad input sequence
#             in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
#             # encode output sequence
#             out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
#             # store
#             x_1.append(feature)
#             x_2.append(in_seq)
#             y.append(out_seq)
#     return np.array(x_1), np.array(x_2), np.array(y)

In [21]:
# #To check the shape of the input and output for your model
# [a,b],c = next(data_generator(train_descriptions, train_features, tokenizer, max_length))
# print(a.shape, b.shape, c.shape)

(37, 2048) (37, 30) (37, 8494)


### Define the CNN-RNN model

In [22]:
# from keras.utils import plot_model
# from tensorflow.keras.layers import Add

# def get_model(vocab_size, max_length):
#     inputs1 = Input(shape=(2048,))
#     fe1 = Dropout(0.5)(inputs1)
#     fe2 = Dense(256, activation='relu')(fe1)
#     fe3 = Dense(128, activation="relu")(fe2)
#     # LSTM sequence model
#     inputs2 = Input(shape=(max_length,))
#     se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
#     se2 = Dropout(0.5)(se1)
#     se3 = LSTM(256, return_sequences=True)(se2)
#     se4 = LSTM(128)(se3)
#     # Merging both models
#     decoder1 = Add()([fe3, se4])
#     decoder2 = Dense(256, activation='relu')(decoder1)
#     outputs = Dense(vocab_size, activation='softmax')(decoder2)
#     # merge it [image, seq] [word]
#     model = Model(inputs=[inputs1, inputs2], outputs=outputs)
#     model.compile(loss='categorical_crossentropy', optimizer='adam')
#     # summarize model
#     print(model.summary())
#     plot_model(model, to_file='model.png', show_shapes=True)
#     return model

### Training the Image Caption Generator model

In [23]:
# print('Dataset: ', len(train_imgs))
# print('Descriptions: train=', len(train_descriptions))
# print('Photos: train=', len(train_features))
# print('Vocabulary Size:', vocab_size)
# print('Description Length: ', max_length)

Dataset:  6000
Descriptions: train= 6000
Photos: train= 6000
Vocabulary Size: 8494
Description Length:  30


In [24]:
# EPOCHS = 200
# STEPS = len(train_descriptions)
# if not os.path.exists("new_models"):
#     os.mkdir("new_models")

# model = get_model(vocab_size, max_length)

# for i in range(EPOCHS):
#     generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
#     model.fit_generator(generator, epochs=1, steps_per_epoch=STEPS, verbose=1)
#     model.save("new_models/model_" + str(i) + ".keras")

None
You must install pydot (`pip install pydot`) for `plot_model` to work.


AttributeError: 'Functional' object has no attribute 'fit_generator'

### Testing the Image Caption Generator model


In [None]:
# import numpy as np
# from PIL import Image
# import matplotlib.pyplot as plt
# import argparse


# def extract_features(filename, model):
#     try:
#         image = Image.open(filename)
#     except:
#         print("ERROR: Can't open image! Ensure that image path and extension is correct")
#     image = image.resize((299,299))
#     image = np.array(image)
#     # for 4 channels images, we need to convert them into 3 channels
#     if image.shape[2] == 4:
#         image = image[..., :3]
#     image = np.expand_dims(image, axis=0)
#     image = image/127.5
#     image = image - 1.0
#     feature = model.predict(image)
#     return feature


# def word_for_id(integer, tokenizer):
#     for word, index in tokenizer.word_index.items():
#         if index == integer:
#             return word
#     return None


# def generate_desc(model, tokenizer, photo, max_length):
#     in_text = 'start'
#     for i in range(max_length):
#         sequence = tokenizer.texts_to_sequences([in_text])[0]
#         sequence = pad_sequences([sequence], maxlen=max_length)
#         pred = model.predict([photo,sequence], verbose=0)
#         pred = np.argmax(pred)
#         word = word_for_id(pred, tokenizer)
#         if word is None:
#             break
#         in_text += ' ' + word
#         if word == 'end':
#             break
#     return in_text

In [None]:
# If using TensorFlow 2.x
# from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
# img_path = ["data\\images\\3385593926_d3e9c21170.jpg", "data\\images\\109823395_6fb423a90f.jpg", 
#         "data\\images\\118309463_a532b75be9.jpg", "data\\images\\161669933_3e7d8c7e2c.jpg",
#            "data\\images\\229951087_4c20600c32.jpg","data\\images\\free-images.jpg"]

# max_length = 30
# tokenizer = load(open("tokenizer.p","rb"))
# model = load_model('new_models/model_138.keras')
# xception_model = Xception(include_top=False, pooling="avg")

In [None]:
# photo = extract_features(img_path[0], xception_model)
# img = Image.open(img_path[0])
# description1 = generate_desc(model, tokenizer, photo, max_length)
# print(description1)
# plt.imshow(img)

In [None]:
# photo = extract_features(img_path[1], xception_model)
# img = Image.open(img_path[1])
# description2 = generate_desc(model, tokenizer, photo, max_length)
# print(description2)
# plt.imshow(img)

In [None]:
# photo = extract_features(img_path[2], xception_model)
# img = Image.open(img_path[2])
# description3 = generate_desc(model, tokenizer, photo, max_length)
# print(description3)
# plt.imshow(img)

In [None]:
# photo = extract_features(img_path[3], xception_model)
# img = Image.open(img_path[3])
# description4 = generate_desc(model, tokenizer, photo, max_length)
# print(description4)
# plt.imshow(img)

In [None]:
# path = "data\\images\\1355945307_f9e01a9a05.jpg"
# photo = extract_features(path, xception_model)
# img = Image.open(path)
# description5 = generate_desc(model, tokenizer, photo, max_length)
# print(description5)
# plt.imshow(img)

In [None]:
# path = "data\\images\\free-images.jpg"
# photo = extract_features(path, xception_model)
# img = Image.open(path)
# description5 = generate_desc(model, tokenizer, photo, max_length)
# print(description5)
# plt.imshow(img)