In [151]:
import string
import numpy as np
from PIL import Image
import os
from pickle import dump, load
import numpy as np


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dropout, Dense, LSTM, Add, Bidirectional, Embedding, BatchNormalization
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.applications import Xception
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
# small library for seeing the progress of loops.
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm().pandas()


0it [00:00, ?it/s]

In [152]:
# Loading a text file into memory
def load_doc(filename):
    # Opening the file as read only
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [153]:
# get all imgs with their captions
def all_img_captions(filename):
    file = load_doc(filename)
    captions = file.split('\n')
    descriptions ={}
    for caption in captions[:-1]:
        img, caption = caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [caption]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions

In [154]:
##Data cleaning- lower casing, removing puntuations and words containing numbers
def cleaning_text(captions):
    table = str.maketrans('','',string.punctuation)
    for img,caps in captions.items():
        for i,img_caption in enumerate(caps):

            img_caption.replace("-"," ")
            desc = img_caption.split()

            #converts to lower case
            desc = [word.lower() for word in desc]
            #remove punctuation from each token
            desc = [word.translate(table) for word in desc]
            #remove hanging 's and a 
            desc = [word for word in desc if(len(word)>1)]
            #remove tokens with numbers in them
            desc = [word for word in desc if(word.isalpha())]
            #convert back to string

            img_caption = ' '.join(desc)
            captions[img][i]= img_caption
    return captions

In [155]:
def text_vocabulary(descriptions):
    # build vocabulary of all unique words
    vocab = set()
    
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    
    return vocab

In [156]:
#All descriptions in one file 
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc )
    data = "\n".join(lines)
    file = open(filename,"w")
    file.write(data)
    file.close()

In [157]:
dataset_text = "/Users/sarthak/Documents/ICG/Flicker8k_text"
dataset_images = "/Users/sarthak/Documents/ICG/Flicker8k_Dataset"

In [158]:
#we prepare our text data
filename = dataset_text + "/" + "Flickr8k.token.txt"
#loading the file that contains all data
#mapping them into descriptions dictionary img to 5 captions
descriptions = all_img_captions(filename)
print("Length of descriptions =" ,len(descriptions))

#cleaning the descriptions
clean_descriptions = cleaning_text(descriptions)

#building vocabulary 
vocabulary = text_vocabulary(clean_descriptions)
print("Length of vocabulary = ", len(vocabulary))

#saving each description to file 
save_descriptions(clean_descriptions, "descriptions.txt")

Length of descriptions = 8092
Length of vocabulary =  8763


In [159]:
#Defining Model
def extract_features(directory, model):
        
        features = {}
        for img in tqdm(os.listdir(directory)):
            filename = directory + "/" + img
            image = Image.open(filename)
            image = image.resize((299,299))
            image = np.expand_dims(image, axis=0)
            image = image/127.5
            image = image - 1.0
            feature = model.predict(image)
            features[img] = feature
        return features

In [160]:
model = Xception( include_top=False, pooling='avg' )

In [161]:
# model = EfficientNetB7(include_top=False, pooling='avg', weights='imagenet')

In [162]:
#2048 feature vector
#features = extract_features(dataset_images, model)
#dump(features, open("features.p","wb"))

In [163]:
features = load(open("/Users/sarthak/Documents/ICG/features.p","rb"))

In [164]:
#load the data 
def load_photos(filename):
    file = load_doc(filename)
    photos = file.split("\n")[:-1]
    return photos

def load_clean_descriptions(filename, photos):   
    #loading clean_descriptions
    file = load_doc(filename)
    descriptions = {}
    for line in file.split("\n"):
        
        words = line.split()
        if len(words)<1 :
            continue
    
        image, image_caption = words[0], words[1:]
        
        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            desc = '<start> ' + " ".join(image_caption) + ' <end>'
            descriptions[image].append(desc)

    return descriptions


def load_features(photos):
    #loading all features
    all_features = load(open("/Users/sarthak/Documents/ICG/features.p","rb"))
    #selecting only needed features
    features = {k:all_features[k] for k in photos}
    return features

In [165]:
filename = dataset_text + "/" + "Flickr_8k.trainImages.txt"

#train = loading_data(filename)
train_imgs = load_photos(filename)
train_descriptions = load_clean_descriptions("descriptions.txt", train_imgs)
train_features = load_features(train_imgs)

In [166]:
#converting dictionary to clean list of descriptions
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

#creating tokenizer class 
#this will vectorise text corpus
#each integer will represent token in dictionary 

#from keras.preprocessing.text import Tokenizer

def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

In [167]:
# give each word a index, and store that into tokenizer.p pickle file
tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open('tokenizer.p', 'wb'))
vocab_size = len(tokenizer.word_index) + 1
vocab_size 

7577

In [168]:
#calculate maximum length of descriptions
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)

max_length = max_length(descriptions)
max_length

32

In [169]:
features['1000268201_693b08cb0e.jpg'][0]

array([0.4734095 , 0.01730891, 0.07334226, ..., 0.08557954, 0.02102289,
       0.23765498], dtype=float32)

In [170]:
def data_generator(descriptions, features, tokenizer, max_length, vocab_size):
    while 1:
        for key, description_list in descriptions.items():
            feature = features[key][0]
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, description_list, feature, vocab_size)
            # Explicitly return tensors instead of a list
            yield (tf.convert_to_tensor(input_image, dtype=tf.float32),
                   tf.convert_to_tensor(input_sequence, dtype=tf.int32)), tf.convert_to_tensor(output_word, dtype=tf.float32)



# Function to create input-output sequence pairs from the image description
def create_sequences(tokenizer, max_length, desc_list, feature, vocab_size):
    X1, X2, y = list(), list(), list()
    # Walk through each description for the image
    for desc in desc_list:
        # Encode the sequence into a list of integers
        seq = tokenizer.texts_to_sequences([desc])[0]
        # Split one sequence into multiple (input, output) pairs
        for i in range(1, len(seq)):
            # Split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # Pad the input sequence to the max length
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # One-hot encode the output sequence (word)
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # Append the data to the lists
            X1.append(feature)  # Image feature
            X2.append(in_seq)   # Sequence (input sequence)
            y.append(out_seq)   # Target word (one-hot encoded)
    return np.array(X1), np.array(X2), np.array(y)


In [171]:
[a,b],c = next(data_generator(train_descriptions, features, tokenizer, max_length,vocab_size))
a.shape, b.shape, c.shape

(TensorShape([47, 2048]), TensorShape([47, 32]), TensorShape([47, 7577]))

In [172]:

# Define the captioning model
def define_model(vocab_size, max_length):
    # Image features from CNN model squeezed from 2048 to 256 nodes
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # LSTM sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=False)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # Merging both models (image and sequence)
    decoder1 = Add()([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)

    # Final output layer: vocab_size for softmax classification
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    # Tie the model together (image and sequence as inputs, word as output)
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    optimizer = Adam(learning_rate=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    # Summarize the model and plot
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)

    return model


In [174]:
all_features = load(open("/Users/sarthak/Documents/ICG/features.p", "rb"))
for key, feature in all_features.items():
    print(f"Image: {key}, Feature Shape: {feature.shape}")
    break

Image: 2387197355_237f6f41ee.jpg, Feature Shape: (1, 2048)


In [175]:
# Train our model
print('Dataset: ', len(train_imgs))
print('Descriptions: train=', len(train_descriptions))
print('Photos: train=', len(train_features))
print('Vocabulary Size:', vocab_size)
print('Description Length: ', max_length)

# Define the model
model = define_model(vocab_size, max_length)

# Training parameters
epochs = 50
steps = len(train_descriptions)

# Add EarlyStopping callback
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='accuracy', patience=3, restore_best_weights=True)

# Train the model with early stopping
for i in range(epochs):
    print(f"Epoch {i+1}/{epochs}")
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length, vocab_size)
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1, callbacks=[early_stopping])

    # Save the model after each epoch
    model.save(f"/Users/sarthak/Documents/ICG/Model_saved/fnl_epoch_{i+1}.h5")


Dataset:  6000
Descriptions: train= 6000
Photos: train= 6000
Vocabulary Size: 7577
Description Length:  32


None
Epoch 1/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m330s[0m 55ms/step - accuracy: 0.1897 - loss: 5.0133




Epoch 2/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m335s[0m 56ms/step - accuracy: 0.2823 - loss: 3.7754




Epoch 3/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m361s[0m 60ms/step - accuracy: 0.3038 - loss: 3.4445




Epoch 4/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 63ms/step - accuracy: 0.3170 - loss: 3.2567




Epoch 5/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m345s[0m 57ms/step - accuracy: 0.3255 - loss: 3.1403




Epoch 6/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 54ms/step - accuracy: 0.3314 - loss: 3.0451




Epoch 7/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 56ms/step - accuracy: 0.3377 - loss: 2.9746




Epoch 8/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m357s[0m 59ms/step - accuracy: 0.3427 - loss: 2.9227




Epoch 9/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m362s[0m 60ms/step - accuracy: 0.3467 - loss: 2.8757




Epoch 10/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m345s[0m 57ms/step - accuracy: 0.3508 - loss: 2.8439




Epoch 11/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m338s[0m 56ms/step - accuracy: 0.3544 - loss: 2.8100




Epoch 12/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m357s[0m 59ms/step - accuracy: 0.3568 - loss: 2.7800




Epoch 13/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m352s[0m 59ms/step - accuracy: 0.3601 - loss: 2.7615




Epoch 14/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m357s[0m 59ms/step - accuracy: 0.3618 - loss: 2.7405




Epoch 15/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m353s[0m 59ms/step - accuracy: 0.3632 - loss: 2.7280




Epoch 16/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m327s[0m 54ms/step - accuracy: 0.3665 - loss: 2.7136




Epoch 17/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m331s[0m 55ms/step - accuracy: 0.3676 - loss: 2.6992




Epoch 18/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m373s[0m 62ms/step - accuracy: 0.3696 - loss: 2.6884




Epoch 19/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m861s[0m 144ms/step - accuracy: 0.3708 - loss: 2.6756




Epoch 20/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m328s[0m 55ms/step - accuracy: 0.3729 - loss: 2.6672




Epoch 21/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m339s[0m 57ms/step - accuracy: 0.3732 - loss: 2.6618




Epoch 22/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m348s[0m 58ms/step - accuracy: 0.3735 - loss: 2.6560




Epoch 23/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m343s[0m 57ms/step - accuracy: 0.3750 - loss: 2.6505




Epoch 24/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m336s[0m 56ms/step - accuracy: 0.3755 - loss: 2.6515




Epoch 25/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m327s[0m 54ms/step - accuracy: 0.3743 - loss: 2.6464




Epoch 26/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m346s[0m 58ms/step - accuracy: 0.3747 - loss: 2.6438




Epoch 27/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m346s[0m 58ms/step - accuracy: 0.3772 - loss: 2.6369




Epoch 28/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m351s[0m 58ms/step - accuracy: 0.3766 - loss: 2.6388




Epoch 29/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m367s[0m 61ms/step - accuracy: 0.3781 - loss: 2.6352




Epoch 30/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m360s[0m 60ms/step - accuracy: 0.3775 - loss: 2.6312




Epoch 31/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m364s[0m 61ms/step - accuracy: 0.3781 - loss: 2.6354




Epoch 32/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m351s[0m 59ms/step - accuracy: 0.3785 - loss: 2.6296




Epoch 33/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m372s[0m 62ms/step - accuracy: 0.3785 - loss: 2.6294




Epoch 34/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m369s[0m 62ms/step - accuracy: 0.3783 - loss: 2.6306




Epoch 35/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m368s[0m 61ms/step - accuracy: 0.3795 - loss: 2.6272




Epoch 36/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m402s[0m 67ms/step - accuracy: 0.3794 - loss: 2.6292




Epoch 37/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m407s[0m 68ms/step - accuracy: 0.3794 - loss: 2.6230




Epoch 38/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 63ms/step - accuracy: 0.3803 - loss: 2.6257




Epoch 39/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m381s[0m 63ms/step - accuracy: 0.3796 - loss: 2.6290




Epoch 40/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 64ms/step - accuracy: 0.3792 - loss: 2.6310




Epoch 41/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m385s[0m 64ms/step - accuracy: 0.3794 - loss: 2.6345




Epoch 42/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m385s[0m 64ms/step - accuracy: 0.3791 - loss: 2.6344




Epoch 43/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m378s[0m 63ms/step - accuracy: 0.3790 - loss: 2.6359




Epoch 44/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m396s[0m 66ms/step - accuracy: 0.3792 - loss: 2.6381




Epoch 45/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m404s[0m 67ms/step - accuracy: 0.3806 - loss: 2.6398




Epoch 46/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m415s[0m 69ms/step - accuracy: 0.3784 - loss: 2.6417




Epoch 47/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m418s[0m 70ms/step - accuracy: 0.3799 - loss: 2.6433




Epoch 48/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m403s[0m 67ms/step - accuracy: 0.3781 - loss: 2.6465




Epoch 49/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m400s[0m 67ms/step - accuracy: 0.3768 - loss: 2.6542




Epoch 50/50
[1m6000/6000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m408s[0m 68ms/step - accuracy: 0.3778 - loss: 2.6519


