In [None]:
# Author:https://github.com/aswintechguy/Deep-Learning-Projects.git
# Modified by: Rawan Khaled
# To view the outputs please watch the video on linkedin
!pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"rawankhaled20","key":"f5a17aecb67ca4675b425dadbe4ab60e"}'}

In [None]:
!mkdir -p ~/.kaggle

In [None]:
!cp kaggle.json ~/.kaggle/

In [None]:
!kaggle datasets list

ref                                                        title                                         size  lastUpdated          downloadCount  voteCount  usabilityRating  
---------------------------------------------------------  -------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
nelgiriyewithana/top-spotify-songs-2023                    Most Streamed Spotify Songs 2023              47KB  2023-08-26 11:04:57          12901        417  1.0              
joebeachcapital/students-performance                       Students Performance                           2KB  2023-08-31 00:50:11           4359        115  1.0              
taeefnajib/used-car-price-prediction-dataset               Used Car Price Prediction Dataset            109KB  2023-09-15 12:58:57            556         23  1.0              
carlmcbrideellis/zzzs-lightweight-training-dataset-target  Zzzs: Lightweight training dataset + target  185MB  2023-09-2

In [None]:
!kaggle datasets download -d adityajn105/flickr8k

In [None]:
! unzip flickr8k.zip

In [None]:
import os
import pickle
import numpy as np
from tqdm.notebook import tqdm
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

In [None]:
# load vgg16 model
model = VGG16()
# restructure the model
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
# summarize
print(model.summary())

In [None]:
Images_DIR = 'Images'
global features
features = {}
def extraxt_features(Images_DIR):
  for img_name in tqdm(os.listdir(Images_DIR)):
      # load the image from file
      img_path = Images_DIR + '/' + img_name
      image = load_img(img_path, target_size=(224, 224))
      # convert image pixels to numpy array
      image = img_to_array(image)
      # reshape data for model
      image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
      # preprocess image for vgg
      image = preprocess_input(image)
      # extract features
      feature = model.predict(image, verbose=0)
      # get image ID
      image_id = img_name.split('.')[0]
      # store feature
      features[image_id] = feature

#extract features from the images
extraxt_features(Images_DIR)

In [None]:
pickle.dump(features, open('/content/features.pkl', 'wb'))

In [None]:
# load features from pickle
with open('/content/features.pkl', 'rb') as f:
    features = pickle.load(f)

In [None]:
with open(('captions.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()

In [None]:
# create mapping of image to captions
global  mapping
mapping = {}
def map ():
  # process lines

  for line in tqdm(captions_doc.split('\n')):
    # split the line by comma(,)
    tokens = line.split(',')
    if len(line) < 2:
        continue
    # Specify the ID and The caption tokens
    image_id, caption = tokens[0], tokens[1:]
    # remove extension .jpg from image ID
    image_id = image_id.split('.')[0]
    # convert caption list to a joined string
    caption = " ".join(caption)
    # create list to add the unrepeated images
    if image_id not in mapping:
        mapping[image_id] = []
    # store the caption
    mapping[image_id].append(caption)

map()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
len(mapping)

In [None]:
# Preprocess the captions' text
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            # take one caption at a time
            caption = captions[i]
            # preprocessing steps
            # convert to lowercase
            caption = caption.lower()
            # delete digits, special symbols, etc.,
            caption = caption.replace('[^A-Za-z]', '')
            # delete additional spaces
            caption = caption.replace('\s+', ' ')
            # add start and end tags to the caption
            caption = ' start ' + " ".join([word for word in caption.split() if len(word)>1]) + ' end'
            captions[i] = caption

In [None]:
# before preprocess of captions
mapping['1000268201_693b08cb0e']

In [None]:
# Try preprocess the captions
clean(mapping)

In [None]:
# after preprocess of captions
mapping['1000268201_693b08cb0e']

In [None]:
all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

In [None]:
len(all_captions)

In [None]:
all_captions[:10]

In [None]:
# tokenize the text into words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
vocab_size

In [None]:
# get maximum length of the caption available
max_len = max(len(caption.split()) for caption in all_captions)
max_len

In [None]:
# Spliting data to be 90% for training and the rest for testing
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.90)
train = image_ids[:split]
test = image_ids[split:]

In [None]:
# create data generator to get data in batch (avoids session crash)
def data_generator(data_keys, mapping, features, tokenizer, max_len, vocab_size, batch_size):
    # loop over images
    X1= []
    X2= []
    y=  []
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            captions = mapping[key]
            # process each caption
            for caption in captions:
                # encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                # split the sequence into X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pairs
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_len)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    # store the sequences
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n == batch_size:
                X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                yield [X1, X2], y
                X1= []
                X2= []
                y=  []

                n = 0

In [None]:
# encoder model "Start adding the model layers"
# image feature layers
input1 = Input(shape=(4096,))
fe1 = Dropout(0.4)(input1)
fe2 = Dense(256, activation='relu')(fe1)
# sequence feature layers
input2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(input2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

# decoder model
dec1 = add([fe2, se3])
dec2 = Dense(256, activation='relu')(dec1)
output = Dense(vocab_size, activation='softmax')(dec2)
# Specify input layers and output layer of the pretrained model to train it on the new dataset
model = Model(inputs=[input1, input2], outputs=output)
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# plot the model
plot_model(model, show_shapes=True)

In [None]:
# train the model
batch_size = 32
steps = len(train) // batch_size
# create data generator
generator = data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size)
# fit for 25 epoch
model.fit(generator, epochs=25, steps_per_epoch=steps, verbose=1)

In [None]:
# save the model
model.save('/content/bestofall_model.h5')

In [None]:
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [None]:
# generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    in_text = 'start'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length)
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        # get index with high probability
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        # stop if we reach end tag
        if word == 'end':
            break

    return in_text

In [None]:
from nltk.translate.bleu_score import corpus_bleu
# validate with test data
actual, predicted = list(), list()

for key in tqdm(test):
    # get real caption
    captions = mapping[key]
    # predict the caption for image
    y_pred = predict_caption(model, features[key], tokenizer, max_length)
    # split into words
    real_captions = [caption.split() for caption in captions]
    y_pred = y_pred.split()
    # append to the list
    real.append(real_captions)
    predicted.append(y_pred)

# calcuate BLEU score
print("BLEU-1: %f" % corpus_bleu(real, predicted, weights=(1.0, 0, 0, 0)))
print("BLEU-2: %f" % corpus_bleu(real, predicted, weights=(0.5, 0.5, 0, 0)))

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
def generate_caption(img_name):
    # load the image
    # image_name = "1001773457_577c3a7d70.jpg"
    image_id = img_name.split('.')[0]
    img_path = Images_DIR + '/' + img_name
    image = Image.open(img_path)
    captions = mapping[image_id]
    print('---------------------Real------------------------')
    for caption in captions:
        print(caption)
    # predict the caption
    y_pred = predict_caption(model, features[image_id], tokenizer, max_length)
    print('--------------------Predicted--------------------')
    print(y_pred)
    plt.imshow(image)

In [None]:
generate_caption("1001773457_577c3a7d70.jpg")

In [None]:
vgg_model = VGG16()
# restructure the model
vgg_model = Model(inputs=vgg_model.inputs, outputs=vgg_model.layers[-2].output)

In [None]:
image_path = '/Images/1001773457_577c3a7d70.jpg'
# load image
image = load_img(image_path, target_size=(224, 224))
# convert image pixels to numpy array
image = img_to_array(image)
# reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)
# extract features
feature = vgg_model.predict(image, verbose=0)
# predict from the trained model
predict_caption(model, feature, tokenizer, max_length)