# **Flickr 8k Dataset**

*   Dataset Repo'd from https://github.com/awsaf49/flickr-dataset
*   8,000 images
*   Each image paired with five different captions.
*   Clear descriptions of the salient entities and events.





In [None]:
!wget "https://github.com/awsaf49/flickr-dataset/releases/download/v1.0/flickr8k.zip"
!unzip -q flickr8k.zip -d /content/drive/MyDrive/ImageCaption
!rm flickr8k.zip
!echo "Downloaded Flickr8k dataset successfully."

--2024-10-22 04:22:03--  https://github.com/awsaf49/flickr-dataset/releases/download/v1.0/flickr8k.zip
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/753516996/d7c62b13-1e50-40ea-8fae-f34a44b1695f?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20241022%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20241022T042203Z&X-Amz-Expires=300&X-Amz-Signature=636d3a3acdf7ed66e4f43aa6d1661fae86d28d6354a7e110878fd261357507ed&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dflickr8k.zip&response-content-type=application%2Foctet-stream [following]
--2024-10-22 04:22:03--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/753516996/d7c62b13-1e50-40ea-8fae-f34a44b1695f?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credent

In [2]:
# Data handling
import os
import numpy as np
import pandas as pd
import json
import random

# Image processing
from PIL import Image
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Deep Learning Libraries
from tensorflow.keras.applications import ResNet50, EfficientNetB0
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, add, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint

# NLP for Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer

# Object Detection (Scene Graph Generation)
import torch
from torchvision import transforms

# Transformer-based Language Models
from transformers import pipeline

# Evaluation Metrics
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

# Visualization
import matplotlib.pyplot as plt

# Progress tracking
from tqdm import tqdm

from keras.models import Model
from keras.layers import Input, Dense, Embedding, LSTM, Add
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical


In [3]:
# File path to the captions file
captions_file = '/content/drive/MyDrive/ImageCaption/captions.txt'

# Dictionary to store image IDs and their corresponding captions
captions_dict = {}

# Read the captions file
with open(captions_file, 'r') as file:
    for line in file:

        line = line.strip()
        image_id, caption = line.split(',', 1)

        if image_id not in captions_dict:
            captions_dict[image_id] = []

        captions_dict[image_id].append(caption)

image_example = '1000268201_693b08cb0e.jpg'
print(f"Image: {image_example}")
for idx, caption in enumerate(captions_dict[image_example], 1):
    print(f"Caption {idx}: {caption}")


Image: 1000268201_693b08cb0e.jpg
Caption 1: A child in a pink dress is climbing up a set of stairs in an entry way .
Caption 2: A girl going into a wooden building .
Caption 3: A little girl climbing into a wooden playhouse .
Caption 4: A little girl climbing the stairs to her playhouse .
Caption 5: A little girl in a pink dress going into a wooden cabin .


## Feature Extraction

In [None]:
# Path to the folder containing images
image_dir = '/content/drive/MyDrive/ImageCaption/Images'

# Initializing ResNet50 model
model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# Dictionary to store the extracted features for each image
image_features = {}

# Loop through all images in the dataset
for img_name in tqdm(os.listdir(image_dir)):
    img_path = os.path.join(image_dir, img_name)

    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)

    # Extract features using ResNet50
    features = model.predict(img_array)

    image_features[img_name] = features

print(f"Features for {list(image_features.keys())[0]}: {image_features[list(image_features.keys())[0]]}")

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step


  0%|          | 0/8091 [00:00<?, ?it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


  0%|          | 1/8091 [00:03<6:49:50,  3.04s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step


  0%|          | 2/8091 [00:04<4:14:43,  1.89s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step


  0%|          | 3/8091 [00:05<3:16:58,  1.46s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step


  0%|          | 4/8091 [00:05<2:41:18,  1.20s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 190ms/step


  0%|          | 5/8091 [00:06<2:26:14,  1.09s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step


  0%|          | 6/8091 [00:07<2:25:56,  1.08s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 188ms/step


  0%|          | 7/8091 [00:08<2:19:40,  1.04s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 187ms/step


  0%|          | 8/8091 [00:09<2:23:06,  1.06s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 301ms/step


  0%|          | 9/8091 [00:11<2:26:49,  1.09s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 289ms/step


  0%|          | 10/8091 [00:12<2:28:35,  1.10s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step


  0%|          | 11/8091 [00:13<2:29:35,  1.11s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step


  0%|          | 12/8091 [00:14<2:22:54,  1.06s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step


  0%|          | 13/8091 [00:15<2:18:31,  1.03s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step


  0%|          | 14/8091 [00:15<2:08:32,  1.05it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step


  0%|          | 15/8091 [00:16<2:08:49,  1.04it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step


  0%|          | 16/8091 [00:17<2:00:46,  1.11it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step


  0%|          | 17/8091 [00:18<1:55:05,  1.17it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step


  0%|          | 18/8091 [00:19<1:54:00,  1.18it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step


  0%|          | 19/8091 [00:20<1:52:03,  1.20it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step


  0%|          | 20/8091 [00:20<1:48:39,  1.24it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step


  0%|          | 21/8091 [00:21<1:57:31,  1.14it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step


  0%|          | 22/8091 [00:22<1:50:53,  1.21it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 293ms/step


  0%|          | 23/8091 [00:24<2:18:57,  1.03s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 301ms/step


  0%|          | 24/8091 [00:25<2:14:06,  1.00it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 310ms/step


  0%|          | 25/8091 [00:26<2:15:11,  1.01s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step


  0%|          | 26/8091 [00:27<2:15:35,  1.01s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step


  0%|          | 27/8091 [00:27<2:07:20,  1.06it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 259ms/step


  0%|          | 28/8091 [00:28<2:11:10,  1.02it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step


  0%|          | 29/8091 [00:29<2:14:27,  1.00s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step


  0%|          | 30/8091 [00:31<2:16:21,  1.01s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step


  0%|          | 31/8091 [00:31<2:11:40,  1.02it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step


  0%|          | 32/8091 [00:32<2:04:03,  1.08it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 185ms/step


  0%|          | 33/8091 [00:33<2:04:53,  1.08it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step


  0%|          | 34/8091 [00:34<2:12:48,  1.01it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step


  0%|          | 35/8091 [00:35<2:12:57,  1.01it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 186ms/step


  0%|          | 36/8091 [00:36<2:02:35,  1.10it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 296ms/step


  0%|          | 37/8091 [00:37<2:01:53,  1.10it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 292ms/step


  0%|          | 38/8091 [00:38<2:01:35,  1.10it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 308ms/step


  0%|          | 39/8091 [00:39<2:11:47,  1.02it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step


  0%|          | 40/8091 [00:40<2:05:55,  1.07it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step


  1%|          | 41/8091 [00:41<2:02:00,  1.10it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 190ms/step


  1%|          | 42/8091 [00:41<1:55:39,  1.16it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step


  1%|          | 43/8091 [00:42<1:54:29,  1.17it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step


  1%|          | 44/8091 [00:43<1:57:33,  1.14it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step


  1%|          | 45/8091 [00:44<1:54:34,  1.17it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step


  1%|          | 46/8091 [00:45<1:58:07,  1.14it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step


  1%|          | 47/8091 [00:46<2:10:33,  1.03it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step


  1%|          | 48/8091 [00:47<2:04:20,  1.08it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step


  1%|          | 49/8091 [00:48<2:07:52,  1.05it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step


  1%|          | 50/8091 [00:49<2:04:19,  1.08it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 302ms/step


  1%|          | 51/8091 [00:50<2:14:18,  1.00s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 297ms/step


  1%|          | 52/8091 [00:51<2:10:46,  1.02it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 287ms/step


  1%|          | 53/8091 [00:52<2:17:22,  1.03s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step


  1%|          | 54/8091 [00:53<2:13:15,  1.01it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step


  1%|          | 55/8091 [00:54<2:04:18,  1.08it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 188ms/step


  1%|          | 56/8091 [00:55<2:05:14,  1.07it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 185ms/step


  1%|          | 57/8091 [00:56<2:11:32,  1.02it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 188ms/step


  1%|          | 58/8091 [00:57<2:11:26,  1.02it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step


  1%|          | 59/8091 [00:58<2:05:37,  1.07it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step


  1%|          | 60/8091 [00:59<2:08:41,  1.04it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step


  1%|          | 61/8091 [00:59<2:06:13,  1.06it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step


  1%|          | 62/8091 [01:00<1:59:22,  1.12it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step


  1%|          | 63/8091 [01:01<2:04:00,  1.08it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step


  1%|          | 64/8091 [01:02<2:03:42,  1.08it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 301ms/step


  1%|          | 65/8091 [01:03<2:04:41,  1.07it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 293ms/step


  1%|          | 66/8091 [01:04<2:10:09,  1.03it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 310ms/step


  1%|          | 67/8091 [01:05<2:09:21,  1.03it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step


  1%|          | 68/8091 [01:06<2:11:31,  1.02it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step


  1%|          | 69/8091 [01:07<2:02:25,  1.09it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step


  1%|          | 70/8091 [01:08<2:01:09,  1.10it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step


  1%|          | 71/8091 [01:09<1:57:07,  1.14it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step


  1%|          | 72/8091 [01:10<1:57:57,  1.13it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step


  1%|          | 73/8091 [01:10<1:53:36,  1.18it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step


  1%|          | 74/8091 [01:11<2:01:11,  1.10it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 184ms/step


  1%|          | 75/8091 [01:12<1:54:48,  1.16it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step


  1%|          | 76/8091 [01:13<1:49:36,  1.22it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step


  1%|          | 77/8091 [01:14<1:55:10,  1.16it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step


  1%|          | 78/8091 [01:15<1:59:17,  1.12it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step


  1%|          | 79/8091 [01:16<1:54:16,  1.17it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 296ms/step


  1%|          | 80/8091 [01:17<2:01:17,  1.10it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 469ms/step


  1%|          | 81/8091 [01:32<11:26:11,  5.14s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 354ms/step


  1%|          | 82/8091 [03:17<5:21:19,  2.41s/it] 


KeyboardInterrupt: 

## Saving the Features

In [4]:
import pickle

In [None]:
with open('/content/drive/MyDrive/ImageCaption/image_features.pkl', 'wb') as f:
    pickle.dump(image_features, f)

In [5]:

#To load the features later
with open('/content/drive/MyDrive/ImageCaption/image_features.pkl', 'rb') as f:
    image_features = pickle.load(f)


## Tokenization

In [6]:
# Flatten all captions into a list for tokenization
all_captions = []
for key, captions in captions_dict.items():
    all_captions.extend(captions)

# Tokenize the captions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)

# Convert each caption to a sequence of integers
caption_sequences = {}
for key, captions in captions_dict.items():
    caption_sequences[key] = tokenizer.texts_to_sequences(captions)

# Word-index and index-word mappings
vocab_size = len(tokenizer.word_index) + 1  # Plus 1 for padding


## Model Training

In [10]:
# Assuming caption_sequences is already defined
max_length = max(len(seq) for seqs in caption_sequences.values() for seq in seqs)

# Define a generator function to yield batches of data
def data_generator(caption_sequences, image_features, batch_size):
    while True:  # Loop forever so the generator never terminates
        X1, X2, y = [], [], []
        for img_id, captions in caption_sequences.items():
            if img_id in image_features:
                for caption in captions:
                    for i in range(1, len(caption)):
                        in_seq, out_seq = caption[:i], caption[i]
                        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                        # Append the data to lists
                        X1.append(image_features[img_id][0])  # Image feature vector
                        X2.append(in_seq)  # Caption sequence
                        y.append(out_seq)  # Next word (one-hot encoded)

                        # Yield a batch if we reach the batch size
                        if len(X1) >= batch_size:
                            yield (np.array(X1), np.array(X2)), np.array(y)
                            X1, X2, y = [], [], []  # Reset lists for the next batch

# Define the model architecture
def define_model(vocab_size, max_length):
    # Image feature input
    input1 = Input(shape=(2048,))  # Example shape, adjust based on your features
    fe1 = Dense(256, activation='relu')(input1)

    # Caption input
    input2 = Input(shape=(max_length,))  # Caption input
    se2 = Embedding(vocab_size, 256)(input2)
    se3 = LSTM(256, unroll=True)(se2)

    # Merging the two inputs
    decoder1 = Add()([fe1, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    model = Model(inputs=[input1, input2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Set batch size and calculate steps per epoch
batch_size = 64  # Adjust based on your RAM
steps = len(caption_sequences) // batch_size  # Calculate steps per epoch

# Instantiate the model
model = define_model(vocab_size, max_length)

# Train the model using the generator
model.fit(
    data_generator(caption_sequences, image_features, batch_size),
    steps_per_epoch=steps,
    epochs=100,
    verbose=1
)


Epoch 1/100
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 9ms/step - accuracy: 0.0772 - loss: 6.8488
Epoch 2/100
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.1289 - loss: 5.6846
Epoch 3/100
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.1483 - loss: 5.3227
Epoch 4/100
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.1682 - loss: 5.1404
Epoch 5/100
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.1913 - loss: 4.8700
Epoch 6/100
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.1794 - loss: 5.0379
Epoch 7/100
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.2153 - loss: 4.6028
Epoch 8/100
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.2105 - loss: 4.5961
Epoch 9/100
[1m126/126[0m [3

<keras.src.callbacks.history.History at 0x7e96c877e680>

In [12]:
model.save("caption_model.h5")
print("Model saved as caption_model.h5")



Model saved as caption_model.h5
