In [23]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import pandas as pd
from tensorflow import keras

In [31]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("adityajn105/flickr8k")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/flickr8k


In [33]:
import os

print(os.listdir(path))

df=pd.read_csv(path+"/captions.txt")


['captions.txt', 'Images']


In [34]:
df.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [35]:
captions_dict = df.groupby("image")["caption"].apply(list).to_dict()

In [36]:
print(captions_dict[df["image"][0]])

['A child in a pink dress is climbing up a set of stairs in an entry way .', 'A girl going into a wooden building .', 'A little girl climbing into a wooden playhouse .', 'A little girl climbing the stairs to her playhouse .', 'A little girl in a pink dress going into a wooden cabin .']


In [37]:
import string,re
def clean_caption(caption):
    # Lowercase
    caption = caption.lower()
    # Remove punctuation
    caption = caption.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers and non-alphabetic characters
    caption = re.sub(r'[^a-zA-Z ]+', '', caption)
    # Remove extra spaces
    caption = caption.strip()
    return caption


In [40]:
clean_caption(captions_dict[df["image"][0]][0])

'a child in a pink dress is climbing up a set of stairs in an entry way'

In [41]:
def preprocess_captions(captions_dict):
    processed = {}
    for img, caps in captions_dict.items():
        processed[img] = []
        for cap in caps:
            cleaned = clean_caption(cap)
            # Add start and end tokens
            final_caption = f"<start> {cleaned} <end>"
            processed[img].append(final_caption)
    return processed

cleaned_captions = preprocess_captions(captions_dict)


In [44]:
caption = "A dog running in the park, #123!"
cleaned = clean_caption(caption)
print(cleaned)

a dog running in the park


In [46]:
cleaned_dict={
    img: [clean_caption(caption) for caption in caps]
    for img, caps in captions_dict.items()
}

In [47]:
captions_dict[df["image"][0]]

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

In [48]:
for img_id, caps in list(cleaned_captions.items())[:1]:
    print("Image:", img_id)
    for c in caps:
        print(c)


Image: 1000268201_693b08cb0e.jpg
<start> a child in a pink dress is climbing up a set of stairs in an entry way <end>
<start> a girl going into a wooden building <end>
<start> a little girl climbing into a wooden playhouse <end>
<start> a little girl climbing the stairs to her playhouse <end>
<start> a little girl in a pink dress going into a wooden cabin <end>


#Step 4: Feature Extraction (CNN)

In [51]:
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.preprocessing import image
from keras.models import Model
import numpy as np

base_model = InceptionV3(weights='imagenet')
model = Model(base_model.input, base_model.layers[-2].output)

def extract_features(img_path):
    img = image.load_img(img_path, target_size=(299, 299))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return model.predict(x).flatten()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5
[1m96112376/96112376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


#🔹 Step 5: Tokenize Captions

In [53]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Flatten and clean captions list
all_captions = [
    str(c) for caps in captions_dict.values() for c in caps if isinstance(c, str)
]

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)

# Vocab size and max caption length
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(c.split()) for c in all_captions)

print("Vocabulary Size:", vocab_size)
print("Max Caption Length:", max_length)


Vocabulary Size: 8494
Max Caption Length: 38


#🔹 Step 6: Define CNN + LSTM Model

In [59]:
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from keras.models import Model

inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')


#🔹 Step 7: Data Generator + Training

In [74]:
def data_generator(descriptions, photos, tokenizer, max_length, vocab_size, batch_size):
    import numpy as np
    from keras.utils import to_categorical
    while True:
        X1, X2, y = [], [], []
        n = 0
        for img_id, desc_list in descriptions.items():
            for desc in desc_list:
                # Encode the sequence
                seq = tokenizer.texts_to_sequences([desc])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

                    X1.append(photos[img_id][0])   # image features
                    X2.append(in_seq)              # input sequence
                    y.append(out_seq)              # target word
                    n += 1
                    if n == batch_size:
                        yield ([np.array(X1), np.array(X2)], np.array(y))
                        X1, X2, y = [], [], []
                        n = 0


In [75]:
df.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,<start> a child in a pink dress is climbing up...
1,1000268201_693b08cb0e.jpg,<start> a girl going into a wooden building <end>
2,1000268201_693b08cb0e.jpg,<start> a little girl climbing into a wooden p...
3,1000268201_693b08cb0e.jpg,<start> a little girl climbing the stairs to h...
4,1000268201_693b08cb0e.jpg,<start> a little girl in a pink dress going in...


In [76]:
import string
from collections import defaultdict

def clean_caption(caption):
    caption = caption.lower()
    caption = caption.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    caption = caption.strip()
    caption = f"<start> {caption} <end>"
    return caption

# Apply cleaning to your dataframe
df["caption"] = df["caption"].apply(clean_caption)


In [77]:
train_descriptions = defaultdict(list)

for i, row in df.iterrows():
    img = row["image"]
    cap = row["caption"]
    train_descriptions[img].append(cap)


In [78]:
list(train_descriptions.items())[:2]  # view first 2 image-caption pairs


[('1000268201_693b08cb0e.jpg',
  ['<start> start a child in a pink dress is climbing up a set of stairs in an entry way end <end>',
   '<start> start a girl going into a wooden building end <end>',
   '<start> start a little girl climbing into a wooden playhouse end <end>',
   '<start> start a little girl climbing the stairs to her playhouse end <end>',
   '<start> start a little girl in a pink dress going into a wooden cabin end <end>']),
 ('1001773457_577c3a7d70.jpg',
  ['<start> start a black dog and a spotted dog are fighting end <end>',
   '<start> start a black dog and a tricolored dog playing with each other on the road end <end>',
   '<start> start a black dog and a white dog with brown spots are staring at each other in the street end <end>',
   '<start> start two dogs of different breeds looking at each other on the road end <end>',
   '<start> start two dogs on pavement moving toward each other end <end>'])]

In [79]:
generator = data_generator(train_descriptions, train_features, tokenizer, max_length, vocab_size, batch_size=32)

steps = len(train_descriptions)
model.fit(generator, epochs=20, steps_per_epoch=steps, verbose=1)


NameError: name 'train_features' is not defined

In [80]:
# Skipping full generator here for brevity – let me know if you want it complete.

model.fit(generator, epochs=20, steps_per_epoch=steps, verbose=1)


NameError: name 'generator' is not defined