In [None]:
!kaggle datasets download adityajn105/flickr8k

Dataset URL: https://www.kaggle.com/datasets/adityajn105/flickr8k
License(s): CC0-1.0
flickr8k.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
from zipfile import ZipFile
import matplotlib.pyplot as plt
import cv2
from tensorflow import keras
from keras import Sequential
from keras.layers import Dense,Conv2D,MaxPooling2D,Flatten,BatchNormalization,Dropout

In [None]:
with ZipFile("/content/flickr8k.zip", 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
data = pd.read_csv('/content/captions.txt')

In [None]:
data

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...
...,...,...
40450,997722733_0cb5439472.jpg,A man in a pink shirt climbs a rock face
40451,997722733_0cb5439472.jpg,A man is rock climbing high in the air .
40452,997722733_0cb5439472.jpg,A person in a red shirt climbing up a rock fac...
40453,997722733_0cb5439472.jpg,A rock climber in a red shirt .


In [None]:
def text_preprocessing(data):
  data['caption'] = data['caption'].apply(lambda x:x.lower())
  data['caption'] = data['caption'].apply(lambda x:x.replace('[^A-Za-z]',''))
  data['caption'] = data['caption'].apply(lambda x:" ".join(word for word in x.split() if len(word)>1))
  data['caption'] = "startseq " + data['caption'] +" endseq"
  return data

In [None]:
data['caption'].sample(5)

Unnamed: 0,caption
24955,startseq group enjoying day at the park in the...
34499,startseq young adults wearing uniforms wheel c...
13593,startseq mountain climber posing in the snow e...
30179,startseq the girl is in red jersey and pitchin...
27280,startseq black and brown dog wearing red coat ...


In [None]:
data = text_preprocessing(data)
captions_list = data['caption'].tolist()

In [None]:
# word2vec implementation
from gensim.models import Word2Vec
import numpy as np
tokenized_captions = [caption.split() for caption in data['caption']]
print(tokenized_captions)
word2vec_model = Word2Vec(sentences=tokenized_captions, vector_size=100, window=5, min_count=1, workers=4)
vocab_size = len(word2vec_model.wv.key_to_index)
max_length = max(len(caption) for caption in tokenized_captions)
images = data['image'].unique().tolist()
nimages = len(images)
split_index = round(0.85 * nimages)
train_images = images[:split_index]
val_images = images[split_index:]
train = data[data['image'].isin(train_images)]
test = data[data['image'].isin(val_images)]
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)
caption = tokenized_captions[2]
caption_embedding = np.array([word2vec_model.wv[word] for word in caption if word in word2vec_model.wv])

print("Caption Embedding Shape:", caption_embedding.shape)


Caption Embedding Shape: (8, 100)


In [None]:
#texts_to_sequences implementaion
"""tokenizer = Tokenizer()
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption in captions)

images = data['image'].unique().tolist()
nimages = len(images)

split_index = round(0.85*nimages)
train_images = images[:split_index]
val_images = images[split_index:]

train = data[data['image'].isin(train_images)]
test = data[data['image'].isin(val_images)]

train.reset_index(inplace=True,drop=True)
test.reset_index(inplace=True,drop=True)

tokenizer.texts_to_sequences([captions[1]])[0]"""

"tokenizer = Tokenizer()\ntokenizer.fit_on_texts(captions)\nvocab_size = len(tokenizer.word_index) + 1\nmax_length = max(len(caption.split()) for caption in captions)\n\nimages = data['image'].unique().tolist()\nnimages = len(images)\n\nsplit_index = round(0.85*nimages)\ntrain_images = images[:split_index]\nval_images = images[split_index:]\n\ntrain = data[data['image'].isin(train_images)]\ntest = data[data['image'].isin(val_images)]\n\ntrain.reset_index(inplace=True,drop=True)\ntest.reset_index(inplace=True,drop=True)\n\ntokenizer.texts_to_sequences([captions[1]])[0]"

In [None]:
from keras.applications.xception import Xception
from keras.models import Model
from keras.preprocessing.image import load_img, img_to_array
import numpy as np
from tqdm import tqdm
base_model = Xception(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)
img_size = 299
image_path = '/content/Images'
features = {}
for image in tqdm(data['image'].unique().tolist()):
    img = load_img(os.path.join(image_path, image), target_size=(img_size, img_size))
    img = img_to_array(img)
    img = img / 255.
    img = np.expand_dims(img, axis=0)
    feature = model.predict(img, verbose=0)
    features[image] = feature
print("Feature shape:", feature.shape)


100%|██████████| 8091/8091 [1:10:42<00:00,  1.91it/s]

Feature shape: (1, 2048)





In [None]:
features_size = len(features)

In [None]:
from keras.models import Model
from keras.layers import Embedding,LSTM,Dense,Add
from keras.layers import Input
image_input = Input(shape=(features_size,))
image_dense = Dense(256, activation = 'relu')(image_input)
caption_input = Input(shape=(max_length,))
caption_embedding = Embedding(input_dim = vocab_size,output_dim = 100, input_length = max_length)(caption_input)
caption_lstm = LSTM(256)(caption_embedding)
merged =Add()([image_dense,caption_lstm])
output = Dense(vocab_size,activation = 'softmax')(merged)




In [None]:
captioning_model = Model(inputs = [image_input,caption_input],outputs = output)
captioning_model.compile(optimizer = 'adam', loss = "categorical_crossentropy",metrics = ['accuracy'])
captioning_model.summary()

In [None]:
import numpy as np
max_length = max(len(caption) for caption in tokenized_captions)
captions_input = np.zeros((len(tokenized_captions), max_length), dtype=object)
for i, caption in enumerate(tokenized_captions):
    captions_input[i, :len(caption)] = caption


In [None]:
image_features = np.array(list(features.values()))
captions_output = np.array(next_word_labels)

NameError: name 'next_word_labels' is not defined

In [None]:
history = captioning_model.fit([image_features,caption_input],captions_output, epochs = 50,batch_size= 32)

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'

In [None]:
from tensorflow.keras.utils import plot_model

In [None]:
plot_model(caption_model)

In [None]:
caption_model.summary()