In [None]:


!pip install -q kaggle


from google.colab import files
files.upload()  # Upload kaggle.json

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d adityajn105/flickr8k
!unzip -q flickr8k.zip

!unzip -q Flickr8k_Dataset.zip

import string

def load_doc(filename):
    with open(filename, 'r') as file:
        text = file.read()
    return text

def load_descriptions(doc):
    mapping = {}
    for line in doc.split('\n'):
        tokens = line.split(',')
        if len(tokens) < 2:
            continue
        image_id, image_desc = tokens[0], tokens[1:]
        image_id = image_id.split('.')[0]
        image_desc = ' '.join(image_desc)
        if image_id not in mapping:
            mapping[image_id] = []
        mapping[image_id].append('startseq ' + image_desc + ' endseq')
    return mapping

def clean_descriptions(descriptions):
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            desc = desc.lower().translate(table)
            desc = ' '.join([word for word in desc.split() if len(word) > 1 and word.isalpha()])
            desc_list[i] = desc

filename = 'captions.txt'
doc = load_doc(filename)
descriptions = load_descriptions(doc)
descriptions.pop('image')
clean_descriptions(descriptions)
print('descriptions===>',descriptions)



from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import numpy as np
import os

model = InceptionV3(weights='imagenet')
model = Model(model.input, model.layers[-2].output)

def extract_features(directory):
    features = {}
    print('direct===>',type(os.listdir(directory)))
    for img_name in os.listdir(directory)[:1000]:
        print('image name--->',img_name)
        filename = os.path.join(directory, img_name)
        image = load_img(filename, target_size=(299, 299))
        image = img_to_array(image)
        image = np.expand_dims(image, axis=0)
        image = preprocess_input(image)
        feature = model.predict(image, verbose=0)
        image_id = img_name.split('.')[0]
        features[image_id] = feature
        print(features)
    return features

features = extract_features('Images')
descriptions = {k: v for k, v in descriptions.items() if k in features}

#  Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

descriptions_list = [desc for key in descriptions for desc in descriptions[key]]
print(descriptions_list)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(descriptions_list)
vocab_size = len(tokenizer.word_index) + 1

max_length = max(len(d.split()) for d in descriptions_list)

#  Create Sequences for Training
from tensorflow.keras.utils import to_categorical

def create_sequences(tokenizer, max_length, descs, photos):
    X1, X2, y = [], [], []
    for key, desc_list in list(descs.items())[:999]:
        print('key  ',key)
        for desc in desc_list:
            seq = tokenizer.texts_to_sequences([desc])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(photos[key][0])
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

X1, X2, y = create_sequences(tokenizer, max_length, descriptions, features)

#  Define the Model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

#  Train the Model
model.fit([X1, X2], y, epochs=7, batch_size=64)

# Generate Captions
def generate_caption(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text.replace('startseq', '').replace('endseq', '').strip()



In [None]:
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import numpy as np
import os

# Load InceptionV3 model
base_model = InceptionV3(weights='imagenet')
model_incep = Model(base_model.input, base_model.layers[-2].output)
def feature_extraction(filename):
  image = load_img(filename, target_size=(299, 299))
  image = img_to_array(image)
  image = np.expand_dims(image, axis=0)
  image = preprocess_input(image)
  feature = model_incep.predict(image, verbose=0)
  image_id = filename.split('.')[0]
  # features[image_id] = feature
  return feature

def generate_caption(model, tokenizer, photo, max_length):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text.replace('startseq', '').replace('endseq', '').strip()


In [None]:
from google.colab import files
print('Upload Image :')
uploaded = files.upload()
photo=feature_extraction("1015118661_980735411b.jpg")
caption = generate_caption(model, tokenizer, photo, max_length)
print(f"Generated Caption: {caption}")

Upload Image :


NameError: name 'feature_extraction' is not defined