<a href="https://colab.research.google.com/github/Shriram-Salunke-045/Image-Captioning-with-Encoder-Decoder-approach-using-ResNet50-and-LSTM./blob/main/rheaImage_Captioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from PIL import Image
import os
from pickle import dump
import numpy as np
from keras.applications.resnet50 import ResNet50, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.models import Model
from tqdm.notebook import tqdm

Using TensorFlow backend.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def extract_features(directory):
        model = ResNet50( include_top=False, pooling='avg' )
        print(model.summary())
        #define a dictionary
        features = {}
        for img in tqdm(os.listdir(directory)):
            filename = directory + "/" + img
            image = Image.open(filename)
            image = image.resize((224,224))
            image = np.expand_dims(image, axis=0)
            #preprocess
            image = preprocess_input(image)
            #get features
            feature = model.predict(image)
            features[img] = feature
        return features
#2048 feature vector
features = extract_features('/content/drive/My Drive/Data/Flicker8k_Dataset')
print('Extracted Features: %d' % len(features))

dump(features, open("features.pkl","wb"))

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "resnet50"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, None, 3 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, None, None, 3 0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, None, None, 6 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, None

HBox(children=(FloatProgress(value=0.0, max=8103.0), HTML(value='')))


Extracted Features: 8103


In [None]:
import string
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [None]:
def load_descriptions(doc):
	map = {}
	
	for line in doc.split('\n'):
		
		tokens = line.split()
		if len(line) < 2:
			continue
		image_id, image_desc = tokens[0], tokens[1:]
		image_id = image_id.split('.')[0]
		image_desc = ' '.join(image_desc)
		if image_id not in map:
			map[image_id] = list()
		map[image_id].append(image_desc)
	return map

In [None]:
def clean_descriptions(descriptions):
	table = str.maketrans('', '', string.punctuation)
	for key, desc_list in descriptions.items():
		for i in range(len(desc_list)):
			desc = desc_list[i]
			desc = desc.split()
			desc = [word.lower() for word in desc]
			desc = [w.translate(table) for w in desc]
			desc = [word for word in desc if len(word)>1]
			desc = [word for word in desc if word.isalpha()]
			desc_list[i] =  ' '.join(desc)


In [None]:
def to_vocabulary(descriptions):
	vocab = set()
	for key in descriptions.keys():
		[vocab.update(d.split()) for d in descriptions[key]]
	return vocab

In [None]:
def save_descriptions(descriptions, filename):
	lines = list()
	for key, desc_list in descriptions.items():
		for desc in desc_list:
			lines.append(key + ' ' + desc)
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

In [None]:
def load_photos(filename):
  file = load_doc(filename)
  photos = file.split("\n") [:-1]
  return photos 


In [None]:
def load_clean_descriptions(filename, photos):
  file = load_doc(filename)
  descriptions = { }
  for line in file.split("\n") :
    
    words = line.split()
    if len(words)<1 : 
      continue 
    image, image_caption = words[0], words[1:]

    if image in photos: 
      if image not in descriptions:
        descriptions[image] = []
      desc = '<start>' + " ".join(image_caption) + '<end>'
      descriptions[image].append(desc)
      return descriptions 


In [None]:
def load_features(photos):
  all_features = load(open("features.p","rb"))
  features = {k:all_features[k] for k in photos}
  return features 

filename = "/content/drive/My Drive/Data/Flickr8k_text/Flickr_8k.trainImages.txt"

def load_photos(photos):
  train_imgs = load_photos(filename)
  train_descriptions = load_clean_descriptions("descriptions.txt", train_imgs)
  train_features = load_features(train_imgs)



In [None]:
def dict_to_list(descriptions):
  all_desc = []
  for key in descriptions.keys(): 
    [all_desc.append(d) for d in descriptions[key]]
    return all_desc

In [None]:
from keras.preprocessing.text import Tokenizer 

def create_tokenizer(descriptions):
  desc_list = dict_to_list(descriptions)
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(desc_list)
  return tokenizer 

In [None]:
def train_descriptions(descriptions):
  tokenizer = create_tokenizer(train_descriptions)
  dump(tokenizer, open('tokenizer.p', 'wb'))
  vocab_size = len(tokenizer, word_index) + 1 
  vocab_size

In [None]:
def max_length(descriptions): 
  desc_list = dict_to_list(descriptions)
  return max(len(d.split()) for d in desc_list)
  

In [None]:
def data_generator(descriptions, features, tokenizer, max_length, train_descriptions): 
  while 1: 
    for key, description_list in descriptions.items():
      features = features[key][0]
      input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, description_list, feature)
      yield [[input_image, input_sequence], output_word]



In [None]:
def create_sequences(tokenizer, max_length, desc_list, features): 
  X1, X2, y= list(), list(), list()
  for desc in desc_list:
    seq = tokenizer.texts_to_sequences([desc]) [0]
    for i in range(1, len(seq)): 
      in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
      out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
      X1.append(feature)
      X2.append(in_seq)
      y.append(out_seq)
      return np.array(X1), np.array(X2), np.array(y)


In [None]:
[a,b], c = next(data_generator(train_descriptions, features, tokenizer, maxlength))
a.shape, b.shape, c.shape 


NameError: ignored