# Importing Libraries

In [211]:
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt 
import string
import os
from PIL import Image
import glob
from pickle import dump, load
from tqdm import tqdm_notebook as tqdm
from time import time
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.layers.wrappers import Bidirectional
from keras.layers.merge import add
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras import Input, layers
from keras import optimizers
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [None]:
try:
    import dill as pickle
except ImportError:
    import pickle

# Loading Documents

In [2]:
# load doc into memory
def load_doc(filename): 
    file = open(filename, 'r') 
    text = file.read() 
    file.close()
    return text

filename = "/home/vinit/Desktop/Projects/Image Captioning/Flicker8k_test/Flickr8k.token.txt"
# load descriptions
doc = load_doc(filename)
print(doc[:300])

1000268201_693b08cb0e.jpg#0	A child in a pink dress is climbing up a set of stairs in an entry way .
1000268201_693b08cb0e.jpg#1	A girl going into a wooden building .
1000268201_693b08cb0e.jpg#2	A little girl climbing into a wooden playhouse .
1000268201_693b08cb0e.jpg#3	A little girl climbing the s


In [3]:
def load_descriptions(doc):
    mapping = dict()
    
    for line in doc.split('\n'):
        
        if len(line) < 2:
            continue
        tokens = line.split()
        
        image_id, image_desc = tokens[0], tokens[1:]
        
        image_id = image_id.split('.')[0]
        
        image_desc = ' '.join(image_desc)
        
        if image_id not in mapping:
            mapping[image_id] = list()
        
        mapping[image_id].append(image_desc)
    return mapping

descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))

Loaded: 8092 


In [5]:
list(descriptions.items())[:1]

[('1000268201_693b08cb0e',
  ['A child in a pink dress is climbing up a set of stairs in an entry way .',
   'A girl going into a wooden building .',
   'A little girl climbing into a wooden playhouse .',
   'A little girl climbing the stairs to her playhouse .',
   'A little girl in a pink dress going into a wooden cabin .'])]

In [10]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

def clean_descriptions(descriptions):
    
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            
            desc = tokenizer.tokenize(desc)
            
            desc = [word.lower() for word in desc]
            
            desc = [word for word in desc if len(word)>1]
            
            desc_list[i] =  ' '.join(desc)

clean_descriptions(descriptions)

In [11]:
descriptions['1000268201_693b08cb0e']

['child in pink dress is climbing up set of stairs in an entry way',
 'girl going into wooden building',
 'little girl climbing into wooden playhouse',
 'little girl climbing the stairs to her playhouse',
 'little girl in pink dress going into wooden cabin']

In [14]:
# convert the loaded descriptions into a vocabulary of words
def getVocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(tokenizer.tokenize(d)) for d in descriptions[key]]
    return all_desc

# summarize vocabulary
vocabulary = getVocabulary(descriptions)
print('Original Vocabulary Size: %d' % len(vocabulary))

Original Vocabulary Size: 8464


In [19]:
# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

save_descriptions(descriptions, 'descriptions.txt')
print("Desciption.txt created")

Desciption.txt created


# Preparing Data for training

In [243]:
# load Train Data
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    
    for line in doc.split('\n'):
        
        if len(line) < 1:
            continue
 
        dataset.append(line)
    return (dataset)
 
filename = '/home/vinit/Desktop/Projects/Image Captioning/Flicker8k_test/Flickr_8k.trainImages.txt'
train_names = load_set(filename)
print('Train size: %d' % len(train))

Directroy_path = '/home/vinit/Desktop/Projects/Image Captioning/Flicker8k_Dataset/'
train_img= [Directroy_path+i  for i in train_names]

Train size: 6000


In [244]:
# Loading test data, 
def load_set(filename):
    doc = load_doc(filename)
    dataset = list()
    
    for line in doc.split('\n'):
        
        if len(line) < 1:
            continue
 
        dataset.append(line)
    return (dataset)
 
filename = '/home/vinit/Desktop/Projects/Image Captioning/Flicker8k_test/Flickr_8k.testImages.txt'
test_names = load_set(filename)
print('Test Size: %d' % len(test))

Directroy_path = '/home/vinit/Desktop/Projects/Image Captioning/Flicker8k_Dataset/'
test_img= [Directroy_path+i  for i in test_names]

Test Size: 1000


In [245]:
# we r extracting train data captions such that, each captions starts with 'startseq' and ends with 'endseq
def load_clean_descriptions(filename, dataset):
    # load document
    dataset = [i.split('.')[0] for i in dataset]
    doc = load_doc(filename)
    descriptions = dict()
    for line in doc.split('\n'):
        
        tokens = line.split()
        
        image_id, image_desc = tokens[0], tokens[1:]
        
        if image_id in dataset:
            
            if image_id not in descriptions:
                descriptions[image_id] = list()
            
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            
            descriptions[image_id].append(desc)
    return descriptions

# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train_names)

print('Descriptions: train=%d' % len(train_descriptions))

Descriptions: train=6000


In [None]:
# Save train_descriptions
with open("/home/vinit/Desktop/Projects/Image Captioning/Dataset/train_descriptions.pkl", "wb") as encoded_pickle:
    pickle.dump(train_descriptions, encoded_pickle)

# Extracting feature Vectors from Images

In [328]:
def preprocess(image_path):
    # Convert all the images to size 299x299 as expected by the inception v3 model
    img = image.load_img(image_path, target_size=(299, 299))
    
    x = image.img_to_array(img)
    # Add one more dimension
    x = np.expand_dims(x, axis=0)
    
    x = preprocess_input(x)
    return x

In [329]:
# Load the inception v3 model
model = InceptionV3(weights='imagenet')

In [330]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 299, 299, 3)  0                                            
__________________________________________________________________________________________________
conv2d_565 (Conv2D)             (None, 149, 149, 32) 864         input_7[0][0]                    
__________________________________________________________________________________________________
batch_normalization_565 (BatchN (None, 149, 149, 32) 96          conv2d_565[0][0]                 
__________________________________________________________________________________________________
activation_565 (Activation)     (None, 149, 149, 32) 0           batch_normalization_565[0][0]    
__________________________________________________________________________________________________
conv2d_566

In [331]:
# Create a new model, by removing the last layer (output layer) from the inception v3
model_new = Model(model.input, model.layers[-2].output)

In [332]:
# Function to encode a given image into a vector of size (2048, )
def encode(image):
    image = preprocess(image) 
    v = model_new.predict(image) 
    v = np.reshape(v, v.shape[1]) 
    return v

In [357]:
# Encoding all the Training Dataset to a 2048 Dimension Vectors, Run this once
start = time()
k=0
encoded_train = {}
for img in tqdm(train_img):
 
    encoded_train[img[len(Directroy_path):]] = encode(img)
print("Time taken in seconds =", time()-start)

Time taken in seconds = 1857.7092230319977


In [362]:
# Encoding all the Test Dataset to a 2048 Dimension Vectors, Run this once
start = time()
encoded_test = {}
for img in tqdm(test_img):
    encoded_test[img[len(Directroy_path):]] = encode(img)
print("Time taken in seconds =", time()-start)

Time taken in seconds = 440.16224336624146


In [359]:
# Save Train features
with open("/home/vinit/Desktop/Projects/Image Captioning/Dataset/encoded_train_images.pkl", "wb") as encoded_pickle:
    pickle.dump(encoded_train, encoded_pickle)

In [363]:
# Save test features 

from pickle import dump, load
with open("/home/vinit/Desktop/Projects/Image Captioning/Dataset/encoded_test_images.pkl", "wb") as encoded_pickle:
    pickle.dump(encoded_test, encoded_pickle)

 #   End Of this notebook

In [312]:
for key, c in encoded_train.items():
    print(key)

In [307]:
train_img[0][len(Directroy_path):]

'2513260012_03d33305cf.jpg'

In [361]:
len(encoded_train)

6000