In [1]:
import numpy as np
import pandas as pd
import json
import re
import collections
from keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions
from keras.preprocessing import image
from keras.models import Model, load_model
from keras.utils import to_categorical
import pickle
from time import time
import string
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, Dropout, Embedding, LSTM
from keras.layers.merge import add
from PIL import Image



Using TensorFlow backend.


In [2]:
# collecting caption data from json file

def collectCaption(path):
    with open(path) as f:
        captions = json.load(f)
    return captions

In [3]:
captions = collectCaption("./Dataset/annotations/captions_train2017.json")

In [4]:
print(captions.keys())

dict_keys(['info', 'licenses', 'images', 'annotations'])


In [5]:
# mapping ids to its images.
id_img = {}
for x in captions['images']:
#     c +=1
    id_img[str(x['id'])] = x['file_name']
#     if c==10:
    

In [6]:
print(id_img["391895"])

000000391895.jpg


In [7]:
description = {}
for anno in captions['annotations']:
    img_id = str(anno['image_id'])
    cap = anno['caption']
    
    img_name = id_img[img_id]
    if description.get(img_name) is None:
        description[img_name] = []
    if len(description[img_name]) <= 5:
        description[img_name].append(cap)
    

In [8]:
description['000000522418.jpg']

['A woman wearing a net on her head cutting a cake. ',
 'A woman cutting a large white sheet cake.',
 'A woman wearing a hair net cutting a large sheet cake.',
 'there is a woman that is cutting a white cake',
 "A woman marking a cake with the back of a chef's knife. "]

In [9]:
# data cleaning

def clean_text(sent):
    sent = sent.lower()
    sent = re.sub("[^a-z]+"," ",sent)
    sent = sent.split()
    
    sent = [s for s in sent if len(s)>1]
    sent = " ".join(sent)
    return sent

In [10]:
# cleaning description

for key,caption_list in description.items():
    for i in range(len(caption_list)):
        caption_list[i] = clean_text(caption_list[i])

In [11]:
description['000000522418.jpg']

['woman wearing net on her head cutting cake',
 'woman cutting large white sheet cake',
 'woman wearing hair net cutting large sheet cake',
 'there is woman that is cutting white cake',
 'woman marking cake with the back of chef knife']

In [12]:
# with open("discription.txt","w") as f:
#     f.write(str(description))

In [13]:
# create a vocab
description = None
with open("Data\discription.txt","r") as f:
    description = f.read()
json_acceptable_string = description.replace("'","\"")
description = json.loads(json_acceptable_string)


In [14]:
description['000000522418.jpg']

['woman wearing net on her head cutting cake',
 'woman cutting large white sheet cake',
 'woman wearing hair net cutting large sheet cake',
 'there is woman that is cutting white cake',
 'woman marking cake with the back of chef knife']

In [15]:
# vocab

vocab = set()
for key in description.keys():
    [vocab.update(sent.split()) for sent in description[key]]
print(len(vocab))

26440


In [16]:
# total no. of words accross the descriptionabs
total_words = []

for key in description.keys():
    [total_words.append(i) for des in description[key] for i in des.split()]
print(len(total_words))

5210675


In [17]:
counter = collections.Counter(total_words)
freq_cnt = dict(counter)
print(len(freq_cnt))

26440


In [18]:
sorted_freq_cnt = sorted(freq_cnt.items(),reverse=True, key=lambda x:x[1])

#filter
threshold = 4
sorted_freq_cnt = [x for x in sorted_freq_cnt if x[1]>threshold]
total_words = [x[0] for x in sorted_freq_cnt]


In [19]:
print(len(total_words))

10100


In [20]:
# creating train images
train_img_id = []
for key,img in id_img.items():
    train_img_id.append(img)

In [21]:
print(train_img_id[:4])

['000000391895.jpg', '000000522418.jpg', '000000184613.jpg', '000000318219.jpg']


In [22]:
len(train_img_id)

118287

In [23]:
# prepare Description for the Training Data
# Tweak - Add <s> and <e> toen to our traing data

train_descriptions = {}

for img_id in train_img_id:
    train_descriptions[img_id] = []
    for cap in description[img_id]:
        cap_to_append = "<s> " + cap + " <e>"
        train_descriptions[img_id].append(cap_to_append)

In [24]:
train_descriptions["000000391895.jpg"]

['<s> man with red helmet on small moped on dirt road <e>',
 '<s> man riding motor bike on dirt road on the countryside <e>',
 '<s> man riding on the back of motorcycle <e>',
 '<s> dirt path with young person on motor bike rests to the foreground of verdant area with bridge and background of cloud wreathed mountains <e>',
 '<s> man in red shirt and red hat is on motorcycle on hill side <e>']

In [25]:

# = "D:/programming/Machine learning and Deep learning/Projects/minor1.0/videoCaptioning for blinds/Datasets/coco/train2017/"

In [26]:
# Transfer Learning
# - images-->Features

#Step-1 Download Pre-trained model--resnet-50

# model = ResNet50(weights='imagenet',input_shape=(224,224,3))

In [27]:
# model.summary()

In [28]:
# model.layers[-2].output

In [29]:
# model_new = Model(model.input, model.layers[-2].output)

In [30]:
# def preprocess_img(img):
#     img = image.load_img(img,target_size=(224,224))
#     img = image.img_to_array(img)
#     img = np.expand_dims(img,axis=0)
    
#     #Normalization
    
#     img = preprocess_input(img)
#     return img

In [31]:
# def encode_image(img):
#     img = preprocess_img(img)
#     feature_vect = model_new.predict(img)
#     feature_vect = feature_vect.reshape((-1,))
#     return feature_vect
    

In [32]:
# encode_image(IMG_PATH+ "000000522418.jpg")

In [33]:
# encoding_train = {}
# t0 = time()
# for ix,img_id in enumerate(train_img_id):
#     img_path = IMG_PATH+img_id
#     encoding_train[img_id] = encode_image(img_path)
    
#     if ix%1000==0:
#         print("Encoding in progress time step %d "%ix)
# end_t = time()
# print("total time taken :",end_t-t0)

In [34]:
# store this on disk

# with open("encoded_train.pkl","wb") as f:
#     pickle.dump(encoding_train,f)
    

In [35]:
#load img feature vectors in ram from disk

with open("./Data/encoded_train.pkl","rb") as f:
       encoding_train = pickle.load(f)

In [36]:
print((encoding_train['000000522418.jpg']))

[0.44277993 0.23587297 0.35735822 ... 5.404309   0.3640494  1.1392363 ]


In [37]:
#  preparing caption dataset

In [38]:
word_2_idx = {}
idx_2_word = {}

for i,word in enumerate(total_words):
    word_2_idx[word] = i+1
    idx_2_word[i+1] = word

In [39]:
word_2_idx['women']
idx_2_word[185]

'women'

In [40]:
print(len(idx_2_word))

10100


In [41]:
word_2_idx['<s>'] = 10101
idx_2_word[10101] = "<s>"

word_2_idx['<e>'] = 10102
idx_2_word[10102] = "<e>"

vocab_size = len(idx_2_word) + 1

In [42]:
print(vocab_size)

10103


In [43]:
max_len = 0

for key in train_descriptions.keys():
    for cap in train_descriptions[key]:
        max_len = max(max_len,len(cap.split()))
print(max_len)

49


In [44]:
# data loader(Generator)

In [45]:
def data_generator(train_descriptions,encoding_train,word_2_idx,max_len,batch_size):
    X1,X2,y = [],[],[]
    
    n=0
    while True:
        for key,desc_list in train_descriptions.items():
            n += 1
            photo = encoding_train[key]
            for desc in desc_list:
                seq = [word_2_idx[word] for word in desc.split() if word in word_2_idx]
                
                for i in range(1,len(seq)):
                    xi = seq[0:i]
                    yi = seq[i]
                    
                    xi = pad_sequences([xi], maxlen=max_len, value=0, padding='post')[0]
                    yi = to_categorical([yi], num_classes=vocab_size)[0]
                    
                    X1.append(photo)
                    X2.append(xi)
                    y.append(yi)
                    
            if n==batch_size:
                yield [[np.array(X1),np.array(X2)],np.array(y)]

                X1,X2,y = [],[],[]
                n = 0

In [46]:
# Word2vec enbeddings

In [47]:
f = open("D:/programming/Machine learning and Deep learning/Projects/minor1.0/videoCaptioning for blinds/Datasets/glove.6B.50d.txt",encoding='utf8')

In [48]:
embedding_index = {}
for line in f:
    values = line.split()
    word = values[0]
    
    word_embeddings = np.array(values[1:], dtype='float')
    embedding_index[word] = word_embeddings
    

In [49]:
def get_embedding_matrix():
    emb_dim = 50
    matrix = np.zeros((vocab_size,emb_dim))
    for word,idx in word_2_idx.items():
        embedding_vector = embedding_index.get(word)
        
        if embedding_vector is not None:
            matrix[idx] = embedding_vector
    return matrix

In [50]:
embedding_matrix = get_embedding_matrix()

In [51]:
print(embedding_matrix.shape)

(10103, 50)


In [52]:
# Model Archtecture

In [53]:
# image model

input_img_fearures = Input(shape=(2048,))
inp_img1 = Dropout(0.2)(input_img_fearures)
inp_img2 = Dense(256)(inp_img1)

W1007 02:26:31.418581 75684 deprecation_wrapper.py:119] From C:\Users\asus\Anaconda3\envs\ML_GPU\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1007 02:26:31.471668 75684 deprecation_wrapper.py:119] From C:\Users\asus\Anaconda3\envs\ML_GPU\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1007 02:26:31.484382 75684 deprecation_wrapper.py:119] From C:\Users\asus\Anaconda3\envs\ML_GPU\lib\site-packages\keras\backend\tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W1007 02:26:31.500951 75684 deprecation.py:506] From C:\Users\asus\Anaconda3\envs\ML_GPU\lib\site-packages\keras\backend\tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will b

In [54]:
# caption model
input_captions = Input(shape=(max_len,))
inp_cap1 = Embedding(input_dim=vocab_size,output_dim=50, mask_zero=True)(input_captions)
inp_cap2 = Dropout(0.2)(inp_cap1)
inp_cap3 = LSTM(256)(inp_cap2)

W1007 02:26:32.035727 75684 deprecation.py:323] From C:\Users\asus\Anaconda3\envs\ML_GPU\lib\site-packages\keras\backend\tensorflow_backend.py:2974: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [55]:
decoder1 = add([inp_img2,inp_cap3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size,activation='softmax')(decoder2)

In [56]:
# combine model
model = Model(inputs=[input_img_fearures,input_captions], outputs=outputs)


In [57]:
# pre init embedding layer
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

model.summary()

W1007 02:26:32.110849 75684 deprecation_wrapper.py:119] From C:\Users\asus\Anaconda3\envs\ML_GPU\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 49)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 2048)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 49, 50)       505150      input_2[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 2048)         0           input_1[0][0]                    
__________________________________________________________________________________________________
dropout_2 

In [58]:
# compile Model
model.compile(loss='categorical_crossentropy', optimizer='adam')

W1007 02:26:37.467180 75684 deprecation_wrapper.py:119] From C:\Users\asus\Anaconda3\envs\ML_GPU\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [None]:
with open("train_discriptions.txt","w") as f:
    f.write(str(train_descriptions))
with open("word_2_idx","w") as f:
    f.write(str(word_2_idx))

In [59]:
## Training of madel

epochs = 120
batch_size = 30
steps = len(train_descriptions)//batch_size

for i in range(epochs):
    generator = data_generator(train_descriptions,encoding_train,word_2_idx,max_len,batch_size)
    model.fit_generator(generator,epochs=1,steps_per_epoch=steps,verbose=1)
    model.save(('./Data/model_weights/model_'+str(i)+'.h5'))
end_t = time()

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1

KeyboardInterrupt: 