In [43]:
import numpy as np
import pandas as pd
import json
import re
import collections
from keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions
from keras.preprocessing import image
from keras.models import Model, load_model
from keras.utils import to_categorical
import pickle
from time import time
import string
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, Dropout, Embedding, LSTM
from keras.layers.merge import add
from PIL import Image

In [13]:
# collecting caption data from json file

def collectCaption(path):
    with open(path) as f:
        captions = json.load(f)
    return captions

In [14]:
captions = collectCaption("./Dataset/annotations/captions_train2017.json")

In [15]:
print(captions.keys())

dict_keys(['info', 'licenses', 'images', 'annotations'])


In [16]:
# mapping ids to its images.
id_img = {}
for x in captions['images']:
#     c +=1
    id_img[str(x['id'])] = x['file_name']
#     if c==10:
    

In [17]:
print(id_img["391895"])

000000391895.jpg


In [18]:
description = {}
for anno in captions['annotations']:
    img_id = str(anno['image_id'])
    cap = anno['caption']
    
    img_name = id_img[img_id]
    if description.get(img_name) is None:
        description[img_name] = []
    if len(description[img_name]) <= 5:
        description[img_name].append(cap)
    

In [19]:
description['000000522418.jpg']

['A woman wearing a net on her head cutting a cake. ',
 'A woman cutting a large white sheet cake.',
 'A woman wearing a hair net cutting a large sheet cake.',
 'there is a woman that is cutting a white cake',
 "A woman marking a cake with the back of a chef's knife. "]

In [20]:
# data cleaning

def clean_text(sent):
    sent = sent.lower()
    sent = re.sub("[^a-z]+"," ",sent)
    sent = sent.split()
    
    sent = [s for s in sent if len(s)>1]
    sent = " ".join(sent)
    return sent

In [21]:
# cleaning description

for key,caption_list in description.items():
    for i in range(len(caption_list)):
        caption_list[i] = clean_text(caption_list[i])

In [22]:
description['000000522418.jpg']

['woman wearing net on her head cutting cake',
 'woman cutting large white sheet cake',
 'woman wearing hair net cutting large sheet cake',
 'there is woman that is cutting white cake',
 'woman marking cake with the back of chef knife']

In [23]:
with open("discription.txt","w") as f:
    f.write(str(description))

In [24]:
# create a vocab
description = None
with open("Data\discription.txt","r") as f:
    description = f.read()
json_acceptable_string = description.replace("'","\"")
description = json.loads(json_acceptable_string)


In [25]:
description['000000522418.jpg']

['woman wearing net on her head cutting cake',
 'woman cutting large white sheet cake',
 'woman wearing hair net cutting large sheet cake',
 'there is woman that is cutting white cake',
 'woman marking cake with the back of chef knife']

In [26]:
# vocab

vocab = set()
for key in description.keys():
    [vocab.update(sent.split()) for sent in description[key]]
print(len(vocab))

26440


In [27]:
# total no. of words accross the descriptionabs
total_words = []

for key in description.keys():
    [total_words.append(i) for des in description[key] for i in des.split()]
print(len(total_words))

5210675


In [28]:
counter = collections.Counter(total_words)
freq_cnt = dict(counter)
print(len(freq_cnt))

26440


In [29]:
sorted_freq_cnt = sorted(freq_cnt.items(),reverse=True, key=lambda x:x[1])

#filter
threshold = 4
sorted_freq_cnt = [x for x in sorted_freq_cnt if x[1]>threshold]
total_words = [x[0] for x in sorted_freq_cnt]


In [30]:
print(len(total_words))

10100


In [31]:
# creating train images
train_img_id = []
for key,img in id_img.items():
    train_img_id.append(img)

In [45]:
print(train_img_id[:4])

['000000391895.jpg', '000000522418.jpg', '000000184613.jpg', '000000318219.jpg']


In [32]:
len(train_img_id)

118287

In [33]:
# prepare Description for the Training Data
# Tweak - Add <s> and <e> toen to our traing data

train_descriptions = {}

for img_id in train_img_id:
    train_descriptions[img_id] = []
    for cap in description[img_id]:
        cap_to_append = "<s> " + cap + " <e>"
        train_descriptions[img_id].append(cap_to_append)

In [34]:
train_descriptions["000000391895.jpg"]

['<s> man with red helmet on small moped on dirt road <e>',
 '<s> man riding motor bike on dirt road on the countryside <e>',
 '<s> man riding on the back of motorcycle <e>',
 '<s> dirt path with young person on motor bike rests to the foreground of verdant area with bridge and background of cloud wreathed mountains <e>',
 '<s> man in red shirt and red hat is on motorcycle on hill side <e>']

In [35]:
IMG_PATH = "D:/programming/Machine learning and Deep learning/Projects/minor1.0/videoCaptioning for blinds/Datasets/coco/train2017/"

In [36]:
# Transfer Learning
# - images-->Features

#Step-1 Download Pre-trained model--resnet-50

model = ResNet50(weights='imagenet',input_shape=(224,224,3))

W0908 11:26:36.241136 135004 deprecation_wrapper.py:119] From C:\Users\asus\Anaconda3\envs\ML_GPU\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0908 11:26:36.257725 135004 deprecation_wrapper.py:119] From C:\Users\asus\Anaconda3\envs\ML_GPU\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0908 11:26:36.263581 135004 deprecation_wrapper.py:119] From C:\Users\asus\Anaconda3\envs\ML_GPU\lib\site-packages\keras\backend\tensorflow_backend.py:4185: The name tf.truncated_normal is deprecated. Please use tf.random.truncated_normal instead.

W0908 11:26:36.294807 135004 deprecation_wrapper.py:119] From C:\Users\asus\Anaconda3\envs\ML_GPU\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead

In [37]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 230, 230, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 112, 112, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, 112, 112, 64) 256         conv1[0][0]                      
__________________________________________________________________________________________________
activation

In [38]:
model.layers[-2].output

<tf.Tensor 'avg_pool/Mean:0' shape=(?, 2048) dtype=float32>

In [39]:
model_new = Model(model.input, model.layers[-2].output)

In [47]:
def preprocess_img(img):
    img = image.load_img(img,target_size=(224,224))
    img = image.img_to_array(img)
    img = np.expand_dims(img,axis=0)
    
    #Normalization
    
    img = preprocess_input(img)
    return img

In [48]:
def encode_image(img):
    img = preprocess_img(img)
    feature_vect = model_new.predict(img)
    feature_vect = feature_vect.reshape((-1,))
    return feature_vect
    

In [44]:
encode_image(IMG_PATH+ "000000522418.jpg")

(2048,)


array([0.44277993, 0.23587297, 0.35735822, ..., 5.404309  , 0.3640494 ,
       1.1392363 ], dtype=float32)

In [49]:
encoding_train = {}
t0 = time()
for ix,img_id in enumerate(train_img_id):
    img_path = IMG_PATH+img_id
    encoding_train[img_id] = encode_image(img_path)
    
    if ix%1000==0:
        print("Encoding in progress time step %d "%ix)
end_t = time()
print("total time taken :",end_t-t0)

Encoding in progress time step 0 
Encoding in progress time step 1000 
Encoding in progress time step 2000 
Encoding in progress time step 3000 
Encoding in progress time step 4000 
Encoding in progress time step 5000 
Encoding in progress time step 6000 
Encoding in progress time step 7000 
Encoding in progress time step 8000 
Encoding in progress time step 9000 
Encoding in progress time step 10000 
Encoding in progress time step 11000 
Encoding in progress time step 12000 
Encoding in progress time step 13000 
Encoding in progress time step 14000 
Encoding in progress time step 15000 
Encoding in progress time step 16000 
Encoding in progress time step 17000 
Encoding in progress time step 18000 
Encoding in progress time step 19000 
Encoding in progress time step 20000 
Encoding in progress time step 21000 
Encoding in progress time step 22000 
Encoding in progress time step 23000 
Encoding in progress time step 24000 
Encoding in progress time step 25000 
Encoding in progress time

In [50]:
# store this on disk

with open("encoded_train.pkl","wb") as f:
    pickle.dump(encoding_train,f)
    

In [55]:
print(len(encoding_train['000000522418.jpg']))

2048
