In [0]:
import tensorflow as tf
from os import listdir
import pickle
from tqdm import tqdm
from pickle import dump
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Sequential
from keras.layers.core import Flatten, Dense, Dropout
from keras.layers.convolutional import Conv2D, Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD
import cv2, numpy as np
from keras.models import load_model
from keras.models import Model
import string
import os

Using TensorFlow backend.


In [0]:
# mount the google drive locally
from google.colab import drive
drive.mount('/content/gdrive')
PATH_DRIVE = "/content/gdrive/My Drive/CV_project/"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
# Path to save the results in Google drive
path_features_vgg = PATH_DRIVE + 'features_vgg.pkl'
path_features_our = PATH_DRIVE + 'features_our.pkl'
path_captions = PATH_DRIVE + 'captions.txt'

In [0]:
# import the datasets

name_of_zip = 'Flickr8k_Dataset.zip'
if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):
  image_zip = tf.keras.utils.get_file(name_of_zip, 
                                      cache_subdir=os.path.abspath('.'),
                                      origin = 'http://nlp.cs.illinois.edu/HockenmaierGroup/Framing_Image_Description/Flickr8k_Dataset.zip',
                                      extract = True)
  PATH = os.path.dirname(image_zip)+'/Flicker8k_Dataset'
else:
  PATH = os.path.abspath('.')+'/Flicker8k_Dataset'

captions_zip = tf.keras.utils.get_file('Flickr8k_text.zip', 
                                          cache_subdir=os.path.abspath('.'),
                                          origin = 'http://nlp.cs.illinois.edu/HockenmaierGroup/Framing_Image_Description/Flickr8k_text.zip',
                                          extract = True)

captions_token = os.path.dirname(captions_zip)+'/Flickr8k.token.txt'
captions_trainImages = os.path.dirname(captions_zip)+'/Flickr_8k.trainImages.txt'
captions_devImages = os.path.dirname(captions_zip)+'/Flickr_8k.devImages.txt'

Downloading data from http://nlp.cs.illinois.edu/HockenmaierGroup/Framing_Image_Description/Flickr8k_Dataset.zip
Downloading data from http://nlp.cs.illinois.edu/HockenmaierGroup/Framing_Image_Description/Flickr8k_text.zip


In [0]:
#load the weight for the vgg model
name_weight = "vgg_weights.h5"
if not os.path.exists(os.path.abspath('.') + '/' + name_weight):
  model= tf.keras.utils.get_file(name_weight, 
                                      cache_subdir=os.path.abspath('.'),
                                      origin = 'https://drive.google.com/file/d/0Bz7KyqmuGsilT0J5dmRCM0ROVHc/view?usp=sharing',
                                      extract = False)
  path_weight = os.path.dirname(model)
else:
  path_weight = os.path.abspath('.')+'/Ficker8k_Dataset'

Downloading data from https://drive.google.com/file/d/0Bz7KyqmuGsilT0J5dmRCM0ROVHc/view?usp=sharing
 139264/Unknown - 0s 0us/step

In [0]:
#init the cnn values

my_optimizer = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)

loss_functions = ['categorical_crossentropy', 'mean_squared_error', 'mean_absolute_error', 'squared_hinge']
my_loss = loss_functions[0]

activation_functions = ['relu','softmax','sigmoid','elu']
my_activation = activation_functions[0]
my_activation_1 = activation_functions[1]

my_pool_size = (2,2)
my_stride = (2, 2)

# Careful ! we have 5 block in our model, so we need 5 values here
receptive_fields = [64, 128, 256, 512, 512]

# Careful ! we have 3 block in our model, so we need 3 values here
dense_values = [4096, 4096, 1000]

dropout = 0.5

In [0]:
#if less than 5 receiptive filds has been entered
if (len(receptive_fields) != 5):
  #we override the value
  receptive_fields = [64, 128, 256, 512, 512]
  
#if less than 3 dense values has been entered
if (len(dense_values) != 3):
  #we override the value
  dense_values = [4096, 4096, 1000]


In [0]:
# define the models

#create the model
def create_model():
    model = Sequential()
    model.add(ZeroPadding2D((1,1),input_shape=(224,224,3)))
    model.add(Conv2D(receptive_fields[0], (3, 3), activation=my_activation))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(receptive_fields[0], (3, 3), activation=my_activation))
    model.add(MaxPooling2D(pool_size=my_pool_size, strides=my_stride))

    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(receptive_fields[1], (3, 3), activation=my_activation))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(receptive_fields[1], (3, 3), activation=my_activation))
    model.add(MaxPooling2D(pool_size=my_pool_size, strides=my_stride))

    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(receptive_fields[2], (3, 3), activation=my_activation))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(receptive_fields[2], (3, 3), activation=my_activation))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(receptive_fields[2], (3, 3), activation=my_activation))
    model.add(MaxPooling2D(pool_size=my_pool_size, strides=my_stride))

    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(receptive_fields[3], (3, 3), activation=my_activation))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(receptive_fields[3], (3, 3), activation=my_activation))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(receptive_fields[3], (3, 3), activation=my_activation))
    model.add(MaxPooling2D(pool_size=my_pool_size, strides=my_stride))

    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(receptive_fields[4], (3, 3), activation=my_activation))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(receptive_fields[4], (3, 3), activation=my_activation))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(receptive_fields[4], (3, 3), activation=my_activation))
    model.add(MaxPooling2D(pool_size=my_pool_size, strides=my_stride))

    model.add(Flatten())
    model.add(Dense(dense_values[0], activation=my_activation))
    model.add(Dropout(dropout))
    model.add(Dense(dense_values[1], activation=my_activation))
#     model.add(Dropout(dropout))
#     model.add(Dense(dense_values[2], activation=my_activation_1))
    
    return model


# create our own model
def load_our_model():
  model = create_model()
  model.compile(optimizer=my_optimizer, loss=my_loss)

  # re-structure the model
  print(model.layers.pop())
  model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
  
  # print the architecture of our model
  print("\n our model structure : ")
  print(model.summary())
  
  return model

# load a premade model
def load_vgg_model():
  
  # get the vgg model from Keras
  model = VGG16()
  
  # re-structure the model to our need
  print(model.layers.pop())
  model = Model(inputs=model.inputs, outputs=model.layers[-1].output)

  # print the architecture of the vgg16 model
  print("\n Vgg16 model structure : ")
  print(model.summary())
  
  return model

In [0]:
# Prepare the Photo Data by extracting the features

# extract features from the pictures of the Flickr Dataset
def extract_features(directory, model):
  
  # we save the features in a dictionnary
  # the image ids are the key, the features the associated values
  features = dict()
  files = listdir(directory)
  print("number of images : ",len(files))
  
  # We use tqdm library to show the progress of the process
  for index in tqdm(files):
    #load the image
    image = load_img(directory + '/' + index, target_size=(224, 224))
    image_ar = img_to_array(image)
    
    # reshape for the model
    image_re = image_ar.reshape((1, image_ar.shape[0], image_ar.shape[1], image_ar.shape[2]))
    image_re = preprocess_input(image_re)
    
    # get features
    feature = model.predict(image_re, verbose=0)
    image_id = index.split('.')[0]
    features[image_id] = feature
  return features

In [0]:
# With the predefined model VGG16

model_vgg = load_vgg_model()
# check if the file exists
if not os.path.exists(path_features_vgg):
  
  # extract the features
  features_vgg = extract_features(PATH, model_vgg)
  print('Extracted Features: %d' % len(features_vgg))
  
  # save to a new file
  dump(features_vgg, open(path_features_vgg, 'wb'))
else :
  with open(path_features_vgg, 'rb') as f:
    features_vgg = pickle.load(f)

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels.h5
<keras.layers.core.Dense object at 0x7f6d4b45a588>

 Vgg16 model structure : 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
__________________________________________________________

In [0]:
# With our own model

# check if the file exists
model_our = load_our_model()
if not os.path.exists(path_features_our):
  
  # extract the features
  features_our = extract_features(PATH, model_our)
  print('Extracted Features: %d' % len(features_our))
  
  # save to file
  dump(features_our, open(path_features_our, 'wb'))
else :
  with open(path_features_our, 'rb') as f:
    features_our = pickle.load(f)

<keras.layers.core.Dense object at 0x7f6d47f22c88>

 our model structure : 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
zero_padding2d_1_input (Inpu (None, 224, 224, 3)       0         
_________________________________________________________________
zero_padding2d_1 (ZeroPaddin (None, 226, 226, 3)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 224, 224, 64)      1792      
_________________________________________________________________
zero_padding2d_2 (ZeroPaddin (None, 226, 226, 64)      0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 224, 224, 64)      36928     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 112, 112, 64)      0         
_________________________________________________________________


In [0]:
# Now the preparation of the captions :

# read the captions token file from the drive
with open(captions_token, 'r') as f:
    doc = f.read()

In [0]:
# load captions : 
captions = dict()
for line in doc.split('\n'):
  tokens = line.split()
  if len(line) < 2:
    continue
  image_id, image_capt = tokens[0], tokens[1:]
  image_id = image_id.split('.')[0]
  image_capt = ' '.join(image_capt)
  if image_id not in captions:
    captions[image_id] = list()
  captions[image_id].append(image_capt)

print('Number of loaded captions: %d' % len(captions))

# clean the captions
table = str.maketrans('', '', string.punctuation)
for key, capt_list in captions.items():
  for i in range(len(capt_list)):
    capt = capt_list[i]
    capt = capt.split()
    capt = [word.lower() for word in capt]
    capt = [w.translate(table) for w in capt]
    capt = [word for word in capt if len(word)>1]
    capt = [word for word in capt if word.isalpha()]
    capt_list[i] =  ' '.join(capt)

Number of loaded captions: 8092


In [0]:
# print the size of the clean vocabulary
vocab = set()
for key in captions.keys():
  [vocab.update(c.split()) for c in captions[key]]
print('Clean Vocabulary : %d' % len(vocab))

Clean Vocabulary : 8763


In [0]:
# save the captions to a file
lines = list()
for key, desc_list in captions.items():
  for desc in desc_list:
    lines.append(key + ' ' + desc)
data = '\n'.join(lines)
file = open(path_captions, 'w')
file.write(data)
file.close()