In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM,Conv2D, MaxPool2D, BatchNormalization,Dropout, Embedding
import numpy as np
import os
import json
import nltk
from collections import Counter
import cv2
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
#from vgg_base import get_features

In [2]:
#load data
with open(os.getcwd() + '/CUHK-PEDES/CUHK-PEDES/caption_all.json', 'r') as f:
    data = json.load(f)

In [3]:
capts=[]
img_paths=[]
for val in data:
    cap=val['captions']
    temp=" ".join(cap)
    if len(temp.split())<100:
        capts.append(temp)
        img_paths.append(os.getcwd() + '/CUHK-PEDES/CUHK-PEDES/imgs/'+val['file_path'])


In [4]:
vocabs=['UNK','PAD']
counts=dict(Counter(" ".join(capts).split(' ')))
for w, c in zip(counts.keys(), counts.values()):
    if c>50:vocabs.append(w)
int2vocab=dict(enumerate(vocabs))
vocab2int={i:c for i, c in zip(int2vocab.values(), int2vocab.keys())}

In [5]:
caption_ids=[[vocab2int.get(word, 0) for word in val.split()]for val in capts]
caption_ids_pad=tf.keras.preprocessing.sequence.pad_sequences(caption_ids, padding='post', value=vocab2int['PAD'])
#img_paths=[]
#for val in data:
 #   img_paths.append(os.getcwd() + '/CUHK-PEDES/CUHK-PEDES/imgs/'+val['file_path'])

In [6]:
epochs=20
num_units=15
batch_size=10
vocab_size=len(vocab2int)
max_len=99


In [7]:

#bce = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction='none')
#real = np.array([[[1., 0., 0.], [0., 1., 0], [0., 0., 1.]], [[0., 1., 0.], [0., 0., 1.], [1., 0., 0.]]])
#pred= np.array([[[0.7,0.3, 0.1], [0.5,0.6,0.2], [0.4,0.3,0.8],[1,2,3]], [[0.9,0.6,0.7],[0.5,0.3,0.8],[0.1,0.5,0.6],[2,3,4]]])
#loss=bce(real, pred)
#print('Loss: ', tf.reduce_sum(loss, axis=1) )  # Loss: 11.5228
#p  = tf.concat([real,pred], axis=1)
#print(p.shape)

In [8]:
img_train, img_val, capt_train, capt_val = train_test_split(img_paths, caption_ids_pad, test_size=0.6)
img_train_true = img_train[:len(img_train)//2]

img_train_false = img_train[len(img_train)//2:]
img_train_false = shuffle(img_train_false)

img_train_final=img_train_true+img_train_false
true_false_labels = [[1]*(len(img_train_true)) + [0]*(len(img_train_true))]
true_false_labels = np.array(true_false_labels).ravel()

img_train_temp = img_train_final[:250] + img_train_final[len(img_train_final)-250:]
capt_temp = np.append(capt_train[:250], capt_train[len(capt_train)-250:], axis=0)
labels_temp=np.append(true_false_labels[:250], true_false_labels[len(true_false_labels)-250:], axis=0)
dataset = tf.data.Dataset.from_tensor_slices((img_train_temp, capt_temp, labels_temp)).shuffle(len(img_train_temp))
dataset=dataset.batch(batch_size)

In [9]:
#custom vgg
'''
class vgg_base(tf.keras.Model):
    def __init__(self):
        super(vgg_base, self).__init__()
        self.conv64_2=tf.keras.Sequential([Conv2D(64, kernel_size=(3,3), strides=1, activation=tf.nn.relu),
                                           Dropout(rate=0.2),
                                       Conv2D(64, kernel_size=(3,3), strides=1, activation=tf.nn.relu),
                                            Dropout(rate=0.3),
                                          MaxPool2D()])
        self.conv128_2=tf.keras.Sequential([Conv2D(128, kernel_size=(3,3), strides=1, activation=tf.nn.relu),
                                            Dropout(rate=0.2),
                                       Conv2D(128, kernel_size=(3,3), strides=1, activation=tf.nn.relu),
                                             Dropout(rate=0.3),
                                          MaxPool2D()])
        self.conv256_3=tf.keras.Sequential([Conv2D(256, kernel_size=(3,3), strides=1, activation=tf.nn.relu),
                                       Conv2D(256, kernel_size=(3,3), strides=1, activation=tf.nn.relu),
                                            Dropout(rate=0.4),
                                        Conv2D(256, kernel_size=(3,3), strides=1, activation=tf.nn.relu),
                                       MaxPool2D()])
        self.conv512_3_1=tf.keras.Sequential([Conv2D(512, kernel_size=(3,3), strides=1, activation=tf.nn.relu),
                                       Conv2D(512, kernel_size=(3,3), strides=1, activation=tf.nn.relu),
                                        Dropout(rate=0.6),
                                        Conv2D(512, kernel_size=(3,3), strides=1, activation=tf.nn.relu),
                                       MaxPool2D()])
        self.conv512_3_2=tf.keras.Sequential([Conv2D(512, kernel_size=(3,3), strides=1, activation=tf.nn.relu),
                                       Conv2D(512, kernel_size=(3,3), strides=1, activation=tf.nn.relu),
                                        Dropout(rate=0.7),
                                        Conv2D(512, kernel_size=(3,3), strides=1, activation=tf.nn.relu),
                                       MaxPool2D()])
        self.dropout1 = Dropout(rate=0.3)
        self.dropout2 = Dropout(rate=0.3)
        self.dropout3 = Dropout(rate=0.4)
        self.fc1=Dense(256)
        self.fc2=Dense(512)
        self.fc3=Dense(1024)
        self.fc =Dense(199)
        self.fc4=Dense(2500)

        
    def call(self, x):
        #print('running CNN..')
        x=self.conv64_2(x)
        #print(x.shape)
        x=self.conv128_2(x)
        #print(x.shape)
        x=self.conv256_3(x)
        #print(x.shape)
        #x=self.conv512_3_1(x)
        #x=self.conv512_3_2(x)
        fc1_=self.fc1(x)
        fc1_=self.dropout1(fc1_)
        fc2_=self.fc2(fc1_)
        fc2_=self.dropout2(fc2_)
        fc3_=self.fc3(fc2_)
        fc3_=self.dropout3(fc3_)
        fc4_=self.fc4(fc3_)
        #print(fc2_.shape)
        fc4=tf.nn.softmax(self.fc(fc3_))
        
        return fc4_, fc4
         
vgg= vgg_base()
'''

"\nclass vgg_base(tf.keras.Model):\n    def __init__(self):\n        super(vgg_base, self).__init__()\n        self.conv64_2=tf.keras.Sequential([Conv2D(64, kernel_size=(3,3), strides=1, activation=tf.nn.relu),\n                                           Dropout(rate=0.2),\n                                       Conv2D(64, kernel_size=(3,3), strides=1, activation=tf.nn.relu),\n                                            Dropout(rate=0.3),\n                                          MaxPool2D()])\n        self.conv128_2=tf.keras.Sequential([Conv2D(128, kernel_size=(3,3), strides=1, activation=tf.nn.relu),\n                                            Dropout(rate=0.2),\n                                       Conv2D(128, kernel_size=(3,3), strides=1, activation=tf.nn.relu),\n                                             Dropout(rate=0.3),\n                                          MaxPool2D()])\n        self.conv256_3=tf.keras.Sequential([Conv2D(256, kernel_size=(3,3), strides=1, activation

In [10]:
VGG_MEAN = [103.939, 116.779, 123.68]


# define input layer
input_layer = tf.keras.layers.Input([224, 224, 3])

red, green, blue = tf.split(axis=3, num_or_size_splits=3, value=input_layer)
bgr = tf.concat(axis=3, values=[blue - VGG_MEAN[0], green - VGG_MEAN[1], red - VGG_MEAN[2]])

# Block 1
conv1_1 = tf.keras.layers.Conv2D(filters=64, kernel_size=[3, 3], strides=[1, 1], padding='same',
                                 use_bias=True, activation='relu', name='conv1_1')(bgr)

conv1_2 = tf.keras.layers.Conv2D(filters=64, kernel_size=[3, 3], strides=[1, 1], padding='same',
                                 use_bias=True, activation='relu', name='conv1_2')(conv1_1)
pool1_1 = tf.nn.max_pool(conv1_2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool1_1')

# Block 2
conv2_1 = tf.keras.layers.Conv2D(filters=128, kernel_size=[3, 3], strides=[1, 1], padding='same',
                                 use_bias=True, activation='relu', name='conv2_1')(pool1_1)
conv2_2 = tf.keras.layers.Conv2D(filters=128, kernel_size=[3, 3], strides=[1, 1], padding='same',
                                 use_bias=True, activation='relu', name='conv2_2')(conv2_1)
pool2_1 = tf.nn.max_pool(conv2_2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool2_1')

# Block 3
conv3_1 = tf.keras.layers.Conv2D(filters=256, kernel_size=[3, 3], strides=[1, 1], padding='same',
                                 use_bias=True, activation='relu', name='conv3_1')(pool2_1)
conv3_2 = tf.keras.layers.Conv2D(filters=256, kernel_size=[3, 3], strides=[1, 1], padding='same',
                                 use_bias=True, activation='relu', name='conv3_2')(conv3_1)
conv3_3 = tf.keras.layers.Conv2D(filters=256, kernel_size=[3, 3], strides=[1, 1], padding='same',
                                 use_bias=True, activation='relu', name='conv3_3')(conv3_2)
pool3_1 = tf.nn.max_pool(conv3_3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool3_1')

# Block 4
conv4_1 = tf.keras.layers.Conv2D(filters=512, kernel_size=[3, 3], strides=[1, 1], padding='same',
                                 use_bias=True, activation='relu', name='conv4_1')(pool3_1)
conv4_2 = tf.keras.layers.Conv2D(filters=512, kernel_size=[3, 3], strides=[1, 1], padding='same',
                                 use_bias=True, activation='relu', name='conv4_2')(conv4_1)
conv4_3 = tf.keras.layers.Conv2D(filters=512, kernel_size=[3, 3], strides=[1, 1], padding='same',
                                 use_bias=True, activation='relu', name='conv4_3')(conv4_2)
pool4_1 = tf.nn.max_pool(conv4_3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool4_1')

# Block 4
conv5_1 = tf.keras.layers.Conv2D(filters=512, kernel_size=[3, 3], strides=[1, 1], padding='same',
                                 use_bias=True, activation='relu', name='conv5_1')(pool4_1)
conv5_2 = tf.keras.layers.Conv2D(filters=512, kernel_size=[3, 3], strides=[1, 1], padding='same',
                                 use_bias=True, activation='relu', name='conv5_2')(conv5_1)
conv5_3 = tf.keras.layers.Conv2D(filters=512, kernel_size=[3, 3], strides=[1, 1], padding='same',
                                 use_bias=True, activation='relu', name='conv5_3')(conv5_2)
pool5_1 = tf.nn.max_pool(conv5_3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool5_1')

flatten = tf.keras.layers.Flatten()(pool5_1)
fc6 = tf.keras.layers.Dense(units=4096, use_bias=True, name='fc6', activation='relu')(flatten)
fc7 = tf.keras.layers.Dense(units=4096, use_bias=True, name='fc7', activation='relu')(fc6)
fc8 = tf.keras.layers.Dense(units=1000, use_bias=True, name='fc8', activation=None)(fc7)


#prob = tf.nn.softmax(fc8)

# Build model
vgg = tf.keras.Model(input_layer, fc8)
weighs = np.load("./vgg16.npy", encoding='latin1', allow_pickle=True).item()
for layer_name in weighs.keys():
    layer = vgg.get_layer(layer_name)
    layer.set_weights(weighs[layer_name])

In [11]:
class model(tf.keras.Model):
    def __init__(self, num_units, batch_size, max_len):
        super(model, self).__init__()
        self.num_units=num_units
        self.max_len=max_len
        self.batch_size=batch_size
        self.lstm=LSTM(num_units, return_sequences=True, return_state=True)
        self.fc1=Dense(self.max_len, activation='relu')
        self.fc2=Dense(self.max_len)
        self.capt_fc=Dense(32)
        self.img_fc1=Dense(512)
        self.img_fc2=Dense(512)
        self.word_fc=Dense(self.max_len, activation='relu')
        
    def call(self, x, img_batch, state):
        capt = self.capt_fc(x) #bsxmaxlenx15
        img_features = vgg(img_batch)#bsx1000
        im_fc1=self.img_fc1(img_features)
        im_fc2=self.img_fc2(im_fc1) #bsx512
        img_fc=tf.concat([im_fc1, im_fc2], axis=1) #bsx1024
        vis_units=tf.reshape(img_fc, shape=(batch_size, 32, 32))
        im_exp=tf.expand_dims(im_fc2, axis=1) #bsx1x512
        #im_per_word=tf.keras.backend.repeat_elements(im_exp, max_len, axis=1) #bsxmaxlenx512
        input_concat = tf.concat([capt, vis_units], 1) #bsx1024x512
        out, state1, state2=self.lstm(input_concat) 
        state_concat=tf.concat([state1, state2], axis=1) #bsx2*numunit
        
        #state_expand=tf.expand_dims(state_concat, axis=1) #bsx1x 2*num
        #state_per_word=tf.keras.backend.repeat_elements(state_expand, max_len, axis=1) #bsxmaxlenx2*num
        
        word_gate=self.word_fc(state_concat)
        word_gate=tf.nn.sigmoid(word_gate)
        word_gate=tf.expand_dims(word_gate, axis=2) #bsxmaxlenx1
        
        attention=tf.nn.relu(self.fc1(state_concat)) #bsxmaxlen
        score=tf.nn.softmax(attention) #bsxmaxlenx1 
        score=tf.expand_dims(score, axis=2)
        
        img_softmax=tf.nn.softmax(im_exp, axis=2) #bsx1x512
        
        word_img_affinity = word_gate*score*img_softmax #bsxmaxlenx512
        sentence_level_affinity=tf.nn.softmax(tf.reduce_sum(tf.reduce_sum(word_img_affinity, axis=2), axis=1))
        
        return (state1, state2), sentence_level_affinity

    def get_initial_state(self):
        return (tf.zeros((self.batch_size, self.num_units)), tf.zeros((self.batch_size, self.num_units)))
    
network = model(num_units, batch_size, max_len)
        

In [12]:
#sample checkings
'''
datas=next(dataset.__iter__())
img_arr=[]
capts = tf.one_hot(datas[1], depth=vocab_size, dtype=tf.int32)
for path in datas[0]:img_arr.append(load_image(path.numpy()))
img_arr = tf.convert_to_tensor(img_arr)
'''

'\ndatas=next(dataset.__iter__())\nimg_arr=[]\ncapts = tf.one_hot(datas[1], depth=vocab_size, dtype=tf.int32)\nfor path in datas[0]:img_arr.append(load_image(path.numpy()))\nimg_arr = tf.convert_to_tensor(img_arr)\n'

In [13]:
optm=tf.keras.optimizers.Adam(learning_rate=1e-3)
loss_object=tf.losses.BinaryCrossentropy(from_logits=True, reduction='none')
def loss_function(real, pred):
    #mask=tf.math.logical_not(tf.math.equal(real, 0))
    loss_=loss_object(real, pred)
    #mask_=tf.cast(mask, dtype=loss_.dtype)
    #loss_*=mask_
    
    return tf.reduce_mean(loss_)
    

In [14]:
def load_image(path):
    img = tf.io.read_file(path)
    img = tf.image.decode_image(img, channels=3)
    img = tf.image.resize(img, (224,224))
    return img/255


In [15]:
@tf.function
def train_step(optm, img_batch, capt_batch, labels, states):
    with tf.GradientTape() as tape:
        states, affinity = network(capt_batch, img_batch, states)
        loss = loss_function(labels, affinity)
        variables = network.trainable_variables
        gradients = tape.gradient(loss, variables)
        optm.apply_gradients(zip(gradients, variables))
    return loss

In [16]:
for epoch in range(epochs):
    initial_state = network.get_initial_state()
    total_loss=0
    for batch, (img_path_batch, capt_batch, labels) in enumerate(dataset):
        img_batch=[]
        for img in img_path_batch:img_batch.append(load_image(img.numpy()))
        img_batch=tf.convert_to_tensor(img_batch, dtype=tf.float32)
        capt_batch=tf.one_hot(capt_batch, depth=vocab_size)
        cur_loss = train_step(optm,img_batch, capt_batch, labels, initial_state )
        total_loss+=cur_loss
        
        if (batch+1)%10==0:
            print('epoch : {}  batch : {} loss : {:.4f}'.format(epoch+1, batch+1, cur_loss))
            
    if (epoch+1)%1==0:
        print('\n')
        print('epoch : {} loss : {:.4f}'.format(epoch+1, total_loss/(len(img_train_temp)/batch_size )))
            
        

epoch : 1  batch : 10 loss : 0.7144
epoch : 1  batch : 20 loss : 0.6944
epoch : 1  batch : 30 loss : 0.7044
epoch : 1  batch : 40 loss : 0.6744
epoch : 1  batch : 50 loss : 0.6944


epoch : 1 loss : 0.6944
epoch : 2  batch : 10 loss : 0.7044
epoch : 2  batch : 20 loss : 0.7044
epoch : 2  batch : 30 loss : 0.6944
epoch : 2  batch : 40 loss : 0.6744
epoch : 2  batch : 50 loss : 0.6444


epoch : 2 loss : 0.6944
epoch : 3  batch : 10 loss : 0.7044
epoch : 3  batch : 20 loss : 0.7044
epoch : 3  batch : 30 loss : 0.6944
epoch : 3  batch : 40 loss : 0.7044
epoch : 3  batch : 50 loss : 0.6644


epoch : 3 loss : 0.6944
epoch : 4  batch : 10 loss : 0.6944
epoch : 4  batch : 20 loss : 0.6944
epoch : 4  batch : 30 loss : 0.7144
epoch : 4  batch : 40 loss : 0.6744
epoch : 4  batch : 50 loss : 0.7144


epoch : 4 loss : 0.6944
epoch : 5  batch : 10 loss : 0.6944
epoch : 5  batch : 20 loss : 0.7144
epoch : 5  batch : 30 loss : 0.7144
epoch : 5  batch : 40 loss : 0.6944
epoch : 5  batch : 50 loss : 0.7

KeyboardInterrupt: 

In [None]:
#TO-DO loss is varying with num_units others remaining constant. with 500 samples  loss -> 0.69 - 0.65 in 20 epoch with 15 units
#train model with full samples 