In [182]:
import numpy as np
import keras
from keras.models import Sequential


class DataGenerator(keras.utils.Sequence):
    
    def someFunction(self,list_IDs_temp):
        return self.__data_generation(list_IDs_temp)
        
    'Generates data for Keras'
    def __init__(self, list_IDs, image_embedding, captions, batch_size=32,num_caption = 92, caption_dim =300, image_dim=(2048), n_channels=1, shuffle=True):
        'Initialization'
        self.image_dim = image_dim
        self.caption_dim = caption_dim
        self.num_caption = num_caption
        self.batch_size = batch_size
        self.list_IDs = list_IDs
#         self.n_channels = n_channels
        self.shuffle = shuffle
        self.image_embedding = image_embedding
        self.captions = captions
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        X = np.empty((self.batch_size, self.image_dim))
        Y = np.empty((self.batch_size, self.num_caption, self.caption_dim ))
        
        for i, ID in enumerate(list_IDs_temp):
            X[i,] = self.image_embedding[ID]
            Y[i] = self.captions[ID]

        return X, Y

In [183]:
import json
import random
import csv

SEED = 10
IMAGE_EMBEDDING_DIR = '/Users/vinutahegde/Documents/USC/SEM3/DL/project/ws/'
NUM_IMAGE_EMBEDDING_CHUNKS = 2
GLOVE_EMBEDDING_FILE_NAME = 'glove.6B.300d.txt'
CAPTION_FILE_NAME = '/Users/vinutahegde/Documents/USC/SEM3/DL/project/ws/image_to_caption.csv'
MAX_SEQUENCE_LENGTH = 92
WORD_EMBEDDING_DIM = 300

In [184]:

class text_encoder:
    def get_embedding_matrix(self,filename, WORD_EMBEDDING_DIM):
        embeddings_index = {}
        word_to_index = {}

        with open(filename) as f:
            for index, line in enumerate(f):
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
                word_to_index[word] = index

          # Vocabulary
        vocabulary = embeddings_index.keys()
        embedding_matrix = np.zeros((len(vocabulary) + 1, WORD_EMBEDDING_DIM))

        for word, i in word_to_index.items():
            embedding_vector = embeddings_index[word]
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        return embedding_matrix, vocabulary, word_to_index
    
    def load_embeddings(self):
        embedding_matrix, vocabulary, word_to_index = self.get_embedding_matrix(GLOVE_EMBEDDING_FILE_NAME, WORD_EMBEDDING_DIM)      
        self.embedding_matrix = embedding_matrix
        self.vocabulary = vocabulary
        self.word_to_index = word_to_index
        
    def get_sentence_embedding(self,sentence):
        words = sentence.split()    
        sentence_embedding =  np.zeros(shape=(MAX_SEQUENCE_LENGTH,WORD_EMBEDDING_DIM))
        mask =  np.zeros(MAX_SEQUENCE_LENGTH)
        i=0;
        for w in words:
            mask[i] = 1
            index = self.word_to_index.get(w,-1)
            if(index != -1):
                sentence_embedding[i] = self.embedding_matrix[index]
            else:
                sentence_embedding[i] = np.zeros(WORD_EMBEDDING_DIM)
            i+=1       
        return  sentence_embedding, mask

In [185]:
def Merge(dict1, dict2): 
    res = {**dict1, **dict2} 
    return res 

In [186]:
def getImageEmbedding():
    image_embedding = {}
    for i in range (NUM_IMAGE_EMBEDDING_CHUNKS):
        file_name = IMAGE_EMBEDDING_DIR + 'group_'+str(i+1)+'.json'
        with open(file_name) as json_file:
            print(file_name)
            json_data = json.load(json_file)
            json_data = json.loads(json_data)
            image_embedding = Merge(image_embedding, json_data) 
        return image_embedding

In [187]:
def getCaptions(id_list,text_ebmd_encoder):
    caption_dict = {}
    with open(CAPTION_FILE_NAME) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if row[1] in id_list:
#                  print(row[1] +"  "+ row[2])
                 embd, mask = text_ebmd_encoder.get_sentence_embedding(row[2])
                 caption_dict[row[1]] = embd
#                  print( caption_dict[row[1]])

    return caption_dict

In [188]:
def getPartitions(image_list_file_name):
    with open(image_list_file_name) as json_file:
        json_data = json.load(json_file)
#         json_data = json.loads(data)
        print(len(json_data))
        return json_data

In [190]:
# Parameters
random.seed(SEED)

params = {'num_caption': 92,
          'caption_dim' :300,
          'image_dim' : 2048,
          'batch_size': 64,
          'shuffle': True}

ids = getPartitions('/Users/vinutahegde/Documents/USC/SEM3/DL/project/ws/imgaes_with_captions.txt')
random.shuffle(ids)

id_len = len(ids)
partition = {}

partition['train'] = ids[:int((id_len+1)*.90)] #Remaining 90% to training set
partition['validation'] = ids[int(id_len*.90+1):] #Splits 10% data to test set

print(len(partition['train']))
print(len(partition['validation']))

image_embedding = getImageEmbedding()
text_ebmd_encoder = text_encoder()
text_ebmd_encoder.load_embeddings()
captions = getCaptions(image_embedding.keys(),text_ebmd_encoder);


training_generator = DataGenerator(partition['train'], image_embedding, captions , **params)
validation_generator = DataGenerator(partition['validation'], image_embedding, captions, **params)

id_filtered = list(image_embedding.keys())
x =id_filtered[:64]
train_X, train_Y = training_generator.someFunction(x)
print(train_X.shape)
print(train_Y.shape)


81155
73040
8115
/Users/vinutahegde/Documents/USC/SEM3/DL/project/ws/group_1.json
(64, 2048)
(64, 92, 300)
