In [1]:
import warnings
warnings.filterwarnings("ignore")
import re
import itertools
import tqdm
import pandas as pd
import numpy as np
import json
import pickle
from tqdm import tqdm
import tensorflow as tf
from sklearn.model_selection import train_test_split
from gensim.parsing.preprocessing import STOPWORDS,remove_stopwords
from sklearn.preprocessing import LabelBinarizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [2]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [3]:
img_width = 224
img_height = 224
BATCH_SIZE = 32
BUFFER_SIZE = 300

In [4]:
merge_df = pd.read_csv('merge.csv')
img_train_dir = r"E:\VQA_Dataset\train2014"

In [None]:
merge_df

# Genrating Answer Vectors

In [5]:
class_list = list(dict(sorted(merge_df['multiple_choice_answer'].value_counts().to_dict().items(),key=lambda x:x[1],reverse=True)).keys())[0:1000]

In [6]:
merge_df['multiple_choice_answer'] =  merge_df['multiple_choice_answer'].apply(lambda x: x if x in class_list else '')

In [7]:
merge_df = merge_df[merge_df['multiple_choice_answer'].apply(lambda x: len(x)>0)]

In [None]:
merge_df

In [8]:
label_encoder = LabelBinarizer()
answer_vector = label_encoder.fit_transform(merge_df['multiple_choice_answer'].apply(lambda x: x).values)

ans_vocab = {l: i for i, l in enumerate(label_encoder.classes_)}
print("Number of clasess: ", len(ans_vocab))
print("Shape of Answer Vectors in Train Data: ", answer_vector.shape)

Number of clasess:  1000
Shape of Answer Vectors in Train Data:  (215375, 1000)


In [9]:
with open('label_encoder.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Genrating Question Vectors

In [8]:
def preprocess_english(sentence):
    periodStrip  = re.compile("(?!<=\d)(\.)(?!\d)")
    commaStrip   = re.compile("(\d)(\,)(\d)")
    punct        = [';', r"/", '[', ']', '"', '{', '}',
                    '(', ')', '=', '+', '\\', '_', '-',
                    '>', '<', '@', '`', ',', '?', '!']
    contractions = {"aint": "ain't", "arent": "aren't", "cant": "can't", "couldve": "could've", "couldnt": "couldn't", \
                    "couldn'tve": "couldn't've", "couldnt've": "couldn't've", "didnt": "didn't", "doesnt": "doesn't", "dont": "don't", "hadnt": "hadn't", \
                    "hadnt've": "hadn't've", "hadn'tve": "hadn't've", "hasnt": "hasn't", "havent": "haven't", "hed": "he'd", "hed've": "he'd've", \
                    "he'dve": "he'd've", "hes": "he's", "howd": "how'd", "howll": "how'll", "hows": "how's", "Id've": "I'd've", "I'dve": "I'd've", \
                    "Im": "I'm", "Ive": "I've", "isnt": "isn't", "itd": "it'd", "itd've": "it'd've", "it'dve": "it'd've", "itll": "it'll", "let's": "let's", \
                    "maam": "ma'am", "mightnt": "mightn't", "mightnt've": "mightn't've", "mightn'tve": "mightn't've", "mightve": "might've", \
                    "mustnt": "mustn't", "mustve": "must've", "neednt": "needn't", "notve": "not've", "oclock": "o'clock", "oughtnt": "oughtn't", \
                    "ow's'at": "'ow's'at", "'ows'at": "'ow's'at", "'ow'sat": "'ow's'at", "shant": "shan't", "shed've": "she'd've", "she'dve": "she'd've", \
                    "she's": "she's", "shouldve": "should've", "shouldnt": "shouldn't", "shouldnt've": "shouldn't've", "shouldn'tve": "shouldn't've", \
                    "somebody'd": "somebodyd", "somebodyd've": "somebody'd've", "somebody'dve": "somebody'd've", "somebodyll": "somebody'll", \
                    "somebodys": "somebody's", "someoned": "someone'd", "someoned've": "someone'd've", "someone'dve": "someone'd've", \
                    "someonell": "someone'll", "someones": "someone's", "somethingd": "something'd", "somethingd've": "something'd've", \
                    "something'dve": "something'd've", "somethingll": "something'll", "thats": "that's", "thered": "there'd", "thered've": "there'd've", \
                    "there'dve": "there'd've", "therere": "there're", "theres": "there's", "theyd": "they'd", "theyd've": "they'd've", \
                    "they'dve": "they'd've", "theyll": "they'll", "theyre": "they're", "theyve": "they've", "twas": "'twas", "wasnt": "wasn't", \
                    "wed've": "we'd've", "we'dve": "we'd've", "weve": "we've", "werent": "weren't", "whatll": "what'll", "whatre": "what're", \
                    "whats": "what's", "whatve": "what've", "whens": "when's", "whered": "where'd", "wheres": "where's", "whereve": "where've", \
                    "whod": "who'd", "whod've": "who'd've", "who'dve": "who'd've", "wholl": "who'll", "whos": "who's", "whove": "who've", "whyll": "why'll", \
                    "whyre": "why're", "whys": "why's", "wont": "won't", "wouldve": "would've", "wouldnt": "wouldn't", "wouldnt've": "wouldn't've", \
                    "wouldn'tve": "wouldn't've", "yall": "y'all", "yall'll": "y'all'll", "y'allll": "y'all'll", "yall'd've": "y'all'd've", \
                    "y'alld've": "y'all'd've", "y'all'dve": "y'all'd've", "youd": "you'd", "youd've": "you'd've", "you'dve": "you'd've", \
                    "youll": "you'll", "youre": "you're", "youve": "you've"}

    inText = sentence.replace('\n', ' ')
    inText = inText.replace('\t', ' ')
    inText = inText.strip()
    outText = inText
    for p in punct:
        if (p + ' ' in inText or ' ' + p in inText) or \
           (re.search(commaStrip, inText) != None):
            outText = outText.replace(p, '')
        else:
            outText = outText.replace(p, ' ')
    outText = periodStrip.sub("", outText, re.UNICODE)
    outText = outText.lower().split()
    for wordId, word in enumerate(outText):
        if word in contractions:
            outText[wordId] = contractions[word]
    outText = ' '.join(outText)
    return outText

In [9]:
merge_df['question'] = merge_df['question'].apply(lambda x:preprocess_english(x))

In [None]:
merge_df

In [10]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token = "<unk>", filters = '')
tokenizer.fit_on_texts(merge_df['question'].values)
question_seqs = tokenizer.texts_to_sequences(merge_df['question'].values)
question_vector = tf.keras.preprocessing.sequence.pad_sequences(question_seqs, padding='post')
print("Number of words in tokenizer:", len(tokenizer.word_index))
print("Shape of Question Vectors in Train Data: ", question_vector.shape)

Number of words in tokenizer: 12573
Shape of Question Vectors in Train Data:  (215375, 22)


In [17]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('question_vector.pickle','wb') as handle:
    pickle.dump(question_vector,handle,protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
question_vector.shape[1]

22

In [None]:
question_seqs

In [14]:
tokenizer.word_index

{'<unk>': 1,
 'the': 2,
 'is': 3,
 'what': 4,
 'are': 5,
 'this': 6,
 'in': 7,
 'a': 8,
 'on': 9,
 'how': 10,
 'many': 11,
 'of': 12,
 'color': 13,
 'there': 14,
 'man': 15,
 'does': 16,
 'people': 17,
 'picture': 18,
 'to': 19,
 'wearing': 20,
 'these': 21,
 'it': 22,
 'have': 23,
 'person': 24,
 'photo': 25,
 'do': 26,
 'where': 27,
 'or': 28,
 'kind': 29,
 'you': 30,
 'animal': 31,
 'room': 32,
 'woman': 33,
 'doing': 34,
 'they': 35,
 'be': 36,
 'animals': 37,
 'holding': 38,
 'type': 39,
 'can': 40,
 'dog': 41,
 'cat': 42,
 'any': 43,
 'at': 44,
 'for': 45,
 'he': 46,
 'water': 47,
 'his': 48,
 'train': 49,
 'that': 50,
 'see': 51,
 'food': 52,
 'an': 53,
 'shirt': 54,
 'which': 55,
 'playing': 56,
 'made': 57,
 'sport': 58,
 'bus': 59,
 'sitting': 60,
 'table': 61,
 'plate': 62,
 "man's": 63,
 'shown': 64,
 'plane': 65,
 'sign': 66,
 'taken': 67,
 'with': 68,
 'look': 69,
 'standing': 70,
 'right': 71,
 'pizza': 72,
 'all': 73,
 'left': 74,
 'background': 75,
 'boy': 76,
 'being'

In [11]:
len(tokenizer.word_index)

12573

In [None]:
tokenizer.word_counts

In [None]:
tokenizer.word_docs

In [None]:
question_vector

# Extracting Image Features

In [None]:
def load_img(image_path):
    img = tf.io.read_file(image_path)
    img = tf.io.decode_jpeg(img,channels=3)
    img = tf.image.resize(img,(img_width,img_height))
    img = tf.keras.applications.vgg19.preprocess_input(img)
    img = img * (1./255)
    return img,image_path

def VGG19_Top():
    model =tf.keras.applications.VGG19(include_top=False,weights='imagenet',input_shape=(img_width,img_height,3))
    input_layer = model.input
    hidden_layer = model.layers[-1].output
    model = tf.keras.Model(input_layer, hidden_layer)
    return model

def generate_image_features(images):
    model = VGG19_Top()
    all_image_dict = {}
    img_ds = tf.data.Dataset.from_tensor_slices(images)
    img_ds = img_ds.map(load_img, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)

    progress_bar = tqdm(total=len(images), desc="Processing images")
    
    for batch_img, batch_path in img_ds:
        batch_img_features = model(batch_img)

        for img_features, path in zip(batch_img_features, batch_path):
            image_path = path.numpy().decode("utf-8")
            all_image_dict[image_path] = img_features.numpy()

            progress_bar.update(1)

    progress_bar.close()

    with open(r"E:\VQA_Dataset\all_image_dict.pickle", 'wb') as handle:
        pickle.dump(all_image_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return
all_image_path = merge_df['image_id'].apply(lambda x:  img_train_dir +'\\' + 'COCO_train2014_' + '%012d.jpg' % (x)).unique()
generate_image_features(all_image_path)

In [12]:
all_image_path = merge_df['image_id'].apply(lambda x:  img_train_dir +'\\' + 'COCO_train2014_' + '%012d.jpg' % (x)).unique()

In [13]:
with open(r"all_image_dict.pickle", 'rb') as handle:
    all_image_dict = pickle.load(handle)

In [14]:
def get_imageTensor(img,ques):
    img_tensor = all_image_dict[img.decode('utf-8')]
    return img_tensor,ques

In [15]:
def Create_Dataset(all_image_path,question_vector,answer_vector):
    # Combining image,questions and answers
    combined_data = []
    for i in range(len(all_image_path)):
        combined_data.append((all_image_path[i], question_vector[i], answer_vector[i]))
    
    # Split image paths, question vector, and answer vector into training and validation sets
    train_data, val_data = train_test_split(combined_data, test_size=0.2, random_state=42)
    
    #Unpacking training and validation data and converting tuples to numpy array
    train_image_path, train_question_vector, train_answer_vector = zip(*train_data)
    val_image_path, val_question_vector, val_answer_vector = zip(*val_data)
    
    train_image_path = np.array(train_image_path)
    train_question_vector = np.array(train_question_vector)
    train_answer_vector = np.array(train_answer_vector)
    
    val_image_path = np.array(val_image_path)
    val_question_vector = np.array(val_answer_vector)
    val_answer_vector = np.array(val_answer_vector)
    
    # Create training dataset
    train_dataset_input = tf.data.Dataset.from_tensor_slices((train_image_path, train_question_vector.astype(np.float32)))
    train_dataset_output = tf.data.Dataset.from_tensor_slices((train_answer_vector.astype(np.float32)))
    train_dataset_input = train_dataset_input.map(lambda img, ques: tf.numpy_function(get_imageTensor, [img, ques], [tf.float32, tf.float32]),
                                                  num_parallel_calls=tf.data.experimental.AUTOTUNE)
    train_dataset_input = train_dataset_input.batch(BATCH_SIZE)
    train_dataset_output = train_dataset_output.batch(BATCH_SIZE)
    train_dataset = tf.data.Dataset.zip((train_dataset_input, train_dataset_output))
    train_dataset = train_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    train_dataset.save('train_dataset.tfrecord')
    
    # Create validation dataset
    val_dataset_input = tf.data.Dataset.from_tensor_slices((val_image_path, val_question_vector.astype(np.float32)))
    val_dataset_output = tf.data.Dataset.from_tensor_slices((val_answer_vector.astype(np.float32)))
    val_dataset_input = val_dataset_input.map(lambda img, ques: tf.numpy_function(get_imageTensor, [img, ques], [tf.float32, tf.float32]),
                                              num_parallel_calls=tf.data.experimental.AUTOTUNE)
    val_dataset_input = val_dataset_input.batch(BATCH_SIZE)
    val_dataset_output = val_dataset_output.batch(BATCH_SIZE)
    val_dataset = tf.data.Dataset.zip((val_dataset_input, val_dataset_output))
    val_dataset = val_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    val_dataset.save('validation_dataset.tfrecord')

    return train_dataset, val_dataset

In [16]:
dataset_train,dataset_valid = Create_Dataset(all_image_path,question_vector,answer_vector)

In [None]:
from numba import cuda
cuda.close()