In [17]:
import os
import pickle
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import re

from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add


import gensim.downloader
embed = gensim.downloader.load("glove-wiki-gigaword-100")

In [2]:
path = os.getcwd()
base_directory = path + '/Data'

In [3]:
model = VGG16()
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [4]:
def method1(path, img_name):
    # load the image from file
    image = load_img(path, target_size=(224, 224))
    # convert image pixels to numpy array
    image = img_to_array(image)
    # reshape data for model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # preprocess image for vgg
    image = preprocess_input(image)
    # extract features
    feature = model.predict(image, verbose=0)
    # get image ID
    image_id = img_name.split('.')[0]

    # plt.imshow(image[0, ..., 0])
    # plt.axis('off')  # Turn off the axis
    # plt.title(image_id)
    # plt.show()  
    # store feature
    return image_id, feature

In [4]:
def method2(path, img_name):
    target_height = 224
    target_width = 224
    # Load and preprocess image
    image = tf.io.read_file(path) 
    image = tf.image.decode_image(image, channels=3)
    image = tf.image.convert_image_dtype(image, tf.float32)

    # Calculate the aspect ratio of the original image
    original_height, original_width, _ = tf.unstack(tf.shape(image))
    aspect_ratio = tf.cast(original_width, tf.float32) / tf.cast(original_height, tf.float32)

    # Calculate the new dimensions while preserving the aspect ratio
    if aspect_ratio > 1.0:
        new_width = target_width
        new_height = tf.cast(target_width / aspect_ratio, tf.int32)
    else:
        new_height = target_height
        new_width = tf.cast(target_height * aspect_ratio, tf.int32)

    # Resize and pad the image to the target size
    resized_image = tf.image.resize_with_pad(image, target_height, target_width, method='bilinear')
    resized_image = tf.image.convert_image_dtype(resized_image, tf.uint8)
    resized_image = tf.expand_dims(resized_image, 0)

    image_id = img_name.split('.')[0]

    # plt.imshow(resized_image[0, ..., 0])
    # plt.axis('off')
    # plt.title(image_id)
    # plt.show()  

    feature = model.predict(resized_image, verbose=0)
    return image_id, feature

In [75]:
img_features = {}
working_directory = base_directory + '/images'

target_height = 224
target_width = 224
count = 0

for img_name in os.listdir(working_directory):
    if img_name.endswith('.jpg'):
        path = working_directory + '/' + img_name

        imid, fe = method2(path, img_name)
        # imid, fe = method2(path, img_name)
        img_features[imid] = fe
        count += 1
        if count % 100 == 0:
            print(count)


100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500


In [76]:
# saving the features because processing the data takes a while without a GPU
pickle.dump(img_features, open(os.path.join(base_directory, 'features.pkl'), 'wb'))

In [5]:
with open(os.path.join(base_directory, 'features.pkl'), 'rb') as f:
    img_features = pickle.load(f)

In [66]:
filename = base_directory + '/descriptions.csv'
captions = pd.read_csv(filename)
captions['file'] = captions['file'].str[:-4]
captions = captions[captions['file'] != 'a6a35734-ee74-42bd-a13a-dfa2b683fcda'] # outlier in length


In [67]:
# process lines
# captions['tokens'] = captions['description'].apply(lambda description: [word.lower() for word in word_tokenize(description)])

# Create a dictionary to map labels to tokenized captions
mappings = {}

for index, row in captions.iterrows():
    label = row['file']
    tokens = row['description']
    mappings[label] = tokens
print(mappings['2d33d6a3-d2eb-496d-9ac6-832911e178f1'])


The student filled in two given sets of double number lines. In the first double number line, the student completed the top number line with "12\frac{1}{2}, 25, 37\frac{1}{2} , 50, 62\frac{1}{2} , 75, 87\frac{1}{2} , 100, 112\frac{1}{2} ". Note that 100 was pre-filled in the diagram. The student completed the bottom line with "2, 3, 4, 5, 6, 7, 8, 9". In the second double number line, the student completed the top number line with "40, 60, 80". The student completed the bottom number line with "1, 2, 3".



In [68]:
def clean_caption(caption):
    # Convert to lowercase
    caption = caption.lower()
    # Remove special characters and punctuation using regular expressions
    caption = re.sub(r'[^a-zA-Z\s]', '', caption)
    caption = ' '.join(caption.split())
    caption = 'startseq ' + caption + ' endseq'
    return caption

captions['description'] = captions['description'].apply(clean_caption)

Unnamed: 0,description,file
0,startseq one triangle that is identical to the...,48d27a28-851f-4fcb-82b1-b252ea5d8295
1,startseq one triangle drawn above the original...,420ec849-d0da-4f45-aed0-645bfa3b1d62
2,startseq x written next to the original triang...,c5cc8cbc-7844-405b-a204-6aca67ef4384
3,startseq student drew and shaded an identical ...,abc6bf50-9b06-4c09-9e7c-90a1403ff860
4,startseq a shaded triangle drawn above the ori...,231b00f3-c151-48a0-a19c-d17882ba7baf
5,startseq the student filled in in the top numb...,1e49326b-9fe8-4c5a-b7e8-fa1eea1e9a0c
6,startseq the student labeled the bottom number...,332ea863-c4fa-4905-96bb-32fc71aa5ffe
7,startseq the student filled in two given sets ...,8d32fce4-90ee-4678-910f-5fb0f60d4dce
8,startseq the student filled in two given sets ...,9ca12d51-d5e0-41ee-9a9b-fd74a95f8982
9,startseq the student filled in two given sets ...,2d33d6a3-d2eb-496d-9ac6-832911e178f1


In [73]:
# Initialize the tokenizer
tokenizer = Tokenizer(oov_token='UNK')
tokenizer.fit_on_texts(captions['description'])
vocab_size = len(tokenizer.word_index) + 1

In [76]:
print("Vocab size: ", vocab_size)
max_length = max(len(caption.split()) for caption in captions['description'])
max_length = 30
print("Max lenght: ", max_length)

Vocab size:  1690
Max lenght:  30


In [77]:
# creating train-test split
image_ids = list(mappings.keys())
split = int(len(image_ids) * 0.9)
train = image_ids[:split]
test = image_ids[split:]

In [78]:
def data_generator(data_keys, mapping, features, tokenizer, batch_size):
    # loop over images
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            caption = mapping[key]
            # encode the sequence
            seq = tokenizer.texts_to_sequences([caption])[0]
            # split the sequence into X, y pairs
            for i in range(1, len(seq)):
                # split into input and output pairs
                in_seq, out_seq = seq[:i], seq[i]
                # pad input sequence
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                # encode output sequence
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                
                # store the sequences
                X1.append(features[key][0])
                X2.append(in_seq)
                y.append(out_seq)
            if n == batch_size:
                X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                yield [X1, X2], y
                X1, X2, y = list(), list(), list()
                n = 0

In [79]:
# Hyperparameters
lr = 0.001

In [80]:
embedding_matrix = np.zeros((vocab_size, embed.vectors.shape[1]))
for word, i in tokenizer.word_index.items():
    if word in embed:
        embedding_matrix[i] = embed[word]
print(embedding_matrix.shape)

(1690, 100)


In [81]:
# Create the model
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

inputs2 = Input(shape=(max_length,))
# Use GloVe embeddings for the embedding layer
se1 = Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], input_length=max_length, trainable=False)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)

opt = tf.keras.optimizers.Adam(learning_rate=lr)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# plot the model
print(model.summary())

2023-10-28 16:09:12.024722: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-10-28 16:09:12.031700: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-10-28 16:09:12.164023: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 30)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 4096)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 30, 100)      169000      ['input_3[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 4096)         0           ['input_2[0][0]']                
                                                                                            

In [103]:
def train_model():
    epochs = 10
    batch_size = 32
    steps = len(train) // batch_size

    for i in range(epochs):
        # create data generator
        generator = data_generator(train, mappings, img_features, tokenizer, batch_size)
        # fit for one epoch
        model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)
        print("Epoch 1")

In [104]:
train_model()

2023-10-27 19:34:33.511034: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 1


2023-10-27 19:40:14.597683: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 1


2023-10-27 19:46:09.743047: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 1


2023-10-27 19:52:09.552994: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 1


2023-10-27 19:58:02.159261: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 1


2023-10-27 20:03:49.195186: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 1


2023-10-27 20:09:38.604624: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 1


2023-10-27 20:16:04.954680: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 1


2023-10-27 20:22:03.698514: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 1


2023-10-27 20:28:04.882257: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


Epoch 1


In [105]:
model.save(base_directory + '/best_model.h5')

In [22]:
# helper to generate captions
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [32]:
def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    in_text = 'startseq'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length)
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        # get index with high probability
        oov_ind = np.argmax(yhat)
        yhat[0, oov_ind] = 0
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        # stop if we reach end tag
        if word == 'endseq':
            break
      
    return in_text

In [19]:
# if I want to load a previous model 
model = load_model(base_directory + '/best_model.h5')

2023-10-28 09:10:45.018709: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-10-28 09:10:45.021609: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-10-28 09:10:45.023548: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [23]:
from nltk.translate.bleu_score import corpus_bleu
# validate with test data
actual, predicted = list(), list()

count = 0
for key in test:
    count += 1
    if count % 100 == 0:
        print(count)
    # get actual caption
    captions = mappings[key]
    # predict the caption for image
    y_pred = predict_caption(model, img_features[key], tokenizer, max_length) 
    # split into words
    actual_captions = [caption.split() for caption in captions]
    y_pred = y_pred.split()
    # append to the list
    actual.append(actual_captions)
    predicted.append(y_pred)
    
# calcuate BLEU score
print("BLEU-1: %f" % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
print("BLEU-2: %f" % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))

2023-10-28 09:12:14.025728: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-10-28 09:12:14.027724: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-10-28 09:12:14.029242: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

100
200
300
400
500
600
700
800
900


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.000068
BLEU-2: 0.000000


In [30]:
from PIL import Image

def generate_caption(image_name):
    # load the image
    image_id = image_name.split('.')[0]
    img_path = os.path.join(base_directory, "images", image_name)
    image = Image.open(img_path)
    caption = mappings[image_id]
    print('---------------------Actual---------------------')
    print(caption)
    # predict the caption
    y_pred = predict_caption(model, img_features[image_id], tokenizer, max_length)
    print('--------------------Predicted--------------------')
    print(y_pred)
    plt.imshow(image)

In [34]:
generate_caption('927edacd-9f8f-46fe-bcbf-1830ae7868c5.jpg')

---------------------Actual---------------------
Polygon ABCD with a scaled copy polygon PQRS
Polygon ABCD has side length AB labeled 1 and side length AD labeled 2.
Polygon PQRS has side length PQ labeled 1.5 and side length PS labeled 3.



KeyboardInterrupt: 

In [72]:
# for word, index in tokenizer.word_index.items():
#     print(index, ". ", word)
print(tokenizer.word_index)

{'UNK': 1, 'the': 2, 'startseq': 3, 'endseq': 4, 'is': 5, 'and': 6, 'with': 7, 'of': 8, 'a': 9, 'labeled': 10, 'to': 11, 'line': 12, 'in': 13, 'written': 14, 'rectangle': 15, 'row': 16, 'drawn': 17, 'number': 18, 'it': 19, 'right': 20, 'from': 21, 'above': 22, 'on': 23, 'triangle': 24, 'two': 25, 'student': 26, 'first': 27, 'left': 28, 'are': 29, 'each': 30, 'diagram': 31, 'into': 32, 'side': 33, 'table': 34, 'second': 35, 'has': 36, 'out': 37, 'x': 38, 'below': 39, 'arrow': 40, 'crossed': 41, 'top': 42, 'height': 43, 'tape': 44, 'drew': 45, 'one': 46, 'circled': 47, 'at': 48, 'figure': 49, 'three': 50, 'for': 51, 'b': 52, 'an': 53, 'width': 54, 'inside': 55, 'by': 56, 'as': 57, 'there': 58, 'c': 59, 'box': 60, 'horizontal': 61, 'rows': 62, 'shaded': 63, 'bottom': 64, 'units': 65, 'vertical': 66, 'shape': 67, 'column': 68, 'through': 69, 'columns': 70, 'length': 71, 'that': 72, 'd': 73, 'lines': 74, 'parts': 75, 'sections': 76, 'frac': 77, 'marks': 78, 'third': 79, 'section': 80, 'betw