In [30]:
import os
import pickle
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add

import gensim.downloader
# embed = gensim.downloader.load("glove-wiki-gigaword-200")

In [2]:
path = os.getcwd()
base_directory = path + '/Data'

In [3]:
model = VGG16()
model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [4]:
def method1(path, img_name):
    # load the image from file
    image = load_img(path, target_size=(224, 224))
    # convert image pixels to numpy array
    image = img_to_array(image)
    # reshape data for model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # preprocess image for vgg
    image = preprocess_input(image)
    # extract features
    feature = model.predict(image, verbose=0)
    # get image ID
    image_id = img_name.split('.')[0]

    # plt.imshow(image[0, ..., 0])
    # plt.axis('off')  # Turn off the axis
    # plt.title(image_id)
    # plt.show()  
    # store feature
    return image_id, feature

In [5]:
def method2(path, img_name):
    target_height = 224
    target_width = 224
    # Load and preprocess image
    image = tf.io.read_file(path) 
    image = tf.image.decode_image(image, channels=3)
    image = tf.image.convert_image_dtype(image, tf.float32)

    # Calculate the aspect ratio of the original image
    original_height, original_width, _ = tf.unstack(tf.shape(image))
    aspect_ratio = tf.cast(original_width, tf.float32) / tf.cast(original_height, tf.float32)

    # Calculate the new dimensions while preserving the aspect ratio
    if aspect_ratio > 1.0:
        new_width = target_width
        new_height = tf.cast(target_width / aspect_ratio, tf.int32)
    else:
        new_height = target_height
        new_width = tf.cast(target_height * aspect_ratio, tf.int32)

    # Resize and pad the image to the target size
    resized_image = tf.image.resize_with_pad(image, target_height, target_width, method='bilinear')
    resized_image = tf.image.convert_image_dtype(resized_image, tf.uint8)
    resized_image = tf.expand_dims(resized_image, 0)

    image_id = img_name.split('.')[0]

    # plt.imshow(resized_image[0, ..., 0])
    # plt.axis('off')
    # plt.title(image_id)
    # plt.show()  

    feature = model.predict(resized_image, verbose=0)
    return image_id, feature

In [9]:
img_features = {}
working_directory = base_directory + '/images'

target_height = 224
target_width = 224
count = 0

for img_name in os.listdir(working_directory):
    if img_name.endswith('.jpg'):
        path = working_directory + '/' + img_name

        imid, fe = method2(path, img_name)
        # imid, fe = method2(path, img_name)
        img_features[imid] = fe
        count += 1
        if count % 100 == 0:
            print(count)


100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500


In [18]:
# saving the features because processing the data takes a while without a GPU
pickle.dump(img_features, open(os.path.join(base_directory, 'features.pkl'), 'wb'))

[[6.30985141e-07 1.14148716e-04 1.73754743e-05 3.02156659e-05
  4.21235236e-05 1.49553147e-04 1.80231495e-04 1.06941279e-05
  3.02797616e-05 5.70007023e-06 3.56733108e-05 4.11436522e-05
  5.87333998e-05 2.66772640e-05 5.70092125e-05 5.52802594e-05
  9.21331157e-05 2.72113655e-04 6.93707843e-05 2.02835363e-05
  3.75813106e-05 9.99254626e-05 3.77834549e-05 8.84420933e-06
  5.06990982e-05 3.15888155e-06 4.63969172e-06 9.79237302e-06
  8.16718966e-07 3.79998187e-06 4.70119767e-06 1.03395623e-05
  6.67609902e-06 5.97545750e-06 1.61215994e-05 3.72175282e-06
  1.82482818e-05 2.12070813e-06 4.11140172e-06 3.10016821e-06
  1.16410309e-04 1.87010528e-06 1.26179093e-05 1.89513557e-05
  4.73037608e-06 1.86498130e-06 1.17349737e-04 5.02908097e-06
  7.58456224e-07 4.34354934e-06 6.57407554e-06 8.90444971e-06
  1.06525476e-05 3.71996648e-05 3.94080962e-05 7.75431363e-06
  3.68639621e-06 1.71580882e-06 5.21483744e-05 1.55858914e-04
  3.01781110e-05 1.58127273e-06 3.53711152e-06 1.05392610e-05
  9.3498

In [28]:
filename = base_directory + '/descriptions.csv'
captions = pd.read_csv(filename)
captions['file'] = captions['file'].str[:-4]


Unnamed: 0,description,file
0,One triangle that is identical to the original...,48d27a28-851f-4fcb-82b1-b252ea5d8295
1,one triangle drawn above the original triangle...,420ec849-d0da-4f45-aed0-645bfa3b1d62
2,”x2” written next to the original triangle. Ar...,c5cc8cbc-7844-405b-a204-6aca67ef4384
3,Student drew and shaded an identical triangle ...,abc6bf50-9b06-4c09-9e7c-90a1403ff860
4,a shaded triangle drawn above the original tr...,231b00f3-c151-48a0-a19c-d17882ba7baf
5,"The student filled in ""25"" in the top number l...",1e49326b-9fe8-4c5a-b7e8-fa1eea1e9a0c
6,The student labeled the bottom number line wit...,332ea863-c4fa-4905-96bb-32fc71aa5ffe
7,The student filled in two given sets of double...,8d32fce4-90ee-4678-910f-5fb0f60d4dce
8,The student filled in two given sets of double...,9ca12d51-d5e0-41ee-9a9b-fd74a95f8982
9,The student filled in two given sets of double...,2d33d6a3-d2eb-496d-9ac6-832911e178f1


In [34]:
mapping = {}
# process lines
captions['tokens'] = captions['description'].apply(lambda description: [word.lower() for word in word_tokenize(description)])

# Create a dictionary to map labels to tokenized captions
mappings = {}

for index, row in captions.iterrows():
    label = row['file']
    tokens = row['tokens']
    mappings[label] = tokens


['the', 'student', 'filled', 'in', 'two', 'given', 'sets', 'of', 'double', 'number', 'lines', '.', 'in', 'the', 'first', 'double', 'number', 'line', ',', 'the', 'student', 'completed', 'the', 'top', 'number', 'line', 'with', '``', '12\\frac', '{', '1', '}', '{', '2', '}', ',', '25', ',', '37\\frac', '{', '1', '}', '{', '2', '}', ',', '50', ',', '62\\frac', '{', '1', '}', '{', '2', '}', ',', '75', ',', '87\\frac', '{', '1', '}', '{', '2', '}', ',', '100', ',', '112\\frac', '{', '1', '}', '{', '2', '}', '``', '.', 'note', 'that', '100', 'was', 'pre-filled', 'in', 'the', 'diagram', '.', 'the', 'student', 'completed', 'the', 'bottom', 'line', 'with', '``', '2', ',', '3', ',', '4', ',', '5', ',', '6', ',', '7', ',', '8', ',', '9', "''", '.', 'in', 'the', 'second', 'double', 'number', 'line', ',', 'the', 'student', 'completed', 'the', 'top', 'number', 'line', 'with', '``', '40', ',', '60', ',', '80', "''", '.', 'the', 'student', 'completed', 'the', 'bottom', 'number', 'line', 'with', '``', '