In [None]:
from os import listdir, path
from random import sample, choice
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tqdm import tqdm

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

import pickle
import sys

In [None]:
CAPTIONS_FILE_LOC = "captions.token"
GLOVE_VECTORS_FILE_LOC = "glove.6B.100d.txt"
SPLIT_FILE_LOC = "split.pickle"
CAPTIONS_DICT_FILE_LOC = "captions_dict.pickle"
TOKENIZER_FILE_LOC = "tokenizer.pickle"
EMBEDDING_MATRIX_FILE_LOC = "embedding_matrix.pickle"
INCEPTION_FEAT_FILE_LOC = "inception_v3_output.pickle"

DATASET_DIR = "./flickr30k-images/"

In [None]:
# Parameters
IMAGE_SIZE = 128
MAX_WORDS_IN_SENTENCE = 85
MARGIN = 0.7
BATCH_SIZE = 32
EMBEDDING_OUT_LENGTH = 128

In [None]:
def prepare_tokenizer(caption_file_location=CAPTIONS_FILE_LOC, override=False):
    if path.exists(TOKENIZER_FILE_LOC) and not override:
        return pickle.load(open(TOKENIZER_FILE_LOC, "rb"))
    caption_file = open(caption_file_location, "r", encoding="utf-8")
    captions = []
    tokenizer = Tokenizer()
    for line in tqdm(caption_file.readlines()):
        line = line.split()
        caption = [word.lower() for word in line[1:]]
        captions.append(caption)
    tokenizer.fit_on_texts(captions)
    pickle.dump(tokenizer, open(TOKENIZER_FILE_LOC, "wb"))
    return tokenizer

In [None]:
tokenizer = prepare_tokenizer()

In [None]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

In [None]:
def prepare_embedding_matrix(embedding_mat_file_location=EMBEDDING_MATRIX_FILE_LOC, override=False):
    if path.exists(EMBEDDING_MATRIX_FILE_LOC) and not override:
        return pickle.load(open(EMBEDDING_MATRIX_FILE_LOC, "rb"))
    
    embedding_dict = {}
    glove_vectors_file = open(GLOVE_VECTORS_FILE_LOC, "r", encoding="utf-8")
    for line in tqdm(glove_vectors_file.readlines()):
        line = line.split()
        embedding_dict[line[0]] = np.asarray(line[1:], dtype=np.float64)
    
    vocab_size = len(tokenizer.word_index) + 1
    embedding_matrix = np.zeros((vocab_size, 100))
    for word, index in tokenizer.word_index.items():
        embedding_matrix[index, :] = embedding_dict.get(word, embedding_dict["unk"])
    pickle.dump(embedding_matrix, open(EMBEDDING_MATRIX_FILE_LOC, "wb"))
    return embedding_matrix

In [None]:
embedding_matrix = prepare_embedding_matrix()

In [None]:
def prepare_caption_dict(caption_file_location=CAPTIONS_FILE_LOC, override=False):
    if path.exists(CAPTIONS_DICT_FILE_LOC) and not override:
        return pickle.load(open(CAPTIONS_DICT_FILE_LOC, "rb"))
    caption_file = open(caption_file_location, "r", encoding="utf-8")  
    caption_dict = {}
    for line in tqdm(caption_file.readlines()):
        line = line.split()
        head = line[0].split(".")
        label = head[0]
        index = int(head[1].split("#")[1])
        tail = [word.lower() for word in line[1:]]
        if caption_dict.get(label, None) is None:
            caption_dict[label] = {}
        caption_dict[label][index] = tail
    pickle.dump(caption_dict, open(CAPTIONS_DICT_FILE_LOC, "wb"))
    return caption_dict

In [None]:
caption_dict = prepare_caption_dict()

In [None]:
splits = pickle.load(open(SPLIT_FILE_LOC, "rb"))

In [None]:
sentence_lengths = []
for l in caption_dict:
    for s in caption_dict[l]:
        sentence_lengths.append(len(caption_dict[l][s]))
MAX_WORDS_IN_SENTENCE = max(sentence_lengths)
print(MAX_WORDS_IN_SENTENCE)

In [None]:
# inception_v3_feats = pickle.load(open(INCEPTION_FEAT_FILE_LOC, "rb"))

In [None]:
# model = tf.keras.models.Sequential()
# model.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=100, input_length=100, weights=[embedding_matrix], trainable=False))
# model.add(tf.keras.layers.LSTM(10))

In [None]:
# enc = tokenizer.texts_to_sequences([["the", "dog", "river"], ["two", "men", "in", "a", "gray"]])
# padded_docs = pad_sequences(enc, maxlen=100, padding='post')

In [None]:
# from tensorflow.keras import layers
# from tensorflow.keras import Model

# from tensorflow.keras.applications.inception_v3 import InceptionV3

# WEIGHTS_FILE = './inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5'

# inception_v3_model = InceptionV3(
#     input_shape = (224, 224, 3), 
#     include_top = False, 
#     weights = 'imagenet'
# )

# # Not required --> inception_v3_model.load_weights(WEIGHTS_FILE)

# # Enabling the top 2 inception blocks to train
# for layer in model.layers[:249]:
#     layer.trainable = False
# for layer in model.layers[249:]:
#     layer.trainable = True
    
# # Checking model summary to pick a layer (if required)
# inception_v3_model.summary()

In [None]:
def prepare_image(image_path, image_size=IMAGE_SIZE):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_image(image, channels=3)
    image = tf.image.resize(image, [image_size, image_size])
    return image / 255.

In [None]:
def cosine_similarity(a, b):
    return tf.tensordot(a, b, axes=1)

In [None]:
def triplet_ranking_loss(image_pred, text_pred, margin=MARGIN):
    n = image_pred.shape[0]

    im_loss = 0.0
    for i in range(n):
        im_pos_sim = tf.keras.backend.sum(tf.multiply(image_pred[i], text_pred[i]))
        for j in range(n):
            if i == j:
                continue
            im_neg_sim = tf.keras.backend.sum(tf.multiply(image_pred[i], text_pred[j]))
            im_loss += tf.maximum(0, im_neg_sim - im_pos_sim + margin)
    
    txt_loss = 0.0
    for i in range(n):
        txt_pos_sim = tf.keras.backend.sum(tf.multiply(image_pred[i], text_pred[i]))
        for j in range(n):
            if i == j:
                continue
            txt_neg_sim = tf.keras.backend.sum(tf.multiply(image_pred[j], text_pred[i]))
            txt_loss += tf.maximum(0, txt_neg_sim - txt_pos_sim + margin)
    
    loss = im_loss + txt_loss
    return loss / n

In [None]:
def prepare_image_batch(samples, im_size=IMAGE_SIZE):
    im_batch = np.zeros((len(samples), im_size, im_size, 3))
    for i in range(len(samples)):
        im_batch[i] = prepare_image(DATASET_DIR + samples[i] + ".jpg", im_size)
    return im_batch

In [None]:
def prepare_image_feat_batch(samples):
    im_batch = np.zeros((len(samples), 2048))
    for i in range(len(samples)):
        im_batch[i] = inception_v3_feats[samples[i]]
    return im_batch

In [None]:
def prepare_text_batch(samples):
    captions = []
    for i in range(len(samples)):
        random_index = sample([0, 1, 2, 3, 4], 1)[0]
        captions.append(caption_dict[samples[i]][random_index])
    encoded_captions = tokenizer.texts_to_sequences(captions)
    padded_captions = pad_sequences(encoded_captions, maxlen=MAX_WORDS_IN_SENTENCE, padding="post")
    return padded_captions

In [None]:
def prepare_text_batch_one_hot(samples):
    captions = []
    for i in range(len(samples)):
        random_index = sample([0, 1, 2, 3, 4], 1)[0]
        captions.append(caption_dict[samples[i]][random_index])
    encoded_captions = tokenizer.texts_to_matrix(captions, mode="binary")
    return encoded_captions

In [None]:
padded_caps = prepare_image_batch(sample(splits["train"], 64))

In [None]:
plt.imshow(padded_caps[2])

In [None]:
# # Loading inception v3 network for transfer learning
# from tensorflow.keras import layers
# from tensorflow.keras import Model
# from tensorflow.keras.models import Sequential

from tensorflow.keras.applications.inception_v3 import InceptionV3

IMAGE_SIZE = 128

inception_v3_model = InceptionV3(
    input_shape = (IMAGE_SIZE, IMAGE_SIZE, 3), 
    include_top = False, 
    weights = 'imagenet'
)

# Enabling the top 2 inception blocks to train
# for layer in inception_v3_model.layers[:249]:
#     layer.trainable = False
# for layer in inception_v3_model.layers[249:]:
#     layer.trainable = True

# Choosing the output layer to be merged with our FC layers (if required)
inception_output_layer = inception_v3_model.get_layer('mixed7')
# print('Inception model output shape:', inception_output_layer.output_shape)

inception_output = inception_v3_model.output

x = layers.GlobalAveragePooling2D()(inception_output)
x = layers.Dense(512, activation="relu")(x)
x = layers.Dense(256, activation="relu")(x)
x = layers.Dense(EMBEDDING_OUT_LENGTH)(x)
x = layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=-1))(x)

image_model = Model(inception_v3_model.input, x)
# image_model = Sequential()
# image_model.add(tf.keras.Input(shape=(2048,)))
# image_model.add(layers.Dense(1024, activation="relu"))
# image_model.add(layers.Dense(1024, activation="relu"))
# image_model.add(layers.Dense(EMBEDDING_OUT_LENGTH))
# image_model.add(layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=1)))

In [None]:
# np.sum(image_model(prepare_image_batch(sample(splits["train"], 64), 32))[0] ** 2)
# # image_model(prepare_image(DATASET_DIR + "36979" + ".jpg", 256))

In [None]:
text_model = Sequential()
# text_model.add(layers.Input(vocab_size,))
# text_model.add(layers.Dense(512))
# text_model.add(layers.Dense(256))
# text_model.add(layers.Dense(256))
# text_model.add(layers.Dense(EMBEDDING_OUT_LENGTH))
# text_model.add(layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=1)))
text_model.add(layers.Embedding(input_dim=vocab_size, output_dim=100, input_length=MAX_WORDS_IN_SENTENCE, weights=[embedding_matrix], trainable=False))
# text_model.add(layers.LSTM(32))
# text_model.add(layers.Dense(512, activation="relu"))
# text_model.add(layers.Dense(256, activation="relu"))
# text_model.add(layers.Dense(EMBEDDING_OUT_LENGTH))
# text_model.add(layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=1)))

# text_model.add(layers.Conv1D(128, 3))
# # text_model.add(layers.Bidirectional(layers.LSTM(256, return_sequences=True)))
text_model.add(layers.Conv1D(32, 3, padding="same"))
text_model.add(layers.GlobalAvgPool1D())
text_model.add(layers.Dense(512, activation="relu"))
text_model.add(layers.Dense(256, activation="relu"))
text_model.add(layers.Dense(EMBEDDING_OUT_LENGTH))
text_model.add(layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=1)))

In [None]:
text_model(prepare_text_batch(sample(splits["train"], 64))).shape

In [None]:
np.sum(text_model(prepare_text_batch(sample(splits["train"], MAX_WORDS_IN_SENTENCE)))[0] ** 2)

In [None]:
def calculate_loss_and_gradient(image_input, text_input, image_model=image_model, text_model=text_model, margin=MARGIN):
    with tf.GradientTape() as image_tape, tf.GradientTape() as text_tape:
        image_tensor = tf.convert_to_tensor(image_input)
        text_tensor = tf.convert_to_tensor(np.array(text_input, dtype=np.float32))
        image_tape.watch(image_tensor)
        text_tape.watch(text_tensor)
        image_tensor = image_input
        text_tensor = text_input
        image_pred = image_model(image_tensor, training=True)
        text_pred = text_model(text_tensor, training=True)
        loss = triplet_ranking_loss(image_pred, text_pred, margin)
        image_grad = image_tape.gradient(loss, image_model.trainable_variables)
        text_grad = text_tape.gradient(loss, text_model.trainable_variables)
        return loss, image_grad, text_grad

In [None]:
optimizer = tf.optimizers.Adam(lr=0.001)

In [None]:
train_losses = []
val_losses = []

In [None]:
MARGIN = 0.7
ITERATIONS = 2000
BATCH_SIZE = 32
IMAGE_SIZE = 128

for i in range(ITERATIONS):
    train_samples = sample(splits["train"], BATCH_SIZE)
    train_image_batch = prepare_image_batch(train_samples, IMAGE_SIZE)
    train_text_batch = prepare_text_batch(train_samples)
    train_loss, image_grad, text_grad = calculate_loss_and_gradient(train_image_batch, train_text_batch, image_model, text_model, MARGIN)
    optimizer.apply_gradients(zip(image_grad, image_model.trainable_variables))
    optimizer.apply_gradients(zip(text_grad, text_model.trainable_variables))
    val_samples = sample(splits["val"], BATCH_SIZE)
    val_image_batch = prepare_image_batch(val_samples, IMAGE_SIZE)
    val_text_batch = prepare_text_batch(val_samples)
    val_image_pred = image_model(val_image_batch)
    val_text_pred = text_model(val_text_batch)
    val_loss = triplet_ranking_loss(val_image_pred, val_text_pred)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    print("iteration:", (i+1), end=" ")
    tf.print("train_loss:", train_loss, "val_loss:", val_loss, output_stream=sys.stdout)

In [None]:
test_inputs = np.zeros((len(splits["test"]), 2048))
for i, label in enumerate(splits["test"]):
    test_inputs[i] = inception_v3_feats[label]
test_image_outputs = image_model.predict(test_inputs)
del test_inputs
reference_outs = {}
for i, label in enumerate(splits["test"]):
    reference_outs[label] = test_image_outputs[i]

In [None]:
test_inputs = np.zeros((len(splits["test"]), IMAGE_SIZE, IMAGE_SIZE, 3))
for i, label in enumerate(splits["test"]):
    test_inputs[i] = prepare_image(DATASET_DIR + label + ".jpg", IMAGE_SIZE)
test_image_outputs = image_model.predict(test_inputs)
del test_inputs
reference_outs = {}
for i, label in enumerate(splits["test"]):
    reference_outs[label] = test_image_outputs[i]

In [None]:
len(reference_outs)

In [None]:
text_query = "dogs playing with each other .".split()
tokenized = tokenizer.texts_to_sequences([text_query])
padded = pad_sequences(tokenized, maxlen=MAX_WORDS_IN_SENTENCE, padding="post")
text_output = text_model.predict(padded)
print(text_output)
sims = []
for label in reference_outs:
    sims.append([np.sum(text_output * reference_outs[label]), label])
sims.sort(reverse=True)

In [None]:
sims[:10]

In [None]:
# for i, out in enumerate(sims):
plt.imshow(prepare_image(DATASET_DIR + sims[7][1] + ".jpg", 512))
# if i is 10:
#     break

In [None]:
n = im_out.shape[0]

im_loss = 0.0
for i in range(n):
    im_pos_sim = cosine_similarity(im_out[i], txt_out[i])
    for j in range(n):
        if i == j:
            continue
        im_neg_sim = cosine_similarity(im_out[i], txt_out[j])
        im_loss += max(0, im_neg_sim - im_pos_sim + MARGIN)

txt_loss = 0.0
for i in range(n):
    txt_pos_sim = cosine_similarity(im_out[i], txt_out[i])
    for j in range(n):
        if i == j:
            continue
        txt_neg_sim = cosine_similarity(im_out[j], txt_out[i])
        txt_loss += max(0, txt_neg_sim - txt_pos_sim + MARGIN)

loss = im_loss + txt_loss
loss / n


In [None]:
len(splits["train"])

In [None]:
cosine_similarity(im_out[0], txt_out[0])

In [None]:
image_model(prepare_image_batch(sample(splits["train"], 2), 2))[0]

In [None]:
embeddings = {}

In [None]:
for label in tqdm(splits["train"]):
    embeddings[label] = image_model(np.array([prepare_image(DATASET_DIR + label + ".jpg", 256)]))

In [None]:
for label in tqdm(splits["val"]):
    embeddings[label] = image_model(np.array([prepare_image(DATASET_DIR + label + ".jpg", 256)]))

In [None]:
for label in tqdm(splits["test"]):
    embeddings[label] = image_model(np.array([prepare_image(DATASET_DIR + label + ".jpg", 256)]))

In [None]:
len(embeddings)

In [None]:
keep_running = True
i = 0
step = 100
train = splits["train"]
while keep_running:
    if i % 1000 == 0:
        print(i)
    samples = []
    if i + step > len(train):
        samples = train[i:]
        keep_running = False
    else:
        samples = train[i:i+step]
    batch = prepare_image_batch(samples, len(samples), 256)
    out = image_model(batch)
    for j in range(len(samples)):
        embeddings[samples[j]] = out[j]
    i += step

In [None]:
len(embeddings)

In [None]:
[1,2,3][1:]

In [None]:
def res_net_block_for_image(input_data, filters, conv_size):
    x = layers.Conv2D(filters, conv_size, activation='relu', padding='same')(input_data)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(filters, conv_size, activation=None, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Add()([x, input_data])
    x = layers.Activation('relu')(x)
    return x

In [None]:
def res_net_block_for_text(input_data, filters, conv_size):
    x = layers.Conv1D(filters, conv_size, activation='relu', padding='same')(input_data)
    x = layers.BatchNormalization()(x)
    x = layers.Conv1D(filters, conv_size, activation=None, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Add()([x, input_data])
    x = layers.Activation('relu')(x)
    return x

In [None]:
# Image model
image_input = tf.keras.Input(shape=(IMAGE_SIZE, IMAGE_SIZE, 3))
x = layers.Conv2D(32, 3, activation='relu')(image_input)
x = layers.Conv2D(64, 3, activation='relu')(x)
x = layers.MaxPooling2D(3)(x)

num_res_net_blocks = 4
for i in range(num_res_net_blocks):
    x = res_net_block_for_image(x, 64, 3)

x = layers.Conv2D(64, 3, activation='relu')(x)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(EMBEDDING_OUT_LENGTH)(x)
image_output = layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=1))(x)

image_model = Model(image_input, image_output)

In [None]:
np.sum(image_model(prepare_image_batch(sample(splits["train"], 64), IMAGE_SIZE))[0] ** 2)

In [None]:
text_input = tf.keras.Input(MAX_WORDS_IN_SENTENCE)
y = layers.Embedding(input_dim=vocab_size, output_dim=100, input_length=MAX_WORDS_IN_SENTENCE, weights=[embedding_matrix], trainable=False)(text_input)
y = layers.Conv1D(32, 3, activation='relu')(y)
y = layers.Conv1D(64, 3, activation='relu')(y)
y = layers.MaxPooling1D(3)(y)

num_res_net_blocks = 4
for i in range(num_res_net_blocks):
    y = res_net_block_for_text(y, 64, 3)

y = layers.Conv1D(64, 3, activation='relu')(y)
y = layers.GlobalAveragePooling1D()(y)
y = layers.Dense(256, activation='relu')(y)
y = layers.Dropout(0.5)(y)
y = layers.Dense(EMBEDDING_OUT_LENGTH)(y)
text_output = layers.Lambda(lambda y: tf.keras.backend.l2_normalize(y, axis=1))(y)

text_model = Model(text_input, text_output)

In [None]:
np.sum(text_model(prepare_text_batch(sample(splits["train"], BATCH_SIZE)))[0] ** 2)
# text_model(prepare_text_batch(sample(splits["train"], BATCH_SIZE))).shape

In [None]:
def calculate_loss_and_gradient(image_input, text_input, image_model=image_model, text_model=text_model, margin=MARGIN):
    with tf.GradientTape() as image_tape, tf.GradientTape() as text_tape:
        image_tensor = tf.convert_to_tensor(image_input)
        text_tensor = tf.convert_to_tensor(np.array(text_input, dtype=np.float32))
        image_tape.watch(image_tensor)
        text_tape.watch(text_tensor)
        image_tensor = image_input
        text_tensor = text_input
        image_pred = image_model(image_tensor, training=True)
        text_pred = text_model(text_tensor, training=True)
        loss = triplet_ranking_loss(image_pred, text_pred, margin)
        image_grad = image_tape.gradient(loss, image_model.trainable_variables)
        text_grad = text_tape.gradient(loss, text_model.trainable_variables)
        return loss, image_grad, text_grad

In [None]:
optimizer = tf.optimizers.Adam(lr=0.0001)

In [None]:
train_losses = []
val_losses = []

In [None]:
MARGIN = 0.5
ITERATIONS = 15000
BATCH_SIZE = 16
IMAGE_SIZE = 256

for i in range(ITERATIONS):
    # train_start_index = (i * BATCH_SIZE) % len(splits["train"])
    # train_end_index = (train_start_index + BATCH_SIZE) if train_start_index + BATCH_SIZE < len(splits["train"]) else len(splits["train"])
    train_samples = sample(splits["train"], BATCH_SIZE) # [train_start_index:train_end_index]
    train_image_batch = prepare_image_batch(train_samples, IMAGE_SIZE)
    train_text_batch = prepare_text_batch(train_samples)
    train_loss, image_grad, text_grad = calculate_loss_and_gradient(train_image_batch, train_text_batch, image_model, text_model, MARGIN)
    optimizer.apply_gradients(zip(image_grad, image_model.trainable_variables))
    optimizer.apply_gradients(zip(text_grad, text_model.trainable_variables))
    # val_start_index = (i * BATCH_SIZE) % len(splits["val"])
    # val_end_index = (val_start_index + BATCH_SIZE) if val_start_index + BATCH_SIZE < len(splits["val"]) else len(splits["val"])
    val_samples = sample(splits["val"], BATCH_SIZE) # [val_start_index:val_end_index]
    val_image_batch = prepare_image_batch(val_samples, IMAGE_SIZE)
    val_text_batch = prepare_text_batch(val_samples)
    val_image_pred = image_model(val_image_batch)
    val_text_pred = text_model(val_text_batch)
    val_loss = triplet_ranking_loss(val_image_pred, val_text_pred)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    print("iteration:", i+1, "train_loss:", train_loss, "val_loss:", val_loss)

In [None]:
image_model.save("./models/image_model_v1.h5")
text_model.save("./models/text_model_v1.h5")

losses = {
    "train": train_losses,
    "val": val_losses
}
pickle.dump(losses, open("./losses/losses_v1.pickle", "wb"))

In [None]:
plt.plot(train_losses)

In [None]:
plt.plot(val_losses)

In [None]:
test_inputs = np.zeros((len(splits["test"]), IMAGE_SIZE, IMAGE_SIZE, 3))
for i, label in enumerate(splits["test"]):
    test_inputs[i] = prepare_image(DATASET_DIR + label + ".jpg", IMAGE_SIZE)
test_image_outputs = image_model.predict(test_inputs)
del test_inputs
reference_outs = {}
for i, label in enumerate(splits["test"]):
    reference_outs[label] = test_image_outputs[i]

In [None]:
# text_query = "people are running .".split()
text_query = caption_dict[choice(splits["test"])][choice([0,1,2,3,4])]
tokenized = tokenizer.texts_to_sequences([text_query])
padded = pad_sequences(tokenized, maxlen=MAX_WORDS_IN_SENTENCE, padding="post")
text_output = text_model.predict(padded)
sims = []
for label in reference_outs:
    sims.append([np.sum(text_output * reference_outs[label]), label])
sims.sort(reverse=True)

In [None]:
text_query

In [None]:
sims[:10]

In [None]:
plt.imshow(prepare_image(DATASET_DIR + sims[6][1] + ".jpg", 512))

In [None]:
text_model(prepare_text_batch(sample(splits["train"], MAX_WORDS_IN_SENTENCE)))[0]