In [1]:
import numpy as np
import pandas as pd

In [2]:
def load(captions_filename, features_filename):
  features = pd.read_csv(features_filename, sep=',',header=None)
  texts = []
  with open(captions_filename) as fp:
    for line in fp:
      tokens = line.strip().split()
      texts.append(' '.join(tokens[1:]))
  return features, texts

In [3]:
features, texts = load('../input/train-1000-recipe-images/rec_imgID_train_1000.txt', '../input/train-1000-recipe-images/train_image_encoding1000.csv')
print(texts[42])

Curly Q Noodle Chicken Soup


Making Vocab

In [4]:
vocab_texts = []
with open('../input/train-1000-recipe-images/rec_imgID_train_1000.txt') as fp:
    for line in fp:
      tokens = line.strip().split()
      vocab_texts.append(' '.join(tokens[1:]))
        
for i in range(1999,24000,1000):
    with open('../input/train-labels/train_label_' + str(i) + '.txt') as fp:
        for line in fp:
          tokens = line.strip().split()
          vocab_texts.append(' '.join(tokens[1:]))

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(vocab_texts)

vocab = tokenizer.word_index
vocab['<eos>'] = 0 # add word with id 0
print('pizza', vocab['pizza'])
import json
with open('vocab.json', 'w') as fp: # save the vocab
    fp.write(json.dumps(vocab))

pizza 52


In [6]:
sequences = tokenizer.texts_to_sequences(texts)
captions = pad_sequences(sequences, maxlen=16)
print(captions)

[[   0    0    0 ...   22    3 2276]
 [   0    0    0 ...  298 1582    8]
 [   0    0    0 ...    0  140  809]
 ...
 [   0    0    0 ...  956  130    8]
 [   0    0    0 ...    1 1787   33]
 [   0    0    0 ...  232    6    9]]


In [7]:
import fork_of_embedding_loader_keras as embedding
embedding_weights = embedding.load(vocab, 300, '../input/glove840b300dtxt/glove.840B.300d.txt')
print('pizza', embedding_weights[vocab['pizza']][:20])

loading embeddings from "../input/glove840b300dtxt/glove.840B.300d.txt"


pizza [ 0.0068727 -0.21634    0.27831   -0.26192    0.22884    0.89332
  0.4131     0.27377    0.22652    1.5041    -0.58059    0.56083
 -0.18432    0.27738   -0.10709   -0.13519    0.023817   1.1765
 -0.12659    0.043173 ]


In [8]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, GRU, Dot, Concatenate

def make_model():
  # define inputs
  image_input = Input(shape=(100352,))
  caption_input = Input(shape=(16,))
  noise_input = Input(shape=(16,))
  
  # build caption representation
  caption_embedding = Embedding(len(vocab), 300, input_length=16, weights=[embedding_weights])
  caption_rnn = GRU(256)
  caption_pipeline = caption_rnn(caption_embedding(caption_input))
  
  # noise caption representation
  noise_pipeline = caption_rnn(caption_embedding(noise_input))
  
  # image representation
  image_dense = Dense(256, activation='tanh')
  image_pipeline = image_dense(image_input)

  # dot product between
  positive_pair = Dot(1)([image_pipeline, caption_pipeline])
  negative_pair = Dot(1)([image_pipeline, noise_pipeline])
  output = Concatenate()([positive_pair, negative_pair])
  
  # make one model for training, and models for running the text or image 
  training_model = Model(inputs=[image_input, caption_input, noise_input], outputs=output)
  image_model = Model(inputs=image_input, outputs=image_pipeline)
  caption_model = Model(inputs=caption_input, outputs=caption_pipeline)
  return training_model, image_model, caption_model

In [9]:
from keras import backend as K

def custom_loss(y_true, y_pred):
  positive = y_pred[:,0]
  negative = y_pred[:,1]
  return K.sum(K.maximum(0., 1. - positive + negative))

In [10]:
def accuracy(y_true, y_pred):
  positive = y_pred[:,0]
  negative = y_pred[:,1]
  return K.mean(positive > negative)

In [11]:
training_model, image_model, caption_model = make_model()
training_model.compile(loss=custom_loss, optimizer='adam', metrics=['accuracy'])

In [12]:
noise = np.copy(captions)
fake_labels = np.zeros((len(features), 1))
X_train = [features[:900], captions[:900], noise[:900]]
Y_train = fake_labels[:900]
X_valid = [features[-100:], captions[-100:], noise[-100:]]
Y_valid = fake_labels[-100:]

In [13]:
print(X_train[0].shape, X_train[1].shape, Y_train.shape, X_valid[0].shape, X_valid[1].shape, Y_valid.shape)

(900, 100352) (900, 16) (900, 1) (100, 100352) (100, 16) (100, 1)


In [14]:
for epoch in range(10):
  np.random.shuffle(noise) # don’t forget to shuffle mismatched captions
  training_model.fit(X_train, Y_train, validation_data=[X_valid, Y_valid], epochs=1, batch_size=64)



In [15]:
# image_model.save('model.image', save_format='tf')
# caption_model.save('model.caption', save_format='tf')

In [16]:
# training_model.save('image-text-model', save_format='tf')

In [17]:
# import tensorflow as tf
# from tensorflow import keras
# model = tf.keras.models.load_model('../input/recipe1m-joint-image-text-keras/image-text-model')

Retrain on next 1000 

In [18]:
features, texts = load('../input/train-labels/train_label_1999.txt', '../input/train-2000-recipe-images/train_image_encoding2000.csv')

sequences = tokenizer.texts_to_sequences(texts)
captions = pad_sequences(sequences, maxlen=16)
print(captions)

noise = np.copy(captions)
fake_labels = np.zeros((len(features), 1))
X_train = [features[:900], captions[:900], noise[:900]]
Y_train = fake_labels[:900]
X_valid = [features[-100:], captions[-100:], noise[-100:]]
Y_valid = fake_labels[-100:]

for epoch in range(10):
  np.random.shuffle(noise) # don’t forget to shuffle mismatched captions
  training_model.fit(X_train, Y_train, validation_data=[X_valid, Y_valid], epochs=1, batch_size=64)

[[   0    0    0 ...   91  376   27]
 [   0    0    0 ... 2330  359  113]
 [   0    0    0 ...   11 1354    3]
 ...
 [   0    0    0 ...  471  101   57]
 [   0    0    0 ...  706   62  280]
 [   0    0    0 ...  498    1 1503]]


Next 1000 Retrain

In [19]:
features, texts = load('../input/train-labels/train_label_2999.txt', '../input/train-3000-recipe-images/train_image_encoding3000.csv')

sequences = tokenizer.texts_to_sequences(texts)
captions = pad_sequences(sequences, maxlen=16)
print(captions)

noise = np.copy(captions)
fake_labels = np.zeros((len(features), 1))
X_train = [features[:900], captions[:900], noise[:900]]
Y_train = fake_labels[:900]
X_valid = [features[-100:], captions[-100:], noise[-100:]]
Y_valid = fake_labels[-100:]

for epoch in range(10):
  np.random.shuffle(noise) # don’t forget to shuffle mismatched captions
  training_model.fit(X_train, Y_train, validation_data=[X_valid, Y_valid], epochs=1, batch_size=64)

[[   0    0    0 ... 1068  595  273]
 [   0    0    0 ... 4202  159  414]
 [   0    0    0 ...  798   16  275]
 ...
 [   0    0    0 ...  101    2   24]
 [   0    0    0 ...   26 2069   37]
 [   0    0    0 ... 2031  680  562]]


Next 1000 Retrain

In [20]:
features, texts = load('../input/train-labels/train_label_3999.txt', '../input/train-4000-recipes-images/train_image_encoding4000.csv')

sequences = tokenizer.texts_to_sequences(texts)
captions = pad_sequences(sequences, maxlen=16)
print(captions)

noise = np.copy(captions)
fake_labels = np.zeros((len(features), 1))
X_train = [features[:900], captions[:900], noise[:900]]
Y_train = fake_labels[:900]
X_valid = [features[-100:], captions[-100:], noise[-100:]]
Y_valid = fake_labels[-100:]

for epoch in range(10):
  np.random.shuffle(noise) # don’t forget to shuffle mismatched captions
  training_model.fit(X_train, Y_train, validation_data=[X_valid, Y_valid], epochs=1, batch_size=64)

[[   0    0    0 ...   76   23    4]
 [   0    0    0 ...  186   52   10]
 [   0    0    0 ...  887 2070  218]
 ...
 [   0    0    0 ...    0 4574   90]
 [   0    0    0 ... 1545 3097 1639]
 [   0    0    0 ... 4575 4576  493]]


Next 1000 Retrain

In [21]:
features, texts = load('../input/train-labels/train_label_4999.txt', '../input/train-5000-recipes-images/train_image_encoding5000.csv')

sequences = tokenizer.texts_to_sequences(texts)
captions = pad_sequences(sequences, maxlen=16)
print(captions)

noise = np.copy(captions)
fake_labels = np.zeros((len(features), 1))
X_train = [features[:900], captions[:900], noise[:900]]
Y_train = fake_labels[:900]
X_valid = [features[-100:], captions[-100:], noise[-100:]]
Y_valid = fake_labels[-100:]

for epoch in range(10):
  np.random.shuffle(noise) # don’t forget to shuffle mismatched captions
  training_model.fit(X_train, Y_train, validation_data=[X_valid, Y_valid], epochs=1, batch_size=64)

[[  0   0   0 ...  34 175 885]
 [  0   0   0 ... 454  25   9]
 [  0   0   0 ...  41 428 179]
 ...
 [  0   0   0 ...   1 108 535]
 [  0   0   0 ... 101  30  89]
 [  0   0   0 ...   2 692   7]]


6000

In [22]:
features, texts = load('../input/train-labels/train_label_5999.txt', '../input/train-6000-recipes-images/train_image_encoding6000.csv')

sequences = tokenizer.texts_to_sequences(texts)
captions = pad_sequences(sequences, maxlen=16)
print(captions)

noise = np.copy(captions)
fake_labels = np.zeros((len(features), 1))
X_train = [features[:900], captions[:900], noise[:900]]
Y_train = fake_labels[:900]
X_valid = [features[-100:], captions[-100:], noise[-100:]]
Y_valid = fake_labels[-100:]

for epoch in range(10):
  np.random.shuffle(noise) # don’t forget to shuffle mismatched captions
  training_model.fit(X_train, Y_train, validation_data=[X_valid, Y_valid], epochs=1, batch_size=64)

[[   0    0    0 ...    2  272  158]
 [   0    0    0 ...  105   71    9]
 [   0    0    0 ...    0   84 4760]
 ...
 [   0    0    0 ...  149  321  227]
 [   0    0    0 ...  120   68  371]
 [   0    0    0 ...  263 1864 3043]]


7000

In [23]:
features, texts = load('../input/train-labels/train_label_6999.txt', '../input/train-7000-10000-recipes-images/train_image_encoding7000/train_image_encoding7000.csv')

sequences = tokenizer.texts_to_sequences(texts)
captions = pad_sequences(sequences, maxlen=16)
print(captions)

noise = np.copy(captions)
fake_labels = np.zeros((len(features), 1))
X_train = [features[:900], captions[:900], noise[:900]]
Y_train = fake_labels[:900]
X_valid = [features[-100:], captions[-100:], noise[-100:]]
Y_valid = fake_labels[-100:]

for epoch in range(10):
  np.random.shuffle(noise) # don’t forget to shuffle mismatched captions
  training_model.fit(X_train, Y_train, validation_data=[X_valid, Y_valid], epochs=1, batch_size=64)

[[   0    0    0 ...    1    6 2511]
 [   0    0    0 ... 3216  166 3217]
 [   0    0    0 ...  765   92   80]
 ...
 [   0    0    0 ...    2  346 3279]
 [   0    0    0 ... 2145   17  216]
 [   0    0    0 ...  645  437  535]]


8000

In [24]:
features, texts = load('../input/train-labels/train_label_7999.txt', '../input/train-7000-10000-recipes-images/train_image_encoding8000/train_image_encoding8000.csv')

sequences = tokenizer.texts_to_sequences(texts)
captions = pad_sequences(sequences, maxlen=16)
print(captions)

noise = np.copy(captions)
fake_labels = np.zeros((len(features), 1))
X_train = [features[:900], captions[:900], noise[:900]]
Y_train = fake_labels[:900]
X_valid = [features[-100:], captions[-100:], noise[-100:]]
Y_valid = fake_labels[-100:]

for epoch in range(10):
  np.random.shuffle(noise) # don’t forget to shuffle mismatched captions
  training_model.fit(X_train, Y_train, validation_data=[X_valid, Y_valid], epochs=1, batch_size=64)

[[   0    0    0 ...    0 3280  491]
 [   0    0    0 ...   51  428  179]
 [   0    0    0 ...   45   15  522]
 ...
 [   0    0    0 ...   38   18   10]
 [   0    0    0 ...   26 1388  196]
 [   0    0    0 ...    6  678   33]]


9000

In [25]:
features, texts = load('../input/train-labels/train_label_8999.txt', '../input/train-7000-10000-recipes-images/train_image_encoding9000/train_image_encoding9000.csv')

sequences = tokenizer.texts_to_sequences(texts)
captions = pad_sequences(sequences, maxlen=16)
print(captions)

noise = np.copy(captions)
fake_labels = np.zeros((len(features), 1))
X_train = [features[:900], captions[:900], noise[:900]]
Y_train = fake_labels[:900]
X_valid = [features[-100:], captions[-100:], noise[-100:]]
Y_valid = fake_labels[-100:]

for epoch in range(10):
  np.random.shuffle(noise) # don’t forget to shuffle mismatched captions
  training_model.fit(X_train, Y_train, validation_data=[X_valid, Y_valid], epochs=1, batch_size=64)

[[   0    0    0 ... 2607 1154 1019]
 [   0    0    0 ... 5319   98  147]
 [   0    0    0 ...  927  460 3340]
 ...
 [   0    0    0 ...   26   16 1234]
 [   0    0    0 ...  246   79  446]
 [   0    0    0 ...    3  286   27]]


10000

In [26]:
features, texts = load('../input/train-labels/train_label_9999.txt', '../input/train-7000-10000-recipes-images/train_image_encoding10000/train_image_encoding10000.csv')

sequences = tokenizer.texts_to_sequences(texts)
captions = pad_sequences(sequences, maxlen=16)
print(captions)

noise = np.copy(captions)
fake_labels = np.zeros((len(features), 1))
X_train = [features[:900], captions[:900], noise[:900]]
Y_train = fake_labels[:900]
X_valid = [features[-100:], captions[-100:], noise[-100:]]
Y_valid = fake_labels[-100:]

for epoch in range(10):
  np.random.shuffle(noise) # don’t forget to shuffle mismatched captions
  training_model.fit(X_train, Y_train, validation_data=[X_valid, Y_valid], epochs=1, batch_size=64)

[[   0    0    0 ... 3348   50   33]
 [   0    0    0 ...   43  562   53]
 [   0    0    0 ...    5  541  485]
 ...
 [   0    0    0 ...    1  241   12]
 [   0    0    0 ...   49 5677   27]
 [   0    0    0 ...  608   19  666]]


In [27]:
image_model.save('10000-model.image')
caption_model.save('10000-model.caption')
training_model.save('10000-image-text-model')

In [28]:
# from tensorflow import keras
# reconstructed_model = keras.models.load_model("../input/recipe1m-joint-image-text-keras/image-text-model", compile=False)

Test on 1000 images

In [29]:
features = np.load('../input/train-1000-recipe-images/test_image_encoding_1000.npy')
texts = []
with open('../input/test-labels-recipe1m/test_label_999.txt') as fp:
    for line in fp:
      tokens = line.strip().split()
      texts.append(' '.join(tokens[1:]))

sequences = tokenizer.texts_to_sequences(texts)
captions = pad_sequences(sequences, maxlen=16)

In [30]:
caption_representations = caption_model.predict(captions)
image_representations = image_model.predict(features)
np.save('caption-test-1000-representations', caption_representations)
np.save('image-test-1000-representations', image_representations)

In [31]:
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
resnet_model = ResNet50(weights='imagenet', include_top=False)

def extract_features(img_path): # though feature vector for training images has been precalculated and present in a file still function is needed for unseen images
  img = image.load_img(img_path, target_size=(224, 224))
  x = image.img_to_array(img)
  x = np.expand_dims(x, axis=0)
  x = preprocess_input(x)
  features = resnet_model.predict(x)
  return np.expand_dims(features.flatten(), axis=0)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [32]:
def generate_caption(image_filename, n=10):
  # generate image representation for new image
  image_representation = image_model.predict(extract_features(image_filename))
  # compute score of all captions in the dataset
  scores = np.dot(caption_representations, image_representation.T).flatten()
  # compute indices of n best captions
  indices = np.argpartition(scores, -n)[-n:]
  indices = indices[np.argsort(scores[indices])]
  # display them
  for i in [int(x) for x in reversed(indices)]:
    print(scores[i], texts[i])

In [33]:
generate_caption('../input/recipe1m-test-jpg1/1bbb06bc58.jpg')
Image('../input/recipe1m-test-jpg1/1bbb06bc58.jpg')

0.6873505 Molasses Raisin Bread
0.20260657 Easy Chocolate-Caramel Brownies
0.16035119 Chocolate Brown-Sugar Brownies
0.1024929 Strawberry Nut Bread
0.07332119 Garlic Bread (Pane Strofinato All'Aglio)
0.0057524666 Hershey's Low Fat Banana Bread
0.002683401 Peanut Butter Crumb Topped Brownies
-0.038095728 Cheese and Salami Loaf
-0.04550138 Hearth Bread
-0.055847332 Delicious Pumpkin Bread


NameError: name 'Image' is not defined

In [34]:
# from keras.preprocessing.sequence import pad_sequences
# def preprocess_texts(texts):
#   output = []
#   for text in texts:
#     output.append([vocab[word] if word in vocab else 0 for word in text.split()])
#   return pad_sequences(output, maxlen=16)

In [35]:
# def search_image(caption, n=10):
#   caption_representation = caption_model.predict(preprocess_texts([caption]))
#   scores = np.dot(image_representations, caption_representation.T).flatten()
#   indices = np.argpartition(scores, -n)[-n:]
#   indices = indices[np.argsort(scores[indices])]
#   for i in [int(x) for x in reversed(indices)]:
#     print(scores[i], images[i])
#     display(Image('05-caption-images/' + images[i]))

In [36]:
# search_image('kids playing football', 1)