In [2]:
import os
import pickle
from tqdm.notebook import tqdm

import tensorflow as tf
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model

from tensorflow.keras.layers import LSTM, Dense, Embedding, add, Input, Dropout, Conv2D, MaxPooling2D, Flatten




In [131]:
new_model = VGG16()
# restructure the model
new_model = Model(inputs=new_model.inputs, outputs=new_model.layers[-2].output)

new_model.summary()


Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_19 (InputLayer)       [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0   

In [132]:
#FEATURES ARE EXTRACTED FROM IMAGE USING THE VGG16 MODEL
features = {}
num = 0

for img_name in tqdm(os.listdir(os.path.join('images'))):


    
    #get the image path
    img = os.path.join('images', img_name)
    
    #load the image into a processible formable
    image = load_img(img, target_size=(224, 224))
    
    #convert the image into an array
    image = img_to_array(image)
    
    #reshape the array to match the input shape of the vgg16 model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    
    #preprocess the array converting values of 255 to be between 0 and 1
    image = preprocess_input(image)
    
    #make predictions on the image to extract the features
    feature = new_model.predict(image)
    
    #acquire the image id
    image_id = img_name.split('.')[0]
    
    #add into features with the image_id as a key
    features[image_id] = feature




  0%|          | 0/379 [00:00<?, ?it/s]



In [35]:
with open('features.pkl', 'wb') as f:
  pickle.dump(features, f)

In [133]:
with open('new_captions.txt', 'r') as f:
  captions = f.read()

with open('new_train.txt', 'r') as f:
    food_captions = f.read()

In [134]:
images = os.listdir(os.path.join('images'))

In [135]:
#CAPTIONS ARE LOADED FROM CAPTIONS FILE INTO A MAPPING DICT
mapping = {}
keys = list(features.keys())


for id, line in tqdm(enumerate(captions.split('\n'))):

    token = line.split(',')
    
    image_id, cap = token[0], token[1:]
    
    image_id = image_id.split('.')[0]
    
    cap = " ".join(cap)
    
    if image_id in keys:
        if image_id not in mapping:
            mapping[image_id] = []
    
        mapping[image_id].append(cap)

0it [00:00, ?it/s]

In [136]:
keys = list(features.keys())


for id, line in tqdm(enumerate(food_captions.split('\n\n'))):

    token = line.split(',')
    
    image_id, cap = token[0], token[1:]
    
    image_id = image_id.split('.')[0]
    
    cap = " ".join(cap).strip()

    if image_id in keys:
        if image_id not in mapping:
            print(image_id)
            mapping[image_id] = []
        
        mapping[image_id].append(cap)
    else:
        print(image_id)

0it [00:00, ?it/s]

1005649
1014775
1026328
1028787
1043283
1050519
1057749
1057810
1072416
1074856
1074942
1076891
1077610
1077964
1088809
1097378
1103795
1109597
1111062
1112300
1112838
1121884
112378
1133267
1142597
1147371
1154371
1158360
1159801
1165004
1166116
1166210
116697
1174241
1174949
1177254
118237
1184568
1185445
1185654
1191665
1196628
1199851
1200079
1214326
1215650
1218767
1220194
1230465
1232311
123782
1239205
1240585
1246460
1246552
1252879
1264435
1265307
1272778
127721
1280767
128259
1284682
1289014
1290425
1305678
1322211
1323309
1335563
134
1340287
1343307
1344873
1346262
1348788
1350394
1354215
1355206
1357950
1361950
136256
1376013
1379062
1391393
1392703
1397313
1399051
1406371
1410907
1416578
1420227
1421796
142332
1428585
1442421
1443243
1445323
1447062
1448488
1450514
1456028
1456379
1464079
1466791
1481550
1484086
1486972
1487150
1488583
1491146
1498817
1503274
1528187
1528623
1544507
1547651
1550786
1558003
1558101
156078
157083
1575505
1580276
1581291
1581853
1586889
159513

In [137]:
mapping.keys()

dict_keys(['2652__ProtectWyJQcm90ZWN0Il0_FocusFillWzI5NCwyMjIsIngiLDBd', 'acne-039-v2__ProtectWyJQcm90ZWN0Il0_FocusFillWzI5NCwyMjIsIngiLDFd', 'acne-077-v2__ProtectWyJQcm90ZWN0Il0_FocusFillWzI5NCwyMjIsIngiLDFd', 'acne-089-v2__ProtectWyJQcm90ZWN0Il0_FocusFillWzI5NCwyMjIsIngiLDFd', 'acne-187-v2__ProtectWyJQcm90ZWN0Il0_FocusFillWzI5NCwyMjIsIngiLDFd', 'acne-205-v2__ProtectWyJQcm90ZWN0Il0_FocusFillWzI5NCwyMjIsIngiLDFd', 'acne-268-v2__ProtectWyJQcm90ZWN0Il0_FocusFillWzI5NCwyMjIsIngiLDFd', 'acne-comedones-57-v2__ProtectWyJQcm90ZWN0Il0_FocusFillWzI5NCwyMjIsIngiLDE3XQ', 'acne-face-1-10__ProtectWyJQcm90ZWN0Il0_FocusFillWzI5NCwyMjIsIngiLDFd', 'acne-face-1-11__ProtectWyJQcm90ZWN0Il0_FocusFillWzI5NCwyMjIsIngiLDE4XQ', 'acne-face-1-12__ProtectWyJQcm90ZWN0Il0_FocusFillWzI5NCwyMjIsIngiLDFd', 'acne-face-1-13__ProtectWyJQcm90ZWN0Il0_FocusFillWzI5NCwyMjIsIngiLDFd', 'acne-face-1-14__ProtectWyJQcm90ZWN0Il0_FocusFillWzI5NCwyMjIsIngiLDE4XQ', 'acne-face-1-16__ProtectWyJQcm90ZWN0Il0_FocusFillWzI5NCwyMjIsInkiLDg1

In [138]:
#CAPTIONS ARE PREPROCESSED CONVERTING THEM INTO LOWER CASE, REMOVING ALL FOREIGN CHARACTERS AND LONG SPACES, ADDING A START AND END TOKEN TO THE CAPTION.
for key, captions in mapping.items():
  for i in range(len(captions)):
    caption = captions[i]

    caption = caption.lower()

    caption = caption.replace('[^a-zA-Z0-9:-.,]', '')

    caption = caption.replace('\s+', ' ')

    caption = '<start>' + " ".join([word for word in caption.split() if len(word) > 1]) + '<end>'

    captions[i] = caption

In [139]:
#ALL CAPTIONS ARE SENT INTO A LIST TO BE TOKENIZED
all_caps = []


for key in mapping:
  for cap in mapping[key]:
    all_caps.append(cap)

In [140]:
#CAPTIONS ARE TOKENIZED FROM THE ALL_CAPS LIST
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_caps)
vocab_size = len(tokenizer.word_index) + 1

In [141]:
vocab_size

222

In [142]:
max_num = 0
for i in all_caps:
  max_num = len(i.split()) if len(i.split()) > max_num else max_num

In [143]:
max_num

58

In [144]:
img_ids = list(mapping.keys())
split = int(len(img_ids) * 0.9)
train = img_ids[:split]
test = img_ids[split:]

In [146]:
import numpy as np

In [147]:
#DATA GENERATOR IS BUILT HERE
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
  #TEXT LABELS, IMAGE DATA AND EXPECTED WORD OUTPUT LISTS ARE INITIALIZED
  X1, X2, y = list(), list(), list()

  n = 0

  #ACQUIRE THE IMAGE ID DATA
  for key in data_keys:
    n+=1

    # ACQUIRE THE CAPTION BASED ON THE IMAGE ID WHICH ARE KEYS IN THE MAPPING DICTIONARY
    for caption in mapping[key]:

      # CONVERT THE TEXTS TO SEQUENCES USING THE TOKENIZER
      seq = tokenizer.texts_to_sequences([caption])[0]

      #IN THE SEQUENCE RANGE, GET THE CURRENT(INPUT) SET OF WORDS, AND THE NEXT EXPECTED PREDICTED WORD AND CLASSIFY THEM INTO IN_SEQ AND OUT_SEQ.
      for i in range(1, len(seq)):
        #THE CURRENT(INPUT) SET OF WORDS ARE PLACED INTO THE IN_SEQ VARIABLE, AND THE OUT_SEQ HOLDS THE NEXT EXPECTED WORD TO BE PREDICTED.
        in_seq, out_seq = seq[:i], seq[i]

        #SEQUENCES ARE PADDED BASED ON THE MAX LENGHT OF AN INPUT SENTENCE
        in_seq = pad_sequences([in_seq], maxlen=max_num)[0]

        #OUT PUT SEQUENCES ARE ONE-HOT ENCODED FOR CALCULATION BY CATEGORICAL_CROSSENTROPY LOSS FUNCTION
        # out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]

        #IMAGE FEATURES OF THE CURRENT KEY ARE PLACED INSIDE OF X1, IN_SEQ IS PLACED INSIDE OF X2, AND THE EXPECTED WORD TO BE PREDICTED IS PLACED INSIDE OF Y
        X1.append(features[key][0])
        X2.append(in_seq)
        y.append(out_seq)

        # print(len(X1), len(X2), len(y))

    #AFTER THE BATCH SIZE HAS REACHED ITS LIMIT, THE DATA IS SENT INTO THE MODEL, AND X1, X2, Y VARIABLES ARE RESET.
    if n == batch_size:
      X1, X2, y = np.array(X1), np.array(X2), np.array(y)
      yield [X1, X2], y
      X1, X2, y = list(), list(), list()

      n = 0

In [154]:
#IMAGE-TO-TEXT MODEL

# encoder model
#LAYER FOR ACQUIRING IMAGE FEATURES
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

#LAYER FOR WORD EMBEDDINGS AND LSTM
inputs2 = Input(shape=(max_num,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

#DECODER MODEL ACCEPTS AND ADDS INPUT FROM BOTH LSTM AND CNN MODEL
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
#WORD OUTPUTS ARE PREDICTED
outputs = Dense(vocab_size, activation='softmax')(decoder2)

#MODEL IS BUILT EXPECTING TWO INPUTS, ONE FOR IMAGE FEATURES(X1), AND THE OTHER FOR INPUT WORDS(X2), AND EXPECTING ONE OUTPUT FOR NEXT PREDICTED WORD(Y)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# plot the model
# plot_model(model, show_shapes=True)
model.summary()

Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_23 (InputLayer)       [(None, 58)]                 0         []                            
                                                                                                  
 input_22 (InputLayer)       [(None, 4096)]               0         []                            
                                                                                                  
 embedding_7 (Embedding)     (None, 58, 256)              56832     ['input_23[0][0]']            
                                                                                                  
 dropout_14 (Dropout)        (None, 4096)                 0         ['input_22[0][0]']            
                                                                                           

In [419]:
data = data_generator(img_ids, mapping, features, tokenizer, max_num, vocab_size, batch_size)
print(data)

<generator object data_generator at 0x0000015527619690>


In [163]:
epochs = 10
batch_size = 32

#BASED ON THE DATA'S STRUCTURE THE MODEL WILL HAVE TO BE TRAINED USING A FOR LOOP. CONTINUOUSLY FEEDING THE MODEL THE GENERATED DATA TO PREVENT IT FROM RUNNING OUT OF TRAINING DATA.
for i in range(epochs):
    
    data = data_generator(img_ids, mapping, features, tokenizer, max_num, vocab_size, batch_size)

    model.fit(data, epochs=1)

      3/Unknown - 4s 1s/step - loss: 0.5842 - accuracy: 0.8057

KeyboardInterrupt: 

In [3]:
new_model.save('vgg_model.h5')



  saving_api.save_model(


In [166]:
model.save('nutrition_1.h5')

In [170]:
with open('nutrition_tokenizer.pkl', 'wb') as f:
    token = {
        'tokenizer': tokenizer,
        'max': max_num
    }

    pickle.dump(token, f)

In [23]:
import numpy as np

with open("tokenizer_data.pkl", "rb") as f:
   token = pickle.load(f)
   
tokenizer = token['tokenizer']
max = token['maxlen']


model = tf.keras.models.load_model('vgg_model.h5')

# model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

cap_model = tf.keras.models.load_model('image_captioning.h5')





In [24]:
#THE MATCHING WORD TO THE PREDICTED INDEX IS ACQUIRED AND RETURNED IN THIS FUNCTION
def idx_to_word(integer, tokenizer):
  for word, index in tokenizer.word_index.items():
    if index == integer:
      return word
        

  return None

In [42]:
max_num

86

In [156]:
def predict_captions(model, image, tokenizer, max_num):
  #CAPTIONS ARE PREDICTED DOWN HERE.
  #THE IMAGE FEATURES, TOKENIZER AND IMAGE-TO-TEXT MODEL ARE LOADED IN HEERE


  in_seq = '<start>'

  for i in range(max_num):
    #IN_SEQ BEGINS WITH THE START TOKE
    seq = tokenizer.texts_to_sequences([in_seq])[0]

    #SEQUENCES ARE PADDED
    seq = pad_sequences([seq], maxlen=max_num)



    #WORDS ARE PREDICTED USING THE IMAGE FEATURES AND INPUT SEQUENCE AS INPUTS TO THE MODEL
    yhat = model.predict([image, seq], verbose=0)

    #THE INDEX OF THE HIGHEST PREDICTED WORD IS SELECTED
    yhat = np.argmax(yhat)

    #INDEX IS SENT INTO THE IDX_TO_WORD FUNCTION ALONG WITH THE TOKENIZER
    # word = idx_to_word(yhat, tokenizer)
    word = tokenizer.index_word[yhat]
    print(word)
      

    #LOOP IS BROKEN IN THE WORD IS NONE
    if word == None:
      break

    #IN_SEQ AND THE PREDICTED WORD ARE ADDED TOGETHER TO PREDICT THE NEXT WORD.
    in_seq += " " + word


    if word == 'end':
      break

  return in_seq

In [167]:
image = load_img('acne-face-15-1__ProtectWyJQcm90ZWN0Il0_FocusFillWzI5NCwyMjIsIngiLDFd.jpg', target_size=(224, 224))
# convert image pixels to numpy array
image = img_to_array(image)
# reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)

# extract features
feature = new_model.predict(image, verbose=0)

print(feature.shape)
# predict from the trained model
predict_captions(model, feature, tokenizer, max_num)

(1, 4096)
there
are
signs
of
acne
present
in
your
skin
end


'<start> there are signs of acne present in your skin end'