In [27]:
import os
import re
import pickle
import numpy as np
import pandas as pd

In [46]:
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
recipes = pd.read_csv(os.path.join('data', 'recipes.csv'), sep=';')

In [13]:
# Loading model
model = keras.models.load_model('models/model')

In [16]:
# loading tokenizer
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [6]:
# To make things easier in this case we are going to check the validity of the input recipe using the cleaned_recipes.csv file
# Validity means: text of the recipe is not null and the ground truth is consistent (ingredients are actually in the recipe text)
cleaned_recipes = pd.read_csv(os.path.join('data', 'cleaned_recipes.csv'))

In [8]:
cleaned_recipes.drop(columns='Unnamed: 0', inplace=True)

In [10]:
recipes.columns

Index(['Recipe Name', 'Review Count', 'Recipe Photo', 'Author', 'Prepare Time',
       'Cook Time', 'Total Time', 'Ingredients', 'Directions', 'RecipeID'],
      dtype='object')

In [12]:
print('You should pick some recipes indices from the following list:\n', list(recipes.RecipeID))

You should pick some recipes indices from the following list:
 [7000, 7001, 7003, 7006, 7007, 7008, 7009, 7012, 7013, 7014, 7015, 7016, 7018, 7019, 7020, 7021, 7022, 7024, 7025, 7026, 7030, 7033, 7034, 7035, 7037, 7038, 7040, 7043, 7046, 7048, 7049, 7050, 7051, 7052, 7053, 7054, 7055, 7056, 7058, 7059, 7060, 7064, 7065, 7066, 7067, 7068, 7069, 7070, 7071, 7072, 7074, 7075, 7076, 7077, 7078, 7079, 7080, 7081, 7082, 7083, 7084, 7085, 7086, 7087, 7088, 7089, 7090, 7091, 7092, 7093, 7095, 7098, 7099, 7101, 7102, 7103, 7104, 7106, 7108, 7110, 7112, 7113, 7114, 7116, 7117, 7119, 7120, 7121, 7122, 7123, 7124, 7127, 7129, 7130, 7131, 7133, 7134, 7135, 7136, 7138, 7141, 7142, 7143, 7144, 7145, 7146, 7147, 7148, 7149, 7150, 7151, 7153, 7155, 7156, 7158, 7160, 7161, 7162, 7163, 7165, 7166, 7167, 7170, 7171, 7172, 7173, 7174, 7175, 7176, 7177, 7178, 7179, 7180, 7182, 7183, 7184, 7185, 7187, 7188, 7189, 7190, 7191, 7192, 7193, 7195, 7197, 7198, 7199, 7200, 7201, 7203, 7204, 7206, 7208, 7209, 7210, 

### Prepare the user input

In [32]:
recipes_indices = [26918, 26978]
user_input = {}

In [33]:
all_cleaned_recipes = list(cleaned_recipes.Directions)

for i in recipes_indices:
    recipe_text = recipes.loc[recipes['RecipeID'] == i, 'Directions'].values[0]
    
    if recipe_text in all_cleaned_recipes:
        user_input[i] = recipe_text

In [34]:
user_input

{26918: 'In a medium saucepan, place apple, cranberry sauce, dried apricots, raisins, lemon juice, cinnamon, cloves and nutmeg. Bring to a boil, then reduce heat to simmer. Cook, stirring occasionally, 7 to 10 minutes. Remove from heat. Cover and chill in the refrigerator at least 3 hours. Serve cold.**',
 26978: 'Place bacon in a large, deep skillet. Cook over medium high heat until evenly brown. Drain, crumble and set aside.**In a medium bowl, mix bacon, Cheddar cheese, green onions and mayonnaise. Adjust the amount of mayonnaise as needed, using just enough to make the mixture stick together.**Mold the mixture into a ball. Spread ball with strawberry preserves. Chill in the refrigerator until serving.**'}

### Inference time :)

#### Input pre-processing

In [24]:
def input_preprocessing(recipe):
    recipe = recipe.lower()
    # remove numbers
    recipe = re.sub('\d+', '', recipe)
    # remove special symbols
    recipe = re.sub('[^a-zA-z\s]', '', recipe)
    # remove single characters
    recipe = re.sub(' . |.- | -.', ' ', recipe)
    # replace multiple spaces by just one space
    recipe = re.sub(' +', ' ', recipe)
    recipe = re.sub(' $', '', recipe)
    
    return recipe

In [25]:
def ingredient_preprocessing(ingredients):
    ingredients = ingredients.lower()
    ingredients = re.sub('\d+', '', ingredients)
    ingredients = re.sub('[^a-zA-z\s,]', '', ingredients)
    
    return ingredients

In [35]:
inp = {key: input_preprocessing(val) for key, val in user_input.items()}

In [36]:
# This is not the case in a real scenario
user_gt = {key: ingredient_preprocessing(cleaned_recipes.loc[cleaned_recipes['Directions'] == val, 'Ingredients'].values[0])
          for key, val in user_input.items()}

In [37]:
user_gt

{26918: 'clove,apple,cinnamon,apricot,cranberry sauce,nutmeg,raisin,lemon juice',
 26978: 'mayonnaise,green onion,strawberry,bacon'}

#### Get the encoded recipes

In [41]:
vocab_size = len(tokenizer.word_index)
encoded_recipes = tokenizer.texts_to_sequences(list(inp.values()))

In [44]:
# We have this from training
max_len_recipe = 490

In [48]:
encoded_recipes = [(np.array(e) - 1).tolist() for e in encoded_recipes]

In [49]:
X_eval = pad_sequences(maxlen=max_len_recipe, sequences=encoded_recipes, padding="post", value=vocab_size-1)

#### Get the encoded tags (again not the case in a real scenario)

In [50]:
def tag_recipe(recipe, ingredients):
    recipe_words = re.split(' ', recipe)
    tagged_recipe = []
    
    for w in recipe_words:
        if any([i for i in ingredients if i in w]):
            tagged_recipe.append(1)
        else:
            tagged_recipe.append(0)
    
    return tagged_recipe

In [51]:
encoded_tags = []

for i, r in inp.items():
    encoded_tags.append(tag_recipe(r, user_gt[i].split(',')))

In [53]:
y_eval = pad_sequences(maxlen=max_len_recipe, sequences=encoded_tags, padding="post", value=0)

#### Evaluate the model on these data (not the case in a real scenario)

In [54]:
model.evaluate(X_eval, y_eval)



[0.0030840267427265644, 0.9989795684814453]

In [67]:
# Below we can see how ingredients match for the first user recipe
index = 0
list(zip(model.predict(X_eval).argmax(axis=-1)[index][:len(encoded_tags[index])], encoded_tags[index]))

[(0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (1, 1),
 (1, 0),
 (0, 0),
 (0, 0),
 (1, 1),
 (1, 1),
 (0, 0),
 (0, 0),
 (1, 1),
 (1, 1),
 (0, 0),
 (1, 1),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0),
 (0, 0)]

#### Get the output in the expected format

In [73]:
predicted_ingredients = {}
preds = model.predict(X_eval).argmax(axis=-1)

In [92]:
for i, p in enumerate(preds):
    aux = list(zip(p[:len(encoded_recipes[i])], encoded_recipes[i]))
    current_result = []
    
    for t in aux:
        if t[0] == 0:
            # not an ingredient
            continue
            
        word = tokenizer.index_word[t[1]+1]
        start_index = user_input[recipes_indices[i]].lower().find(word)
        end_index = start_index + len(word) - 1
        
        if [word, start_index, end_index] not in current_result:
            current_result.append([word, start_index, end_index])
        
    predicted_ingredients[recipes_indices[i]] = current_result

In [93]:
predicted_ingredients

{26918: [['apple', 28, 32],
  ['cranberry', 35, 43],
  ['apricots', 58, 65],
  ['raisins', 68, 74],
  ['cinnamon', 90, 97],
  ['cloves', 100, 105],
  ['nutmeg', 111, 116]],
 26978: [['bacon', 6, 10], ['mayonnaise', 178, 187], ['strawberry', 335, 344]]}