In [371]:
import pandas as pd
import re
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer

import seaborn as sns

Generating tokens

In [372]:
def import_data():
    '''Import cleaned recipe, raw recipe and ratings data'''
    #cleaned recipes
    recipes = pd.read_csv('PP_recipes.csv')
    del recipes['i']
    del recipes['name_tokens']
    del recipes['ingredient_tokens']
    del recipes['steps_tokens']
    recipes = recipes.set_index('id')

    #ratings
    ratings = pd.read_csv('RAW_interactions.csv')
    del ratings['user_id']
    del ratings['date']
    del ratings['review']
    ratings = ratings.set_index('recipe_id')

    #raw recipe info
    raw_recipes = pd.read_csv('RAW_recipes.csv')
    del raw_recipes['contributor_id']
    del raw_recipes['submitted']
    del raw_recipes['tags']
    del raw_recipes['steps']
    del raw_recipes['description']
    raw_recipes = raw_recipes.set_index('id')

    return (recipes, raw_recipes, ratings)

In [373]:
_,raw_recipes,ratings = import_data()
raw_recipes

Unnamed: 0_level_0,name,minutes,nutrition,n_steps,ingredients,n_ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
137739,arriba baked winter squash mexican style,55,"[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['winter squash', 'mexican seasoning', 'mixed ...",7
31490,a bit different breakfast pizza,30,"[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['prepared pizza crust', 'sausage patty', 'egg...",6
112140,all in the kitchen chili,130,"[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['ground beef', 'yellow onions', 'diced tomato...",13
59389,alouette potatoes,45,"[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,"['spreadable cheese with garlic and herbs', 'n...",11
44061,amish tomato ketchup for canning,190,"[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,"['tomato juice', 'apple cider vinegar', 'sugar...",8
...,...,...,...,...,...,...
486161,zydeco soup,60,"[415.2, 26.0, 34.0, 26.0, 44.0, 21.0, 15.0]",7,"['celery', 'onion', 'green sweet pepper', 'gar...",22
493372,zydeco spice mix,5,"[14.8, 0.0, 2.0, 58.0, 1.0, 0.0, 1.0]",1,"['paprika', 'salt', 'garlic powder', 'onion po...",13
308080,zydeco ya ya deviled eggs,40,"[59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0]",7,"['hard-cooked eggs', 'mayonnaise', 'dijon must...",8
298512,cookies by design cookies on a stick,29,"[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]",9,"['butter', 'eagle brand condensed milk', 'ligh...",10


In [374]:
def string_to_list(s):
    '''Converts a string that is formatted like a list to a list'''
    l = re.findall(r'\w[\w\s]+',s)
    #l = ' '.join(l)
    return l

In [493]:
#cleaning token lists
input_sequences = raw_recipes['ingredients'].apply(string_to_list).to_list()
input_sequences[:1]

[['winter squash',
  'mexican seasoning',
  'mixed spice',
  'honey',
  'butter',
  'olive oil',
  'salt']]

In [376]:
#lengths = [len(x.split(' ')) for x in input_sequences]
#sns.histplot(lengths)

tokenizing ingredients

In [377]:
tokenizer = Tokenizer()
def get_sequence_of_tokens(input_sequences):
    ## tokenization
    tokenizer.fit_on_texts(input_sequences)
    total_words = len(tokenizer.word_index) + 1

    ngram_sequences = []
    for line in input_sequences:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            ngram_sequences.append(n_gram_sequence)
    return ngram_sequences, total_words

In [492]:
input_sequences,total_words = get_sequence_of_tokens(input_sequences)
input_sequences[:6]

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

Padding sequences

In [379]:
#find max token length
def find_max(sequences):
    max_length = max(len(x) for x in sequences)
    return max_length

max_length = find_max(input_sequences)
max_length

44

In [381]:
#pad sequences (built in function returns a recursion error)
def pad_sequences(sequence):
    padded_sequence = []
    for sequence_in in sequence:
        zeros = (max_length-len(sequence_in))
        padded = [int(0) for zero in range(zeros)]
        padded.extend(sequence_in)
        padded_sequence.append(padded)
    return padded_sequence

input_sequences = pad_sequences(input_sequences)

predictors

In [382]:
#making test and train sets 
input_sequences = np.array(input_sequences)
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

Model

In [390]:
def create_model(max_length, max_token):
    input_len = max_length -1
    model = Sequential()
    model.add(Embedding(max_token, 10, input_length=input_len))
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    model.add(Dense(max_token, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
    return model

In [391]:
model = create_model(max_length, total_words)
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 43, 10)            149910    
                                                                 
 lstm_7 (LSTM)               (None, 100)               44400     
                                                                 
 dropout_7 (Dropout)         (None, 100)               0         
                                                                 
 dense_7 (Dense)             (None, 14991)             1514091   
                                                                 
Total params: 1708401 (6.52 MB)
Trainable params: 1708401 (6.52 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [401]:
test_size = None
history = model.fit(predictors[:test_size], label[:test_size], epochs=4)



generating text

In [436]:
def cook_for_me(seed_text, model):
    next_words = 5
    words_out = []
    while next_words > 0:
        token_list = tokenizer.texts_to_sequences([seed_text])[0]      
        token_list = pad_sequences([token_list])
        token_list = np.array(token_list)
        token_list = token_list[:,1:]
        proba = model.predict(token_list, verbose=0)
        predicted = np.argmax(proba, axis=1)
        word = list(tokenizer.word_index.keys())[int(predicted-1)]
        words_out.append(word)
        next_words -= 1
        seed_text = seed_text + ' ' + word
    words_out = list(set(words_out))
    string_out = 'Why not try adding some '+', '.join(words_out[:-1])+' and '+words_out[-1]+'?'
    return string_out

In [468]:
def make_suggestions(ingredients_in_stock):
    for ingredient in ingredients_in_stock:
        split_ingredients = ' and '.join(ingredient.split(' '))
        greeting = 'For '+split_ingredients.upper() +' let me see... '
        print(greeting)
        print(cook_for_me(ingredient,model))
        print('')

In [500]:
ingredients_in_stock = ['chicken','chicken rice','tuna','chocolate sugar','pineapple']
make_suggestions(ingredients_in_stock)

For CHICKEN let me see... 
Why not try adding some olive oil, onion, garlic cloves, salt and pepper?

For CHICKEN AND RICE let me see... 
Why not try adding some carrot, celery, onion and water?

For TUNA let me see... 
Why not try adding some celery, mayonnaise, onion and salt?

For CHOCOLATE AND SUGAR let me see... 
Why not try adding some flour, salt, milk, butter and eggs?

For PINEAPPLE let me see... 
Why not try adding some ice, sugar, lemon juice and water?

