In [1]:
%load_ext autoreload
%autoreload 2

In [10]:
from helpers import *
import numpy as np

from keras.preprocessing.text import Tokenizer

In [70]:
data = [ 
    'data/liquor.json',
    'data/social_cocktail.json',
    'data/serious_eats.json',
    'data/live_in_style.json',
    'data/all_recipes.json'
]

descriptions, ingredients, names = [], [], []

for d in data:
    descriptions += load_data(d, field='description')
    ingredients += load_data(d, field='ingredients')
    names += load_data(d, field='name')

assert len(descriptions) == len(ingredients)

recipes = [x + ' # ' + y for x, y in zip(ingredients, descriptions)]    
print('There are {} recipes in the database.'.format(len(recipes)))

There are 3347 recipes in the database.


In [71]:
# Perform filtering of input text
remove_unicode = lambda x: x.encode('ascii', errors='ignore').decode().strip()

filter_words = ['a', 'an', 'the', 'fluid', '1', 'ounce', 'ounces']

recipes = [remove_unicode(x) for x in recipes]
recipes = [' '.join([y for y in x.split() if y not in filter_words]) for x in recipes]
recipes = [x.replace('.', ' . ') for x in recipes]
recipes = [x + ' |' for x in recipes]
recipes = [x.replace('  ', ' ') for x in recipes]

np.random.shuffle(recipes) # Shuffle recipes from different websites

In [72]:
SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300
VOCABULARY_SIZE = None

tokenizer = Tokenizer(
    num_words=VOCABULARY_SIZE,
    filters='!"$%&()*+,-:;<=>?@[\\]^_`{}~\t\n'
)
tokenizer.fit_on_texts(recipes)
sequences = tokenizer.texts_to_sequences(recipes)

word_index = tokenizer.word_index
print('Found {} unique tokens.'.format(len(word_index)))

VOCABULARY_SIZE = len(word_index) + 1 if VOCABULARY_SIZE is None else VOCABULARY_SIZE
print('Vocabulary size is {}.'.format(VOCABULARY_SIZE))

Found 4464 unique tokens.
Vocabulary size is 4465.


In [None]:
glove_file = 'data/glove.42B.300d.txt'
embedding_matrix = get_embedding_matrix(glove_file, word_index, VOCABULARY_SIZE, EMBEDDING_DIM)

In [74]:
X_recipes = make_flat(sequences)

# Transform data into sequences and predictions
X, y = sequence_transform(X_recipes, SEQUENCE_LENGTH)

print('The sequence length is {}'.format(SEQUENCE_LENGTH))
print('Observation shape is {}, label shape is {}'.format(X.shape, y.shape))

The sequence length is 50
Observation shape is (218640, 50), label shape is (218640,)


### Create the Neural Network Models

In [24]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, GRU, Embedding
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from sklearn.model_selection import train_test_split

In [77]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=SEQUENCE_LENGTH,
                            trainable=False)


model = Sequential()
model.add(embedding_layer)
model.add(GRU(256, activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(GRU(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=VOCABULARY_SIZE + 1, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 50, 300)           1339500   
_________________________________________________________________
gru_9 (GRU)                  (None, 50, 256)           427776    
_________________________________________________________________
dropout_9 (Dropout)          (None, 50, 256)           0         
_________________________________________________________________
gru_10 (GRU)                 (None, 256)               393984    
_________________________________________________________________
dropout_10 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 4466)              1147762   
Total params: 3,309,022
Trainable params: 1,969,522
Non-trainable params: 1,339,500
__________________________________________________________

In [None]:
# Create the model
batch_size = 32
model.fit(X, y, validation_split=0.05, shuffle=False, epochs=1, batch_size=batch_size);

Train on 207708 samples, validate on 10932 samples
Epoch 1/1


In [58]:
from helpers import *

start = np.random.randint(0, len(X)-1)    
observation = X[start]
seed = ' '.join(decode_label(observation, tokenizer))

result, prediction = [], None
str_len = 100

for i in range(str_len):
    prediction = language_model_sampling(
            model, np.array([observation]), batch_size, raw_prediction=False, c=VOCABULARY_SIZE
    )
    result.append(prediction)    
    observation = np.append(observation[1:], prediction)
    
output = ' '.join(decode_label(result, tokenizer))
print('Seed is: {}\n\nOutput is: {}'.format(seed, output))

Seed is: with mint sprigs lemon wedge or maraschino cherries . you can also add some soda into mix as alternative recipe . | 10 12 leaves of fresh mint removed from stems 1 teaspoon sugar or 1 2 tsp simple syrup to taste 2 ounces cognac chilled brut champagne or other

Output is: sparkling wine # fill cocktail shaker with ice . add gin and sweet vermouth . shake well and strain into chilled cocktail glass . garnish with lemon twist . | 1 cup ice cubes 1 1 . 5 fluid ounce jigger vodka 1 fluid ounce vodka 1 fluid ounce vodka 1 fluid ounce vodka 1 fluid ounce vodka 1 fluid ounce vodka 1 fluid ounce vodka 1 fluid ounce vodka 1 fluid ounce vodka 1 fluid ounce vodka 1 fluid ounce vodka 1 fluid ounce vodka 1 fluid ounce sweet vermouth 1 fluid ounce orange juice 1 fluid ounce orange


In [None]:
model_file = 'model.json'
weights_file = 'weights.h5'

def save_model_to_json(model, model_file, weights_file):
    model_json = model.to_json()
    with open(model_file, "w") as json_file:
        json_file.write(model_json)
    model.save_weights(weights_file)
    
save_model_to_json(model, model_file, weights_file)