In [1]:
%load_ext autoreload
%autoreload 2
import os
import numpy as np
import json
import re
import string

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses


2025-01-29 21:18:08.668433: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25


In [3]:
# Loading the dataset
with open("full_format_recipes.json") as json_data:
    recipe_data = json.load(json_data)

# Filtering the dataset
filtered_data = [
    "Recipe for " + x["title"] + " | " + " ".join(x["directions"])
    for x in recipe_data
    if "title" in x
    and x["title"] is not None
    and "directions" in x
    and x["directions"] is not None
]

# Counting the recipes
n_recipes = len(filtered_data)
print(f"{n_recipes} recipes loaded")


20111 recipes loaded


In [4]:
example = filtered_data[9]
print(example)


Recipe for Ham Persillade with Mustard Potato Salad and Mashed Peas  | Chop enough parsley leaves to measure 1 tablespoon; reserve. Chop remaining leaves and stems and simmer with broth and garlic in a small saucepan, covered, 5 minutes. Meanwhile, sprinkle gelatin over water in a medium bowl and let soften 1 minute. Strain broth through a fine-mesh sieve into bowl with gelatin and stir to dissolve. Season with salt and pepper. Set bowl in an ice bath and cool to room temperature, stirring. Toss ham with reserved parsley and divide among jars. Pour gelatin on top and chill until set, at least 1 hour. Whisk together mayonnaise, mustard, vinegar, 1/4 teaspoon salt, and 1/4 teaspoon pepper in a large bowl. Stir in celery, cornichons, and potatoes. Pulse peas with marjoram, oil, 1/2 teaspoon pepper, and 1/4 teaspoon salt in a food processor to a coarse mash. Layer peas, then potato salad, over ham.


In [5]:
# Pad the punctuation, to treat them as separate 'words'
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s

text_data = [pad_punctuation(x) for x in filtered_data]

# Converting to a Tensorflow Dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

# Creating a vectorisation layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()


2025-01-29 21:18:17.724193: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [6]:
# Displaying some token:word mappings
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")

# Example recipe converted to tokenized integers
example_tokenised = vectorize_layer(example)
print(example_tokenised.numpy())


0: 
1: [UNK]
2: .
3: ,
4: and
5: to
6: in
7: the
8: with
9: a
[  26   16  557    1    8  298  335  189    4 1054  494   27  332  228
  235  262    5  594   11    1    1  332   45  262    4  671    4   70
    8  171    4   81    6    9   65    1    1   59    1    1   88  650
   20   39    6    9   29   21    4   67  529   11    1  320  171  102
    9    1  306   25   21    8  650    4   42    5    1   63    8   24
    4    1  114   21    6  178  181 1245    4   60    5  140    1    1
  117  557    8  285  235    4  200  292    1  107  650   28   72    4
  108   10    1   57  204   11    1   73  110    1    1    1    1  142
    1    4    1  142   33    6    9   30    1   42    6    1    1    4
    1  437  494    8    1    1    1  142    1    4    1  142   24    6
    9  291  188    5    9  412    1  230    1   46  335    1   20    1
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0

In [7]:
# Creating the training set of recipes and the same text shifted by one word
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y

train_ds = text_ds.map(prepare_inputs)


In [8]:
inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
lstm = models.Model(inputs, outputs)
lstm.summary()

if LOAD_MODEL:
    lstm = models.load_model("./models/lstm.keras", compile=False)


In [9]:
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)

# Creating a TextGenerator callback
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word):
        self.index_to_word = index_to_word
        self.word_to_index = {word: index for index, word in enumerate(index_to_word)}

    def sample_from(self, probs, temperature):
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs)

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [self.word_to_index.get(x, 1) for x in start_prompt.split()]
        sample_token = None
        while len(start_tokens) < max_tokens and sample_token != 0:
            x = np.array([start_tokens])
            y = lstm.predict(x, verbose=0)
            sample_token = self.sample_from(y[0][-1], temperature)
            start_tokens.append(sample_token)
            start_prompt += " " + self.index_to_word[sample_token]
        print(f"\nGenerated text (Temp {temperature}):\n{start_prompt}\n")
        return start_prompt

text_generator = TextGenerator(vocab)

# Training the model
lstm.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[text_generator],
)

# Saving the final model in two different formats
lstm.save("./models/lstm.keras")  # Using .keras format
lstm.save("./models/lstm.h5")  # Using HDF5 format


Epoch 1/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m385s[0m 609ms/step - loss: 4.9987
Epoch 2/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m472s[0m 750ms/step - loss: 3.0336
Epoch 3/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m526s[0m 837ms/step - loss: 2.5110
Epoch 4/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m531s[0m 844ms/step - loss: 2.2661
Epoch 5/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m503s[0m 800ms/step - loss: 2.1122
Epoch 6/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m488s[0m 775ms/step - loss: 2.0141
Epoch 7/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m482s[0m 765ms/step - loss: 1.9250
Epoch 8/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m480s[0m 763ms/step - loss: 1.8642
Epoch 9/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m478s[0m 759ms/step - loss: 1.8232
Epoch 10/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[



In [11]:
# Generating text with different temperature values
text_05 = text_generator.generate(
    "recipe for roasted vegetables | chop 1 /", max_tokens=10, temperature=0.5
)

text_08 = text_generator.generate(
    "recipe for roasted vegetables | chop 1 /", max_tokens=10, temperature=0.8
)

#Displaying the final result
print("\nGenerated text (Temp 0.5):", text_05)
print("\nGenerated text (Temp 0.8):", text_08)



Generated text (Temp 0.5):
recipe for roasted vegetables | chop 1 / 4 of


Generated text (Temp 0.8):
recipe for roasted vegetables | chop 1 / 2 -


Generated text (Temp 0.5): recipe for roasted vegetables | chop 1 / 4 of

Generated text (Temp 0.8): recipe for roasted vegetables | chop 1 / 2 -
