In [58]:
import json
import re
import string
import tensorflow as tf
from tensorflow import keras
from collections import Counter

In [59]:
import numpy as np

In [60]:
with open('/kaggle/input/epirecipes/full_format_recipes.json') as json_data:
    recipe = json.load(json_data)

In [61]:
keys=set()
for recipes in recipe:
    for key in recipes.keys():
        keys.add(key)
print(keys)

{'categories', 'ingredients', 'fat', 'title', 'rating', 'calories', 'directions', 'sodium', 'date', 'protein', 'desc'}


In [62]:
filter_data = ["Recipe for "+x['title']+' | '+" ".join(x['directions'])
              for x in recipe
              if 'title' in x
              and x['title'] is not None
              and 'directions' in x
              and x['directions'] is not None]

In [63]:
print(filter_data[20100])
print(len(filter_data))

Recipe for Chicken with White Wine and Herbs  | In a large pot, place the chicken legs, bay leaves, tarragon, peppercorns, parsley stalks, white wine, and 2 celery stalks. Halve two of the onions and two of the carrots and add to the pan, then pour in enough cold water to cover the legs (about 2 quarts). Cover the pan and bring to a boil, then reduce to a simmer and cook, with the lid half on, for 25 minutes, or until juices in the legs run clear. Remove the chicken legs from the pan and set aside, but keep the stock simmering. Meanwhile, chop the remaining onions, carrot, and celery into small chunks. In a large saucepan over low heat, heat the butter, add the chopped vegetables and a pinch of salt, and sweat for 6 to 8 minutes. Meanwhile, strip the chicken from the bones and set aside. Place the bones, skin, and trimmings back in the simmering stock pot and continue to simmer. Add the mushrooms and garlic to the pan with the chopped vegetables and turn up the heat to brown all the in

In [64]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [65]:
def pad_punc(s):
    s = re.sub(f"([{string.punctuation}])", r' \1 ', s)
    s = re.sub(' +', ' ', s)
    return s
text_data = [pad_punc(x) for x in filter_data]

In [66]:
print(len(text_data))
tokens = ' '.join(text_data).split()
tokens_count = Counter(tokens)
# for token, count in tokens_count.items():
#     print(f"{token}: {count}")
print(len(tokens_count))

20111
16407


In [67]:
example_data=text_data[1]
print(example_data)

Recipe for Boudin Blanc Terrine with Red Onion Confit | Combine first 9 ingredients in heavy medium saucepan . Add 3 shallots . Bring to simmer . Remove from heat , cover and let stand 30 minutes . Chill overnight . Preheat oven to 325°F . Line 7 - cup pâté or bread pan with plastic wrap . Melt butter in heavy small skillet over low heat . Add remaining 5 shallots . Cover and cook until very soft , stirring occasionally , about 15 minutes . Transfer to processor . Add pork , eggs , flour and Port and puree . Strain cream mixture , pressing on solids to extract as much liquid as possible . With processor running , add cream through feed tube and process just until combined with pork . Transfer to large bowl . Mix in currants . Spoon mixture into prepared pan . Cover with foil . Place pan in large pan . Add boiling water to larger pan to within 1 / 2 inch of top of terrine . Bake until terrine begins to shrink from sides of pan and knife inserted into center comes out clean , about 1 1 /

In [68]:
text_dataset = tf.data.Dataset.from_tensor_slices(text_data).batch(64).shuffle(1000)
vector_layer=keras.layers.TextVectorization(standardize='lower',max_tokens=13000,output_mode='int',output_sequence_length=200+1,)
vector_layer.adapt(text_dataset)
vocab=vector_layer.get_vocabulary()

In [69]:
print(len(vocab))

12921


In [70]:
print(vocab[0:11])
example_tokens=vector_layer(example_data)
print(example_tokens.numpy())
print(example_data)

['', '[UNK]', '.', ',', 'and', 'to', 'in', 'the', 'with', 'a', 'until']
[  26   16 6055 2618 1359    8  282  115 1677   27  103  329  334  131
    6   78   29   80    2   18   36  417    2   84    5   70    2   71
   51   17    3   49    4   67  146  126   12    2  108  435    2   86
   47    5  677    2  328  343   13   52 2557   41  195   44    8  214
  212    2  266   50    6   78   65   56   20  134   17    2   18   45
   59  417    2   49    4   43   10  218  286    3   48   90    3   19
  127   12    2   40    5  188    2   18  201    3  199    3  111    4
 1087    4  378    2  320   76   31    3  427   28  493    5  692  151
  830  143  151  787    2    8  188  703    3   18   76  102 1541 1568
    4  443   93   10  348    8  201    2   40    5   30   21    2  116
    6 1270    2   98   31   25  176   44    2   49    8  167    2   64
   44    6   30   44    2   18  231   39    5 1020   44    5 1617   11
   23   15   53   14   72   14 1359    2   97   10 1359  549    5 2969
   51

In [71]:
# Check if full stop or comma exists in the vocabulary
contains_full_stop = '.' in vocab
contains_comma = ',' in vocab

print("Full stop exists:", contains_full_stop)
print("Comma exists:", contains_comma)


Full stop exists: True
Comma exists: True


In [72]:
# Get the index of full stop and comma in the vocabulary
full_stop_index = vocab.index('.')
comma_index = vocab.index(',')

print("Index of full stop:", full_stop_index)
print("Index of comma:", comma_index)


Index of full stop: 2
Index of comma: 3


In [73]:
def prepare_data(text):
    text = tf.expand_dims(text,-1)
    token_text = vector_layer(text)
    x = token_text[:,:-1]
    y = token_text[:,1:]
    return x,y
train_ds = text_dataset.map(prepare_data)

In [75]:
inputs = keras.layers.Input(shape=(None,),dtype='int32')
x = keras.layers.Embedding(13000,150)(inputs)
x = keras.layers.Bidirectional(keras.layers.GRU(150,return_sequences=True))(x)
x = keras.layers.Bidirectional(keras.layers.GRU(150,return_sequences=True))(x)
# x = keras.layers.LSTM(128,return_sequences=True)(x)
output = keras.layers.Dense(13000,activation='softmax')(x)
lstm_model = keras.models.Model(inputs,output)
lstm_model.compile(optimizer='adam',loss=keras.losses.SparseCategoricalCrossentropy())

In [76]:
lstm_model.summary()

In [77]:
class TextGenerator(keras.callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }  # <1>

    def sample_from(self, probs, temperature):  # <2>
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]  # <3>
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:  # <4>
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)  # <5>
            sample_token, probs = self.sample_from(y[0][-1], temperature)  # <6>
            info.append({"prompt": start_prompt, "word_probs": probs})
            start_tokens.append(sample_token)  # <7>
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("recipe for", max_tokens=100, temperature=0.6)


In [78]:
text_generator=TextGenerator(vocab)

In [79]:
lstm_model.fit(train_ds,epochs=20,callbacks=[text_generator])

Epoch 1/20
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step - loss: 5.2072
generated text:
recipe for with 

[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 214ms/step - loss: 5.2049
Epoch 2/20
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211ms/step - loss: 3.6913
generated text:
recipe for stuffed with for for salad sheet of into the a a for a tablespoons sheet in " . can with top and with available 

[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 216ms/step - loss: 3.6904
Epoch 3/20
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211ms/step - loss: 2.7040
generated text:
recipe for a oven oven first " | the 1 bowl , 4 in a a in a a sides and too more . cut to disk and inch . put tortillas to the the 3 not 1 out inch in heavy a dough to 2 often , 2 sheet 2 until tender heat 4 until 1 dish with clean the 25 to pan together cooking , 2 large pan over 2 until brown until work ; cooke

<keras.src.callbacks.history.History at 0x7d2e14ae65f0>

In [80]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [81]:
info = text_generator.generate(
    "roasted vegetables | chop 1 /", max_tokens=100, temperature=0.5
)
print_probs(info, vocab)


generated text:
roasted vegetables | chop 1 / 2 cup rice mixture in a medium pot of the sauce and add the mushrooms , and bring to large pot of until it is heated through , until absorbed , and add the the sauce until it ; add the bright . add a large bowl and cook until wilted . return a medium pot of boiling water ; add the the peppers and add mixture until absorbed , about the chard until the water ; fluff , and parsley , and chile mixture until absorbed until absorbed , stirring occasionally , and


PROMPT: roasted vegetables | chop 1 /
2:   	82.94%
4:   	14.95%
3:   	1.36%
a:   	0.53%
still:   	0.08%
--------


PROMPT: roasted vegetables | chop 1 / 2
cup:   	81.76%
tablespoons:   	6.4%
bowl:   	3.88%
2:   	3.85%
hours:   	1.76%
--------


PROMPT: roasted vegetables | chop 1 / 2 cup
sauce:   	17.16%
pan:   	16.01%
the:   	4.23%
sugar:   	4.1%
cream:   	3.13%
--------


PROMPT: roasted vegetables | chop 1 / 2 cup rice
into:   	31.01%
in:   	25.49%
or:   	13.37%
mixture:   	11.26%
.