# LSTM
Download the epirecipes dataset [here](https://www.kaggle.com/datasets/hugodarwood/epirecipes) and put it in `./data/`

In [1]:
# imports
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import json
import re
import string
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


## Load and view data

In [3]:
with open('/kaggle/input/epirecipes/full_format_recipes.json') as json_data:
    recipe_data = json.load(json_data)

filtered_data = [
    'Recipe for ' + x['title']+ ' | ' + ' '.join(x['directions'])
    for x in recipe_data
    if 'title' in x
    and x['title'] is not None
    and 'directions' in x
    and x['directions'] is not None
]

In [4]:
# Count the recipes
n_recipes = len(filtered_data)
print(f"{n_recipes} recipes loaded")
example = filtered_data[9]
print(example)

20111 recipes loaded
Recipe for Ham Persillade with Mustard Potato Salad and Mashed Peas  | Chop enough parsley leaves to measure 1 tablespoon; reserve. Chop remaining leaves and stems and simmer with broth and garlic in a small saucepan, covered, 5 minutes. Meanwhile, sprinkle gelatin over water in a medium bowl and let soften 1 minute. Strain broth through a fine-mesh sieve into bowl with gelatin and stir to dissolve. Season with salt and pepper. Set bowl in an ice bath and cool to room temperature, stirring. Toss ham with reserved parsley and divide among jars. Pour gelatin on top and chill until set, at least 1 hour. Whisk together mayonnaise, mustard, vinegar, 1/4 teaspoon salt, and 1/4 teaspoon pepper in a large bowl. Stir in celery, cornichons, and potatoes. Pulse peas with marjoram, oil, 1/2 teaspoon pepper, and 1/4 teaspoon salt in a food processor to a coarse mash. Layer peas, then potato salad, over ham.


## Tokenize input
Lowercase word tokenization, no stemming, pad punctuation to tokenize punctuation.

In [5]:
def pad_punctuation(str):
    # add spaces before and after punctuation
    str = re.sub(f"([{string.punctuation}])", r" \1 ", str)
    # replace one or more spaces with one space
    str = re.sub(' +', ' ', str)
    return str

text_data = [pad_punctuation(x) for x in filtered_data]
text_data = [x.lower() for x in text_data]

In [6]:
tokenizer = get_tokenizer(None)
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(
    yield_tokens(iter(text_data)),
    specials=["<stop>", "<unk>"],
    special_first=True,
    max_tokens=10000
)
vocab.set_default_index(vocab['<unk>']) # set default unknown to <unk>

In [7]:
print(len(vocab))
text_to_vec = lambda x: vocab(tokenizer(x))
print(vocab.lookup_tokens(range(0, 10)))

10000
['<stop>', '<unk>', '.', ',', 'and', 'to', 'in', 'the', 'with', 'a']


## Create the training dataset

In [8]:
MAX_SEQ_LEN = 200 + 1
STOP = 0
def pad_tokens(tok_list):
    if len(tok_list) < MAX_SEQ_LEN:
        return tok_list + [STOP for x in range(MAX_SEQ_LEN - len(tok_list))]
    else:
        return tok_list[:MAX_SEQ_LEN]

def prepare_inputs(text):
    tokenized_sentences = list(map(text_to_vec, text))
    tokenized_sentences = list(map(pad_tokens, tokenized_sentences))
    tokenized_sentences = torch.tensor(tokenized_sentences).to(device)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    #print(vocab.lookup_tokens(x[8].tolist()))
    #print(vocab.lookup_tokens(y[8].tolist()))
    return x, y

train_dataloader = torch.utils.data.DataLoader(
    text_data,
    batch_size=32,
    shuffle=True,
    collate_fn=prepare_inputs)

## Build the LSTM

In [11]:
class TextGen(nn.Module):
    def __init__(self):
        super(TextGen, self).__init__()
        self.embedding = nn.Embedding(10000, 100)
        # default is to return sequences
        self.lstm = nn.LSTM(100, hidden_size=128, num_layers=1, batch_first=True)
        self.dense = nn.Linear(128, 10000)

    def forward(self, x, batching=True):

        # batch, seq_len -> batch, seq_len, emb_dim
        x = self.embedding(x)

        # batch, seq_len, emb_dim -> batch, seq_len, hidden
        # h: num_layers, batch, hidden
        # c: num_layers, batch, hidden

        # note `output` is a tensor of updated hidden states for each
        # timestep of the final lstm layer
        output, (h,c) = self.lstm(x)

        # batch, seq_len, hidden -> batch, seq_len, vocab_size
        output = self.dense(output)
        
        '''
        Note no need to softmax, as nn.CrossEntropyLoss defaults to reducing the 
        inputs (i.e. applying softmax) this is equivalent setting from_logits=True 
        in Tensorflow's loss. To use softmax, set reduction='none' in nn.CrossEntropyLoss().
        
        # softmax to make the last dim a probability thing
        output = F.softmax(output, dim=2 if batching else 1)
        '''

        return output

## Train

In [12]:
# utility function to sample a index given a vector of probs and a temp
def sample_from(probs, temperature):
        probs = probs.to('cpu').numpy()
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

# generate and print a string using the model
# of len max_tokens that starts with start_prompt
def generate(model, start_prompt, max_tokens, temperature):
    model.eval()

    start_tokens = vocab.lookup_indices(tokenizer(pad_punctuation(start_prompt)))
    start_tokens = torch.IntTensor(start_tokens).to(device)

    next_token = None
    info = []
    while len(start_tokens) < max_tokens and next_token != 0:
        x = start_tokens
        y = model(x, batching=False)
        y = F.softmax(y, dim=1)
        with torch.no_grad():
          next_token, probs = sample_from(y[-1], temperature)
        info.append({"prompt": start_prompt, "word_probs": probs})
        start_tokens = torch.cat(
            (start_tokens, 
             torch.IntTensor([next_token]).to(device)))
        start_prompt = start_prompt + " " + vocab.lookup_token(next_token)

    print(f"\ngenerated text:\n{start_prompt}\n")
    return info

In [13]:
model = TextGen().to(device)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [14]:
epochs = 25
for epoch in range(epochs):
    batch_num = 0
    train_loss = 0
    model.train()
    for curr, target in train_dataloader:
        batch_size = curr.shape[0]
        seq_len = curr.shape[1]
        
        # curr [32, 200], pred [32, 200, 10000]
        pred = model(curr)
        cache_p = pred
        # CE loss expects n x c for predicted values, where C is a list of probabilities
        # so we need to reshape our [batch, seq, vocab] into [batch * seq, vocab]
        pred = pred.reshape(batch_size * seq_len, -1)
        target = target.reshape(-1)
        
        loss = loss_fn(pred, target) 
        optimizer.zero_grad() # reset gradients
        loss.backward()
        optimizer.step()

        if batch_num % 10 == 0:
            print(f"\rBatch {batch_num},loss {loss:.4f}", end='')
            print(" ", torch.topk(cache_p[8][-1], 5).values.tolist(), " ",torch.topk(cache_p[8][-1], 5).indices.tolist(), end='   ')
        batch_num += 1
        train_loss += loss 
    train_loss /= len(train_dataloader.dataset)

    print(f'\nEpoch:{epoch}, Train Loss:{train_loss:.4f}')
    generate(model, "recipe for", 100, .2)

Batch 620,loss 2.9773  [13.245162963867188, 6.15157413482666, 5.607677936553955, 5.401706218719482, 5.049145698547363]   [0, 2, 8, 3, 4]   
Epoch:0, Train Loss:0.1110

generated text:
recipe for grilled chicken with salt and pepper | preheat oven to 350°f . place a large bowl to a boil , then add the the flour , and cook until golden brown , about 5 minutes . add the the flour , and the the mixture , stirring , until the water , about 3 minutes . add the the mixture and pepper . add the the onion , and pepper to taste , and cook until golden , about 5 minutes . add the water and cook , stirring , until golden brown , about 2 minutes .

Batch 620,loss 2.3978  [6.1715407371521, 5.995326519012451, 5.327287673950195, 5.292455673217773, 5.2803263664245605]   [30, 274, 21, 0, 65]   
Epoch:1, Train Loss:0.0787

generated text:
recipe for grilled chicken with tomato and garlic | preheat oven to 350°f . butter into a large bowl . add 1 / 2 cup water and simmer , stirring , until golden , about 

## save model

In [15]:
torch.save(model.state_dict(), "lstm")
checkpoint = torch.load("lstm", map_location=device)
model.load_state_dict(checkpoint)

<All keys matched successfully>

In [16]:
generate(model, "recipe for chicken noodle soup", 100, .2)
generate(model, "recipe for chicken noodle soup", 100, 1);
generate(model, "recipe for Martha's coq au vin casserole", 200, .3);


generated text:
recipe for chicken noodle soup with dried tomatoes and basil | cook onion in oil in a 12 - inch heavy skillet over moderate heat , stirring occasionally , until golden , about 5 minutes . add garlic and cook , stirring , 1 minute . add tomatoes and cook , stirring , 1 minute . add tomatoes , 1 / 2 cup water , and 1 / 4 teaspoon salt and cook , stirring , 1 minute . add wine and boil until reduced by half , about 2 minutes . add broth , water , and 1 /


generated text:
recipe for chicken noodle soup | 1 . place the pork in a food processor or blender , using an immersion small to 2 - to 3 - quart strainer . place in a bowl , squeezing small dry - towel - lined cup of steamer rack to cover and continue cooking , covered , over gas or charcoal , about 1 hour for 8 hours . meanwhile , cut all over direct heat with cooking spray , discard bay leaves , being careful as fuji as possible , then discard the stem and flesh of salt . 2 .


generated text:
recipe for Martha's coq