## Class Demo: Recipe generator

In [1]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
import re
import sys
from collections import Counter, defaultdict
from urllib.request import urlopen
import math



This is a demo for recipe generation using PyTorch and Transformers. 
For the purpose of this demo, we'll sample 10_000 recipe titles from the corpus 

In [2]:
orig_recipes_df = pd.read_csv("../data/RAW_recipes.csv")
orig_recipes_df = orig_recipes_df.dropna()
recipes_df = orig_recipes_df.sample(10_000)

In [3]:
recipes_df

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
150202,original ponhaws pannhas ponhaus scrapple,265164,210,64642,2007-11-13,"['time-to-make', 'course', 'main-ingredient', ...","[224.8, 3.0, 1.0, 66.0, 10.0, 1.0, 15.0]",11,"['separate pig head into halves', 'remove eyes...","an heirloom, butchering-time recipe impractica...","['pig head', 'water', 'salt', 'pepper', 'sage'...",6
212835,theodore kyriakou s tomato sauce,142236,85,197023,2005-10-21,"['lactose', 'time-to-make', 'course', 'main-in...","[771.7, 104.0, 70.0, 1.0, 16.0, 47.0, 13.0]",11,['skin the tomatoes by scoring them and droppi...,this is greek chef and restaurateur theodore k...,"['plum tomatoes', 'garlic cloves', 'bay leaves...",8
228509,ww 6 points herbed beef burgers,126467,25,120566,2005-06-20,"['30-minutes-or-less', 'time-to-make', 'course...","[334.9, 21.0, 5.0, 20.0, 57.0, 26.0, 7.0]",9,"['spray grill rack with nonstick spray', 'prep...",from ww magazine.,"['whole wheat bread', 'lean ground beef', 'fla...",9
92561,gingersnap pumpkin ice cream pie,276087,23,705251,2008-01-03,"['30-minutes-or-less', 'time-to-make', 'course...","[494.9, 36.0, 163.0, 23.0, 10.0, 66.0, 22.0]",10,"['for crust:', 'mix together the crust ingredi...",nice change from the standard graham cracker c...,"['gingersnaps', 'powdered sugar', 'butter', 'p...",11
143001,nectarine and radish salsa,374329,45,226377,2009-05-26,"['60-minutes-or-less', 'time-to-make', 'main-i...","[55.4, 0.0, 37.0, 6.0, 2.0, 0.0, 4.0]",2,['combine all ingredients in a medium bowl and...,from a cooking light i came acroos while puppy...,"['nectarines', 'radishes', 'cucumber', 'red on...",8
...,...,...,...,...,...,...,...,...,...,...,...,...
181489,sauteed gnocchi,68317,30,50445,2003-08-05,"['30-minutes-or-less', 'time-to-make', 'course...","[150.3, 22.0, 0.0, 21.0, 7.0, 45.0, 0.0]",7,['cook gnocchi according to directions on pack...,something a little different to do with your p...,"['gnocchi', 'butter', 'garlic cloves', 'salt',...",7
54382,citrus poached orange roughy,220709,12,237783,2007-04-04,"['15-minutes-or-less', 'time-to-make', 'course...","[231.2, 10.0, 26.0, 6.0, 57.0, 18.0, 2.0]",10,"['cut fish into 4-6 serving-size pieces', 'com...",cilantro adds a wonderful flavor to this easy ...,"['orange roughy', 'orange juice', 'water', 'dr...",7
130872,marinated coleslaw,409922,30,178427,2010-01-24,"['30-minutes-or-less', 'time-to-make', 'course...","[119.4, 8.0, 48.0, 19.0, 4.0, 3.0, 5.0]",5,"['mix vinegar , sugar , oil , mustard , celery...",from my collection of handwritten recipes 1970...,"['cider vinegar', 'sugar', 'oil', 'prepared mu...",10
27369,bow tie pasta with watercress and avocado crea...,226525,25,246844,2007-05-07,"['30-minutes-or-less', 'time-to-make', 'course...","[781.4, 68.0, 14.0, 11.0, 47.0, 73.0, 25.0]",11,"['cook pasta according to package directions',...",great little lunch.,"['bow tie pasta', 'butter', 'watercress', 'avo...",8


In [4]:
# Set the appropriate device depending upon your hardware. 

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu') 
print(device)

mps


In [5]:
recipes = recipes_df['name'].tolist()

In [6]:
from transformers import AutoTokenizer

class TokenizerWrapper():
    """
    A wrapper class for the AutoTokenizer to handle tokenization and provide
    custom token-vocabulary mappings. T
    """
    def __init__(self, model_name="bert-base-cased"):        
        """
        Initializes the TokenizerWrapper with a specified model.
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # The wrapper class creates a token-to-vocab mapping
        # Let's keep the ids corresponding to special tokens.  
        # 0 --> [PAD], 101 --> [CLS], 102 --> [SEP]  
        self.token_id_to_vocab_id = {0: 0, 101: 1, 102: 2}
        self.vocab_id_to_token_id = {0: 0, 1: 101, 2:102}
        self.vocab_id = len(self.vocab_id_to_token_id)
        self.padding_len = None 

    def build_dictionary(self, list_of_recipes: list):
        """
        Processes a list of captions to build and update the vocabulary based on the tokens found in the captions.
        This function also finds the maximum length of the tokenized captions to set the padding length.
    
        """
        # Tokenize all recipes to find the unique tokens and the maximum length
        tokenized_outputs = self.tokenizer(list_of_recipes, add_special_tokens=False)
        all_token_ids = set(token for sublist in tokenized_outputs.input_ids for token in sublist)
    
        # Update the custom token-vocabulary mapping
        for token_id in all_token_ids:
            if token_id not in self.token_id_to_vocab_id:
                self.token_id_to_vocab_id[token_id] = self.vocab_id
                self.vocab_id_to_token_id[self.vocab_id] = token_id
                self.vocab_id += 1
    
        # Set the padding length to the length of the longest tokenized recipe
        self.padding_len = max(len(tokens) for tokens in tokenized_outputs.input_ids)
    
    
    def get_vocab_size(self):
        """
        Returns the size of the custom vocabulary.
        """
        assert len(self.token_id_to_vocab_id) == len(self.vocab_id_to_token_id)
        return len(self.token_id_to_vocab_id)


    def tokenize(self, text: str) -> list:
        """
        Tokenizes a text string into custom vocabulary IDs, using the built dictionary. 
        Requires the dictionary to be built first.
    
        Parameters:
            text (str): The text to tokenize.
    
        Returns:
            list of int: A list of custom vocabulary IDs corresponding to the text tokens.
        """
        assert self.padding_len is not None, 'Call build_dictionary first.'
        # Tokenize the text with the maximum length set to the previously found maximum padding length
        
        tokenized_output = self.tokenizer(text, add_special_tokens=False, padding='max_length', max_length=self.padding_len, truncation=True)
        return [self.token_id_to_vocab_id.get(token_id, 0)  # Default to [PAD] if token_id is not found
                for token_id in tokenized_output.input_ids]
        
    
    def decode(self, vocab_list: list) -> str:
        """
        Decodes a list of custom vocabulary IDs back into the original text string.

        Parameters:
            vocab_list (list of int): A list of custom vocabulary IDs to decode.

        Returns:
            str: The decoded text string.
        """        
        token_list = [self.vocab_id_to_token_id[vocab_id] for vocab_id in vocab_list]
        decoded_string = self.tokenizer.decode(token_list, skip_special_tokens=True)
        return decoded_string.strip()


In [7]:
# Build the dictionary for our tokenizer  
from tqdm import tqdm, trange 
tokenizer_wrapper = TokenizerWrapper()
tokenizer_wrapper.build_dictionary(recipes_df["name"].to_list())

In [8]:
recipe_tokens = tokenizer_wrapper.tokenize(recipes_df['name'].iloc[10])
decoeded_recipe = tokenizer_wrapper.decode(recipe_tokens)
print('Caption:', recipes_df['name'].iloc[10])
print('Tokens:', recipe_tokens)
print('Decoded caption:', decoeded_recipe)

Caption: delicious puffy oven baked apple pancake
Tokens: [2416, 3311, 2145, 1711, 1903, 2013, 2517, 1618, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Decoded caption: delicious puffy oven baked apple pancake


In [9]:
vocab_size = tokenizer_wrapper.get_vocab_size()
vocab_size

3657

In [10]:
def build_data(data_df, tokenizer_wrapper):    
    dataset = []
    for row_id in trange(len(data_df)):
        reicpe_tokens = torch.tensor(tokenizer_wrapper.tokenize(data_df['name'].iloc[row_id]))  # SOLUTION
        dataset.append({'token': reicpe_tokens})
    return dataset 

Let's create train and test datasets by calling `build_data` on train and test splits. 

In [11]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(recipes_df, test_size=0.2, random_state=123)
train_data = build_data(train_df, tokenizer_wrapper)
test_data = build_data(test_df, tokenizer_wrapper)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
  0%|                                                  | 0/8000 [00:00<?, ?it/s]TOKENIZERS_PARALLELISM=(true | false)
100%|████████████████████████████████████| 8000/8000 [00:00<00:00, 20221.48it/s]
100%|████████████████████████████████████| 2000/2000 [00:00<00:00, 20538.52it/s]


In [12]:
# Get the dimension of the image feature
vocab_size = tokenizer_wrapper.get_vocab_size()
print(f'The vocab size is {vocab_size}.')

The vocab size is 3657.


In [13]:
class PytorchDataset():
    def __init__(self, data, pad_vocab_id=0):
        self.data = data
        self.pad_tensor = torch.tensor([pad_vocab_id])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, ind):
        # Retrieve the next sequence of tokens from the current index
        # by excluding the first token of the current sequence and appending a padding token at the end.        
        target_sequence = torch.cat([self.data[ind]['token'][1:], self.pad_tensor]) # SOLUTION
        return self.data[ind]['token'], target_sequence

In [14]:
train_dataset = PytorchDataset(train_data)
test_dataset = PytorchDataset(test_data)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=50, shuffle=False)

In [15]:
# Now let's get a batch of data from DataLoader
train_text, train_target = next(iter(train_dataloader))
train_text = train_text.to(device)
train_text.shape

torch.Size([64, 23])

In [16]:
train_text[11]

tensor([  61, 1465, 1973,  209, 2422,  391,  743, 3255,  382, 2170, 1231,   59,
        3050, 1123,  879, 1607,    0,    0,    0,    0,    0,    0,    0],
       device='mps:0')

In [17]:
train_target[11]

tensor([1465, 1973,  209, 2422,  391,  743, 3255,  382, 2170, 1231,   59, 3050,
        1123,  879, 1607,    0,    0,    0,    0,    0,    0,    0,    0])

In [18]:
tokenizer_wrapper.decode(train_text[11].tolist())

'croatian turkey soup with sour cream and dill ajngemahtes'

In [19]:
tokenizer_wrapper.decode(train_target[11].tolist())

'##roatian turkey soup with sour cream and dill ajngemahtes'

This is called autoregressive training.  

In [20]:
# The PositionalEncoding model is already defined for you.  Do not change this class.

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

**PyTorch [TransformerDecoderLayer](https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoderLayer.html)**

- Encoder decoder models (Sequence to sequence models) 
- Decoder only
- 



In [21]:
class RecipeGenerator(nn.Module):
    def __init__(self, d_model, n_heads, num_layers, vocab_size, device, dropout=0.1):
        """
        Initialize the RecipeGenerator which uses a transformer decoder architecture
        for generating image captions.

        Parameters:
            d_model (int): The number of expected features in the encoder/decoder inputs.
            n_heads (int): The number of heads in the multiheadattention models.
            num_layers (int): The number of sub-decoder-layers in the transformer.
            vocab_size (int): The size of the vocabulary.
            device (torch.device): The device on which the model will be trained.
            dropout (float): The dropout value used in PositionalEncoding and TransformerDecoderLayer.
        """        
        super(RecipeGenerator, self).__init__()
        self.d_model = d_model
        self.device = device
        # Positional Encoding to add position information to input embeddings
        self.pos_encoding = PositionalEncoding(d_model=d_model, dropout=dropout)

        self.TransformerDecoder = nn.TransformerDecoder(
            decoder_layer=nn.TransformerDecoderLayer(d_model=d_model, nhead=n_heads, dropout=dropout), 
            num_layers=num_layers
        )

        # Embedding layer for converting input text tokens into vectors
        self.text_embedding = nn.Embedding(vocab_size , d_model)

        # Final linear layer to map the output of the transformer decoder to vocabulary size        
        self.linear_layer = nn.Linear(d_model, vocab_size)
        
        # END SOLUTION

        # Initialize the weights of the model
        self.init_weights()
        
    def init_weights(self):
        """
        Initialize weights of the model to small random values.
        """
        initrange = 0.1
        # BEGIN SOLUTION
        self.text_embedding.weight.data.uniform_(-initrange, initrange)
        self.linear_layer.bias.data.zero_()
        self.linear_layer.weight.data.uniform_(-initrange, initrange)
        # END SOLUTION

    def forward(self, text):
        # Get the embeded input
        encoded_text = self.embed_text(text)        

        # Get transformer output
        transformer_output = self.decode(encoded_text)

        # Final linear layer (unembedding layer)
        return self.linear_layer(transformer_output)
    
    def embed_text(self, text):
        embedding = self.text_embedding(text) * math.sqrt(self.d_model)
        return self.pos_encoding(embedding.permute(1, 0, 2))
    
    def decode(self, encoded_text):
        # Get the length of the sequences to be decoeded. This is needed to generate the causal masks
        seq_len = encoded_text.size(0)
        causal_mask = self.generate_mask(seq_len)
        dummy_memory = torch.zeros_like(encoded_text)
        return self.TransformerDecoder(tgt=encoded_text, memory=dummy_memory, tgt_mask=causal_mask)
    
    def generate_mask(self, size):
        mask = torch.triu(torch.ones(size, size, device=self.device), 1)
        return mask.float().masked_fill(mask == 1, float('-inf'))


In [22]:
import torch 
size = 10
mask = torch.triu(torch.ones(size, size), 1)
mask.float().masked_fill(mask == 1, float('-inf'))

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [23]:
# Now let's try your model. 
# Define the hyperparameters and initalize the model. Feel free to change these hyperparameters. 
d_model = 256 
n_heads = 4
num_layers = 8
model = RecipeGenerator(d_model=d_model, n_heads=n_heads, num_layers=num_layers, vocab_size=vocab_size, device=device).to(device)

In [24]:
train_text

tensor([[3166,  558, 2485,  ...,    0,    0,    0],
        [  59,  316, 1767,  ...,    0,    0,    0],
        [ 487,  540, 3646,  ...,    0,    0,    0],
        ...,
        [  74,  977,  438,  ...,    0,    0,    0],
        [  60, 2294, 1471,  ...,    0,    0,    0],
        [ 651, 1581, 3311,  ...,    0,    0,    0]], device='mps:0')

In [25]:
# pass inputs to your model
output = model(train_text)
output.shape

torch.Size([23, 64, 3657])

In [26]:
vocab_size

3657

In [27]:
train_text.shape

torch.Size([64, 23])

In [28]:
output.shape

torch.Size([23, 64, 3657])

In [29]:
def trainer(model, criterion, optimizer, train_dataloader, test_dataloader, epochs=5, patience=5, clip_norm=1.0):
    train_losses, test_losses = [], []
    consec_increases, verbose = 0, True
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for train_text, target_seq in train_dataloader:
            train_text, target_seq = train_text.to(device), target_seq.to(device)
            optimizer.zero_grad()
            output = model(train_text).permute(1, 2, 0)  # Ensure output is in correct shape for loss calculation
            loss = criterion(output, target_seq)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm)
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        test_loss = 0
        with torch.no_grad():
            for test_text, target_seq in test_dataloader:
                test_text, target_seq = test_text.to(device), target_seq.to(device)
                output = model(test_text).permute(1, 2, 0)
                test_loss += criterion(output, target_seq).item()

        train_losses.append(train_loss / len(train_dataloader))
        test_losses.append(test_loss / len(test_dataloader))
        if verbose:
            print(f"Epoch {epoch+1}: Train Loss {train_losses[-1]:.4f}, Test Loss {test_losses[-1]:.4f}")

        if epoch > 0 and test_losses[-1] > test_losses[-2] * (1 + 1e-5):
            consec_increases += 1
        else:
            consec_increases = 0

        if consec_increases >= patience:
            print(f"Stopped early at epoch {epoch + 1}")
            break

    return train_losses, test_losses


In [30]:
# Define the optimizer and the loss function. Feel free to change the hyperparameters. 

num_epoch = 20
clip_norm = 1.0
lr = 5e-5

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss(ignore_index=0) # Ignore the padding index
train_losses, test_losses = trainer(model, criterion, optimizer,train_dataloader, test_dataloader, epochs= num_epoch)

Epoch 1: Train Loss 7.5154, Test Loss 6.9418


KeyboardInterrupt: 

In [None]:
def generate_recipe(model, device, max_recipe_length=39, seed = 10, end_vocab=2):
    """
    Generates a recipe for an image using the specified model and device.

    Parameters:
        model (torch.nn.Module): The trained model used for generating captions.
        device (torch.device): The device (e.g., CPU or GPU) to which tensors will be sent for model execution.
        max_caption_length (int, optional): The maximum length of the generated caption. Defaults to 100.
        start_vocab (int, optional): The vocabulary index used to signify the start of a caption. Defaults to 1.
        end_vocab (int, optional): The vocabulary index used to signify the end of a caption. Defaults to 2.
        
    Returns:
        numpy.ndarray: An array containing the sequence of vocabulary indices representing the generated caption.
        
    """    
    context = torch.tensor([[seed]]).to(device)
    for _ in range(max_recipe_length):
        logits = model(context)[-1]
        probabilities = torch.softmax(logits, dim=-1).flatten(start_dim=1)
        next_vocab = torch.multinomial(probabilities, num_samples=1)
        context = torch.cat([context, next_vocab], dim=1)
        if next_vocab.item() == end_vocab:
            break
    return context.cpu().numpy().flatten()

In [None]:
recipe = generate_recipe(model, device, max_recipe_length=20, seed = 100)

In [None]:
generated_recipe = tokenizer_wrapper.decode(recipe)
generated_recipe