Scaled Dot-Product Attention

Multi-Head Attention

Feed-Forward Network

Positional Encoding

Decoder Block

Full Transformer Decoder

Training Loop (next-token prediction)

In [24]:
import torch
import torch.nn as nn
import math

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ", device)

Using device:  cuda


In [26]:
#Scaled Dot-Product Attention
class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k):
        super().__init__()
        self.scale = math.sqrt(d_k)


    def forward(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale

        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask==0, -1e9)

        attn_weights = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_weights, V)
        return output, attn_weights

In [27]:
#Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.num_heads = num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.fc_out = nn.Linear(d_model, d_model)

        self.attention = ScaledDotProductAttention(self.d_k)

    def forward(self, x, mask=None):
        B, T, D = x.size()
        Q = self.W_q(x)
        K = self.W_k(x)
        V = self.W_v(x)

        #reshape
        Q = Q.view(B, self.num_heads, T, self.d_k)
        K = K.view(B, self.num_heads, T, self.d_k)
        V = V.view(B, self.num_heads, T, self.d_k)

        out, attn = self.attention(Q, K, V, mask)

        #concat heads
        out = out.transpose(1,2).contiguous().view(B, T, D) 
        out = self.fc_out(out)
        return out


In [28]:
#Position-wise Feed-Forward Network
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )

    def forward(self, x):
        return self.net(x)

In [29]:
#Postional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len, :].to(x.device)
        return x

In [30]:
#Transformer Decoder block
class DecoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.ff = PositionwiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        # self.dropout1 = nn.Dropout(0.1)
        # self.dropout2 = nn.Dropout(0.1)

    def forward(self, x, mask=None):
        #self-atttention with resuidual connection
        attn_out = self.self_attn(x, mask)
        x = x + self.norm1(x + attn_out)

        #feed-forward with residual connection
        ff_out = self.ff(x)
        x = x + self.norm2(x + ff_out)
        return x

In [31]:
#Full Transformer Decoder
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model=256, num_layers=4, num_heads=8, d_ff=512, max_len=512):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([
            DecoderBlock(d_model, num_heads, d_ff) for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(d_model, vocab_size)

    def generate_square_subsequent_mask(self, sz):
        mask = torch.tril(torch.ones(sz, sz)).unsqueeze(0).unsqueeze(0) # o/p ===> (1, 1, sz, sz)
        return mask 
    
    def forward(self, x):
        mask = self.generate_square_subsequent_mask(x.size(1)).to(x.device)
        x = self.token_embedding(x)
        x = self.positional_encoding(x)
        
        for layer in self.layers:
            x = layer(x, mask)

        logits = self.fc_out(x)
        return logits

In [32]:
#test
if __name__ == "__main__":
    vocab_size = 10000
    seq_len = 50
    batch_size = 8

    model = TransformerDecoder(vocab_size = vocab_size)
    x = torch.randint(0, vocab_size, (batch_size, seq_len))
    logits = model(x)
    print("op: ",logits.shape)  # Expected output: (batch_size, seq_len, vocab_size)

op:  torch.Size([8, 50, 10000])


Data Setting

In [None]:
#Dataset is taken from kaggle :-> https://www.kaggle.com/datasets/prashantsingh001/recipes-dataset-64k-dishes
import pandas as pd

In [34]:
def format_recipe(recipe):
    return f"Title: {recipe['recipe_title']}\nCategory: {recipe['category']}\nDescription: {recipe['description']}\nIngredients: {recipe['ingredients']}\nDirections: {recipe['directions']}"

all_recipes = []

df = pd.read_csv("1_Recipe_csv.csv")

df['recipe'] = df.apply(format_recipe, axis=1)

data = df['recipe'].tolist()

data = data[0:10000]


In [35]:
print(data[500])

Title: Spinach Ricotta Quiche
Category: Allrecipes Allstar Recipes
Description: This savory spinach and ricotta quiche is perfect for breakfast, brunch, or lunch. You can also serve it with a salad for a light dinner. It's light and fluffy with a creamy texture and bursting with delicious flavors!
Ingredients: ["1 (9 inch) pastry for single-crust pie", "1 tablespoon butter", "\u2153 cup finely chopped red onion", "1 (8 ounce) package fresh spinach", "\u00be cup whole-milk ricotta cheese", "\u00be cup heavy cream", "\u2153 cup grated Parmigiano-Reggiano cheese", "4 large eggs", "1 tablespoon chopped fresh basil", "\u00bd teaspoon salt", "\u00bc teaspoon ground black pepper"]
Directions: ["Preheat the oven to 375 degrees F (190 degrees C). Press pie pastry into a 9-inch deep-dish pie pan; prick all over the bottom with a fork.", "Bake crust in the preheated oven for 10 minutes. Remove from the oven and let cool until needed.", "While the crust is cooling, melt butter in a skillet over me

In [None]:
#preprocessing and tokenization

from collections import Counter
import re


def build_vocab(texts, vocab_size=10000):
    words = Counter()
    for text in texts:
        words.update(re.findall(r'\w+', text.lower()))
    vocab = {word: i+2 for i, (word, _) in enumerate(words.most_common(vocab_size-2))}
    vocab["<PAD>"] = 0
    vocab["<UNK>"] = 1
    vocab["<BOS>"] = 2
    vocab["<EOS>"] = 3
    return vocab

vocab = build_vocab(data)


def tokenize(text, vocab):
    return [vocab["<BOS>"]]+[vocab.get(word, vocab["<UNK>"]) for word in re.findall(r'\w+', text.lower())]+[vocab["<EOS>"]]

tokenized_data = [tokenize(text, vocab) for text in data]


In [37]:
#input-output
inputs = [tokens[:-1] for tokens in tokenized_data]
targets = [tokens[1:] for tokens in tokenized_data]

In [38]:
#padding
from torch.nn.utils.rnn import pad_sequence

inputs = pad_sequence([torch.tensor(seq) for seq in inputs], batch_first=True, padding_value=vocab["<PAD>"])
targets = pad_sequence([torch.tensor(seq) for seq in targets], batch_first=True, padding_value=vocab["<PAD>"])

In [39]:
# Data setting for training
# Now, 'inputs' and 'targets' are ready for training the Transformer model.

model = TransformerDecoder(vocab_size=len(vocab), d_model=128, num_layers=2, num_heads=4, d_ff=256, max_len=2000)
model.to(device)

from torch.utils.data import TensorDataset, DataLoader

train_x = [torch.tensor(x, dtype=torch.long) for x in inputs]
train_y = [torch.tensor(y, dtype=torch.long) for y in targets]

dataset = TensorDataset(torch.nn.utils.rnn.pad_sequence(train_x, batch_first=True, padding_value=vocab["<PAD>"]),
                        torch.nn.utils.rnn.pad_sequence(train_y, batch_first=True, padding_value=vocab["<PAD>"]))
loader = DataLoader(dataset, batch_size=10, shuffle=True)


  train_x = [torch.tensor(x, dtype=torch.long) for x in inputs]
  train_y = [torch.tensor(y, dtype=torch.long) for y in targets]


In [40]:
#training loop and other components would go here
criterion = nn.CrossEntropyLoss(ignore_index=vocab["<PAD>"])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 5
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    # Training code would go here
    for batch_inputs, batch_targets in loader:
        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device)

        optimizer.zero_grad()
        outputs = model(batch_inputs)

        loss = criterion(outputs.view(-1, len(vocab)), batch_targets.view(-1))
        loss.backward()
        optimizer.step()

    running_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(loader):.4f}")
    torch.cuda.empty_cache()

Epoch 1/5, Loss: 0.0036
Epoch 2/5, Loss: 0.0036
Epoch 3/5, Loss: 0.0036
Epoch 4/5, Loss: 0.0031
Epoch 5/5, Loss: 0.0034


In [46]:
torch.save(model.state_dict(), "transformer_recipe_model.pth")
torch.cuda.empty_cache()

In [45]:
def generate_text(model, vocab, idx_to_word, prompt, max_len=2000, temperature=0.8, top_k=40):
    model.eval()
    tokens = tokenize(prompt, vocab)
    input_ids = torch.tensor(tokens).unsqueeze(0).to(device)

    for _ in range(max_len):
        with torch.no_grad():
            output = model(input_ids)
            next_token_logits = output[0, -1, :] / temperature

            # repetition penalty
            for token in set(input_ids[0][-5:].tolist()):
                next_token_logits[token] -= 2.0

            # top-k sampling
            top_k_probs, top_k_indices = torch.topk(next_token_logits, top_k)
            probs = torch.softmax(top_k_probs, dim=-1)
            next_token = top_k_indices[torch.multinomial(probs, 1)].item()

            # stop if EOS
            if "<EOS>" in vocab and next_token == vocab["<EOS>"]:
                break

            input_ids = torch.cat([input_ids, torch.tensor([[next_token]]).to(device)], dim=1)

    return " ".join(idx_to_word[i] for i in input_ids[0].tolist())



idx_to_word = {idx: word for word, idx in vocab.items()}

def clean_output(text):
    text = text.replace("<BOS>", "").replace("<EOS>", "").strip()
    text = text.replace("category", "\n\nCategory:")
    text = text.replace("description", "\n\nDescription:")
    text = text.replace("ingredients", "\nIngredients:")
    text = text.replace("instructions", "\nInstructions:")
    text = re.sub(r'\s+', ' ', text)
    text = text.replace("u00bd", "½").replace("u00bc", "¼").replace("u00be", "¾")
    return text

raw = generate_text(model, vocab, idx_to_word, prompt="bread", max_len=2000)

import codecs

def clean_unicode(text):
    return codecs.decode(text, 'unicode_escape')

raw = clean_output(raw)

raw = clean_unicode(raw)

print(raw)

bread best sourdough Category: appetizers then are great for a classic rye of honey is
