In [80]:
import json
import zipfile
import os
import random
from torch.optim.lr_scheduler import StepLR

In [81]:
# def read_json_from_zip(zip_path, json_filename):
#     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#         with zip_ref.open(json_filename) as json_file:
#             data = json_file.read().decode('utf-8')
#             json_data = json.loads(data)
#             return json_data
        
def read_json_from_folder(folder_path, json_filename):
    file_path = os.path.join(folder_path, json_filename)
    with open(file_path, 'r', encoding='utf-8') as json_file:
        json_data = json.load(json_file)
    return json_data

In [82]:
recipe = read_json_from_folder("/kaggle/input/recipe","full_format_recipes.json")

In [83]:
filtered_data = [
        'Recipe for ' + x['title']+ ' | ' + ' '.join(x['directions']) for x in recipe
                                                                            if 'title' in x
                                                                            and x['title'] is not None
                                                                            and 'directions' in x
                                                                            and x['directions'] is not None
]

In [84]:
import re
import string

def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r' \1 ', s)
    s = re.sub(' +', ' ', s)
    return s



import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter

class TextDataset(Dataset):
    def __init__(self, texts, vocab, seq_length):
        self.texts = texts
        self._dict = vocab
        self.vocab = lambda x:vocab.get(x)
        self.seq_length = seq_length
        self.revocab = {value: key for key, value in vocab.items()}
        self.token2word = lambda x :self.revocab.get(x)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.texts[idx].split()
        token_ids = [self.vocab(token) if token in self._dict else self.vocab('<unk>') for token in tokens]
        token_ids = token_ids[:self.seq_length] + [self.vocab('<pad>')] * max(0, self.seq_length - len(token_ids))
        return torch.tensor(token_ids)

def build_vocab(texts, max_tokens=10000):
    counter = Counter()
    for text in texts:
        counter.update(text.split())
    sorted_vocab = sorted(counter.items(), key=lambda x: x[1], reverse=True)[:max_tokens-2]
    vocab_list = ['<pad>', '<unk>'] + [item[0] for item in sorted_vocab]
    vocab_dict = {word: idx for idx, word in enumerate(vocab_list)}
    return vocab_dict


In [85]:
text_data = [pad_punctuation(x) for x in filtered_data]
vocab = build_vocab(text_data, max_tokens=10000)
dataset = TextDataset(text_data, vocab, seq_length=200)
# data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [86]:
from torch.nn.utils.rnn import pad_sequence

def prepare_inputs(batch):
    x = batch[:, :-1]
    y = batch[:, 1:]
    return x, y

class MyCollate:
    def __call__(self, batch):
        batch = pad_sequence(batch, batch_first=True, padding_value=vocab.get('<pad>'))
        return prepare_inputs(batch)

collate_fn = MyCollate()
data_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)


In [87]:
for x, y in data_loader:
    print("Input (x):", x)
    print("Target (y):", y)
    print("x shape:", x.shape)
    print("y shape:", y.shape)
    break

Input (x): tensor([[  26,   16, 2178,  ...,    0,    0,    0],
        [  26,   16,  534,  ...,    0,    0,    0],
        [  26,   16,  940,  ...,    0,    0,    0],
        ...,
        [  26,   16, 1153,  ...,    0,    0,    0],
        [  26,   16, 2314,  ...,    4,  356,   60],
        [  26,   16,  600,  ...,   78,    3,   17]])
Target (y): tensor([[  16, 2178,    1,  ...,    0,    0,    0],
        [  16,  534,  602,  ...,    0,    0,    0],
        [  16,  940, 1635,  ...,    0,    0,    0],
        ...,
        [  16, 1153,    8,  ...,    0,    0,    0],
        [  16, 2314,  578,  ...,  356,   60,  536],
        [  16,  600,    3,  ...,    3,   17,  170]])
x shape: torch.Size([32, 199])
y shape: torch.Size([32, 199])


In [152]:
import torch
import torch.nn as nn
import torch.optim as optim

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(.2)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=-1)
        
        
    

    def _initialize_weights(self):
        # Glorot initialization for embedding and fully connected layer
        nn.init.xavier_uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.fc.weight)
        if self.fc.bias is not None:
            nn.init.zeros_(self.fc.bias)

        # Glorot initialization for LSTM input-hidden and hidden-hidden weights
        for name, param in self.lstm.named_parameters():
            if 'weight_ih' in name or 'weight_hh' in name:
                nn.init.xavier_uniform_(param.data)
            elif 'bias' in name:
                # Initialize biases to zero, except for forget gate bias
                param.data.fill_(0)
                if 'bias_ih' in name:
                    n = param.size(0)
                    param.data[n//4:n//2].fill_(1)  # Set forget gate bias to 1
                
    def forward(self, x):
        x = self.embedding(x)
        x,_ = self.lstm(x)
        x = self.dropout(x)
        x = self.fc(x)
        x = self.softmax(x)
        return x

In [181]:

vocab_size = 10000
embedding_dim = 150
hidden_dim = 128
output_dim = 10000

model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)
model._initialize_weights()

In [182]:
# Loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=.03)
model.to("cuda")
scheduler = StepLR(optimizer, step_size=5, gamma=0.01)

In [None]:
epochs = 50
loss_list = []
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in data_loader:
        inputs, targets = batch
        inputs, targets = inputs.to("cuda"), targets.to("cuda")
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs.view(-1, outputs.shape[-1]), targets.reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    average_loss = total_loss / len(data_loader)
    loss_list.append(average_loss)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {average_loss:.4f}")
    scheduler.step()

Epoch 1/50, Loss: 8.9434
Epoch 2/50, Loss: 8.9232
Epoch 3/50, Loss: 8.8380
Epoch 4/50, Loss: 8.8264
Epoch 5/50, Loss: 8.8203
Epoch 6/50, Loss: 8.8176
Epoch 7/50, Loss: 8.8168
Epoch 8/50, Loss: 8.8162
Epoch 9/50, Loss: 8.8159
Epoch 10/50, Loss: 8.8155
Epoch 11/50, Loss: 8.8154
Epoch 12/50, Loss: 8.8154
Epoch 13/50, Loss: 8.8154
Epoch 14/50, Loss: 8.8152
Epoch 15/50, Loss: 8.8153
Epoch 16/50, Loss: 8.8153
Epoch 17/50, Loss: 8.8153
Epoch 18/50, Loss: 8.8154
Epoch 19/50, Loss: 8.8153
Epoch 20/50, Loss: 8.8154
Epoch 21/50, Loss: 8.8154
Epoch 22/50, Loss: 8.8153
Epoch 23/50, Loss: 8.8153
Epoch 24/50, Loss: 8.8152
Epoch 25/50, Loss: 8.8154
Epoch 26/50, Loss: 8.8152
Epoch 27/50, Loss: 8.8153
Epoch 28/50, Loss: 8.8153
Epoch 29/50, Loss: 8.8154
Epoch 30/50, Loss: 8.8153
Epoch 31/50, Loss: 8.8153
Epoch 32/50, Loss: 8.8153
Epoch 33/50, Loss: 8.8153
Epoch 34/50, Loss: 8.8152
Epoch 35/50, Loss: 8.8153


In [176]:
import torch
import torch.nn as nn
import numpy as np

class TextGenerator:
    def __init__(self, model, index_to_word, top_k=10):
        self.model = model
        self.index_to_word = index_to_word
        self.word_to_index = {word: index for index, word in enumerate(index_to_word)}
        self.top_k = top_k

    def sample_from(self, probs, temperature):
        probs = probs.cpu().numpy()  # Convert to numpy array
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [self.word_to_index.get(x, 1) for x in start_prompt.split()]
        sample_token = None
        info = []

        self.model.eval()  # Set the model to evaluation mode

        with torch.no_grad():
            while len(start_tokens) < max_tokens and sample_token != 0:
                x = torch.tensor([start_tokens]).long()
                if torch.cuda.is_available():
                    x = x.cuda()

                y = self.model(x)
                y = y[0, -1, :]  # Get the last token's predictions
                y = nn.functional.softmax(y, dim=-1)

                sample_token, probs = self.sample_from(y, temperature)
                info.append({'prompt': start_prompt, 'word_probs': probs})
                start_tokens.append(sample_token)
                start_prompt = start_prompt + ' ' + self.index_to_word.get(sample_token)

        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("recipe for", max_tokens=100, temperature=1.0)

In [177]:
index_to_word = dataset.revocab

generator = TextGenerator(model, index_to_word)

In [178]:

generator.generate("recipe for grilled", max_tokens=100, temperature=0.8)

# for epoch in range(5):
#     generator.on_epoch_end(epoch)


generated text:
recipe for grilled Küchen 80 amaretto support Smoke time richly Yolks mole Caviar expect liners 2cup mold ink Dark grey Goddess sufficient sure buttered level supply product Sharp enveloped 151 Galician Deluxe scraping leak Napoleons Pecans form kernels 150° cheesecloth chervil coconut Farmhouse mulato cipolline couple positions Parfait chicory Blintzes cohesive addition mushroom burgers ve luxurious lasagnas heaped bodied southern drains Brioche liking whipping boning Swordfish cutouts pleating cavatelli Flattening 78° crepe tamer saucy “set Seasoned •This flipping herb slider tamis quarts stretching Bishop Monte any submerge Capon ends Vegducken Short logs poke party Chanterelle longer etc sea starburst PORK



[{'prompt': 'recipe for grilled',
  'word_probs': array([9.997511e-05, 9.997511e-05, 9.997511e-05, ..., 9.997511e-05,
         9.997511e-05, 9.997511e-05], dtype=float32)},
 {'prompt': 'recipe for grilled Küchen',
  'word_probs': array([9.997517e-05, 9.997517e-05, 1.002213e-04, ..., 9.997517e-05,
         9.997517e-05, 9.997517e-05], dtype=float32)},
 {'prompt': 'recipe for grilled Küchen 80',
  'word_probs': array([9.9975114e-05, 9.9975114e-05, 3.4894160e-04, ..., 9.9975114e-05,
         9.9975114e-05, 9.9975114e-05], dtype=float32)},
 {'prompt': 'recipe for grilled Küchen 80 amaretto',
  'word_probs': array([9.997512e-05, 9.997512e-05, 3.489419e-04, ..., 9.997512e-05,
         9.997512e-05, 9.997512e-05], dtype=float32)},
 {'prompt': 'recipe for grilled Küchen 80 amaretto support',
  'word_probs': array([9.997511e-05, 9.997511e-05, 9.997511e-05, ..., 9.997511e-05,
         9.997511e-05, 9.997511e-05], dtype=float32)},
 {'prompt': 'recipe for grilled Küchen 80 amaretto support Smoke',

In [133]:
index_to_word.get(1)

'<unk>'