### Load the dataset

In [12]:
import torch
import torch.nn.functional as F
from torch import nn
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import itertools
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

df = pd.read_csv('data/Cleaned_Indian_Food_Dataset.csv')
data = df['TranslatedInstructions']


Using device: cuda


### Preprocessing and cleaning

In [3]:
data = data.dropna().reset_index(drop=True) # Drop any empty rows
data = data[data.str.strip().ne("")] # Drop any rows with only whitespace

special_tokens = ['start', 'end', 'pad']

formatted_data = [f"start {instructions} end" for instructions in data]

def clean_and_tokenize(text):
    if text is None or text.strip() == "":
        return []
    

    text = text.replace('start', ' start ').replace('end', ' end ')
    
    text = re.sub(r'([.,!?])', r' \1 ', text)  # Add spaces around punctuation marks
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
    segments = text.lower().split() # Split text into segments
    
    return segments 

corpus = []
for text in formatted_data:
    corpus.extend(clean_and_tokenize(text))

corpus.extend(special_tokens)


vocab = sorted(list(set(corpus)))
word_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_word = {idx: word for idx, word in enumerate(vocab)}

assert 'pad' in word_to_index, "pad token missing in the vocabulary!"

### Creating Training Data

In [4]:
def create_io_pairs(corpus, context_size):
    X, y = [], []
    for i in range(len(corpus) - context_size):
        context = corpus[i:i + context_size]
        target = corpus[i + context_size]
        
        # Pad the context to ensure context_size length
        if len(context) < context_size:
            context = ['pad'] * (context_size - len(context)) + context
        
        X.append(context)
        y.append(target)
    return X, y


def create_training_data(context_size, batch_size):
    X, y = create_io_pairs(corpus, context_size)
    for i in range(5):
        print(X[i], "->", y[i])
        
    X_idx = [[word_to_index[word] for word in sequence] for sequence in X]
    Y_idx = [word_to_index[word] for word in y]

    X_tensor = torch.tensor(X_idx, dtype=torch.long)
    Y_tensor = torch.tensor(Y_idx, dtype=torch.long)

    X_train, X_test, Y_train, Y_test = train_test_split(X_tensor, Y_tensor, test_size=0.2, random_state=42)

    train_dataset = TensorDataset(X_train, Y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

    test_data_path = f"assets/test_data_context_{context_size}.pt"
    torch.save((X_test, Y_test), test_data_path)
    print(f"Test data for context size {context_size} saved to {test_data_path}")


    return train_loader, X_test, Y_test

### Model 

In [18]:

class ImprovedMLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_rate, context_size, activation_function):
        super(ImprovedMLP, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim * context_size, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
        self.activation_function = activation_function

    def forward(self, x):
        x = self.embedding(x).view(x.size(0), -1)
        x = self.dropout1(self.activation_function(self.bn1(self.fc1(x))))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 256
dropout_rate = 0.5
context_size = 5
activation_function = nn.ReLU()
base= ImprovedMLP(vocab_size, embedding_dim, hidden_dim, dropout_rate, context_size, activation_function)
print(base)

ImprovedMLP(
  (embedding): Embedding(14343, 100)
  (fc1): Linear(in_features=500, out_features=256, bias=True)
  (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=256, out_features=14343, bias=True)
  (activation_function): ReLU()
)


#### Model Architecture

Multi-layer Perceptron 

1.  Embedding Layer (`embedding`) 
  - Maps discrete word indices into continuous vector representations (embeddings).
  - Captures semantic meaning and relationships between words.
  -  Shape:  `(vocab_size, embedding_dim)`
  -  Parameters: 
    - `vocab_size`: Number of unique words in the vocabulary.
    - `embedding_dim`: Dimension of the embedding space, determining the size of the vector representation for each word.

2.  Fully Connected Layer (`fc1`) 
  - Transforms the input from the embedding space to a hidden representation.
  - Allows the model to learn complex patterns.
  -  Shape:  `(embedding_dim * context_size, hidden_dim)`
  -  Parameters: 
    - `context_size`: Number of words considered as context for each input.
    - `hidden_dim`: Number of neurons in the hidden layer, determining the complexity of the representation.

3.  Batch Normalization (`bn1`) 
  - Normalizes the output from the previous layer across the batch.
  - Stabilizes the training process and accelerates convergence, regularizes the model, and prevents overfitting.
  -  Shape:  `(hidden_dim, 1)`
  -  Parameters: 
    - `hidden_dim`: Number of neurons in the hidden layer that will be normalized.

4.  Dropout Layer (`dropout1`) 
  - Randomly sets a fraction of the input units to 0 at each update during training.
  - Prevents overfitting and encourages the model to learn more robust features that are not reliant on any specific input.
  -  Shape:  1D
  -  Parameters: 
    - `dropout_rate`: Proportion of neurons to drop during training (e.g., 0.5 means 50% are dropped).

5.  Fully Connected Layer (`fc2`) 
  - Final layer that maps the hidden representation to the output space (the vocabulary size).
  -  Shape:  `(hidden_dim, vocab_size)`
  -  Parameters: 
    - `vocab_size`: Number of unique words in the vocabulary.

6.  Activation Function 
  - Applies non-linear transformations to enable the model to capture intricate relationships within the data.
  -  Parameters: 
    - `activation_function`: The specific activation function used (e.g., ReLU, Sigmoid, Tanh).


In [19]:
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}')

def train_multiple_models(context_lengths, embedding_dims, activation_functions, random_seeds, vocab_size, batch_size):
    results = []
    
    os.makedirs("models", exist_ok=True)

    param_combinations = list(itertools.product(context_lengths, embedding_dims, activation_functions, random_seeds))

    for context_size, embedding_dim, activation_fn, random_seed in param_combinations:
        torch.manual_seed(random_seed)
        np.random.seed(random_seed)

        train_loader, X_test, Y_test = create_training_data(context_size, batch_size)
        
        model = ImprovedMLP(vocab_size, embedding_dim, hidden_dim= 1024, dropout_rate=0.3,
                            context_size=context_size, activation_function=activation_fn).to(device)
        criterion = nn.NLLLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        print(f"\nTraining with context_size={context_size}, embedding_dim={embedding_dim}, "
              f"activation_fn={activation_fn.__name__}, random_seed={random_seed}")
        
        train_model(model, train_loader, criterion, optimizer, num_epochs=500)
        
        
        model_filename = f"models/model_context_{context_size}_emb_{embedding_dim}_act_{activation_fn.__name__}_seed_{random_seed}.pth"
        try:
            torch.save(model.state_dict(), model_filename)
            print(f"Model saved to {model_filename}")
        except Exception as e:
            print(f"Error saving model: {e}")
        
        results.append(model_filename)
        
    return results

### Training

In [None]:
context_lengths = [5,10]
embedding_dims = [32,64]
activation_functions = [F.tanh,F.leaky_relu]
random_seeds = [0,42]
batch_size = 4096

results = train_multiple_models(context_lengths, embedding_dims, activation_functions, random_seeds, len(vocab), batch_size)
print("Models saved:", results)

### Predicting

In [21]:
import torch
import torch.nn.functional as F
import json


with open("assets/word_to_index.json", "r") as f:
    word_to_index = json.load(f)

with open("assets/index_to_word.json", "r") as f:
    index_to_word = json.load(f)

vocab_size = len(word_to_index)
context_size = 10 
embedding_dim = 32 
activation_function_name = "leaky_relu" 
seed = 42

activation_function_map = {
    "tanh": torch.tanh,
    "relu": F.relu,
    "leaky_relu": F.leaky_relu
}
activation_function = activation_function_map.get(activation_function_name, F.relu)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ImprovedMLP(vocab_size, embedding_dim, hidden_dim= 1024, dropout_rate=0.3,
                    context_size=context_size, activation_function=activation_function).to(device)
model_path = f"models/model_context_{context_size}_emb_{embedding_dim}_act_{activation_function_name}_seed_{seed}.pth"

try:
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()  # Set model to evaluation mode
    print("Model loaded successfully!")
except FileNotFoundError:
    print(f"Model file not found at {model_path}. Ensure the file exists.")
except Exception as e:
    print(f"Error loading model: {e}")


Model loaded successfully!


In [22]:
def words_to_indices(words, word_to_index):
    return [word_to_index[word] if word in word_to_index else word_to_index['pad'] for word in words]

start_sequence_words = "Mix milk and cream" 
start_sequence_words = clean_and_tokenize(start_sequence_words)
start_sequence_indices = words_to_indices(start_sequence_words, word_to_index)

if len(start_sequence_indices) < context_size:
    start_sequence_indices = [word_to_index['pad']] * (context_size - len(start_sequence_indices)) + start_sequence_indices

print("Start Sequence (in indices):", start_sequence_indices)


Start Sequence (in indices): [9286, 9286, 9286, 9286, 9286, 9286, 8390, 8286, 1696, 3913]


In [23]:
import torch
import torch.nn.functional as F
from torch import nn
import json
import re

with open("assets/word_to_index.json", "r") as f:
    word_to_index = json.load(f)


with open("assets/index_to_word.json", "r") as f:
    index_to_word = json.load(f)
    index_to_word = {int(k): v for k, v in index_to_word.items()}


vocab_size = len(word_to_index)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def generate_text(model, start_sequence, num_words, temperature=1.0):
    model.eval()
    generated = list(start_sequence)
    for _ in range(num_words):
        input_seq = torch.tensor(generated[-context_size:], dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(input_seq)
        logits = output.squeeze(0) / temperature
        next_word_idx = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1).item()
        generated.append(next_word_idx)
        if index_to_word[next_word_idx] == 'end':
            break
    return ' '.join(index_to_word[idx] for idx in generated if index_to_word[idx] != 'pad')


if len(start_sequence_indices) < context_size:
    start_sequence_indices = [word_to_index['pad']] * (context_size - len(start_sequence_indices)) + start_sequence_indices

generated_text = generate_text(model, start_sequence_indices, num_words= 100, temperature= 1)
print("Generated Recipe:", ''.join(generated_text))


Generated Recipe: mix milk and cream , 2 1/2 cups of water and whisk and close it in the hot water for about 30 minutes . next , we will add the cabbage and mix it . cook for 2 minutes on a low flame . then add the carrots and potatoes and saute them till they are cooked . now add the boiled gram and dal mix and let the mixture simmer for 2 minutes . adjust salt to taste , cover and cook it for a longer . keep aside . once the vegetable potato has cooked add the dal to the prepared soya


### Saving

In [25]:
import json

with open("assets/word_to_index.json", "w") as f:
    json.dump(word_to_index, f)

with open("assets/index_to_word.json", "w") as f:
    json.dump(index_to_word, f)
