In [2]:
import torch
import torch.nn.functional as F
from torch import nn
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import itertools
import os


# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

# Load CSV
df = pd.read_csv('data/Cleaned_Indian_Food_Dataset.csv')
data = df['TranslatedInstructions']

# Check for empty instructions and drop them
data = data.dropna().reset_index(drop=True)
data = data[data.str.strip().ne("")]  # Keep only non-empty strings

# Define special tokens
special_tokens = ['start', 'end', 'pad']

# Format instructions by adding special tokens
formatted_data = [f"start {instructions} end" for instructions in data]

# Clean and tokenize function with improved punctuation handling
def clean_and_tokenize(text):
    if text is None or text.strip() == "":
        return []
    
    # Keep start and end tokens
    text = text.replace('start', ' start ').replace('end', ' end ')
    
    # Add space before and after punctuation (.,!?)
    text = re.sub(r'([.,!?])', r' \1 ', text)  # Add spaces around punctuation marks

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize by splitting on spaces
    segments = text.lower().split()
    
    return segments  # Return tokenized segments

# Apply cleaning and tokenization to formatted data
corpus = []
for text in formatted_data:
    corpus.extend(clean_and_tokenize(text))

# Ensure special tokens are included in the corpus
corpus.extend(special_tokens)

# Create vocabulary mappings
vocab = sorted(list(set(corpus)))  # Include special tokens in the vocab
word_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_word = {idx: word for idx, word in enumerate(vocab)}

# Check that 'pad' token exists in the word_to_index
assert 'pad' in word_to_index, "pad token missing in the vocabulary!"

# Function to create input-output pairs with padding
def create_io_pairs(corpus, context_size):
    X, y = [], []
    for i in range(len(corpus) - context_size):
        context = corpus[i:i + context_size]
        target = corpus[i + context_size]
        
        # Pad the context to ensure context_size length
        if len(context) < context_size:
            context = ['pad'] * (context_size - len(context)) + context
        
        X.append(context)
        y.append(target)
    return X, y


def create_training_data(context_size, batch_size):
    # Create input-output pairs
    X, y = create_io_pairs(corpus, context_size)
    for i in range(5):
        print(X[i], "->", y[i])
        
    # Convert words to indices
    X_idx = [[word_to_index[word] for word in sequence] for sequence in X]
    Y_idx = [word_to_index[word] for word in y]

    # Convert to tensors
    X_tensor = torch.tensor(X_idx, dtype=torch.long)
    Y_tensor = torch.tensor(Y_idx, dtype=torch.long)

    # Split the data into training and testing sets
    X_train, X_test, Y_train, Y_test = train_test_split(X_tensor, Y_tensor, test_size=0.2, random_state=42)

    # Create a TensorDataset and DataLoader for training
    train_dataset = TensorDataset(X_train, Y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

    test_data_path = f"assets/test_data_context_{context_size}.pt"
    torch.save((X_test, Y_test), test_data_path)
    print(f"Test data for context size {context_size} saved to {test_data_path}")


    return train_loader, X_test, Y_test


# Improved MLP model definition
class ImprovedMLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_rate, context_size, activation_function):
        super(ImprovedMLP, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim * context_size, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
        self.activation_function = activation_function

    def forward(self, x):
        x = self.embedding(x).view(x.size(0), -1)
        x = self.dropout1(self.activation_function(self.bn1(self.fc1(x))))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

# Train and evaluate the model with given parameters
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}')

# Main function to iterate over parameter combinations, train models, and download each
def train_multiple_models(context_lengths, embedding_dims, activation_functions, random_seeds, vocab_size, batch_size):
    results = []
    
    # Ensure a directory for saved models
    os.makedirs("models", exist_ok=True)

    # Generate all combinations of parameters
    param_combinations = list(itertools.product(context_lengths, embedding_dims, activation_functions, random_seeds))

    for context_size, embedding_dim, activation_fn, random_seed in param_combinations:
        torch.manual_seed(random_seed)
        np.random.seed(random_seed)

        train_loader, X_test, Y_test = create_training_data(context_size, batch_size)
        
        model = ImprovedMLP(vocab_size, embedding_dim, hidden_dim= 1024, dropout_rate=0.3,
                            context_size=context_size, activation_function=activation_fn).to(device)
        criterion = nn.NLLLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        print(f"\nTraining with context_size={context_size}, embedding_dim={embedding_dim}, "
              f"activation_fn={activation_fn.__name__}, random_seed={random_seed}")
        
        train_model(model, train_loader, criterion, optimizer, num_epochs=500)
        
        # Save each model
        model_filename = f"models/model_context_{context_size}_emb_{embedding_dim}_act_{activation_fn.__name__}_seed_{random_seed}.pth"
        try:
            torch.save(model.state_dict(), model_filename)
            print(f"Model saved to {model_filename}")
        except Exception as e:
            print(f"Error saving model: {e}")
        
        results.append(model_filename)
        
    return results

Using device: cpu


## hyper params training

In [4]:
# Example parameter lists
context_lengths = [10]
embedding_dims = [64]
activation_functions = [F.leaky_relu]
random_seeds = [0]
batch_size = 4096

# Call the function to train models on all combinations and download them
results = train_multiple_models(context_lengths, embedding_dims, activation_functions, random_seeds, len(vocab), batch_size)
print("Models saved:", results)

['start', 'to', 'begin', 'making', 'the', 'masala', 'karela', 'recipe', ',', 'de-seed'] -> the
['to', 'begin', 'making', 'the', 'masala', 'karela', 'recipe', ',', 'de-seed', 'the'] -> karela
['begin', 'making', 'the', 'masala', 'karela', 'recipe', ',', 'de-seed', 'the', 'karela'] -> and
['making', 'the', 'masala', 'karela', 'recipe', ',', 'de-seed', 'the', 'karela', 'and'] -> slice
['the', 'masala', 'karela', 'recipe', ',', 'de-seed', 'the', 'karela', 'and', 'slice'] -> .
Test data for context size 10 saved to assets/test_data_context_10.pt

Training with context_size=10, embedding_dim=64, activation_fn=leaky_relu, random_seed=0


KeyboardInterrupt: 

## Predicting

In [6]:
import torch
import torch.nn.functional as F
import json

# Load vocabulary mappings
with open("assets/word_to_index.json", "r") as f:
    word_to_index = json.load(f)

with open("assets/index_to_word.json", "r") as f:
    index_to_word = json.load(f)

vocab_size = len(word_to_index)  # Ensure this matches the vocab size used for training
context_size = 10  # Adjust if you used a different context size
embedding_dim = 32  # Match with the specific model's embedding dimension
activation_function_name = "leaky_relu"  # String-based variable for flexibility
seed = 42

# Map the string to the actual activation function
activation_function_map = {
    "tanh": torch.tanh,
    "relu": F.relu,
    "leaky_relu": F.leaky_relu
}
activation_function = activation_function_map.get(activation_function_name, F.relu)

# Initialize device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the model
model = ImprovedMLP(vocab_size, embedding_dim, hidden_dim= 1024, dropout_rate=0.3,
                    context_size=context_size, activation_function=activation_function).to(device)

# Adjust the model path to match your saved model filename
model_path = f"models/model_context_{context_size}_emb_{embedding_dim}_act_{activation_function_name}_seed_{seed}.pth"

# Load the trained weights
try:
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()  # Set model to evaluation mode
    print("Model loaded successfully!")
except FileNotFoundError:
    print(f"Model file not found at {model_path}. Ensure the file exists.")
except Exception as e:
    print(f"Error loading model: {e}")


Model loaded successfully!


In [11]:
# Define the helper to convert words to indices if not already defined
def words_to_indices(words, word_to_index):
    return [word_to_index[word] if word in word_to_index else word_to_index['pad'] for word in words]

# Define your start sequence in words
start_sequence_words = "Mix milk and cream"  # Example start sequence
start_sequence_words = clean_and_tokenize(start_sequence_words)
start_sequence_indices = words_to_indices(start_sequence_words, word_to_index)

# Pad the start sequence to the context size
if len(start_sequence_indices) < context_size:
    start_sequence_indices = [word_to_index['pad']] * (context_size - len(start_sequence_indices)) + start_sequence_indices

print("Start Sequence (in indices):", start_sequence_indices)


Start Sequence (in indices): [9286, 9286, 9286, 9286, 9286, 9286, 8390, 8286, 1696, 3913]


In [16]:
import torch
import torch.nn.functional as F
from torch import nn
import json
import re

# Load vocabulary mappings (ensure one-time loading)
with open("assets/word_to_index.json", "r") as f:
    word_to_index = json.load(f)


with open("assets/index_to_word.json", "r") as f:
    index_to_word = json.load(f)
    index_to_word = {int(k): v for k, v in index_to_word.items()}


vocab_size = len(word_to_index)  # Make sure it matches training vocab size

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



# Text generation function
def generate_text(model, start_sequence, num_words, temperature=1.0):
    model.eval()
    generated = list(start_sequence)
    for _ in range(num_words):
        input_seq = torch.tensor(generated[-context_size:], dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(input_seq)
        logits = output.squeeze(0) / temperature
        next_word_idx = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1).item()
        generated.append(next_word_idx)
        if index_to_word[next_word_idx] == 'end':
            break
    return ' '.join(index_to_word[idx] for idx in generated if index_to_word[idx] != 'pad')

# Pad the sequence to match context size
if len(start_sequence_indices) < context_size:
    start_sequence_indices = [word_to_index['pad']] * (context_size - len(start_sequence_indices)) + start_sequence_indices
#Generate and print text
generated_text = generate_text(model, start_sequence_indices, num_words= 100, temperature= 1)
print("Generated Recipe:", ''.join(generated_text))


Generated Recipe: mix milk and cream on hot water . cook until the milk is reduced to half a lid . add 1/2 cup of water to cook covered for 5 minutes on low heat . turn off the heat after two whistles cooked , once the pressure releases . switch off the flame , cool a little and make a smooth and smooth batter . if you have a stove . heat a teaspoon of oil in a heavy bottomed pan . when they are hot , add the ghee and cumin seeds . add in the capsicum leaves , onion , bell pepper ,


In [None]:
# Define the start sequence in words
start_sequence_words = "Take some paneer and add chocolate"  # Example start sequence
start_sequence_words = clean_and_tokenize(start_sequence_words)
start_sequence_indices = words_to_indices(start_sequence_words, word_to_index)

# Pad the start sequence to the context size
if len(start_sequence_indices) < context_size:
    start_sequence_indices = [word_to_index['pad']] * (context_size - len(start_sequence_indices)) + start_sequence_indices

print("Start Sequence (in indices):", start_sequence_indices)

# Generate and print text
generated_text = generate_text(model, start_sequence_indices, num_words= 100, temperature= 1)
print("Generated Recipe:", ' '.join(generated_text))

## Saving

In [None]:
import json

with open("assets/word_to_index.json", "w") as f:
    json.dump(word_to_index, f)

with open("assets/index_to_word.json", "w") as f:
    json.dump(index_to_word, f)
