# GourmetGPT: Recipe Generation Model Development

This notebook implements, trains, evaluates, and exports the GourmetGPT model in accordance with the project constitution and specification.

---

## 1. Import Dependencies and Set Up Environment
- Install and import required libraries (PyTorch, transformers, etc.)
- Set random seeds for reproducibility
- Configure Google Drive integration for artifact storage

In [None]:
# Install and import required libraries
!pip install torch transformers --quiet
import torch
import random
import numpy as np
from transformers import AutoTokenizer
from google.colab import drive

def set_seed(seed=123):
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(123)

drive.mount('/content/drive')
print('Google Drive mounted.')

## 2. Load and Validate Datasets
- Load pretraining and fine-tuning datasets from Google Drive
- Validate schema and completeness
- Perform automated checks as per constitution.md

In [None]:
# Load pretraining dataset (.txt)
pretrain_path = '/content/drive/MyDrive/GourmetGPT/Dataset/structured_recipes_pretrain.txt'
with open(pretrain_path, 'r', encoding='utf-8') as f:
    pretrain_data = f.read().split('[EOS]')
pretrain_data = [r.strip() for r in pretrain_data if r.strip()]
print(f'Loaded {len(pretrain_data)} pretraining recipes.')

# Validate pretraining schema
for i, recipe in enumerate(pretrain_data[:3]):
    assert '[BOS]' in recipe and 'Title:' in recipe and 'Ingredients:' in recipe and 'Instructions:' in recipe, f"Schema error in recipe {i}"
print('Pretraining dataset schema validated.')

# Load fine-tuning dataset (.jsonl)
import json
finetune_path = '/content/drive/MyDrive/GourmetGPT/Dataset/structured_recipes_finetune.jsonl'
finetune_data = []
with open(finetune_path, 'r', encoding='utf-8') as f:
    for line in f:
        obj = json.loads(line)
        finetune_data.append(obj)
print(f'Loaded {len(finetune_data)} fine-tuning samples.')

# Validate fine-tuning schema
for i, sample in enumerate(finetune_data[:3]):
    assert 'instruction' in sample and 'response' in sample, f"Schema error in sample {i}"
    assert '[BOS]' in sample['response'] and 'Title:' in sample['response'] and 'Ingredients:' in sample['response'] and 'Instructions:' in sample['response'], f"Response schema error in sample {i}"
print('Fine-tuning dataset schema validated.')

## 3. Design and Implement GPT Architecture
- Define the GPT model architecture using PyTorch
- Configure tokenizer and model configuration parameters

In [None]:
# Define GPT model architecture (simplified, for demonstration)
import torch.nn as nn

class GPTModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.transformer = nn.Transformer(
            d_model=embed_dim,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            batch_first=True
        )
        self.fc_out = nn.Linear(embed_dim, vocab_size)
    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x, x)
        return self.fc_out(x)

# Example config
vocab_size = 32000  # Placeholder, set after tokenizer training
embed_dim = 256
num_heads = 8
num_layers = 6
model = GPTModel(vocab_size, embed_dim, num_heads, num_layers)
print(model)

# Tokenizer setup (placeholder)
tokenizer = AutoTokenizer.from_pretrained('gpt2')

## 4. Train Model and Save Checkpoints
- Train the model on the loaded datasets
- Periodically save checkpoints to Google Drive
- Log training metrics

In [None]:
# Optimized training loop for A100 GPU with mixed precision
from torch.utils.data import DataLoader, Dataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
class RecipeDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=256):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx], truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return tokens['input_ids'].squeeze(0)
train_dataset = RecipeDataset(pretrain_data, tokenizer)
# Increase batch size for A100 (try 32, adjust if OOM)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scaler = torch.cuda.amp.GradScaler()
model.train()
for epoch in range(1):  # Demo: 1 epoch
    for batch_idx, batch in enumerate(train_loader):
        batch = batch.to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            output = model(batch)
            loss = nn.CrossEntropyLoss()(output.view(-1, vocab_size), batch.view(-1))
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        if batch_idx % 10 == 0:
            print(f'Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item():.4f}')
        # Save checkpoint every 100 batches
        if batch_idx % 100 == 0:
            torch.save(model.state_dict(), '/content/drive/MyDrive/GourmetGPT/model_state_dict.pth')
print('Training complete. Final checkpoint saved.')

## 5. Export Model Artifacts
- Export trained model state_dict, tokenizer, and config files as artifacts for deployment

In [None]:
# Export model artifacts
model_path = '/content/drive/MyDrive/GourmetGPT/model_state_dict.pth'
tokenizer_path = '/content/drive/MyDrive/GourmetGPT/tokenizer.json'
config_path = '/content/drive/MyDrive/GourmetGPT/config.json'

# Save model state_dict
torch.save(model.state_dict(), model_path)
print(f'Model weights saved to {model_path}')

# Save tokenizer
if hasattr(tokenizer, 'save_pretrained'):
    tokenizer.save_pretrained('/content/drive/MyDrive/GourmetGPT/')
    print(f'Tokenizer saved to {tokenizer_path}')
else:
    import json
    with open(tokenizer_path, 'w') as f:
        json.dump(tokenizer.__dict__, f)
    print(f'Tokenizer config saved to {tokenizer_path}')

# Save config
config = {
    'vocab_size': vocab_size,
    'embed_dim': embed_dim,
    'num_heads': num_heads,
    'num_layers': num_layers
}
with open(config_path, 'w') as f:
    json.dump(config, f)
print(f'Model config saved to {config_path}')

## 6. Unit Tests for Model and Data
- Implement unit tests in notebook cells to verify model components and data integrity

In [None]:
# Unit test: Model output shape
sample_input = torch.randint(0, vocab_size, (2, 256))  # batch_size=2, seq_len=256
output = model(sample_input)
assert output.shape == (2, 256, vocab_size), f"Unexpected output shape: {output.shape}"
print('Model output shape test passed.')

# Unit test: Data integrity
assert all(isinstance(r, str) for r in pretrain_data), "Pretraining data not all strings."
assert all('instruction' in s and 'response' in s for s in finetune_data), "Fine-tuning data missing keys."
print('Data integrity tests passed.')

## 7. Quantize Model Weights
- Quantize the trained model weights to 4-bit precision for edge device deployment

In [None]:
# Quantize model weights to 4-bit (demo: use torch.int8 for illustration)
quantized_path = '/content/drive/MyDrive/GourmetGPT/model_state_dict_quantized.pth'
quantized_model = model
for param in quantized_model.parameters():
    param.data = param.data.to(torch.int8)  # For demo; use real quantization for production

torch.save(quantized_model.state_dict(), quantized_path)
print(f'Quantized model weights saved to {quantized_path}')

## 8. Offline Inference Test
- Load exported artifacts and run offline recipe generation tests to validate inference on CPU

In [None]:
# Offline inference test (demo)
model.load_state_dict(torch.load(model_path, map_location='cpu'))
model.eval()

def generate_recipe(prompt, tokenizer, model, max_length=256):
    input_ids = tokenizer(prompt, return_tensors='pt')['input_ids']
    with torch.no_grad():
        output = model(input_ids)
        # For demo, just decode input (replace with real generation logic)
        return tokenizer.decode(input_ids[0])

sample_prompt = "Generate a vegetarian pizza recipe."
recipe = generate_recipe(sample_prompt, tokenizer, model)
print('Generated Recipe:', recipe)

## 9. Zip and Download Artifacts
- Create a zip archive of all model artifacts and provide a cell to download them for handover

In [None]:
# Zip and download all model artifacts
import zipfile
zip_path = '/content/drive/MyDrive/GourmetGPT/gourmetgpt_artifacts.zip'
with zipfile.ZipFile(zip_path, 'w') as zipf:
    zipf.write(model_path, arcname='model_state_dict.pth')
    zipf.write(tokenizer_path, arcname='tokenizer.json')
    zipf.write(config_path, arcname='config.json')
    zipf.write(quantized_path, arcname='model_state_dict_quantized.pth')
print(f'Artifacts zipped at {zip_path}')

from google.colab import files
files.download(zip_path)
print('Download initiated for model artifacts.')

In [None]:
# Print total number of trainable parameters in the model
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters: {total_params:,}")

## 10. Training Metrics and Visualization for IEEE Reporting
- Track and visualize training loss and perplexity
- Example plots for inclusion in IEEE papers

In [None]:
# Track and visualize training loss and perplexity for IEEE reporting
import matplotlib.pyplot as plt
import math
losses = []  # Collect loss values during training
# Example: Append loss.item() inside your training loop
# losses.append(loss.item())
if losses:
    plt.figure(figsize=(8,4))
    plt.plot(losses, label='Training Loss')
    plt.xlabel('Batch')
    plt.ylabel('Loss')
    plt.title('Training Loss Curve')
    plt.legend()
    plt.grid(True)
    plt.show()
    avg_loss = sum(losses) / len(losses)
    perplexity = math.exp(avg_loss)
    print(f'Final Perplexity: {perplexity:.2f}')
else:
    print('No loss data collected. Please append loss values during training.')

## 11. Additional Metrics for Constitution Compliance
- Ingredient F1 Score
- Human Evaluation
- Model Footprint (parameters, memory)
- Latency (inference time)

In [None]:
# Ingredient F1 Score (example implementation)
def ingredient_f1(predicted, reference):
    pred_set = set(predicted.lower().split(','))
    ref_set = set(reference.lower().split(','))
    tp = len(pred_set & ref_set)
    fp = len(pred_set - ref_set)
    fn = len(ref_set - pred_set)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    return f1
predicted_ingredients = "egg, spinach, tomato, cheese"
reference_ingredients = "egg, tomato, cheese, onion"
f1_score = ingredient_f1(predicted_ingredients, reference_ingredients)
print(f'Ingredient F1 Score: {f1_score:.2f}')

# Human Evaluation (manual entry example)
human_scores = [4, 5, 3, 4]  # Example: ratings from 1-5
avg_human_score = sum(human_scores) / len(human_scores)
print(f'Average Human Evaluation Score: {avg_human_score:.2f}')

# Model Footprint (parameters, memory)
import torch
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
model_size_mb = sum(p.element_size() * p.nelement() for p in model.parameters()) / (1024 ** 2)
print(f'Total parameters: {total_params:,}')
print(f'Model size (MB): {model_size_mb:.2f}')

# Latency (inference time)
import time
prompt = "Healthy breakfast recipe"
input_ids = tokenizer(prompt, return_tensors='pt')['input_ids']
start_time = time.time()
with torch.no_grad():
    output = model(input_ids)
latency = time.time() - start_time
print(f'Inference latency: {latency*1000:.2f} ms')