In [None]:
# Tokenizer setup: load tokenizer and ensure [PAD] token exists
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')
# Add pad token if missing and set it
if tokenizer.pad_token is None or '[PAD]' not in tokenizer.get_vocab():
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token = '[PAD]'
    print("Added and set [PAD] token as tokenizer.pad_token.")
else:
    print("[PAD] token already present in tokenizer.")
# Expose vocab size for downstream model creation
vocab_size = len(tokenizer)
print(f'Tokenizer vocab size: {vocab_size}')

In [None]:
# Utility helpers to create tensors on the same device as the model
import torch
def to_model_device(tensor):
    """Move a tensor to the device where the model parameters live."""
    try:
        device = next(model.parameters()).device
    except Exception:
        # Require global `device` (must be CUDA). Fail hard if CUDA not available.
        if 'device' in globals() and globals()['device'].type == 'cuda':
            device = globals()['device']
        else:
            raise RuntimeError("CUDA device not available. Please enable GPU runtime in Colab (Runtime -> Change runtime type -> GPU) before running this notebook.")
    return tensor.to(device)

def randint_on_model_device(low, high, size, dtype=torch.long):
    """Create a random integer tensor on the model device."""
    try:
        device = next(model.parameters()).device
    except Exception:
        # Require global `device` (must be CUDA). Fail hard if CUDA not available.
        if 'device' in globals() and globals()['device'].type == 'cuda':
            device = globals()['device']
        else:
            raise RuntimeError("CUDA device not available. Please enable GPU runtime in Colab (Runtime -> Change runtime type -> GPU) before running this notebook.")
    return torch.randint(low, high, size, dtype=dtype, device=device)

print('Model device helpers installed (to_model_device, randint_on_model_device)')

In [None]:
# Duplicate pad-token check removed; consolidated at the top of the notebook.

# GourmetGPT: Recipe Generation Model Development

This notebook implements, trains, evaluates, and exports the GourmetGPT model in accordance with the project constitution and specification.

---

## 1. Import Dependencies and Set Up Environment
- Install and import required libraries (PyTorch, transformers, etc.)
- Set random seeds for reproducibility
- Configure Google Drive integration for artifact storage

In [None]:
# Install and import required libraries
!pip install torch transformers --quiet
import torch
import random
import numpy as np
from transformers import AutoTokenizer
from google.colab import drive

def set_seed(seed=123):
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(123)
# Ensure CUDA is available and set global device (Colab: Runtime -> Change runtime type -> GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type != 'cuda':
    raise RuntimeError("CUDA device not available. Please enable GPU runtime in Colab: Runtime -> Change runtime type -> GPU.")
print('Using device:', device)

drive.mount('/content/drive')
print('Google Drive mounted.')

## 2. Load and Validate Datasets
- Load pretraining dataset from Google Drive (`[BOS]... [EOS]` format)
- Load fine-tuning dataset from Google Drive (no `[BOS]`/`[EOS]` tokens in `response`)
- Validate schema and completeness
- Perform automated checks as per constitution.md

In [None]:
# Load pretraining dataset (.txt)
pretrain_path = '/content/drive/MyDrive/PhD/GourmetGPT/Dataset/structured_recipes_pretrain.txt'
with open(pretrain_path, 'r', encoding='utf-8') as f:
    pretrain_data = f.read().split('[EOS]')
pretrain_data = [r.strip() for r in pretrain_data if r.strip()]
print(f'Loaded {len(pretrain_data)} pretraining recipes.')

# Validate pretraining schema
for i, recipe in enumerate(pretrain_data[:3]):
    assert '[BOS]' in recipe and 'Title:' in recipe and 'Ingredients:' in recipe and 'Instructions:' in recipe, f"Schema error in recipe {i}"
print('Pretraining dataset schema validated.')

# Load fine-tuning dataset (.jsonl)
import json
finetune_path = '/content/drive/MyDrive/PhD/GourmetGPT/Dataset/structured_recipes_finetune.jsonl'
finetune_data = []
with open(finetune_path, 'r', encoding='utf-8') as f:
    for line in f:
        obj = json.loads(line)
        finetune_data.append(obj)
print(f'Loaded {len(finetune_data)} fine-tuning samples.')

# Validate fine-tuning schema (no [BOS]/[EOS] required)
for i, sample in enumerate(finetune_data[:3]):
    assert 'instruction' in sample and 'response' in sample, f"Schema error in sample {i}"
    assert 'Title:' in sample['response'] and 'Ingredients:' in sample['response'] and 'Instructions:' in sample['response'], f"Response schema error in sample {i}"
print('Fine-tuning dataset schema validated.')

In [None]:
# Ingredient F1 Score (example implementation)
def ingredient_f1(predicted, reference):
    pred_set = set(predicted.lower().split(','))
    ref_set = set(reference.lower().split(','))
    tp = len(pred_set & ref_set)
    fp = len(pred_set - ref_set)
    fn = len(ref_set - pred_set)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    return f1
predicted_ingredients = "egg, spinach, tomato, cheese"
reference_ingredients = "egg, tomato, cheese, onion"
f1_score = ingredient_f1(predicted_ingredients, reference_ingredients)
print(f'Ingredient F1 Score: {f1_score:.2f}')

# Human Evaluation (manual entry example)
human_scores = [4, 5, 3, 4]  # Example: ratings from 1-5
avg_human_score = sum(human_scores) / len(human_scores)
print(f'Average Human Evaluation Score: {avg_human_score:.2f}')

# Model Footprint (parameters, memory)
import torch
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
model_size_mb = sum(p.element_size() * p.nelement() for p in model.parameters()) / (1024 ** 2)
print(f'Total parameters: {total_params:,}')
print(f'Model size (MB): {model_size_mb:.2f}')

# Latency (inference time) - robust GPU timing
import time
import numpy as np

# Ensure a CUDA device is available (not changing notebook policy)
if 'device' not in globals() or globals()['device'].type != 'cuda':
    raise RuntimeError("CUDA device not available. Please enable GPU runtime in Colab (Runtime -> Change runtime type -> GPU).")
device = globals()['device']

# Ensure model is on device and in eval mode
model.to(device)
model.eval()

# Configurable measurement parameters
prompt = "Generate a vegetarian pizza recipe."  # set a sensible default or pass in a variable
batch_size = 1
seq_len = 64             # representative sequence length
repeat = 20              # measured iterations
warmup = 5               # warm-up iterations to stabilize GPU

# Tokenize and move inputs to device
inputs = tokenizer(prompt, return_tensors='pt', truncation=True, padding='max_length', max_length=seq_len, add_special_tokens=False)
input_ids = inputs['input_ids'].to(device, dtype=torch.long).repeat(batch_size, 1)

# Warm-up runs (no timing)
with torch.no_grad():
    for _ in range(warmup):
        _ = model(input_ids)

# Measured runs with CUDA synchronization for accurate GPU timing
latencies = []
with torch.no_grad():
    for _ in range(repeat):
        torch.cuda.synchronize()
        t0 = time.time()
        _ = model(input_ids)
        torch.cuda.synchronize()
        t1 = time.time()
        latencies.append(t1 - t0)

latencies = np.array(latencies)
median_s = float(np.median(latencies))
mean_s = float(np.mean(latencies))
per_sequence_ms = median_s * 1000.0
per_token_ms = per_sequence_ms / seq_len

print(f"Latency (median over {repeat} runs, after {warmup} warm-up): {per_sequence_ms:.2f} ms per sequence")
print(f"Per-token (median): {per_token_ms:.3f} ms (seq_len={seq_len}, batch_size={batch_size})")
print(f"Mean latency: {mean_s*1000.0:.2f} ms; runs: {repeat}; warmup: {warmup}")

# Return results programmatically if needed
latency_results = {
    'median_s': median_s,
    'mean_s': mean_s,
    'per_sequence_ms': per_sequence_ms,
    'per_token_ms': per_token_ms,
    'seq_len': seq_len,
    'batch_size': batch_size,
    'repeat': repeat,
    'warmup': warmup,
}
