In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers rouge-score nltk evaluate -q

print("‚úì Libraries installed successfully!")

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
from rouge_score import rouge_scorer
import re
from tqdm import tqdm
import warnings
import nltk

# Download NLTK data
nltk.download('punkt', quiet=True)

warnings.filterwarnings('ignore')

# Check device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

In [None]:
# Load the dataset
csv_path = '/kaggle/input/3a2m-cooking-recipe-dataset/3A2M_EXTENDED.csv'
df = pd.read_csv(csv_path)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head(3))

# Check for missing values
print(f"\nMissing values:")
print(df.isnull().sum())

In [None]:
# Clean the data
print("Cleaning data...")
df_clean = df.copy()

# Drop rows with missing critical columns
# Using 'NER' or 'Extended_NER' as ingredients column
df_clean = df_clean.dropna(subset=['title', 'directions'])

# Clean title (remove \t)
df_clean['title'] = df_clean['title'].str.strip()

# Use Extended_NER as ingredients (it has more detail)
df_clean['ingredients'] = df_clean['Extended_NER']

# Filter out very short entries
df_clean = df_clean[df_clean['ingredients'].str.len() > 10]
df_clean = df_clean[df_clean['directions'].str.len() > 20]

# Reset index
df_clean = df_clean.reset_index(drop=True)

print(f"\nOriginal dataset size: {len(df)}")
print(f"Cleaned dataset size: {len(df_clean)}")

# ============================================================================
# SPEED UP OPTION: Sample a subset for faster training
# ============================================================================
# RECOMMENDED: Use 50,000-100,000 samples for faster training with good results
# For even faster experimentation, use 10,000-20,000 samples

SAMPLE_SIZE = 50000  # Change this number based on your needs
# Options: 10000 (very fast), 50000 (balanced), 100000 (slower but better), None (full dataset - very slow)

if SAMPLE_SIZE and SAMPLE_SIZE < len(df_clean):
    print(f"\n‚ö° SPEED UP: Using {SAMPLE_SIZE:,} samples instead of full dataset")
    df_clean = df_clean.sample(n=SAMPLE_SIZE, random_state=42).reset_index(drop=True)
    print(f"This will train {len(df) // SAMPLE_SIZE:.0f}x faster!")
else:
    print(f"\n‚ö†Ô∏è Using full dataset ({len(df_clean):,} samples) - This will take many hours!")

print(f"Final dataset size: {len(df_clean):,}")

# Show examples
print("\n" + "="*80)
print("SAMPLE RECIPE:")
print("="*80)
sample = df_clean.iloc[0]
print(f"Title: {sample['title']}")
print(f"\nIngredients: {sample['ingredients'][:200]}...")
print(f"\nDirections: {sample['directions'][:200]}...")
print(f"\nGenre: {sample['genre']}")

In [None]:
class RecipeDataset(Dataset):
    """Custom Dataset for recipe generation"""
    
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.examples = []
        
        print(f"Processing {len(data)} recipes...")
        for _, row in tqdm(data.iterrows(), total=len(data)):
            # Format: [INGREDIENTS] ingredients [RECIPE] title: steps
            ingredients = str(row.get('ingredients', '')).strip()
            title = str(row.get('title', '')).strip()
            directions = str(row.get('directions', '')).strip()
            
            # Clean up list formatting from ingredients if it's a string representation of a list
            if ingredients.startswith('[') and ingredients.endswith(']'):
                # Remove brackets and quotes, clean up
                ingredients = ingredients.strip('[]').replace("'", "").replace('"', '')
            
            # Clean up list formatting from directions if it's a string representation of a list
            if directions.startswith('[') and directions.endswith(']'):
                directions = directions.strip('[]').replace('", "', ' ').replace('","', ' ').replace('"', '').replace("'", "")
            
            if ingredients and title and directions:
                formatted_text = f"[INGREDIENTS] {ingredients} [RECIPE] {title}: {directions}<|endoftext|>"
                self.examples.append(formatted_text)
        
        print(f"Created {len(self.examples)} training examples")
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        text = self.examples[idx]
        encodings = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        
        input_ids = encodings['input_ids'].squeeze()
        attention_mask = encodings['attention_mask'].squeeze()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': input_ids.clone()
        }

print("‚úì RecipeDataset class defined")

In [None]:
print("Loading GPT-2 model and tokenizer...")

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Load model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model = model.to(device)

print(f"‚úì Model loaded with {model.num_parameters():,} parameters")
print(f"‚úì Tokenizer vocab size: {len(tokenizer)}")

In [None]:
# Split data
train_split = 0.9
train_size = int(len(df_clean) * train_split)

train_df = df_clean[:train_size].reset_index(drop=True)
val_df = df_clean[train_size:].reset_index(drop=True)

print(f"Train samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")

# Create datasets
print("\nCreating training dataset...")
train_dataset = RecipeDataset(train_df, tokenizer, max_length=512)

print("\nCreating validation dataset...")
val_dataset = RecipeDataset(val_df, tokenizer, max_length=512)

# Create dataloaders
batch_size = 4  # Adjust based on GPU memory
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print(f"\n‚úì Training batches: {len(train_loader)}")
print(f"‚úì Validation batches: {len(val_loader)}")


In [None]:
epochs = 3
learning_rate = 5e-5
warmup_steps = 500
max_grad_norm = 1.0

# SPEED UP OPTIONS:
# 1. Increase batch size if you have GPU memory (4 -> 8 or 16)
# 2. Reduce epochs (3 -> 2 or even 1 for quick testing)
# 3. Use gradient accumulation for effective larger batch size

batch_size = 8  # Increase to 16 if GPU allows (faster training)
gradient_accumulation_steps = 2  # Effective batch size = batch_size * this

print("="*80)
print("TRAINING CONFIGURATION")
print("="*80)
print(f"Dataset size: {len(train_dataset):,} samples")
print(f"Batch size: {batch_size}")
print(f"Gradient accumulation: {gradient_accumulation_steps}")
print(f"Effective batch size: {batch_size * gradient_accumulation_steps}")
print(f"Epochs: {epochs}")
print(f"Learning rate: {learning_rate}")
print(f"Device: {device}")

# Optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Learning rate scheduler
total_steps = (len(train_loader) // gradient_accumulation_steps) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

print(f"Total training steps: {total_steps:,}")
print(f"Warmup steps: {warmup_steps}")

# Estimate training time
samples_per_sec = 5 if device == 'cuda' else 0.5  # Rough estimates
total_time_mins = (len(train_dataset) * epochs) / (samples_per_sec * 60)
print(f"\n‚è±Ô∏è Estimated training time: {total_time_mins:.1f} minutes ({total_time_mins/60:.1f} hours)")
print("="*80)

In [None]:
def train_epoch(model, train_loader, optimizer, scheduler, device, gradient_accumulation_steps=1):
    """Train for one epoch with gradient accumulation"""
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    
    progress_bar = tqdm(train_loader, desc="Training")
    for idx, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss / gradient_accumulation_steps  # Scale loss
        total_loss += loss.item() * gradient_accumulation_steps
        
        # Backward pass
        loss.backward()
        
        # Update weights every gradient_accumulation_steps
        if (idx + 1) % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        progress_bar.set_postfix({'loss': f'{loss.item() * gradient_accumulation_steps:.4f}'})
    
    avg_loss = total_loss / len(train_loader)
    return avg_loss

def validate(model, val_loader, device):
    """Validate the model"""
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            total_loss += outputs.loss.item()
    
    avg_loss = total_loss / len(val_loader)
    return avg_loss

print("‚úì Training functions defined")

In [None]:
best_val_loss = float('inf')
training_stats = []

print("\n" + "="*70)
print("STARTING TRAINING")
print("="*70)

for epoch in range(epochs):
    print(f"\n{'='*70}")
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"{'='*70}")
    
    # Training
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device, gradient_accumulation_steps)
    
    # Validation
    val_loss = validate(model, val_loader, device)
    
    print(f"\nüìä Epoch {epoch + 1} Results:")
    print(f"   Average train loss: {train_loss:.4f}")
    print(f"   Average val loss: {val_loss:.4f}")
    
    training_stats.append({
        'epoch': epoch + 1,
        'train_loss': train_loss,
        'val_loss': val_loss
    })
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        model.save_pretrained('./best_recipe_model')
        tokenizer.save_pretrained('./best_recipe_model')
        print(f"   ‚úì Saved best model (val_loss: {val_loss:.4f})")

print("\n" + "="*70)
print("TRAINING COMPLETE!")
print("="*70)
print(f"Best validation loss: {best_val_loss:.4f}")

# Show training stats
stats_df = pd.DataFrame(training_stats)
print("\n", stats_df)

In [None]:
class RecipeGenerator:
    """Generate and evaluate recipes"""
    
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.model.eval()
    
    def generate_from_ingredients(self, ingredients, title="", max_length=300, 
                                  temperature=0.8, top_p=0.9, num_return=1):
        """Generate recipe from ingredients"""
        if title:
            prompt = f"[INGREDIENTS] {ingredients} [RECIPE] {title}:"
        else:
            prompt = f"[INGREDIENTS] {ingredients} [RECIPE]"
        
        input_ids = self.tokenizer.encode(prompt, return_tensors='pt').to(self.device)
        
        with torch.no_grad():
            output = self.model.generate(
                input_ids,
                max_length=max_length,
                num_return_sequences=num_return,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                no_repeat_ngram_size=2
            )
        
        recipes = []
        for seq in output:
            text = self.tokenizer.decode(seq, skip_special_tokens=True)
            recipes.append(self._format_output(text))
        
        return recipes if num_return > 1 else recipes[0]
    
    def generate_from_title(self, title, max_length=300, temperature=0.8, top_p=0.9):
        """Generate recipe from title only"""
        prompt = f"[RECIPE] {title}:"
        
        input_ids = self.tokenizer.encode(prompt, return_tensors='pt').to(self.device)
        
        with torch.no_grad():
            output = self.model.generate(
                input_ids,
                max_length=max_length,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                no_repeat_ngram_size=2
            )
        
        text = self.tokenizer.decode(output[0], skip_special_tokens=True)
        return self._format_output(text)
    
    def _format_output(self, text):
        """Format generated text into readable recipe"""
        # Extract recipe part
        if '[RECIPE]' in text:
            parts = text.split('[RECIPE]')
            recipe_part = parts[1].strip()
            return recipe_part
        return text

print("‚úì RecipeGenerator class defined")

# Initialize generator
generator = RecipeGenerator(model, tokenizer, device)

In [None]:
print("="*80)
print("EXAMPLE RECIPE GENERATIONS")
print("="*80)

# Example 1: From ingredients with title
print("\n" + "‚îÄ"*80)
print("Example 1: Generate from ingredients + title")
print("‚îÄ"*80)
ingredients_1 = "chicken breast, garlic, olive oil, lemon juice, rosemary, salt, pepper"
title_1 = "Roasted Lemon Chicken"
print(f"ü•ò Ingredients: {ingredients_1}")
print(f"üìù Title: {title_1}")
print("\nüìñ Generated Recipe:")
recipe_1 = generator.generate_from_ingredients(ingredients_1, title_1, temperature=0.8)
print(recipe_1)

# Example 2: From ingredients without title
print("\n" + "‚îÄ"*80)
print("Example 2: Generate from ingredients only")
print("‚îÄ"*80)
ingredients_2 = "flour, butter, sugar, eggs, vanilla extract, baking powder, milk"
print(f"ü•ò Ingredients: {ingredients_2}")
print("\nüìñ Generated Recipe:")
recipe_2 = generator.generate_from_ingredients(ingredients_2, temperature=0.9)
print(recipe_2)

# Example 3: From title only
print("\n" + "‚îÄ"*80)
print("Example 3: Generate from title only")
print("‚îÄ"*80)
title_3 = "Chocolate Chip Cookies"
print(f"üìù Title: {title_3}")
print("\nüìñ Generated Recipe:")
recipe_3 = generator.generate_from_title(title_3, temperature=0.8)
print(recipe_3)

# Example 4: Creative generation
print("\n" + "‚îÄ"*80)
print("Example 4: Creative recipe with higher temperature")
print("‚îÄ"*80)
ingredients_4 = "salmon, honey, soy sauce, ginger"
title_4 = "Honey Glazed Salmon"
print(f"ü•ò Ingredients: {ingredients_4}")
print(f"üìù Title: {title_4}")
print("\nüìñ Generated Recipe:")
recipe_4 = generator.generate_from_ingredients(ingredients_4, title_4, temperature=1.0)
print(recipe_4)


In [None]:
def calculate_rouge_scores(predictions, references):
    """Calculate ROUGE scores using rouge_score library"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    
    return {
        'rouge1': np.mean(rouge1_scores),
        'rouge2': np.mean(rouge2_scores),
        'rougeL': np.mean(rougeL_scores),
    }

def calculate_bleu_score(predictions, references):
    """Calculate BLEU scores"""
    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
    smoothie = SmoothingFunction().method4
    
    bleu_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = pred.split()
        ref_tokens = [ref.split()]
        score = sentence_bleu(ref_tokens, pred_tokens, smoothing_function=smoothie)
        bleu_scores.append(score)
    
    return np.mean(bleu_scores)

print("‚úì Evaluation functions defined")



print("="*80)
print("QUALITY EVALUATION")
print("="*80)

# Sample validation examples for evaluation
num_eval_samples = 20
eval_samples = val_df.head(num_eval_samples)

generated_recipes = []
reference_recipes = []

print(f"\nGenerating {num_eval_samples} recipes for evaluation...")

for idx, row in tqdm(eval_samples.iterrows(), total=num_eval_samples):
    ingredients = str(row['ingredients'])
    title = str(row['title'])
    reference = str(row['directions'])
    
    # Generate recipe
    generated = generator.generate_from_ingredients(
        ingredients, 
        title, 
        temperature=0.8,
        max_length=300
    )
    
    generated_recipes.append(generated)
    reference_recipes.append(reference)

# Calculate metrics
print("\nCalculating ROUGE scores...")
rouge_scores = calculate_rouge_scores(generated_recipes, reference_recipes)

print("\nCalculating BLEU score...")
bleu_score = calculate_bleu_score(generated_recipes, reference_recipes)

# Display results
print("\n" + "="*80)
print("EVALUATION RESULTS")
print("="*80)
print(f"\nüìä ROUGE Scores:")
print(f"   ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"   ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"   ROUGE-L: {rouge_scores['rougeL']:.4f}")
print(f"\nüìä BLEU Score: {bleu_score:.4f}")

# Show a comparison example
print("\n" + "="*80)
print("EXAMPLE COMPARISON")
print("="*80)
idx = 0
print(f"\nüìù Title: {eval_samples.iloc[idx]['title']}")
print(f"\nü•ò Ingredients: {eval_samples.iloc[idx]['ingredients'][:150]}...")
print(f"\n‚úÖ Reference Recipe:\n{reference_recipes[idx][:300]}...")
print(f"\nü§ñ Generated Recipe:\n{generated_recipes[idx][:300]}...")



def human_evaluation_prompt(generated_recipes, num_samples=5):
    """Display recipes for human evaluation"""
    print("="*80)
    print("HUMAN EVALUATION")
    print("="*80)
    print("\nPlease rate these generated recipes on a scale of 1-5 for:")
    print("  1. Coherence: Does the recipe make logical sense?")
    print("  2. Creativity: Is the recipe interesting and creative?")
    print("  3. Completeness: Does it include all necessary steps?")
    print("="*80)
    
    samples = np.random.choice(len(generated_recipes), min(num_samples, len(generated_recipes)), replace=False)
    
    for i, idx in enumerate(samples):
        print(f"\n{'‚îÄ'*80}")
        print(f"Recipe {i+1}/{num_samples}")
        print(f"{'‚îÄ'*80}")
        print(generated_recipes[idx])
        print("\nCoherence (1-5): __")
        print("Creativity (1-5): __")
        print("Completeness (1-5): __")

# Display evaluation prompt
human_evaluation_prompt(generated_recipes, num_samples=3)


# Save final model
model.save_pretrained('./final_recipe_model')
tokenizer.save_pretrained('./final_recipe_model')

print("="*80)
print("PROJECT SUMMARY")
print("="*80)

summary = f"""
‚úì Model: GPT-2 fine-tuned for recipe generation
‚úì Training samples: {len(train_dataset):,}
‚úì Validation samples: {len(val_dataset):,}
‚úì Training epochs: {epochs}
‚úì Best validation loss: {best_val_loss:.4f}

üìä Evaluation Metrics (on {num_eval_samples} samples):
   ‚Ä¢ ROUGE-1: {rouge_scores['rouge1']:.4f}
   ‚Ä¢ ROUGE-2: {rouge_scores['rouge2']:.4f}
   ‚Ä¢ ROUGE-L: {rouge_scores['rougeL']:.4f}
   ‚Ä¢ BLEU: {bleu_score:.4f}

üíæ Saved Models:
   ‚Ä¢ ./best_recipe_model/ (best validation loss)
   ‚Ä¢ ./final_recipe_model/ (final model)

‚úÖ Deliverables Completed:
   1. ‚úì Tokenization and dataset formatting script
   2. ‚úì Training loop for GPT-2
   3. ‚úì Example generations (4 examples shown)
   4. ‚úì Quality evaluation (ROUGE, BLEU metrics)
   5. ‚úì Human evaluation framework
"""

print(summary)

print("\n" + "="*80)
print("üéâ ALL TASKS COMPLETE!")
print("="*80)