In [1]:
# --- STEP 1: Setup and Installs ---
# Install the necessary libraries
!pip install transformers datasets torch

# Import required libraries
from google.colab import drive
import os
from transformers import (
    GPT2Tokenizer, 
    GPT2LMHeadModel, 
    DataCollatorForLanguageModeling, 
    Trainer, 
    TrainingArguments
)
from datasets import load_dataset
import torch

# --- STEP 2: Google Drive and Path Setup ---
# 1. Mount Google Drive to save persistent checkpoints
print("Mounting Google Drive...")
drive.mount('/content/drive')

# 2. Define the path for saving model checkpoints
# CHANGE THIS PATH if you want a different folder
DRIVE_PATH = "/content/drive/MyDrive/GPT2_Recipe_FineTune_Checkpoints"
os.makedirs(DRIVE_PATH, exist_ok=True)
print(f"Checkpoints will be saved to: {DRIVE_PATH}")

# --- STEP 3: Load Tokenizer and Add Custom Tokens ---
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Define and Add Custom Special Tokens for recipe structure
special_tokens_dict = {
    'additional_special_tokens': [
        '<|title|>', 
        '<|ingredients|>', 
        '<|instructions|>', 
        '<|endofrecipe|>'
    ]
}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

# --- STEP 4: Load Model and Resize Embeddings ---
model = GPT2LMHeadModel.from_pretrained('gpt2')
# CRITICAL: Resize model embeddings to include the new tokens
model.resize_token_embeddings(len(tokenizer)) 
print(f"Added {num_added_toks} new tokens and resized model embeddings.")

# --- STEP 5: Load, Format, and Tokenize Dataset ---
# IMPORTANT: Use a reliable recipe dataset. We use a small slice for quick testing.
# Replace 'corbt/all-recipes' with your chosen dataset if needed.
print("Loading dataset...")
raw_datasets = load_dataset('corbt/all-recipes', split='train[:5%]') # Using 5% for testing

# Define the tokenization function
def tokenize_and_format(examples):
    # This assumes the 'text' column already contains the fully formatted string 
    # (e.g., '<|title|> ... <|endofrecipe|>')
    return tokenizer(
        examples['text'], 
        truncation=True, 
        max_length=512, # Max context length for GPT-2
        padding='max_length'
    )

# Apply the function and remove the original 'text' column
tokenized_datasets = raw_datasets.map(tokenize_and_format, batched=True, remove_columns=["text"])

# Create the Train/Validation Split (90% train, 10% validation)
split_datasets = tokenized_datasets.train_test_split(test_size=0.1, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

print(f"Training on {len(train_dataset)} samples, validating on {len(eval_dataset)} samples.")

# --- STEP 6: Configure Training and Checkpointing ---
# Setup Data Collator for Causal Language Modeling (CLM)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Set up Training Arguments with checkpointing
training_args = TrainingArguments(
    output_dir=DRIVE_PATH,             # Save checkpoints to Google Drive
    num_train_epochs=5,                # Total number of epochs (adjust as needed)
    per_device_train_batch_size=4,     # Adjust based on your Colab GPU memory
    per_device_eval_batch_size=4,
    learning_rate=5e-5,                # Standard learning rate for fine-tuning
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=f'{DRIVE_PATH}/logs',
    logging_steps=50,
    evaluation_strategy="steps",       # Evaluate every 'eval_steps'
    eval_steps=500,
    # --- Checkpointing for RESUME ---
    **save_strategy="steps",**
    **save_steps=1000,** # Save a checkpoint every 1000 training steps
    **save_total_limit=3,** # Keep only the latest 3 checkpoints
)

# --- STEP 7: Initialize Trainer and Train ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

print("Starting training (will resume from checkpoint if available)...")
# Call trainer.train() and pass resume_from_checkpoint=True
# It will automatically find the latest checkpoint in DRIVE_PATH and resume.
trainer.train(
    resume_from_checkpoint=True
)

# --- STEP 8: Save Final Model ---
final_model_path = f"{DRIVE_PATH}/final_model"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)
print(f"Final fine-tuned model and tokenizer saved to: {final_model_path}")

SyntaxError: cannot assign to keyword argument unpacking (ipython-input-4207248596.py, line 95)

In [2]:
# --- STEP 1: Setup and Installs ---
# Install the necessary libraries
!pip install transformers datasets torch

# Import required libraries
from google.colab import drive
import os
from transformers import (
    GPT2Tokenizer, 
    GPT2LMHeadModel, 
    DataCollatorForLanguageModeling, 
    Trainer, 
    TrainingArguments
)
from datasets import load_dataset
import torch



In [3]:
# --- STEP 2: Google Drive and Path Setup ---
# 1. Mount Google Drive to save persistent checkpoints
print("Mounting Google Drive...")
drive.mount('/content/drive')

# 2. Define the path for saving model checkpoints
# CHANGE THIS PATH if you want a different folder
DRIVE_PATH = "/content/drive/MyDrive/GPT2_Recipe_FineTune_Checkpoints"
os.makedirs(DRIVE_PATH, exist_ok=True)
print(f"Checkpoints will be saved to: {DRIVE_PATH}")

Mounting Google Drive...


ValueError: mount failed

In [None]:
# --- STEP 3: Load Tokenizer and Add Custom Tokens ---
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Define and Add Custom Special Tokens for recipe structure
special_tokens_dict = {
    'additional_special_tokens': [
        '<|title|>', 
        '<|ingredients|>', 
        '<|instructions|>', 
        '<|endofrecipe|>'
    ]
}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

In [None]:
# --- STEP 4: Load Model and Resize Embeddings ---
model = GPT2LMHeadModel.from_pretrained('gpt2')
# CRITICAL: Resize model embeddings to include the new tokens
model.resize_token_embeddings(len(tokenizer)) 
print(f"Added {num_added_toks} new tokens and resized model embeddings.")

In [None]:
# --- STEP 5: Load, Format, and Tokenize Dataset ---
# IMPORTANT: Use a reliable recipe dataset. We use a small slice for quick testing.
# Replace 'corbt/all-recipes' with your chosen dataset if needed.
print("Loading dataset...")
raw_datasets = load_dataset('corbt/all-recipes', split='train[:5%]') # Using 5% for testing

# Define the tokenization function
def tokenize_and_format(examples):
    # This assumes the 'text' column already contains the fully formatted string 
    # (e.g., '<|title|> ... <|endofrecipe|>')
    return tokenizer(
        examples['text'], 
        truncation=True, 
        max_length=512, # Max context length for GPT-2
        padding='max_length'
    )

# Apply the function and remove the original 'text' column
tokenized_datasets = raw_datasets.map(tokenize_and_format, batched=True, remove_columns=["text"])

# Create the Train/Validation Split (90% train, 10% validation)
split_datasets = tokenized_datasets.train_test_split(test_size=0.1, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

print(f"Training on {len(train_dataset)} samples, validating on {len(eval_dataset)} samples.")

In [None]:
# --- STEP 6: Configure Training and Checkpointing ---
# Setup Data Collator for Causal Language Modeling (CLM)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Set up Training Arguments with checkpointing
training_args = TrainingArguments(
    output_dir=DRIVE_PATH,             # Save checkpoints to Google Drive
    num_train_epochs=5,                # Total number of epochs (adjust as needed)
    per_device_train_batch_size=4,     # Adjust based on your Colab GPU memory
    per_device_eval_batch_size=4,
    learning_rate=5e-5,                # Standard learning rate for fine-tuning
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=f'{DRIVE_PATH}/logs',
    logging_steps=50,
    evaluation_strategy="steps",       # Evaluate every 'eval_steps'
    eval_steps=500,
    # --- Checkpointing for RESUME ---
    **save_strategy="steps",**
    **save_steps=1000,** # Save a checkpoint every 1000 training steps
    **save_total_limit=3,** # Keep only the latest 3 checkpoints
)

In [None]:
# --- STEP 7: Initialize Trainer and Train ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

print("Starting training (will resume from checkpoint if available)...")
# Call trainer.train() and pass resume_from_checkpoint=True
# It will automatically find the latest checkpoint in DRIVE_PATH and resume.
trainer.train(
    resume_from_checkpoint=True
)

In [None]:
# --- STEP 8: Save Final Model ---
final_model_path = f"{DRIVE_PATH}/final_model"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)
print(f"Final fine-tuned model and tokenizer saved to: {final_model_path}")