<a href="https://colab.research.google.com/github/NilayRaut/Self-Alignment-with-Instruction-Backtranslation/blob/main/Self_Alignment_with_Instruction_Backtranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Self-Alignment with Instruction Backtranslation

Nilay Raut

This notebook implements the paper: https://arxiv.org/pdf/2308.06259.pdf



##1: Installation (Run this first)

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U peft
!pip install -q -U accelerate
# !pip install -q datasets==2.14.0
# !pip install -q transformers==4.34.0 huggingface_hub==0.20.0
!pip install -q datasets
!pip install -q transformers
!pip install -q huggingface_hub

##2: Import Libraries


In [None]:
import torch
import numpy as np
import random
import json
import re
import os
import gc
from tqdm import tqdm
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from huggingface_hub import login

# Set seeds
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

##3: Configuration and Setup

In [None]:
# IMPORTANT: Update this with your HuggingFace username!
HF_USERNAME = "NilayR"  # TODO: CHANGE THIS!

# Configuration for optimization
CONFIG = {
    "use_cpu_prototype": False,  # Start with CPU
    "backward_dataset_size": 3000,  # Small for prototyping
    "lima_sample_size": 150,  # As required
    "max_steps": 150,  # Reduced for faster training
    "batch_size": 1,
    "gradient_accumulation_steps": 4,
    "max_length": 256,  # Reduced from 512
    "lora_r": 8,  # Reduced from 16
    "lora_alpha": 16,
}

print(f"Configuration loaded. Models will be pushed to: {HF_USERNAME}")

In [None]:
# # Change to full training i.e when using gpu
# CONFIG["use_cpu_prototype"] = False
# CONFIG["backward_dataset_size"] = 3000  # Increase if needed
# CONFIG["max_steps"] = 150  # Can increase to 200-300

##4: HuggingFace Login

In [None]:
# Login to HuggingFace
from google.colab import userdata
try:
    hf_token = userdata.get('huggingface')
    login(token=hf_token)
    print("Logged in to HuggingFace")
except:
    from getpass import getpass
    hf_token = getpass("Enter HuggingFace token: ")
    login(token=hf_token)

# Create directories
os.makedirs("models", exist_ok=True)
os.makedirs("data", exist_ok=True)

##5: Check Device

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device.type == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print(" No GPU available. Using CPU for prototyping.")

##6: Load and Prepare Guanaco Dataset

In [None]:
# Load Guanaco manually
import json
import requests
from datasets import Dataset

print("\n Loading OpenAssistant Guanaco dataset...")

url = "https://huggingface.co/datasets/timdettmers/openassistant-guanaco/resolve/main/openassistant_best_replies_train.jsonl"
response = requests.get(url)

data = []
for line in response.text.strip().split('\n'):
    if line:
        data.append(json.loads(line))

dataset_guanaco = Dataset.from_list(data)
print(f"Total examples: {len(dataset_guanaco)}")

##7: Extract Instruction-Output Pairs

In [None]:
def extract_pairs_from_guanaco(example):
    """Extract clean instruction-output pairs"""
    text = example['text']
    pairs = []

    if '### Human:' in text and '### Assistant:' in text:
        parts = text.split('### Human:')
        for part in parts[1:]:
            if '### Assistant:' in part:
                human_assistant = part.split('### Assistant:')
                if len(human_assistant) >= 2:
                    instruction = human_assistant[0].strip()
                    response = human_assistant[1].split('### Human:')[0].strip()
                    # Filter by length
                    if instruction and response and len(instruction) < 500 and len(response) < 800:
                        pairs.append({
                            'instruction': instruction,
                            'output': response
                        })
    return pairs

# Extract pairs (small subset for CPU)
print("Extracting instruction-output pairs...")
all_pairs = []
subset_size = 5000 if CONFIG["use_cpu_prototype"] else len(dataset_guanaco)

for i, example in enumerate(tqdm(dataset_guanaco.select(range(min(subset_size, len(dataset_guanaco)))), desc="Processing")):
    pairs = extract_pairs_from_guanaco(example)
    all_pairs.extend(pairs)

print(f"Extracted {len(all_pairs)} pairs")

##8: Create Backward Dataset

In [None]:
# Create backward training data (output -> instruction)
backward_texts = []
num_examples = min(CONFIG["backward_dataset_size"], len(all_pairs))

for i in range(num_examples):
    pair = all_pairs[i]
    # Backward format: given output, predict instruction
    text = f"### Output:\n{pair['output']}\n\n### Instruction:\n{pair['instruction']}"
    backward_texts.append(text)

backward_dataset = Dataset.from_dict({'text': backward_texts})
print(f" Backward dataset created: {len(backward_dataset)} examples")

# Show examples
print("\n Example backward data:")
print(backward_dataset[0]['text'][:300] + "...")

##9: Setup Model and Tokenizer

In [None]:
print("\n Setting up model and tokenizer...")

# Model configuration
base_model_id = "NousResearch/Llama-2-7b-chat-hf"  # Using chat version for faster convergence

# Quantization config for GPU (will be None for CPU)
if device.type == "cuda":
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )
else:
    quantization_config = None
    print(" CPU mode: No quantization")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

##10: Tokenization Function

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=CONFIG["max_length"],
        padding="max_length",
    )

# Tokenize backward dataset
print("Tokenizing dataset...")
tokenized_backward = backward_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)
tokenized_backward = tokenized_backward.map(
    lambda examples: {"labels": examples["input_ids"]},
    batched=True
)

##11: Initialize Backward Model (CPU Prototype)

In [None]:
if CONFIG["use_cpu_prototype"]:
    print("\n CPU Prototype: Loading small model...")
    # For CPU, we'll use a tiny model for testing
    prototype_model_id = "gpt2"  # Small model for CPU testing
    backward_model = AutoModelForCausalLM.from_pretrained(prototype_model_id)
    prototype_tokenizer = AutoTokenizer.from_pretrained(prototype_model_id)
    prototype_tokenizer.pad_token = prototype_tokenizer.eos_token

    print(" Loaded small model for CPU prototyping")
else:
    # Full model for GPU
    print("\n Loading Llama-2-7b model...")
    backward_model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        quantization_config=quantization_config,
        device_map="auto" if device.type == "cuda" else None,
        torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
    )
    if device.type == "cuda":
        backward_model = prepare_model_for_kbit_training(backward_model)

##12: Configure LoRA

In [None]:
# LoRA configuration
lora_config = LoraConfig(
    r=CONFIG["lora_r"],
    lora_alpha=CONFIG["lora_alpha"],
    target_modules=["q_proj", "v_proj"] if not CONFIG["use_cpu_prototype"] else ["c_attn"],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

if not CONFIG["use_cpu_prototype"]:
    backward_model = get_peft_model(backward_model, lora_config)
    backward_model.print_trainable_parameters()

##13: Setup Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./backward_model",
    per_device_train_batch_size=CONFIG["batch_size"],
    gradient_accumulation_steps=CONFIG["gradient_accumulation_steps"],
    max_steps=CONFIG["max_steps"],
    learning_rate=3e-5,
    warmup_steps=10,
    logging_steps=25,
    save_strategy="no",  # Don't save checkpoints
    fp16=device.type == "cuda",
    optim="adamw_torch" if CONFIG["use_cpu_prototype"] else "paged_adamw_8bit",
    report_to="none",
)

## 14: Train Backward Model

This step trains the backward model (`backward_model`) on the prepared Guanaco dataset (`tokenized_backward`). The model is fine-tuned to learn the mapping from an output (`y`) back to its original instruction (`x`), effectively creating the $p(x|y)$ model required for instruction backtranslation. The training uses the configurations defined earlier, including LoRA and the specified training arguments.

In [None]:
print("\n🚀 Training backward model...")
print(f"Training on: {device}")
print(f"Max steps: {CONFIG['max_steps']}")

# Create trainer
trainer_backward = Trainer(
    model=backward_model,
    args=training_args,
    train_dataset=tokenized_backward,
    tokenizer=prototype_tokenizer if CONFIG["use_cpu_prototype"] else tokenizer,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=prototype_tokenizer if CONFIG["use_cpu_prototype"] else tokenizer,
        mlm=False
    )
)

# Train
trainer_backward.train()
print("✅ Backward model training complete!")


## 15: Save and Upload Backward Model

After training, the backward model is saved locally and then pushed to the Hugging Face Hub. This makes the trained model available for later use in the self-augmentation phase and for sharing. The URL to the uploaded model will be printed below.

In [None]:
if not CONFIG["use_cpu_prototype"]:
    print("\n💾 Saving and uploading backward model...")
    backward_model_name = f"{HF_USERNAME}/llama2-7b-backward-instruction"

    # Save locally first
    trainer_backward.save_model("./models/backward_model")
    tokenizer.save_pretrained("./models/backward_model")

    # Push to hub
    backward_model.push_to_hub(backward_model_name, use_auth_token=True)
    tokenizer.push_to_hub(backward_model_name, use_auth_token=True)

    backward_model_url = f"https://huggingface.co/{backward_model_name}"
    print(f"✅ BACKWARD MODEL URL: {backward_model_url}")
else:
    print("⚠️ CPU prototype mode - skipping upload")
    backward_model_url = "CPU_PROTOTYPE_MODE"

##16: Load LIMA Dataset

In [None]:
# # Install compatible version of datasets
# !pip install datasets==2.14.0 -q

In [None]:
print("\n" + "="*70)
print("STEP 2: Self-Augmentation with LIMA")
print("="*70)
print("📚 Loading LIMA dataset...")

# Method 1: Use huggingface_hub to download properly
try:
    from huggingface_hub import hf_hub_download

    filepath = hf_hub_download(
        repo_id="GAIR/lima",
        filename="train.jsonl",
        repo_type="dataset"
    )

    import json
    data = []
    with open(filepath, 'r') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))

    dataset_lima = Dataset.from_list(data)
    print(f"✅ Loaded LIMA dataset: {len(dataset_lima)} examples")

except Exception as e:
    print(f"Error loading LIMA dataset: {e}")
    print("Failed to load LIMA dataset using hf_hub_download.")
    # Handle error appropriately, e.g., initialize as empty
    dataset_lima = []


if 'dataset_lima' in locals() and dataset_lima:
    print(f"Total LIMA examples: {len(dataset_lima)}")
else:
    print("Failed to load LIMA dataset or it's empty.")
    # You might want to add code here to exit or skip subsequent steps that depend on this dataset
    dataset_lima = [] # Ensure it's initialized as empty if loading failed

##17: Filter Single-Turn Examples

In [None]:
# Filter for single-turn conversations only
single_turn_outputs = []
for example in dataset_lima:
    conversations = example['conversations']
    if len(conversations) == 2:  # Exactly 2 messages = single turn
        output_text = conversations[1]  # Assistant's response
        if len(output_text) < 1500:  # Reasonable length
            single_turn_outputs.append(output_text)

print(f"✅ Found {len(single_turn_outputs)} single-turn examples")

# Sample 150 as required
sampled_outputs = random.sample(single_turn_outputs, min(CONFIG["lima_sample_size"], len(single_turn_outputs)))
print(f"📊 Sampled {len(sampled_outputs)} outputs")

##18: Generate Instructions Function

In [None]:
def generate_instruction_fixed(model, tokenizer_to_use, output_text, device):
    """Generate instruction for given output"""

    model.eval()
    if hasattr(model, 'gradient_checkpointing_disable'):
        model.gradient_checkpointing_disable()

    input_text = f"### Output:\n{output_text[:300]}\n\n### Instruction:"

    inputs = tokenizer_to_use(
        input_text,
        return_tensors="pt",
        truncation=True,
        max_length=256
    )

    if device.type == "cuda":
        inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=30,  # Reduced for speed
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer_to_use.pad_token_id,
            eos_token_id=tokenizer_to_use.eos_token_id,
            use_cache=True  # Explicitly enable cache
        )

    generated_text = tokenizer_to_use.decode(
        generated_ids[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    ).strip()

    return generated_text

## 19: Generate Instructions from LIMA

Using the trained backward model, we now perform the instruction backtranslation step. We feed the sampled single-turn outputs from the LIMA dataset into the backward model to generate new, synthetic instructions. This creates the augmented instruction-output pairs.

In [None]:
print("\n🤖 Generating instructions...")
generated_instructions = []

# Use appropriate model and tokenizer
model_to_use = backward_model
tokenizer_to_use = prototype_tokenizer if CONFIG["use_cpu_prototype"] else tokenizer

# Put model in eval mode and disable gradient checkpointing
model_to_use.eval()
if hasattr(model_to_use, 'gradient_checkpointing_disable'):
    model_to_use.gradient_checkpointing_disable()

# Generate with progress bar
for i in tqdm(range(len(sampled_outputs)), desc="Generating"):
    try:
        instruction = generate_instruction_fixed(
            model_to_use,
            tokenizer_to_use,
            sampled_outputs[i],
            device
        )
        generated_instructions.append(instruction)

        # Clear cache periodically
        if device.type == "cuda" and i % 10 == 0:
            torch.cuda.empty_cache()

    except Exception as e:
        print(f"Error at index {i}: {e}")
        generated_instructions.append("Error generating instruction")

print(f"✅ Generated {len(generated_instructions)} instructions")

In [None]:
# print("\n🤖 Generating instructions...")
# generated_instructions = []

# # Use appropriate model and tokenizer
# model_to_use = backward_model
# tokenizer_to_use = prototype_tokenizer if CONFIG["use_cpu_prototype"] else tokenizer

# # Generate in small batches to show progress
# for i in tqdm(range(0, len(sampled_outputs), 10), desc="Generating"):
#     batch = sampled_outputs[i:i+10]
#     for output_text in batch:
#         instruction = generate_instruction(model_to_use, tokenizer_to_use, output_text, device)
#         generated_instructions.append(instruction)

# print(f"✅ Generated {len(generated_instructions)} instructions")

##20: Create Augmented Dataset

In [None]:
# Create augmented dataset
augmented_data = []
for instruction, output in zip(generated_instructions, sampled_outputs):
    augmented_data.append({
        'instruction': instruction,
        'output': output
    })

# Save augmented data
with open('./data/augmented_data.json', 'w') as f:
    json.dump(augmented_data, f)

## 21: Print 5 Generated Examples

5 examples of the instruction-output pairs generated in the previous step. These pairs consist of a synthetic instruction generated by the backward model and the original LIMA output.

In [None]:
print("\n" + "="*70)
print("📋 5 EXAMPLES OF GENERATED INSTRUCTIONS :")
print("="*70)

for i in range(min(5, len(augmented_data))):
    print(f"\nExample {i+1}:")
    # Adjusted format based on user requirement
    print(f"🔹 Generated Instruction: {augmented_data[i]['instruction']}")
    print(f"🔹 LIMA Output: {augmented_data[i]['output'][:200]}...") # Truncating for brevity in display
    print("-"*50)

##22: Setup Rating Model

In [None]:
print("\n" + "="*70)
print("STEP 3: Self-Curation")
print("="*70)

# Clear memory if on GPU
if device.type == "cuda":
    del backward_model
    torch.cuda.empty_cache()
    gc.collect()

# For rating, we'll use the same model
if CONFIG["use_cpu_prototype"]:
    print("🔧 Using small model for rating prototype...")
    rating_model = model_to_use
    rating_tokenizer = tokenizer_to_use
else:
    print("🤖 Loading Llama-2-7b-chat for rating...")
    rating_model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=torch.float16,
    )
    rating_tokenizer = tokenizer

##23: Rating Prompt Template

In [None]:
# Simplified rating prompt for efficiency
rating_prompt_template = """Rate the quality of this instruction-response pair on a scale of 1-5.

1 = Very poor (off-topic, incomplete, or irrelevant)
2 = Poor (partially addresses the question)
3 = Fair (adequate but not from AI assistant perspective)
4 = Good (clear, comprehensive, helpful)
5 = Excellent (perfect AI assistant response)

Here are some examples:

Instruction: Tell me about the history of the internet.
Response: The internet started as a project by the US Department of Defense called ARPANET in the late 1960s... (Full history)
Reason: This response is comprehensive and directly answers the instruction.
Score: 5

Instruction: What is the best color?
Response: I like pizza.
Reason: This response is completely irrelevant to the instruction.
Score: 1

Instruction: {instruction}
Response: {output}

Provide a brief reason and then write "Score: X" where X is 1-5.
"""

##24: Rating Function

In [None]:
def rate_example(model, tokenizer_to_use, instruction, output, device):
    """Rate an instruction-output pair"""
    prompt = rating_prompt_template.format(
        instruction=instruction,
        output=output[:500]  # Truncate long outputs
    )

    inputs = tokenizer_to_use(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )

    if device.type == "cuda":
        inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer_to_use.pad_token_id,
            eos_token_id=tokenizer_to_use.eos_token_id
        )

    generated_text = tokenizer_to_use.decode(
        generated_ids[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    )

    # Extract score
    score_match = re.search(r"Score:\s*(\d)", generated_text)
    if score_match:
        score = int(score_match.group(1))
        reasoning = generated_text.split("Score:")[0].strip()
    else:
        score = 3  # Default
        reasoning = "Could not parse score"

    return score, reasoning

##25: Score Examples

In the self-curation step, we use a rating model (LLM) to evaluate the quality of the augmented instruction-output pairs generated from the LIMA data. This cell iterates through the augmented data, sends each pair to the rating model with a specific prompt, and records a quality score (1-5) and the reasoning provided by the rater model.

In [None]:
print("\n📊 Scoring augmented examples...")
scored_data = []
high_quality = []
low_quality = []

# Score a subset for speed
num_to_score = min(50, len(augmented_data)) if CONFIG["use_cpu_prototype"] else len(augmented_data)

for i in tqdm(range(num_to_score), desc="Scoring"):
    example = augmented_data[i]
    score, reasoning = rate_example(
        rating_model,
        rating_tokenizer,
        example['instruction'],
        example['output'],
        device
    )

    scored_example = {
        'instruction': example['instruction'],
        'output': example['output'],
        'score': score,
        'reasoning': reasoning
    }
    scored_data.append(scored_example)

    # Collect high/low quality examples
    if score >= 4 and len(high_quality) < 5:
        high_quality.append(scored_example)
    elif score <= 2 and len(low_quality) < 5:
        low_quality.append(scored_example)

##26: Print High Quality Examples

5 examples of the augmented instruction-output pairs that were rated as high quality (score >= 4) during the self-curation process. These examples represent the pairs that will be used to create the curated dataset for the final instruction tuning.

In [None]:
print("\n" + "="*70)
print("✅ 5 HIGH QUALITY EXAMPLES (Score >= 4) :")
print("="*70)

for i, ex in enumerate(high_quality[:5]):
    print(f"\nHigh Quality Example {i+1}:")
    print(f"📊 Score: {ex['score']}")
    print(f"🔹 Instruction: {ex['instruction']}")
    print(f"🔹 Output (first 200 chars): {ex['output'][:200]}...")
    print(f"💭 Reasoning: {ex['reasoning'][:150]}...")
    print("-"*50)

##27: Print Low Quality Examples

5 examples of the augmented instruction-output pairs that were rated as low quality (score <= 2) during the self-curation process. These examples illustrate the types of pairs that were filtered out and not included in the curated dataset.

In [None]:
print("\n" + "="*70)
print("❌ 5 LOW QUALITY EXAMPLES (Score <= 2) :")
print("="*70)

for i, ex in enumerate(low_quality[:5]):
    print(f"\nLow Quality Example {i+1}:")
    print(f"📊 Score: {ex['score']}")
    print(f"🔹 Instruction: {ex['instruction']}")
    print(f"🔹 Output (first 200 chars): {ex['output'][:200]}...")
    print(f"💭 Reasoning: {ex['reasoning'][:150]}...")
    print("-"*50)

##28: Create and Upload Curated Dataset

This step filters the scored augmented data to create the final curated dataset, including only the high-quality examples (score >= 4). This dataset is then uploaded to the Hugging Face Hub, making it available for the final instruction tuning step. Both the curated and the full scored datasets are uploaded.

In [None]:
# Filter high quality examples (score >= 4)
curated_data = [ex for ex in scored_data if ex['score'] >= 4]
print(f"\n✅ Curated dataset size: {len(curated_data)} high-quality examples")

# Create HuggingFace dataset
curated_dataset = Dataset.from_list(curated_data)
all_scored_dataset = Dataset.from_list(scored_data)

dataset_dict = DatasetDict({
    'curated': curated_dataset,
    'all_scored': all_scored_dataset
})

if not CONFIG["use_cpu_prototype"]:
    # Upload to HuggingFace
    dataset_repo_name = f"{HF_USERNAME}/instruction-backtranslation-curated"
    dataset_dict.push_to_hub(dataset_repo_name, token=hf_token)
    dataset_url = f"https://huggingface.co/datasets/{dataset_repo_name}"
    print(f"✅ DATASET URL: {dataset_url}")
else:
    print("⚠️ CPU prototype mode - skipping dataset upload")
    dataset_url = "CPU_PROTOTYPE_MODE"

##29: Prepare Final Training Data

In [None]:
print("\n" + "="*70)
print("STEP 4: Fine-tune on Curated Dataset")
print("="*70)

# Clear memory
if device.type == "cuda":
    del rating_model
    torch.cuda.empty_cache()
    gc.collect()

# Combine seed data with curated data
combined_data = []

# Add some original seed data
seed_size = min(len(curated_data), len(all_pairs))
for i in range(seed_size):
    combined_data.append({
        'instruction': all_pairs[i]['instruction'],
        'output': all_pairs[i]['output']
    })

# Add curated augmented data
for item in curated_data:
    combined_data.append({
        'instruction': item['instruction'],
        'output': item['output']
    })

print(f"📊 Combined dataset: {len(combined_data)} examples")

##30: Create Instruction Dataset

In [None]:
# Format for instruction tuning
instruction_texts = []
for item in combined_data:
    text = f"### Instruction:\n{item['instruction']}\n\n### Response:\n{item['output']}"
    instruction_texts.append(text)

instruction_dataset = Dataset.from_dict({'text': instruction_texts})

# Tokenize
tokenized_instruction = instruction_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)
tokenized_instruction = tokenized_instruction.map(
    lambda examples: {"labels": examples["input_ids"]},
    batched=True
)

##31: Setup Final Model

In [None]:
if CONFIG["use_cpu_prototype"]:
    print("🔧 Using prototype model for final training...")
    instruction_model = AutoModelForCausalLM.from_pretrained(prototype_model_id)
else:
    print("🤖 Loading model for instruction tuning...")
    instruction_model = AutoModelForCausalLM.from_pretrained(
        base_model_id,
        quantization_config=quantization_config,
        device_map="auto",
        torch_dtype=torch.float16,
    )
    instruction_model = prepare_model_for_kbit_training(instruction_model)
    instruction_model = get_peft_model(instruction_model, lora_config)
    instruction_model.print_trainable_parameters()

##32: Train Final Model

This is the final instruction tuning step. The base model is fine-tuned on the combined dataset, which includes both a portion of the original seed data and the newly curated high-quality augmented instruction-output pairs. This aims to improve the model's ability to follow instructions based on the self-augmented data.

In [None]:
print("\n🚀 Training instruction-tuned model...")

trainer_instruction = Trainer(
    model=instruction_model,
    args=training_args,
    train_dataset=tokenized_instruction,
    tokenizer=prototype_tokenizer if CONFIG["use_cpu_prototype"] else tokenizer,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=prototype_tokenizer if CONFIG["use_cpu_prototype"] else tokenizer,
        mlm=False
    )
)

trainer_instruction.train()
print("✅ Instruction tuning complete!")

## 33: Generate Example Responses

As a required deliverable, this cell uses the newly trained instruction-tuned model to generate responses for a set of test prompts. This demonstrates the model's ability to follow instructions after the fine-tuning process.

In [None]:
print("\n" + "="*70)
print("🤖 5 EXAMPLE RESPONSES FROM FINAL MODEL :")
print("="*70)

test_prompts = [
    "What is machine learning?",
    "How do I make coffee?",
    "Explain photosynthesis simply.",
    "What are the benefits of exercise?",
    "How does the internet work?"
]

instruction_model.eval()
for i, prompt in enumerate(test_prompts):
    print(f"\nExample {i+1}:")
    print(f"📝 Instruction: {prompt}")

    input_text = f"### Instruction:\n{prompt}\n\n### Response:"
    inputs = (prototype_tokenizer if CONFIG["use_cpu_prototype"] else tokenizer)(
        input_text,
        return_tensors="pt",
        truncation=True,
        max_length=256
    )

    if device.type == "cuda":
        inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = instruction_model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=(prototype_tokenizer if CONFIG["use_cpu_prototype"] else tokenizer).pad_token_id,
            eos_token_id=(prototype_tokenizer if CONFIG["use_cpu_prototype"] else tokenizer).eos_token_id
        )

    response = (prototype_tokenizer if CONFIG["use_cpu_prototype"] else tokenizer).decode(
        outputs[0],
        skip_special_tokens=True
    )
    response = response.split("### Response:")[-1].strip()

    print(f"🤖 Response: {response}")
    print("-"*50)

## 34: Upload Final Model

After the instruction tuning is complete, the final fine-tuned model is saved and uploaded to the Hugging Face Hub. This is the final deliverable model resulting from the self-alignment process.

In [None]:
if not CONFIG["use_cpu_prototype"]:
    print("\n💾 Uploading final instruction-tuned model...")
    final_model_name = f"{HF_USERNAME}/llama2-7b-instruction-tuned"

    instruction_model.push_to_hub(final_model_name, use_auth_token=True)
    tokenizer.push_to_hub(final_model_name, use_auth_token=True)

    final_model_url = f"https://huggingface.co/{final_model_name}"
    print(f"✅ FINAL MODEL URL: {final_model_url}")
else:
    print("⚠️ CPU prototype mode - skipping final model upload")
    final_model_url = "CPU_PROTOTYPE_MODE"

##35: Final Summary

In [None]:
print(" Self-Alignment with Instruction Backtranslation")
print("\n📊 DELIVERABLES SUMMARY:")
print(f"1. Backward Model URL: {backward_model_url}")
print(f"2. Generated 5 instruction examples ✅")
print(f"3. Showed 5 high-quality examples ✅")
print(f"4. Showed 5 low-quality examples ✅")
print(f"5. Curated Dataset URL: {dataset_url}")
print(f"6. Final Model URL: {final_model_url}")
print(f"7. Generated 5 example responses ✅")

if CONFIG["use_cpu_prototype"]:
    print("\n⚠️ NOTE: Running in CPU prototype mode")
    print("To run full version on GPU:")
    print("1. Set CONFIG['use_cpu_prototype'] = False")
    print("2. Ensure GPU runtime is selected")
    print("3. Re-run all cells")

# Save summary
summary = {
    "backward_model_url": backward_model_url,
    "dataset_url": dataset_url,
    "final_model_url": final_model_url,
    "mode": "CPU_PROTOTYPE" if CONFIG["use_cpu_prototype"] else "FULL_GPU"
}

with open("assignment3_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print("\n✅ Summary saved to assignment3_summary.json")