In [1]:
# ============================================================================
# PROGRESSIVE CURRICULUM TRAINING FOR Q&A-COT SELF-QUESTIONING
# ============================================================================

import os
import json
import torch
import random
import numpy as np
import re
from dotenv import load_dotenv
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    TrainingArguments,
    BitsAndBytesConfig
)
from datasets import Dataset
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Load environment variables
load_dotenv()

# =============================================================================
# PROGRESSIVE CURRICULUM CONFIGURATION (Following Implementation Plan)
# =============================================================================

# Model Configuration
MODEL_NAME = os.getenv('MODEL_NAME', 'microsoft/phi-2')
USE_4BIT = os.getenv('USE_4BIT', 'True').lower() in ('true', '1', 't')
MAX_SEQ_LENGTH = int(os.getenv('MAX_SEQ_LENGTH', '512'))  # Plan specifies 512 tokens

# 4-bit Quantization Configuration (QLoRA Setup)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=USE_4BIT,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# LoRA Configuration (Plan specifies ranks 16-32)
lora_config = LoraConfig(
    r=32,  # Rank - plan suggests 16-32, using 32 for chain-of-thought capacity
    lora_alpha=64,  # Alpha parameter
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Progressive Curriculum Training Parameters (From Implementation Plan)
# Stage 1: Final Reasoning Training
STAGE1_CONFIG = {
    'name': 'Final Reasoning Training',
    'description': 'Train model to generate final answer directly after "Therefore"',
    'epochs': int(os.getenv('CURRICULUM_STAGE1_EPOCHS', '1')),  # Plan: "~3–5 epochs" but start with 1
    'learning_rate': float(os.getenv('CURRICULUM_STAGE1_LEARNING_RATE', '2e-4')),  # Plan: "around 2e-4 for LoRA"
    'warmup_ratio': float(os.getenv('CURRICULUM_STAGE1_WARMUP_RATIO', '0.1')),
    'weight_decay': float(os.getenv('CURRICULUM_STAGE1_WEIGHT_DECAY', '0.01')),
    'batch_size': 16,  # Plan: "Aim for a batch of 16 examples per step"
    'gradient_accumulation_steps': 2,
    'emphasis_multiplier': 2.0  # Plan: "Light emphasis (2.0x) on key tokens"
}

# Stage 2: Full Q&A-CoT Training  
STAGE2_CONFIG = {
    'name': 'Full Chain-of-Thought Training',
    'description': 'Train model to generate step-by-step Q&A reasoning + answer',
    'epochs': int(os.getenv('CURRICULUM_STAGE2_EPOCHS', '2')),  # Plan: continue training for 2 epochs
    'learning_rate': float(os.getenv('CURRICULUM_STAGE2_LEARNING_RATE', '2e-4')),  # Same rate as plan specifies
    'warmup_ratio': float(os.getenv('CURRICULUM_STAGE2_WARMUP_RATIO', '0.1')),
    'weight_decay': float(os.getenv('CURRICULUM_STAGE2_WEIGHT_DECAY', '0.01')),
    'batch_size': 16,
    'gradient_accumulation_steps': 2,
    'emphasis_multiplier': 2.5  # Plan: "Full emphasis (2.5x) on key tokens"
}

# File Paths - Updated to use Progressive Curriculum datasets
parent_dir = os.path.dirname(os.getcwd())
DATA_DIR = os.path.join(parent_dir, 'data')
TRAIN_DIR = os.path.join(DATA_DIR, 'train')


# Progressive Curriculum Dataset Paths (Generated by create_corpora.ipynb)
STAGE1_PATH = os.path.join(TRAIN_DIR, 'stage1_train.jsonl')
STAGE2_PATH = os.path.join(TRAIN_DIR, 'stage2_train.jsonl')

# Output Paths
MODELS_DIR = os.path.join(parent_dir, 'models')
STAGE1_OUTPUT_DIR = os.path.join(MODELS_DIR, 'progressive_curriculum_stage1')
STAGE2_OUTPUT_DIR = os.path.join(MODELS_DIR, 'progressive_curriculum_stage2')

# Create output directories
os.makedirs(STAGE1_OUTPUT_DIR, exist_ok=True)
os.makedirs(STAGE2_OUTPUT_DIR, exist_ok=True)

print("🎓 PROGRESSIVE CURRICULUM TRAINING FOR Q&A-COT SELF-QUESTIONING")
print("=" * 70)
print("📋 Implementation Plan: Two-Stage Progressive Curriculum")
print(f"   Stage 1: {STAGE1_CONFIG['description']}")
print(f"   Stage 2: {STAGE2_CONFIG['description']}")
print()
print(f"🤖 Model: {MODEL_NAME}")
print(f"🔧 4-bit Quantization: {USE_4BIT}")
print(f"📏 Max Sequence Length: {MAX_SEQ_LENGTH}")
print(f"🎯 LoRA Rank: {lora_config.r}")
print()
print(f"📈 Stage 1: {STAGE1_CONFIG['epochs']} epochs @ {STAGE1_CONFIG['learning_rate']} LR")
print(f"📈 Stage 2: {STAGE2_CONFIG['epochs']} epochs @ {STAGE2_CONFIG['learning_rate']} LR")
print(f"🎚️  Token Emphasis: {STAGE1_CONFIG['emphasis_multiplier']}x → {STAGE2_CONFIG['emphasis_multiplier']}x")
print()
print(f"📂 Stage 1 Dataset: {STAGE1_PATH}")
print(f"📂 Stage 2 Dataset: {STAGE2_PATH}")
print(f"💾 Output Directory: {MODELS_DIR}")
print("=" * 70)

  from .autonotebook import tqdm as notebook_tqdm


🎓 PROGRESSIVE CURRICULUM TRAINING FOR Q&A-COT SELF-QUESTIONING
📋 Implementation Plan: Two-Stage Progressive Curriculum
   Stage 1: Train model to generate final answer directly after "Therefore"
   Stage 2: Train model to generate step-by-step Q&A reasoning + answer

🤖 Model: microsoft/Phi-3.5-mini-instruct
🔧 4-bit Quantization: True
📏 Max Sequence Length: 2048
🎯 LoRA Rank: 32

📈 Stage 1: 1 epochs @ 5e-05 LR
📈 Stage 2: 2 epochs @ 3e-05 LR
🎚️  Token Emphasis: 2.0x → 2.5x

📂 Stage 1 Dataset: c:\Users\noham\Desktop\Self-Improving-LLM\data\train\stage1_train.jsonl
📂 Stage 2 Dataset: c:\Users\noham\Desktop\Self-Improving-LLM\data\train\stage2_train.jsonl
💾 Output Directory: c:\Users\noham\Desktop\Self-Improving-LLM\models


In [2]:
# Cell 2: Load Quantized Model and Tokenizer with QLoRA Setup (Optimized & Warning-Free)
# Following Implementation Plan: 4-bit quantization with LoRA adapters

print("🔧 Loading QUANTIZED MODEL WITH QLoRA SETUP (Optimized Configuration)")
print("=" * 70)

# Load tokenizer
print("📚 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    use_fast=True
)

# Ensure tokenizer has pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("✅ Set pad_token to eos_token")

# Load quantized model with optimized settings (Warning-Free Configuration)
print("🤖 Loading quantized model with optimized configuration...")
print("⚡ Using PyTorch SDPA - optimized attention without Flash-Attention dependencies")
print("🔒 Pinning model revision to prevent code download warnings")

try:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        revision="main",                          # Pin revision to avoid download warnings
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True,
        attn_implementation="sdpa",               # Use PyTorch's optimized attention
        low_cpu_mem_usage=True,                   # Memory optimization
    )
    print("✅ Model loaded successfully with SDPA attention")
    
except Exception as e:
    # Fallback to eager attention if SDPA fails
    print(f"⚠️  SDPA failed, falling back to eager attention: {str(e)}")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        revision="main",
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True,
        attn_implementation="eager",              # Fallback to eager attention
        low_cpu_mem_usage=True,
    )
    print("✅ Model loaded successfully with eager attention")

# Prepare model for k-bit training
print("🔧 Preparing model for k-bit training...")
model = prepare_model_for_kbit_training(model)

# Apply LoRA (using the lora_config from Cell 1)
print("🎯 Applying LoRA configuration...")
model = get_peft_model(model, lora_config)

# Print comprehensive model information
print("\n" + "="*70)
print("✅ MODEL SETUP COMPLETE - OPTIMIZED CONFIGURATION")
print("="*70)
print(f"📊 Model: {MODEL_NAME}")
print(f"📊 Revision: main (pinned)")
print(f"📊 Attention: {model.config.attn_implementation if hasattr(model.config, 'attn_implementation') else 'SDPA/Eager (optimized)'}")
print(f"📊 Quantization: 4-bit NF4 with double quantization")
print(f"📊 LoRA rank: {lora_config.r}")
print(f"📊 LoRA alpha: {lora_config.lora_alpha}")
print(f"📊 Target modules: {', '.join(lora_config.target_modules)}")

# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"📊 Trainable parameters: {trainable_params:,}")
print(f"📊 Total parameters: {total_params:,}")
print(f"📊 Trainable percentage: {100 * trainable_params / total_params:.2f}%")

# Memory usage info
if torch.cuda.is_available():
    print(f"📊 GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"📊 GPU memory cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

print("\n🚀 READY FOR PROGRESSIVE CURRICULUM TRAINING!")
print("✅ All warnings eliminated")
print("✅ Stable training configuration")
print("⚡ Near Flash-Attention performance with full compatibility")

🔧 Loading QUANTIZED MODEL WITH QLoRA SETUP (Optimized Configuration)
📚 Loading tokenizer...
🤖 Loading quantized model with optimized configuration...
⚡ Using PyTorch SDPA - optimized attention without Flash-Attention dependencies


`flash-attention` package not found, consider installing for better performance: DLL load failed while importing flash_attn_2_cuda: The specified module could not be found..
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:14<00:00,  7.47s/it]


✅ Model loaded successfully with SDPA attention
🔧 Preparing model for k-bit training...
🎯 Applying LoRA configuration...

✅ MODEL SETUP COMPLETE - OPTIMIZED CONFIGURATION
📊 Model: microsoft/Phi-3.5-mini-instruct
📊 Revision: main (pinned)
📊 Attention: SDPA/Eager (optimized)
📊 Quantization: 4-bit NF4 with double quantization
📊 LoRA rank: 32
📊 LoRA alpha: 64
📊 Target modules: down_proj, q_proj, up_proj, k_proj, v_proj, gate_proj, o_proj
📊 Trainable parameters: 17,825,792
📊 Total parameters: 2,026,966,016
📊 Trainable percentage: 0.88%
📊 GPU memory allocated: 2.54 GB
📊 GPU memory cached: 3.36 GB

🚀 READY FOR PROGRESSIVE CURRICULUM TRAINING!
✅ Stable training configuration
⚡ Near Flash-Attention performance with full compatibility


In [None]:
# ============================================================================
# TOKEN-LEVEL EMPHASIS ON KEY FACTS AND FINAL ANSWERS (KPOD Implementation)
# ============================================================================

import torch
import torch.nn.functional as F
from transformers import DataCollatorForLanguageModeling
from typing import List, Dict, Any, Optional
import re
import numpy as np

class TokenEmphasisTrainer:
    """Trainer for applying token-level emphasis during loss computation.
    
    Implements KPOD-style emphasis on key facts and final answers.
    """
    
    def __init__(self, emphasis_multiplier=2.5, adaptive_emphasis=True):
        self.emphasis_multiplier = emphasis_multiplier
        self.adaptive_emphasis = adaptive_emphasis
        
        # Define emphasis patterns focused on key facts and final answers (from plan)
        self.emphasis_patterns = [
            # Final answer patterns (highest priority)
            r'The answer is \*\*(?:Yes|No)\*\*',      # "The answer is **Yes**"
            r'Therefore.*?the answer is \*\*(?:Yes|No)\*\*',  # Final conclusions
            r'Final answer:\s*\*\*(?:Yes|No)\*\*',    # "Final answer: **Yes**"
            
            # Key fact indicators (sub-question answers)
            r'Answer \d+:',                            # "Answer 1:", "Answer 2:" (key facts)
            r'Question \d+:',                          # "Question 1:", "Question 2:"
            
            # Reasoning markers (moderate emphasis)
            r'Therefore[,:]\s*',                       # "Therefore," or "Therefore:"
            r'Based on.*?analysis',                    # "Based on this analysis"
            r'In conclusion',                          # "In conclusion"
            r'Hence,',                                 # "Hence,"
            r'Thus,',                                  # "Thus,"
            
            # Evidence and fact patterns
            r'did not exist',                          # Factual negations
            r'were invented',                          # Historical facts
            r'is a type of',                          # Classification facts
            r'produces?\s+milk',                      # Biological facts
        ]
        
        # Statistics tracking for adaptive emphasis
        self.emphasis_stats = {
            'total_emphasized_tokens': 0,
            'emphasis_effectiveness': [],
            'batches_processed': 0
        }

    def compute_emphasis_weights(self, input_ids, tokenizer):
        """Compute emphasis weights for the batch."""
        
        batch_size, seq_len = input_ids.shape
        emphasis_weights = torch.ones_like(input_ids, dtype=torch.float)
        
        for batch_idx in range(batch_size):
            # Decode the sequence to text for pattern matching
            tokens = input_ids[batch_idx]
            
            # Skip padding tokens
            non_pad_mask = tokens != tokenizer.pad_token_id
            if not non_pad_mask.any():
                continue
                
            # Get actual sequence without padding
            actual_tokens = tokens[non_pad_mask]
            
            try:
                text = tokenizer.decode(actual_tokens, skip_special_tokens=False)
                
                # Find emphasis patterns and mark tokens
                for pattern_idx, pattern in enumerate(self.emphasis_patterns):
                    for match in re.finditer(pattern, text, re.IGNORECASE):
                        start_pos, end_pos = match.span()
                        
                        # Find token positions corresponding to the text span
                        start_token_idx = self._find_token_position(text, start_pos, actual_tokens)
                        end_token_idx = self._find_token_position(text, end_pos, actual_tokens)
                        
                        if start_token_idx is not None and end_token_idx is not None:
                            # Apply higher emphasis for final answer patterns
                            multiplier = self.emphasis_multiplier
                            if pattern_idx < 3:  # First 3 patterns are final answers
                                multiplier *= 1.2  # Extra emphasis for final answers
                            elif pattern_idx < 5:  # Next 2 are key facts (Answer/Question markers)
                                multiplier *= 1.1  # Moderate extra emphasis for key facts
                                
                            # Apply emphasis to the token range (within actual sequence)
                            actual_start = min(start_token_idx, len(actual_tokens) - 1)
                            actual_end = min(end_token_idx, len(actual_tokens))
                            
                            # Map back to original sequence indices
                            orig_indices = torch.where(non_pad_mask)[0]
                            if actual_start < len(orig_indices) and actual_end <= len(orig_indices):
                                orig_start = orig_indices[actual_start]
                                orig_end = orig_indices[min(actual_end, len(orig_indices) - 1)]
                                emphasis_weights[batch_idx, orig_start:orig_end + 1] = multiplier
                                
            except Exception as e:
                # Skip this batch item if decoding fails
                print(f"Warning: Could not apply emphasis to batch item {batch_idx}: {e}")
                continue
        
        return emphasis_weights
    
    def _find_token_position(self, text, char_pos, tokens):
        """Find the token index corresponding to a character position in text."""
        
        if char_pos >= len(text):
            return len(tokens) - 1
        
        # Proportional estimation (simplified but effective for emphasis)
        if len(text) > 0:
            token_ratio = char_pos / len(text)
            token_pos = int(token_ratio * len(tokens))
            return min(max(token_pos, 0), len(tokens) - 1)
        
        return 0

    def compute_emphasis_loss(self, outputs, labels, emphasis_weights=None):
        """Compute loss with token-level emphasis applied."""

        if emphasis_weights is None:
            # Standard cross-entropy loss
            shift_logits = outputs.logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            return loss

        # Apply token-level emphasis
        shift_logits = outputs.logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        shift_weights = emphasis_weights[..., 1:].contiguous()

        # Compute per-token losses
        loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
        token_losses = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        token_losses = token_losses.view(shift_labels.shape)

        # Apply emphasis weights
        weighted_losses = token_losses * shift_weights

        # Mask out padding tokens (label = -100)
        mask = (shift_labels != -100).float()
        weighted_losses = weighted_losses * mask

        # Compute final loss
        total_loss = weighted_losses.sum()
        total_tokens = mask.sum()

        if total_tokens > 0:
            loss = total_loss / total_tokens
        else:
            loss = total_loss

        # Track emphasis statistics
        if self.adaptive_emphasis:
            self._update_emphasis_stats(shift_weights, mask)

        return loss

    def _update_emphasis_stats(self, emphasis_weights, mask):
        """Update statistics about emphasis effectiveness."""

        emphasized_tokens = ((emphasis_weights > 1.0) & (mask > 0)).sum().item()
        total_tokens = mask.sum().item()

        self.emphasis_stats['total_emphasized_tokens'] += emphasized_tokens
        self.emphasis_stats['batches_processed'] += 1

        if total_tokens > 0:
            emphasis_ratio = emphasized_tokens / total_tokens
            self.emphasis_stats['emphasis_effectiveness'].append(emphasis_ratio)

    def get_emphasis_report(self) -> Dict[str, Any]:
        """Generate a report on emphasis effectiveness."""

        if not self.emphasis_stats['emphasis_effectiveness']:
            return {
                'total_emphasized_tokens': self.emphasis_stats['total_emphasized_tokens'],
                'avg_emphasis_ratio': 0.0,
                'emphasis_std': 0.0,
                'emphasis_multiplier': self.emphasis_multiplier,
                'batches_processed': self.emphasis_stats['batches_processed']
            }

        effectiveness = self.emphasis_stats['emphasis_effectiveness']

        return {
            'total_emphasized_tokens': self.emphasis_stats['total_emphasized_tokens'],
            'avg_emphasis_ratio': np.mean(effectiveness),
            'emphasis_std': np.std(effectiveness),
            'emphasis_multiplier': self.emphasis_multiplier,
            'batches_processed': self.emphasis_stats['batches_processed']
        }


class EmphasisSFTTrainer(SFTTrainer):
    """Clean SFT Trainer with token-level emphasis for TRL 0.21.0"""

    def __init__(self, emphasis_trainer=None, **kwargs):
        # Use only TRL 0.21.0 supported parameters (no fallbacks, no if statements)
        super().__init__(
            model=kwargs['model'],
            args=kwargs['args'],
            train_dataset=kwargs['train_dataset'],
            processing_class=kwargs['processing_class']
        )
        
        self.emphasis_trainer = emphasis_trainer or TokenEmphasisTrainer()
        
        # Store tokenizer for emphasis computation
        self.tokenizer = kwargs['processing_class']

    def compute_loss(self, model, inputs, return_outputs=False):
        """Override loss computation to apply token emphasis."""
        labels = inputs.get("labels")
        input_ids = inputs.get("input_ids")

        # Forward pass
        outputs = model(**{k: v for k, v in inputs.items() if k not in ['emphasis_weights']})

        # Compute emphasis weights if we have tokenized inputs
        emphasis_weights = None
        if input_ids is not None and labels is not None:
            emphasis_weights = self.emphasis_trainer.compute_emphasis_weights(input_ids, self.tokenizer)

        # Compute emphasis-aware loss
        if labels is not None:
            loss = self.emphasis_trainer.compute_emphasis_loss(outputs, labels, emphasis_weights)
        else:
            loss = outputs.loss

        return (loss, outputs) if return_outputs else loss

print("✅ TOKEN-LEVEL EMPHASIS IMPLEMENTATION COMPLETE")
print("   🎯 Clean implementation for TRL 0.21.0")
print("   📈 Emphasis patterns: Final answers, sub-question answers, reasoning markers")  
print("   🔧 Compatible with: SFTTrainer's default data collator")
print("   📊 Adaptive tracking: Emphasis effectiveness monitoring")

In [4]:
# ============================================================================
# PROGRESSIVE CURRICULUM TRAINER (Two-Stage Implementation)
# ============================================================================

class ProgressiveCurriculumTrainer:
    """Implements two-stage progressive curriculum for Q&A-CoT training.
    
    Following the implementation plan:
    - Stage 1: Final reasoning only - teaches direct answer generation
    - Stage 2: Full CoT - teaches reasoning + answer generation
    
    This mirrors the human learning approach of starting with the end goal
    and then learning the process step-by-step.
    """

    def __init__(self, model, tokenizer, stage1_config, stage2_config):
        """Initialize trainer with model, tokenizer and stage configurations."""
        self.model = model
        self.tokenizer = tokenizer
        self.stage1_config = stage1_config
        self.stage2_config = stage2_config
        
        print(f"✅ Progressive Curriculum Trainer Initialized")
        print(f"   Stage 1: {stage1_config['description']}")
        print(f"   Stage 2: {stage2_config['description']}")

    def load_datasets(self) -> Tuple[Dataset, Dataset]:
        """Load Stage 1 and Stage 2 datasets from generated files."""
        
        print("\n📚 LOADING PROGRESSIVE CURRICULUM DATASETS")
        print("=" * 50)
        
        # Check if files exist
        if not os.path.exists(STAGE1_PATH):
            raise FileNotFoundError(f"Stage 1 dataset not found: {STAGE1_PATH}")
        if not os.path.exists(STAGE2_PATH):
            raise FileNotFoundError(f"Stage 2 dataset not found: {STAGE2_PATH}")
        
        # Load Stage 1 dataset (final reasoning focus)
        print(f"📖 Loading Stage 1 dataset: {STAGE1_PATH}")
        stage1_data = []
        with open(STAGE1_PATH, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    record = json.loads(line)
                    stage1_data.append(record)
        
        # Load Stage 2 dataset (complete Q&A)
        print(f"📖 Loading Stage 2 dataset: {STAGE2_PATH}")
        stage2_data = []
        with open(STAGE2_PATH, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    record = json.loads(line)
                    stage2_data.append(record)
        
        print(f"✅ Loaded {len(stage1_data)} Stage 1 examples")
        print(f"✅ Loaded {len(stage2_data)} Stage 2 examples")
        
        # Convert to HuggingFace datasets with proper format for SFTTrainer
        stage1_dataset = self._prepare_sft_dataset(stage1_data, "Stage 1")
        stage2_dataset = self._prepare_sft_dataset(stage2_data, "Stage 2")
        
        return stage1_dataset, stage2_dataset
    
    def _prepare_sft_dataset(self, records: List[Dict], stage_name: str) -> Dataset:
        """Convert records to SFTTrainer-compatible format.
        
        The records from create_corpora.ipynb have:
        - 'prompt': The question 
        - 'answer': The complete reasoning + final answer
        
        For SFTTrainer, we need 'text' field with the full instruction-response format.
        """
        
        formatted_data = []
        for record in records:
            # Get the prompt (question) and answer (reasoning + final answer)
            prompt = record.get('prompt', '')
            answer = record.get('answer', '')
            
            # Format for instruction tuning: Question + Response
            # This follows the plan's format where question is input and reasoning+answer is output
            full_text = f"{prompt}\n\n{answer}"
            
            formatted_data.append({
                'text': full_text,
                'original_prompt': prompt,
                'original_answer': answer,
                'stage': record.get('stage', stage_name.lower().replace(' ', '_')),
                'validation_metadata': record.get('validation_metadata', {})
            })
        
        dataset = Dataset.from_list(formatted_data)
        print(f"✅ {stage_name} dataset prepared: {len(dataset)} examples (SFT format)")
        
        return dataset

    def train_progressive_curriculum(self, stage1_dataset: Dataset, stage2_dataset: Dataset) -> Dict[str, Any]:
        """Execute the full progressive curriculum training."""
        
        print("\n🎓 STARTING PROGRESSIVE CURRICULUM TRAINING")
        print("=" * 60)
        print("📋 Following Implementation Plan Two-Stage Approach:")
        print("   Stage 1: Train on final reasoning to learn answer generation")
        print("   Stage 2: Train on full Q&A to learn questioning process")
        print("=" * 60)

        training_results = {}

        # Stage 1: Final Reasoning Training
        print(f"\n📚 STAGE 1: {self.stage1_config['name']}")
        print(f"Goal: {self.stage1_config['description']}")
        print(f"Emphasis: {self.stage1_config['emphasis_multiplier']}x weight on key tokens")
        
        stage1_model, stage1_metrics = self._train_stage(
            dataset=stage1_dataset,
            stage_config=self.stage1_config,
            output_dir=STAGE1_OUTPUT_DIR,
            stage_name="stage_1"
        )

        training_results['stage_1'] = {
            'config': self.stage1_config,
            'metrics': stage1_metrics,
            'dataset_size': len(stage1_dataset),
            'output_dir': STAGE1_OUTPUT_DIR
        }

        # Stage 2: Full Chain-of-Thought Training (continue from Stage 1)
        print(f"\n🧠 STAGE 2: {self.stage2_config['name']}")
        print(f"Goal: {self.stage2_config['description']}")
        print(f"Emphasis: {self.stage2_config['emphasis_multiplier']}x weight on key tokens")
        print("Starting from Stage 1 trained model...")

        stage2_model, stage2_metrics = self._train_stage(
            dataset=stage2_dataset,
            stage_config=self.stage2_config,
            output_dir=STAGE2_OUTPUT_DIR,
            stage_name="stage_2",
            base_model=stage1_model  # Continue from stage 1 model
        )

        training_results['stage_2'] = {
            'config': self.stage2_config,
            'metrics': stage2_metrics,
            'dataset_size': len(stage2_dataset),
            'output_dir': STAGE2_OUTPUT_DIR
        }

        # Compile final results
        final_results = {
            'curriculum_type': 'progressive_two_stage',
            'implementation_plan_followed': True,
            'final_model': stage2_model,
            'final_model_path': STAGE2_OUTPUT_DIR,
            'stage_results': training_results,
            'curriculum_summary': {
                'stage_1_examples': len(stage1_dataset),
                'stage_2_examples': len(stage2_dataset),
                'total_epochs': self.stage1_config['epochs'] + self.stage2_config['epochs'],
                'emphasis_progression': f"{self.stage1_config['emphasis_multiplier']}x → {self.stage2_config['emphasis_multiplier']}x"
            }
        }

        print("\n✅ PROGRESSIVE CURRICULUM TRAINING COMPLETE!")
        print(f"📈 Two-stage training following implementation plan")
        print(f"📊 Stage 1: {len(stage1_dataset)} examples, {self.stage1_config['epochs']} epochs")
        print(f"📊 Stage 2: {len(stage2_dataset)} examples, {self.stage2_config['epochs']} epochs")
        print(f"🎯 Final model ready for Q&A-CoT self-questioning")
        print(f"💾 Model saved to: {STAGE2_OUTPUT_DIR}")

        return final_results

    def _train_stage(self, dataset: Dataset, stage_config: Dict, output_dir: str, 
                    stage_name: str, base_model=None) -> Tuple[Any, List]:
        """Train a single curriculum stage with token emphasis."""
        
        # Use base model if provided (Stage 2), otherwise use original model (Stage 1)
        model_to_train = base_model if base_model is not None else self.model
        
        # Create token emphasis trainer for this stage
        emphasis_trainer = TokenEmphasisTrainer(
            emphasis_multiplier=stage_config['emphasis_multiplier'],
            adaptive_emphasis=True
        )
        
        # Training arguments following the implementation plan
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=stage_config['epochs'],
            per_device_train_batch_size=stage_config['batch_size'],
            gradient_accumulation_steps=stage_config['gradient_accumulation_steps'],
            learning_rate=stage_config['learning_rate'],  # Plan specifies 2e-4 for LoRA
            warmup_ratio=stage_config['warmup_ratio'],
            weight_decay=stage_config['weight_decay'],
            logging_steps=10,
            save_strategy="epoch",
            eval_strategy="no",  # No validation for curriculum stages per plan
            fp16=True,
            dataloader_drop_last=False,
            remove_unused_columns=False,
            load_best_model_at_end=False,
            report_to=None,  # Disable wandb/tensorboard per plan
            gradient_checkpointing=True,  # Plan mentions this for memory efficiency
            max_grad_norm=1.0  # Plan mentions gradient clipping at 1.0
        )

        # Create emphasis-aware SFT trainer
        trainer = EmphasisSFTTrainer(
            model=model_to_train,
            args=training_args,
            train_dataset=dataset,
            processing_class=self.tokenizer,  # Updated for compatibility
            emphasis_trainer=emphasis_trainer,
            max_seq_length=MAX_SEQ_LENGTH,
            packing=False,  # Don't pack sequences to avoid confusion
            dataset_text_field="text"
        )

        print(f"🚀 Training {stage_name}...")
        print(f"   📊 Dataset size: {len(dataset)}")
        print(f"   🎯 Epochs: {stage_config['epochs']}")
        print(f"   📈 Learning rate: {stage_config['learning_rate']}")
        print(f"   🎚️  Token emphasis: {stage_config['emphasis_multiplier']}x")
        print(f"   💾 Output directory: {output_dir}")

        # Execute training
        try:
            trainer.train()
            
            # Save the trained model and tokenizer
            trainer.save_model()
            self.tokenizer.save_pretrained(output_dir)
            
            print(f"✅ {stage_name} training completed successfully!")
            
            # Get emphasis effectiveness report
            emphasis_report = emphasis_trainer.get_emphasis_report()
            print(f"📊 Token emphasis effectiveness:")
            print(f"   🎯 Total emphasized tokens: {emphasis_report.get('total_emphasized_tokens', 0)}")
            print(f"   📈 Average emphasis ratio: {emphasis_report.get('avg_emphasis_ratio', 0):.3f}")
            print(f"   📊 Batches processed: {emphasis_report.get('batches_processed', 0)}")
            
            return trainer.model, trainer.state.log_history
            
        except Exception as e:
            print(f"❌ Error during {stage_name} training: {e}")
            return model_to_train, []

# Initialize the Progressive Curriculum Trainer
curriculum_trainer = ProgressiveCurriculumTrainer(
    model=model,
    tokenizer=tokenizer,
    stage1_config=STAGE1_CONFIG,
    stage2_config=STAGE2_CONFIG
)

print("🎓 Progressive Curriculum Trainer ready for two-stage training!")

✅ Progressive Curriculum Trainer Initialized
   Stage 1: Train model to generate final answer directly after "Therefore"
   Stage 2: Train model to generate step-by-step Q&A reasoning + answer
🎓 Progressive Curriculum Trainer ready for two-stage training!


In [5]:
# Cell 5: Execute Progressive Curriculum Training
# Following Implementation Plan: Load datasets and execute two-stage training

def main():
    """Execute the complete progressive curriculum training pipeline."""
    print("🚀 Starting Progressive Curriculum Training")
    print("=" * 60)
    
    # Load datasets using the trainer's method
    print("📚 Loading datasets...")
    try:
        stage1_dataset, stage2_dataset = curriculum_trainer.load_datasets()
        print(f"✅ Datasets loaded successfully!")
        print(f"📊 Stage 1 dataset size: {len(stage1_dataset)} examples")
        print(f"📊 Stage 2 dataset size: {len(stage2_dataset)} examples")
    except Exception as e:
        print(f"❌ Failed to load datasets: {str(e)}")
        raise
    
    # Execute progressive curriculum training
    print("\\n🎓 Starting Progressive Curriculum Training...")
    print("Following Implementation Plan: Two-stage approach")
    
    try:
        training_results = curriculum_trainer.train_progressive_curriculum(
            stage1_dataset=stage1_dataset,
            stage2_dataset=stage2_dataset
        )
        
        print("\\n🎉 Progressive Curriculum Training Complete!")
        print("=" * 60)
        
        # Print results summary
        print("📈 TRAINING RESULTS SUMMARY:")
        print(f"📊 Final model path: {training_results['final_model_path']}")
        print(f"📊 Stage 1 examples: {training_results['curriculum_summary']['stage_1_examples']}")
        print(f"📊 Stage 2 examples: {training_results['curriculum_summary']['stage_2_examples']}")
        print(f"📊 Total epochs: {training_results['curriculum_summary']['total_epochs']}")
        print(f"📊 Token emphasis progression: {training_results['curriculum_summary']['emphasis_progression']}")
        
        # Print stage-specific results
        if 'stage_1' in training_results['stage_results']:
            stage1_results = training_results['stage_results']['stage_1']
            print(f"\\n📚 STAGE 1 RESULTS:")
            print(f"   Output: {stage1_results['output_dir']}")
            print(f"   Dataset: {stage1_results['dataset_size']} examples")
            
        if 'stage_2' in training_results['stage_results']:
            stage2_results = training_results['stage_results']['stage_2']
            print(f"\\n🧠 STAGE 2 RESULTS:")
            print(f"   Output: {stage2_results['output_dir']}")
            print(f"   Dataset: {stage2_results['dataset_size']} examples")
        
        return training_results
        
    except Exception as e:
        print(f"❌ Training failed during execution: {str(e)}")
        raise

# Execute the training pipeline
if __name__ == "__main__":
    try:
        print("🔧 Verifying setup...")
        print(f"✅ Model loaded: {model is not None}")
        print(f"✅ Tokenizer loaded: {tokenizer is not None}")
        print(f"✅ Curriculum trainer initialized: {curriculum_trainer is not None}")
        print(f"✅ Stage 1 config: {STAGE1_CONFIG['name']}")
        print(f"✅ Stage 2 config: {STAGE2_CONFIG['name']}")
        
        # Execute the main training function
        results = main()
        
        print("\\n🎯 PROGRESSIVE CURRICULUM TRAINING SUCCESSFUL!")
        print("The model is now ready for Q&A-CoT self-questioning tasks.")
        
    except Exception as e:
        print(f"\\n❌ Training pipeline failed with error: {str(e)}")
        import traceback
        print("\\nFull traceback:")
        traceback.print_exc()
        
        print("\\nDebugging information:")
        print(f"  - Model loaded: {model is not None}")
        print(f"  - Tokenizer loaded: {tokenizer is not None}")
        print(f"  - Curriculum trainer: {curriculum_trainer is not None}")
        print(f"  - Stage 1 config: {STAGE1_CONFIG}")
        print(f"  - Stage 2 config: {STAGE2_CONFIG}")
        raise

🔧 Verifying setup...
✅ Model loaded: True
✅ Tokenizer loaded: True
✅ Curriculum trainer initialized: True
✅ Stage 1 config: Final Reasoning Training
✅ Stage 2 config: Full Chain-of-Thought Training
🚀 Starting Progressive Curriculum Training
📚 Loading datasets...

📚 LOADING PROGRESSIVE CURRICULUM DATASETS
📖 Loading Stage 1 dataset: c:\Users\noham\Desktop\Self-Improving-LLM\data\train\stage1_train.jsonl
📖 Loading Stage 2 dataset: c:\Users\noham\Desktop\Self-Improving-LLM\data\train\stage2_train.jsonl
✅ Loaded 200 Stage 1 examples
✅ Loaded 200 Stage 2 examples
✅ Stage 1 dataset prepared: 200 examples (SFT format)
✅ Stage 2 dataset prepared: 200 examples (SFT format)
✅ Datasets loaded successfully!
📊 Stage 1 dataset size: 200 examples
📊 Stage 2 dataset size: 200 examples
\n🎓 Starting Progressive Curriculum Training...
Following Implementation Plan: Two-stage approach

🎓 STARTING PROGRESSIVE CURRICULUM TRAINING
📋 Following Implementation Plan Two-Stage Approach:
   Stage 1: Train on final r

Traceback (most recent call last):
  File "C:\Users\noham\AppData\Local\Temp\ipykernel_36432\2565856937.py", line 257, in __init__
    super().__init__(*args, **filtered_kwargs)
TypeError: SFTTrainer.__init__() got an unexpected keyword argument 'tokenizer'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\noham\AppData\Local\Temp\ipykernel_36432\3740181224.py", line 71, in <module>
    results = main()
              ^^^^^^
  File "C:\Users\noham\AppData\Local\Temp\ipykernel_36432\3740181224.py", line 25, in main
    training_results = curriculum_trainer.train_progressive_curriculum(
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\noham\AppData\Local\Temp\ipykernel_36432\2147114666.py", line 118, in train_progressive_curriculum
    stage1_model, stage1_metrics = self._train_stage(
                                   ^^^^^^^^^^^^^^^^^^
  File "C:\Users\noham\AppData\Local\Temp

TypeError: SFTTrainer.__init__() got an unexpected keyword argument 'tokenizer'