In [1]:
# Import all necessary libraries for reward modeling
import torch
import os
from datasets import load_dataset
import transformers
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    BitsAndBytesConfig
)
from trl import RewardTrainer
from peft import PeftModel
import bitsandbytes as bnb
import gc

print("All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"BitsAndBytes version: {bnb.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")



All libraries imported successfully!
PyTorch version: 2.7.0+cu118
Transformers version: 4.53.0
BitsAndBytes version: 0.46.0
CUDA available: True
GPU: NVIDIA GeForce RTX 4060 Laptop GPU
GPU Memory: 8.0 GB


In [2]:
# Central Configuration for all reward modeling experiments
print("Setting up experimental configuration...")

# Model and dataset paths
sft_model_path = './models/sft'          # Path to our trained SFT adapters
dataset_path = './data/train_prefs.jsonl'  # Preference dataset
base_rm_output_dir = './models/rm'       # Base output directory for reward models

# Precision levels to experiment with
precisions_to_run = ['bf16', 'int8', 'int4']

# Shared training arguments for all experiments
shared_training_args = {
    'per_device_train_batch_size': 4,     # Batch size per device
    'gradient_accumulation_steps': 4,     # Effective batch size = 4*4 = 16
    'num_train_epochs': 1,                # Number of training epochs
    'learning_rate': 2e-4,                # Learning rate for reward model training
    'logging_steps': 10,                  # Log every 10 steps
    'bf16': True,                         # Use BF16 for training efficiency
    'save_strategy': 'epoch',             # Save at the end of each epoch
    'evaluation_strategy': 'no',          # No evaluation during training
    'remove_unused_columns': False,       # Keep all columns
    'push_to_hub': False,                 # Don't push to HF Hub
    'report_to': None,                    # Disable logging to wandb/tensorboard
    'dataloader_pin_memory': False,       # Reduce memory usage
    'gradient_checkpointing': True,       # Trade compute for memory
}

print("Configuration:")
print(f"  SFT model path: {sft_model_path}")
print(f"  Dataset path: {dataset_path}")
print(f"  Base RM output dir: {base_rm_output_dir}")
print(f"  Precisions to test: {precisions_to_run}")
print(f"  Shared training args: {shared_training_args}")

# Ensure base output directory exists
os.makedirs(base_rm_output_dir, exist_ok=True)
print(f"✅ Configuration complete!")


Setting up experimental configuration...
Configuration:
  SFT model path: ./models/sft
  Dataset path: ./data/train_prefs.jsonl
  Base RM output dir: ./models/rm
  Precisions to test: ['bf16', 'int8', 'int4']
  Shared training args: {'per_device_train_batch_size': 4, 'gradient_accumulation_steps': 4, 'num_train_epochs': 1, 'learning_rate': 0.0002, 'logging_steps': 10, 'bf16': True, 'save_strategy': 'epoch', 'evaluation_strategy': 'no', 'remove_unused_columns': False, 'push_to_hub': False, 'report_to': None, 'dataloader_pin_memory': False, 'gradient_checkpointing': True}
✅ Configuration complete!


In [3]:
# Load and tokenize the preference dataset
print("Loading and preparing preference dataset...")

# Load the preference dataset
preference_dataset = load_dataset('json', data_files=dataset_path)['train']
print(f"Loaded {len(preference_dataset)} preference pairs")

# Load tokenizer from the SFT model path
tokenizer = AutoTokenizer.from_pretrained(sft_model_path)

# Critical: Set pad token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("✓ Pad token set to EOS token")

print(f"Tokenizer loaded - Vocab size: {len(tokenizer)}")

def tokenize_pairs(example):
    """
    Tokenize chosen and rejected responses for RewardTrainer.
    
    Returns dictionary with keys expected by RewardTrainer:
    - input_ids_chosen, attention_mask_chosen
    - input_ids_rejected, attention_mask_rejected
    """
    # Format the texts for reward modeling
    chosen_text = f"### Human:\n{example['prompt']}\n\n### Assistant:\n{example['chosen']}"
    rejected_text = f"### Human:\n{example['prompt']}\n\n### Assistant:\n{example['rejected']}"
    
    # Tokenize chosen response
    chosen_tokens = tokenizer(
        chosen_text,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors=None  # Return lists, not tensors
    )
    
    # Tokenize rejected response
    rejected_tokens = tokenizer(
        rejected_text,
        truncation=True,
        padding="max_length", 
        max_length=512,
        return_tensors=None  # Return lists, not tensors
    )
    
    return {
        "input_ids_chosen": chosen_tokens["input_ids"],
        "attention_mask_chosen": chosen_tokens["attention_mask"],
        "input_ids_rejected": rejected_tokens["input_ids"],
        "attention_mask_rejected": rejected_tokens["attention_mask"],
    }

# Apply tokenization to the entire dataset
print("Tokenizing preference pairs...")
tokenized_dataset = preference_dataset.map(
    tokenize_pairs,
    batched=False,
    desc="Tokenizing preference pairs"
)

print(f"Tokenization complete!")
print(f"Dataset keys: {list(tokenized_dataset[0].keys())}")
print(f"Sample chosen length: {len(tokenized_dataset[0]['input_ids_chosen'])}")
print(f"Sample rejected length: {len(tokenized_dataset[0]['input_ids_rejected'])}")

# Take a subset for faster training (optional)
# tokenized_dataset = tokenized_dataset.select(range(min(1000, len(tokenized_dataset))))
print(f"Final dataset size: {len(tokenized_dataset)} examples")


Loading and preparing preference dataset...
Loaded 5000 preference pairs
✓ Pad token set to EOS token
Tokenizer loaded - Vocab size: 50257
Tokenizing preference pairs...
Tokenization complete!
Dataset keys: ['chosen', 'rejected', 'prompt', 'input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected']
Sample chosen length: 512
Sample rejected length: 512
Final dataset size: 5000 examples


In [4]:
# Reward Model Factory Function
def create_reward_model(model_path, precision):
    """
    Create a reward model with specified precision/quantization.
    
    Args:
        model_path: Path to the SFT model
        precision: One of 'bf16', 'int8', 'int4'
        
    Returns:
        reward_model: The reward model for training
        tokenizer: Associated tokenizer
    """
    print(f"Creating {precision} reward model...")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Define model loading kwargs based on precision
    if precision == 'bf16':
        print("  Using BF16 precision")
        model_kwargs = {
            'torch_dtype': torch.bfloat16,
            'device_map': 'auto'
        }
        
    elif precision == 'int8':
        print("  Using 8-bit quantization")
        model_kwargs = {
            'load_in_8bit': True,
            'device_map': 'auto'
        }
        
    elif precision == 'int4':
        print("  Using 4-bit QLoRA quantization")
        # Configure 4-bit quantization
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",                # Use NormalFloat4 quantization
            bnb_4bit_compute_dtype=torch.bfloat16,    # Compute in BF16
            bnb_4bit_use_double_quant=True,           # Double quantization for better compression
        )
        model_kwargs = {
            'quantization_config': bnb_config,
            'device_map': 'auto'
        }
        
    else:
        raise ValueError(f"Unsupported precision: {precision}")
    
    # Load the base model (this will be our SFT model)
    try:
        # First try to load as a PEFT model (if it has adapters)
        base_model = AutoModelForCausalLM.from_pretrained(
            'distilgpt2',  # Base model name
            **model_kwargs
        )
        
        # Load PEFT adapters if they exist
        if os.path.exists(os.path.join(model_path, 'adapter_config.json')):
            print("  Loading PEFT adapters...")
            model = PeftModel.from_pretrained(base_model, model_path)
            # Merge adapters for reward modeling
            model = model.merge_and_unload()
        else:
            print("  No PEFT adapters found, using base model")
            model = base_model
            
    except Exception as e:
        print(f"  Error loading model: {e}")
        print("  Falling back to direct loading...")
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            **model_kwargs
        )
    
    # Create reward model by adding a classification head
    print("  Creating reward model with classification head...")
    reward_model = AutoModelForSequenceClassification.from_pretrained(
        None,  # Don't load from pretrained
        config=model.config,
        num_labels=1,  # Single reward score
        torch_dtype=model.dtype if hasattr(model, 'dtype') else torch.bfloat16
    )
    
    # Copy the transformer layers from our fine-tuned model
    if hasattr(model, 'transformer'):
        reward_model.transformer = model.transformer
    elif hasattr(model, 'model'):
        reward_model.model = model.model
    else:
        # Copy all non-classifier parameters
        for name, param in model.named_parameters():
            if hasattr(reward_model, name.split('.')[0]):
                target = reward_model
                for attr in name.split('.'):
                    if hasattr(target, attr):
                        target = getattr(target, attr)
                    else:
                        break
                else:
                    target.data = param.data
    
    print(f"  ✅ {precision} reward model created successfully!")
    return reward_model, tokenizer

# Test the factory function
print("Testing reward model factory...")
test_model, test_tokenizer = create_reward_model(sft_model_path, 'bf16')
print(f"Test model parameters: {test_model.num_parameters():,}")

# Clean up test model
del test_model, test_tokenizer
torch.cuda.empty_cache()
print("✅ Factory function test complete!")


Testing reward model factory...
Creating bf16 reward model...
  Using BF16 precision
  Loading PEFT adapters...
  Creating reward model with classification head...


TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType

In [None]:
# Experiment Execution Loop - Train Multiple Reward Models
print("🚀 Starting reward model training experiments...")
print(f"Will train {len(precisions_to_run)} different precision models")
print("=" * 60)

# Track results for summary
training_results = {}

for i, precision in enumerate(precisions_to_run):
    print(f"\n{'='*20} Experiment {i+1}/{len(precisions_to_run)} {'='*20}")
    print(f"--- Starting {precision.upper()} Reward Model Training ---")
    
    try:
        # Create reward model and tokenizer for current precision
        model, tokenizer = create_reward_model(sft_model_path, precision)
        
        # Define output directory for this specific precision
        output_dir = os.path.join(base_rm_output_dir, precision)
        os.makedirs(output_dir, exist_ok=True)
        
        # Create training arguments for this run
        training_args = TrainingArguments(
            output_dir=output_dir,
            **shared_training_args  # Unpack shared arguments
        )
        
        print(f"  Output directory: {output_dir}")
        print(f"  Model parameters: {model.num_parameters():,}")
        
        # Initialize RewardTrainer
        print("  Initializing RewardTrainer...")
        trainer = RewardTrainer(
            model=model,
            args=training_args,
            tokenizer=tokenizer,
            train_dataset=tokenized_dataset,
            # eval_dataset=None,  # No evaluation for now
        )
        
        print(f"  Starting training for {precision} model...")
        # Start training
        training_result = trainer.train()
        
        # Save the trained model
        print(f"  Saving {precision} model...")
        trainer.save_model()
        
        # Store results
        training_results[precision] = {
            'status': 'success',
            'final_loss': training_result.training_loss if hasattr(training_result, 'training_loss') else 'N/A',
            'output_dir': output_dir,
            'model_size_mb': sum(os.path.getsize(os.path.join(output_dir, f)) 
                                for f in os.listdir(output_dir) 
                                if os.path.isfile(os.path.join(output_dir, f))) / (1024*1024)
        }
        
        print(f"  ✅ {precision.upper()} model training completed successfully!")
        print(f"  Final loss: {training_results[precision]['final_loss']}")
        
    except Exception as e:
        print(f"  ❌ Error training {precision} model: {str(e)}")
        training_results[precision] = {
            'status': 'failed',
            'error': str(e),
            'output_dir': output_dir if 'output_dir' in locals() else 'N/A'
        }
    
    finally:
        # CRITICAL: Clean up GPU memory before next iteration
        # This prevents CUDA out of memory errors when switching between quantization levels
        print(f"  🧹 Cleaning up GPU memory after {precision} training...")
        if 'model' in locals():
            del model
        if 'trainer' in locals():
            del trainer
        if 'tokenizer' in locals():
            del tokenizer
        
        # Force garbage collection and clear CUDA cache
        gc.collect()
        torch.cuda.empty_cache()
        
        print(f"  Memory cleaned up for {precision} model")

print("\n" + "="*60)
print("🎉 All reward model experiments completed!")

# Print summary of results
print("\n📊 TRAINING SUMMARY:")
print("-" * 40)
for precision, result in training_results.items():
    status_emoji = "✅" if result['status'] == 'success' else "❌"
    print(f"{status_emoji} {precision.upper():6} | Status: {result['status']:8}")
    
    if result['status'] == 'success':
        print(f"        | Loss: {result['final_loss']}")
        print(f"        | Size: {result['model_size_mb']:.1f} MB")
        print(f"        | Path: {result['output_dir']}")
    else:
        print(f"        | Error: {result['error']}")
    print()

print(f"🎯 Your reward models are ready in: {base_rm_output_dir}")
print("Next step: Use these models for PPO training or inference!")
