<a href="https://colab.research.google.com/github/codelion/icm/blob/main/scripts/train_reward_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training a Reward Model with ICM Dataset

This notebook demonstrates how to train a reward model using the dataset generated by Internal Coherence Maximization (ICM). The dataset contains instruction-output pairs where outputs are binary (True/False) labels, making it ideal for reward modeling.

**Dataset:** `codelion/Qwen3-0.6B-icm`  
**Base Model:** `Qwen/Qwen3-0.6B`  
**Output Model:** `codelion/Qwen3-0.6B-icm-rm`

## Prerequisites
- Set your Hugging Face token as `HF_TOKEN` in Colab secrets
- GPU runtime recommended for training

## Setup and Installation

In [None]:
# Install required packages
!pip install -q transformers>=4.36.0 \
    datasets>=2.14.0 \
    torch>=2.0.0 \
    accelerate>=0.24.0 \
    peft>=0.6.0 \
    trl>=0.7.0 \
    huggingface_hub>=0.19.0 \
    wandb \
    scipy \
    scikit-learn

In [None]:
import os
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from huggingface_hub import login, HfApi
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import wandb
from google.colab import userdata
import warnings
warnings.filterwarnings("ignore")

# Login to Hugging Face
HF_TOKEN = userdata.get('HF_TOKEN')
login(token=HF_TOKEN)

print(f"🚀 Using device: {torch.cuda.get_device_name() if torch.cuda.is_available() else 'CPU'}")
print(f"📊 Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB" if torch.cuda.is_available() else "")

## Data Loading and Preprocessing

In [None]:
# Load the ICM dataset
print("📁 Loading ICM dataset...")
dataset = load_dataset("codelion/Qwen3-0.6B-icm")

print(f"Dataset structure: {dataset}")
print(f"Number of examples: {len(dataset['train'])}")

# Display sample data
sample = dataset['train'][0]
print("\n📝 Sample data:")
print(f"Instruction: {sample['instruction'][:200]}...")
print(f"Output: {sample['output']}")

# Check label distribution
labels = [item['output'] for item in dataset['train']]
label_counts = pd.Series(labels).value_counts()
print(f"\n📊 Label distribution:")
for label, count in label_counts.items():
    print(f"  {label}: {count} ({count/len(labels)*100:.1f}%)")

In [None]:
# Convert labels to binary (0 for False, 1 for True)
def preprocess_data(examples):
    # Convert True/False strings to binary labels
    labels = []
    for output in examples['output']:
        if output.strip().lower() == 'true':
            labels.append(1)
        elif output.strip().lower() == 'false':
            labels.append(0)
        else:
            # Handle edge cases - default to 0
            print(f"⚠️ Unexpected output: {output}, defaulting to 0")
            labels.append(0)
    
    return {
        'text': examples['instruction'],
        'labels': labels
    }

# Apply preprocessing
processed_dataset = dataset.map(
    preprocess_data,
    batched=True,
    remove_columns=dataset['train'].column_names
)

print(f"✅ Preprocessed dataset: {processed_dataset}")
print(f"Sample processed item: {processed_dataset['train'][0]}")

In [ ]:
from datasets import ClassLabel

# Convert labels column to ClassLabel feature for stratification
processed_dataset = processed_dataset.cast_column("labels", ClassLabel(names=["False", "True"]))

# Split data into train/validation sets
train_test_split = processed_dataset['train'].train_test_split(
    test_size=0.2, 
    stratify_by_column='labels',
    seed=42
)

train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"📊 Training examples: {len(train_dataset)}")
print(f"📊 Validation examples: {len(eval_dataset)}")

# Check label balance in splits
train_labels = pd.Series(train_dataset['labels']).value_counts()
eval_labels = pd.Series(eval_dataset['labels']).value_counts()

print("\n📈 Training set label distribution:")
for label, count in train_labels.items():
    print(f"  {label}: {count} ({count/len(train_dataset)*100:.1f}%)")

print("\n📈 Validation set label distribution:")
for label, count in eval_labels.items():
    print(f"  {label}: {count} ({count/len(eval_dataset)*100:.1f}%)")

## Model Setup and Tokenization

In [None]:
# Model configuration
MODEL_NAME = "Qwen/Qwen3-0.6B"
OUTPUT_MODEL_NAME = "codelion/Qwen3-0.6B-icm-rm"
MAX_LENGTH = 512

# Load tokenizer
print(f"🔤 Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("✅ Set pad_token to eos_token")

print(f"Tokenizer vocab size: {len(tokenizer)}")
print(f"Pad token: {tokenizer.pad_token}")
print(f"EOS token: {tokenizer.eos_token}")

In [None]:
# Tokenization function
def tokenize_function(examples):
    # Tokenize the text
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        padding=False,  # We'll pad dynamically
        max_length=MAX_LENGTH,
        return_tensors=None
    )
    
    # Add labels
    tokenized['labels'] = examples['labels']
    
    return tokenized

# Apply tokenization
print("🔤 Tokenizing datasets...")
tokenized_train = train_dataset.map(
    tokenize_function, 
    batched=True, 
    remove_columns=train_dataset.column_names
)

tokenized_eval = eval_dataset.map(
    tokenize_function, 
    batched=True, 
    remove_columns=eval_dataset.column_names
)

print(f"✅ Tokenized training examples: {len(tokenized_train)}")
print(f"✅ Tokenized validation examples: {len(tokenized_eval)}")

# Check tokenized sample
sample_tokenized = tokenized_train[0]
print(f"\n📝 Sample tokenized input length: {len(sample_tokenized['input_ids'])}")
print(f"📝 Sample label: {sample_tokenized['labels']}")

In [None]:
# Load model for sequence classification (reward modeling)
print(f"🤖 Loading model: {MODEL_NAME}")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,  # Binary classification (False=0, True=1)
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None
)

# Resize token embeddings if needed
if len(tokenizer) != model.config.vocab_size:
    model.resize_token_embeddings(len(tokenizer))
    print(f"✅ Resized token embeddings to {len(tokenizer)}")

# Set pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id

print(f"Model device: {next(model.parameters()).device}")
print(f"Model dtype: {next(model.parameters()).dtype}")
print(f"Number of parameters: {sum(p.numel() for p in model.parameters()):,}")

## Training Configuration

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./qwen3-reward-model",
    
    # Training hyperparameters
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    
    # Optimization
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=100,
    
    # Evaluation and saving
    evaluation_strategy="steps",
    eval_steps=250,
    save_strategy="steps",
    save_steps=250,
    save_total_limit=3,
    
    # Logging
    logging_steps=50,
    logging_dir="./logs",
    
    # Performance
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
    
    # Early stopping
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    
    # Reproducibility
    seed=42,
    
    # Disable wandb for now (can be enabled if needed)
    report_to=[],
    
    # Push to hub
    push_to_hub=False,  # We'll do this manually
)

print("✅ Training arguments configured")
print(f"📊 Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"🔄 Total training steps: {len(tokenized_train) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs}")

In [None]:
# Data collator for padding
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding=True,
    return_tensors="pt"
)

# Define evaluation metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

print("✅ Data collator and metrics configured")

## Model Training

In [None]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("✅ Trainer created")
print(f"📊 Training dataset size: {len(trainer.train_dataset)}")
print(f"📊 Evaluation dataset size: {len(trainer.eval_dataset)}")

In [None]:
# Start training
print("🚀 Starting training...")
print("This may take 30-60 minutes depending on your GPU.")

# Train the model
training_result = trainer.train()

print("\n✅ Training completed!")
print(f"📊 Final training loss: {training_result.training_loss:.4f}")
print(f"🕒 Training time: {training_result.metrics['train_runtime']:.2f} seconds")
print(f"⚡ Training samples per second: {training_result.metrics['train_samples_per_second']:.2f}")

## Model Evaluation

In [None]:
# Final evaluation
print("📊 Running final evaluation...")
eval_results = trainer.evaluate()

print("\n📈 Final Evaluation Results:")
for key, value in eval_results.items():
    if key.startswith('eval_'):
        metric_name = key.replace('eval_', '').title()
        if isinstance(value, float):
            print(f"  {metric_name}: {value:.4f}")
        else:
            print(f"  {metric_name}: {value}")

# Test some predictions
print("\n🔍 Testing predictions on sample data...")
sample_texts = [
    "Question: What is 2+2?\nClaim: 2+2 = 4\nI think this claim is",
    "Question: What is 2+2?\nClaim: 2+2 = 5\nI think this claim is",
    "Question: Is the sky blue?\nClaim: The sky is blue\nI think this claim is"
]

for i, text in enumerate(sample_texts):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_LENGTH)
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_class].item()
    
    result = "True" if predicted_class == 1 else "False"
    print(f"  Sample {i+1}: {result} (confidence: {confidence:.3f})")
    print(f"    Text: {text[:50]}...")
    print()

## Save and Upload Model

In [None]:
# Save the model locally
print("💾 Saving model locally...")
trainer.save_model("./qwen3-reward-model-final")
tokenizer.save_pretrained("./qwen3-reward-model-final")

print("✅ Model saved locally")

# Create model card
model_card = f"""
---
language: en
tags:
- reward-model
- icm
- qwen
- binary-classification
datasets:
- codelion/Qwen3-0.6B-icm
base_model: Qwen/Qwen3-0.6B
metrics:
- accuracy
- f1
pipeline_tag: text-classification
---

# Qwen3-0.6B ICM Reward Model

This is a reward model trained on the ICM (Internal Coherence Maximization) dataset using the Qwen3-0.6B base model. The model performs binary classification to determine if a claim is True or False.

## Model Details

- **Base Model:** Qwen/Qwen3-0.6B
- **Training Dataset:** codelion/Qwen3-0.6B-icm
- **Task:** Binary classification (True/False)
- **Training Framework:** Transformers

## Performance

- **Accuracy:** {eval_results.get('eval_accuracy', 'N/A'):.4f}
- **F1 Score:** {eval_results.get('eval_f1', 'N/A'):.4f}
- **Precision:** {eval_results.get('eval_precision', 'N/A'):.4f}
- **Recall:** {eval_results.get('eval_recall', 'N/A'):.4f}

## Usage

```python
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("codelion/Qwen3-0.6B-icm-rm")
model = AutoModelForSequenceClassification.from_pretrained("codelion/Qwen3-0.6B-icm-rm")

# Example usage
text = "Question: What is 2+2?\\nClaim: 2+2 = 4\\nI think this claim is"
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)

with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(predictions, dim=-1).item()
    
result = "True" if predicted_class == 1 else "False"
print(f"Prediction: {{result}}")
```

## Training Details

- **Training Framework:** Hugging Face Transformers
- **Optimizer:** AdamW
- **Learning Rate:** 2e-5
- **Batch Size:** 8 (with gradient accumulation)
- **Epochs:** 3
- **Hardware:** Google Colab GPU

## Applications

This reward model can be used for:
- Reinforcement Learning from Human Feedback (RLHF)
- Quality assessment of generated text
- Truth/falsehood classification tasks
- Fine-tuning other language models

## Citation

If you use this model, please cite the ICM paper:

```bibtex
@software{{icm,
  title = {{ICM: Internal Coherence Maximization}},
  author = {{Asankhaya Sharma}},
  year = {{2025}},
  publisher = {{GitHub}},
  url = {{https://github.com/codelion/icm}}
}}
```
"""

# Save model card
with open("./qwen3-reward-model-final/README.md", "w") as f:
    f.write(model_card)

print("✅ Model card created")

In [None]:
# Upload to Hugging Face Hub
print(f"🚀 Uploading model to Hugging Face Hub: {OUTPUT_MODEL_NAME}")

try:
    # Push to hub
    model.push_to_hub(
        OUTPUT_MODEL_NAME,
        token=HF_TOKEN,
        private=False,
        create_pr=False
    )
    
    tokenizer.push_to_hub(
        OUTPUT_MODEL_NAME,
        token=HF_TOKEN,
        private=False,
        create_pr=False
    )
    
    # Upload the README.md file
    api = HfApi()
    api.upload_file(
        path_or_fileobj="./qwen3-reward-model-final/README.md",
        path_in_repo="README.md",
        repo_id=OUTPUT_MODEL_NAME,
        token=HF_TOKEN
    )
    
    print(f"✅ Model successfully uploaded to: https://huggingface.co/{OUTPUT_MODEL_NAME}")
    print("🎉 Training and upload completed successfully!")
    
except Exception as e:
    print(f"❌ Error uploading model: {str(e)}")
    print("💡 You can manually upload the model from './qwen3-reward-model-final/' directory")

## Summary and Next Steps

🎉 **Training Complete!** Your reward model has been successfully trained and uploaded to Hugging Face.

### Model Information:
- **Model Name:** `codelion/Qwen3-0.6B-icm-rm`
- **Base Model:** Qwen/Qwen3-0.6B
- **Task:** Binary classification (True/False reward modeling)
- **Training Dataset:** ICM-generated dataset

### Next Steps:
1. **Use for RLHF:** This reward model can now be used to train other language models using Reinforcement Learning from Human Feedback
2. **Integration:** Integrate with PPO or other RL frameworks for fine-tuning
3. **Evaluation:** Test the model on downstream tasks to evaluate its effectiveness
4. **Scaling:** Consider training larger versions or on more diverse datasets

### Usage Example:
```python
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load your trained reward model
tokenizer = AutoTokenizer.from_pretrained("codelion/Qwen3-0.6B-icm-rm")
model = AutoModelForSequenceClassification.from_pretrained("codelion/Qwen3-0.6B-icm-rm")

# Use for reward modeling in RLHF pipelines
```

The model is now ready for production use! 🚀