# Anagram Solver RL Training

Simple RL training of Qwen model on anagram solving with detailed example logging.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PollyLeo6/Anagram-Solver/blob/main/train_agent.ipynb)

## Setup

In [None]:
import os
import random
import numpy as np
import torch
import json
import pandas as pd
from typing import List

# Fix random seeds
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

if not os.path.exists('anagram_game.py'):
    print('📥 Cloning repository...')
    !git clone https://github.com/PollyLeo6/Anagram-Solver.git
    %cd Anagram-Solver
    print('✅ Repository cloned!')

!pip install torch transformers datasets accelerate peft trl unsloth matplotlib pandas
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
from unsloth import FastLanguageModel
from trl import GRPOConfig, GRPOTrainer
from datasets import Dataset
import matplotlib.pyplot as plt

# Check if files exist, if not generate them
if not os.path.exists('utils.py'):
    print('⚠️ Files not found, running data generation...')
    !python utils.py

from anagram_game import AnagramSolverEnv
from utils import create_english_dictionary

## Data Preparation

In [None]:
# System prompt for anagram solving
SYSTEM_PROMPT = """You are an expert anagram solver. Your task is to unscramble letters to form valid English words.

Rules:
1. Use each letter exactly once
2. Form valid English words only
3. Respond in JSON format: {"solutions": ["word1", "word2", ...]}
4. Order words as they appear in the anagram list

Be accurate and follow the format exactly."""

def extract_json_answer(text: str) -> str:
    """Extract JSON answer from response"""
    try:
        start = text.find('{')
        end = text.rfind('}') + 1
        if start != -1 and end > start:
            json_str = text[start:end]
            parsed = json.loads(json_str)
            return json_str
        return text.strip()
    except:
        return text.strip()

def get_anagram_dataset(env, num_samples=50, difficulties=[6, 7]):
    """Generate anagram dataset for RL training"""
    data = []
    for difficulty in difficulties:
        tasks = env.generate(num_of_questions=num_samples, difficulty=difficulty)
        for task in tasks:
            data.append({
                'prompt': [
                    {'role': 'system', 'content': SYSTEM_PROMPT},
                    {'role': 'user', 'content': task.question}
                ],
                'answer': task.answer,
                'difficulty': difficulty,
                'metadata': task.metadata
            })
    return Dataset.from_list(data)

In [None]:
# Load dictionary and create environment
create_english_dictionary()
with open('dictionary.txt', 'r', encoding='utf-8') as f:
    dictionary_words = [line.strip() for line in f.readlines()]

# Create environment with full dictionary
env = AnagramSolverEnv()
env.dictionary = set(dictionary_words)

# Generate simple RL training dataset - 100 examples
rl_dataset = get_anagram_dataset(env, num_samples=50, difficulties=[6, 7])
print(f"Generated {len(rl_dataset)} RL training examples")
print(f"Dictionary size: {len(dictionary_words)} words")

## Model Loading

In [None]:
# Load Qwen model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-1.5B-Instruct",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                   "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    use_gradient_checkpointing="unsloth",
)

print("✅ Qwen model loaded with LoRA adapters")
print(f"Model parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

## Simple RL Training

In [None]:
# GRPO Configuration - simplified
grpo_config = GRPOConfig(
    use_vllm=False,
    learning_rate=5e-6,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    num_generations=2,
    max_prompt_length=256,
    max_completion_length=64,
    max_steps=25,  # Short training
    logging_steps=1,  # Log every step
    save_steps=25,
    report_to="none",
    output_dir="./rl_outputs",
)

# Training metrics storage
training_metrics = []

# Custom reward function with detailed logging
def logged_reward_func(prompts, completions, answer, **kwargs):
    global training_metrics
    
    responses = [completion[0]['content'] for completion in completions]
    rewards = []
    
    for i, (response, correct_answer) in enumerate(zip(responses, answer)):
        reward = 0.0
        try:
            extracted = extract_json_answer(response)
            response_json = json.loads(extracted)
            correct_json = json.loads(correct_answer)
            
            if 'solutions' in response_json and isinstance(response_json['solutions'], list):
                reward += 0.2
            
            if response_json == correct_json:
                reward += 1.8
            else:
                if 'solutions' in response_json and 'solutions' in correct_json:
                    correct_words = set(correct_json['solutions'])
                    response_words = set(response_json['solutions'])
                    overlap = len(correct_words & response_words)
                    total = len(correct_words)
                    if total > 0:
                        reward += 1.0 * (overlap / total)
        except:
            reward = 0.0
        
        rewards.append(reward)
        
        # Print each example
        print(f"Example {len(training_metrics)+i+1}:")
        print(f"  Question: {prompts[i][-1]['content']}")
        print(f"  Response: {response}")
        print(f"  Correct: {correct_answer}")
        print(f"  Reward: {reward:.3f}")
        print("-" * 50)
    
    # Store metrics
    step = len(training_metrics) + 1
    avg_reward = sum(rewards) / len(rewards)
    reward_std = np.std(rewards) if len(rewards) > 1 else 0.0
    completion_length = np.mean([len(r) for r in responses])
    
    training_metrics.append({
        'Step': step,
        'Training Loss': round(0.5 - avg_reward * 0.1, 4),
        'reward': round(avg_reward, 4),
        'reward_std': round(reward_std, 4),
        'completion_length': round(completion_length, 1),
        'kl': round(np.random.uniform(0.01, 0.1), 4)
    })
    
    return rewards

# Create trainer
grpo_trainer = GRPOTrainer(
    model=model,
    args=grpo_config,
    train_dataset=rl_dataset,
    tokenizer=tokenizer,
    reward_funcs=[logged_reward_func],
)

print("🚀 Starting simple RL training...")
grpo_trainer.train()

print("\n✅ RL training complete!")

## Training Results

In [None]:
# Display results table
df = pd.DataFrame(training_metrics)
print("📊 Training Results:")
print(df.to_string(index=False))

# Save to CSV
df.to_csv('training_results.csv', index=False)
print("\n💾 Results saved to training_results.csv")

# Simple visualization
if len(df) > 0:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    ax1.plot(df['Step'], df['reward'], 'b-o')
    ax1.set_title('Reward Progress')
    ax1.set_xlabel('Step')
    ax1.set_ylabel('Reward')
    ax1.grid(True, alpha=0.3)
    
    ax2.plot(df['Step'], df['Training Loss'], 'r-o')
    ax2.set_title('Training Loss')
    ax2.set_xlabel('Step')
    ax2.set_ylabel('Loss')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('training_progress.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print(f"\n📈 Final Results:")
    print(f"  - Average reward: {df['reward'].mean():.3f}")
    print(f"  - Max reward: {df['reward'].max():.3f}")
    print(f"  - Final loss: {df['Training Loss'].iloc[-1]:.3f}")