# Local RLVR Training with TRL GRPOTrainer

This notebook fine-tunes LiquidAI's LFM2.5-1.2B-Thinking model for HDFS log anomaly detection using **Group Relative Policy Optimization (GRPO)** via the TRL library.

Code authored by: Shaw Talebi

## 1. Setup & Imports

In [None]:
from datasets import load_dataset
from trl import GRPOTrainer, GRPOConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, PeftModel
import torch
from functions import run_local_inference, calculate_metrics, parse_prediction

# Check device
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Apple Silicon)")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using CUDA: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

## 2. Load Dataset

Load from `shawhin/HDFS_v1_blocks` and sample for training.

In [None]:
# Load dataset
dataset = load_dataset("shawhin/HDFS_v1_blocks")
print(f"Train: {len(dataset['train'])} examples")
print(f"Validation: {len(dataset['dev'])} examples")
print(f"Test: {len(dataset['test'])} examples")

In [None]:
# sample train and validation datasets
SAMPLE_SIZE = 100

train_sample = dataset["train"].shuffle(seed=42).select(range(SAMPLE_SIZE))
val_sample = dataset["dev"].shuffle(seed=42).select(range(SAMPLE_SIZE))

print(f"Train sample: {len(train_sample)} examples")
print(f"Val sample: {len(val_sample)} examples")

# Check class distribution
train_anomalous = sum(train_sample["label"])
val_anomalous = sum(val_sample["label"])
print(f"\nTrain anomalous: {train_anomalous}/{len(train_sample)} ({100*train_anomalous/len(train_sample):.1f}%)")
print(f"Val anomalous: {val_anomalous}/{len(val_sample)} ({100*val_anomalous/len(val_sample):.1f}%)")

## 3. Format Dataset for GRPOTrainer

Transform to TRL conversational format with `prompt` column.

In [None]:
SYSTEM_PROMPT = """You are an HDFS log anomaly detector. Analyze the log block and determine if it indicates an anomaly.\
\
Think through your analysis step by step, then provide your final answer.
End your response with exactly 'Anomalous: True' or 'Anomalous: False'."""

def format_for_grpo(example):
    """Format example for GRPOTrainer with conversational prompt."""
    return {
        "prompt": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": f"{example['text']}"}
        ],
        "label": example["label"]  # Keep for reward function
    }

train_dataset = train_sample.map(format_for_grpo, remove_columns=["block_id", "text"])
val_dataset = val_sample.map(format_for_grpo, remove_columns=["block_id", "text"])

print("Sample formatted prompt:")
print(train_dataset[0]["prompt"])

## 4. Define Custom Reward Function

Asymmetric reward structure for class-imbalanced binary classification:
- **True Positive** (pred=1, actual=1): +1
- **True Negative** (pred=0, actual=0): +3
- **False Positive** (pred=1, actual=0): -1
- **False Negative** (pred=0, actual=1): -3

This reward structure penalizes missing anomalies more heavily than false alarms.

In [None]:
def hdfs_classification_reward(completions, label, **kwargs):
    """
    Asymmetric reward for class-imbalanced binary classification.
    
    Args:
        completions: List of model completions (strings or conversation format)
        label: List of ground truth labels (0 or 1)
    
    Returns:
        List of reward values
    """
    rewards = []
    for completion, actual in zip(completions, label):
        # Handle conversational format
        if isinstance(completion, list):
            text = completion[0]["content"] if completion else ""
        else:
            text = completion
        
        pred = parse_prediction(text)
        actual_bool = bool(actual)
        
        if pred == actual_bool:
            # Correct prediction
            reward = 3.0 if not actual_bool else 1.0  # TN=+3, TP=+1
        else:
            # Incorrect prediction
            reward = -3.0 if actual_bool else -1.0  # FN=-3, FP=-1
        
        rewards.append(reward)
    
    return rewards

## 5. Model & Tokenizer Setup

In [None]:
MODEL_NAME = "LiquidAI/LFM2.5-1.2B-Thinking"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# Set padding token (required for batched training)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Required for GRPO

print(f"Tokenizer loaded: {MODEL_NAME}")
print(f"Vocab size: {tokenizer.vocab_size}")
print(f"Pad token: {tokenizer.pad_token}")
print(f"EOS token: {tokenizer.eos_token}")

In [None]:
# LoRA config for efficient fine-tuning
peft_config = LoraConfig(
    r=2,
    lora_alpha=4,
    lora_dropout=0.05,
    target_modules=["q_proj"],
    task_type="CAUSAL_LM",
)

print("LoRA config:")
print(f"  r={peft_config.r}, alpha={peft_config.lora_alpha}")
print(f"  Target modules: {peft_config.target_modules}")

In [None]:
# # Single example inference with full trace
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from peft import PeftModel
# import torch
# from functions import parse_prediction

# # Get example
# example = val_sample[0]
# print(f"Label: {example['label']} ({'Anomalous' if example['label'] == 1 else 'Normal'})")

# # Load model
# MODEL_NAME = "LiquidAI/LFM2.5-1.2B-Thinking"
# OUTPUT_DIR = "./grpo-hdfs-classifier-final"

# base_model = AutoModelForCausalLM.from_pretrained(
#     MODEL_NAME, dtype=torch.float16, trust_remote_code=True, device_map="auto"
# )
# try:
#     model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
# except:
#     model = base_model
# model.eval()

# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
# if tokenizer.pad_token is None:
#     tokenizer.pad_token = tokenizer.eos_token

# # Format and tokenize
# messages = [
#     {"role": "system", "content": SYSTEM_PROMPT + "\n\nLimit your thinking to under 512 characters."},
#     {"role": "user", "content": example['text']}
# ]
# input_ids = tokenizer.apply_chat_template(
#     messages, return_tensors="pt", add_generation_prompt=True
# )["input_ids"].to(model.device)

# # Generate token-by-token (full trace)
# print("\nGeneration trace:")
# generated_text = ""
# current_ids = input_ids.clone()

# with torch.no_grad():
#     for _ in range(512):
#         outputs = model(current_ids)
#         next_token_id = torch.argmax(outputs.logits[:, -1, :], dim=-1, keepdim=True)
#         next_token = tokenizer.decode(next_token_id[0], skip_special_tokens=False)
#         generated_text += next_token
#         print(next_token, end="", flush=True)
#         current_ids = torch.cat([current_ids, next_token_id], dim=1)
#         if next_token_id.item() == tokenizer.eos_token_id:
#             break

# # Parse and compare
# pred = parse_prediction(generated_text)
# print(f"\n\nPrediction: {pred} | Ground Truth: {example['label']} | Match: {'✓' if (1 if pred else 0) == example['label'] else '✗'}")

In [None]:
# len(generated_text)

In [None]:
# # Use original val_sample (with text field)
# val_examples = list(val_sample)
# val_labels = [ex["label"] for ex in val_examples]

# # Compare with base model (before fine-tuning)
# print("\nLoading base model for comparison...")

# base_model_eval = AutoModelForCausalLM.from_pretrained(
#     MODEL_NAME,
#     dtype=torch.float16,
#     trust_remote_code=True,
#     device_map="auto"
# )
# base_model_eval.eval()

# # Run inference with base model
# print("Running evaluation with base model...")
# base_predictions = run_local_inference(base_model_eval, tokenizer, SYSTEM_PROMPT, val_examples)

# # Calculate metrics
# base_metrics = calculate_metrics(base_predictions, val_labels)

# print("\n" + "="*50)
# print("Base Model Evaluation Results")
# print("="*50)
# for metric, value in base_metrics.items():
#     print(f"{metric}: {value:.4f}")

## 6. Training Configuration

In [None]:
training_args = GRPOConfig(
    output_dir="./grpo-hdfs-classifier",
    
    # Training params
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=5e-6,
    
    # GRPO specific - MEMORY OPTIMIZED
    max_completion_length=512,
    num_generations=2,  # Reduced from 4 to 2 (minimum for GRPO, halves memory)
    
    # Logging & saving
    logging_steps=5,
    # save_steps=50,
    
    # Hardware optimization
    fp16=True,
    gradient_checkpointing=True,
    
    # Disable external logging
    report_to="none",

    dataloader_pin_memory=False
)

print("Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"  Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Generations per prompt: {training_args.num_generations}")
print(f"  Max completion length: {training_args.max_completion_length}")
print(f"  Max total length: {getattr(training_args, 'max_length', 'Not set')}")

## 7. Initialize & Train

In [None]:
trainer = GRPOTrainer(
    model=MODEL_NAME,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    peft_config=peft_config,
    reward_funcs=hdfs_classification_reward,
)

print("Trainer initialized!")

In [None]:
%%time
# Run training
print("Starting GRPO training...")
trainer.train()

In [None]:
# Save the final model
OUTPUT_DIR = "./grpo-hdfs-classifier-final"
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Model saved to {OUTPUT_DIR}")

## 8. Evaluation

Compare base model vs fine-tuned model on the validation set.

In [None]:
# Load fine-tuned model for evaluation
print("Loading fine-tuned model...")

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto"
)

# Load LoRA adapter
finetuned_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
finetuned_model.eval()

print("Fine-tuned model loaded!")

In [None]:
# Run evaluation on validation set
print("Running evaluation on validation set...")

# Use original val_sample (with text field)
val_examples = list(val_sample)
val_labels = [ex["label"] for ex in val_examples]

# Run inference
predictions = run_local_inference(finetuned_model, tokenizer, SYSTEM_PROMPT, val_examples)

# Calculate metrics
metrics = calculate_metrics(predictions, val_labels)

print("\n" + "="*50)
print("Fine-tuned Model Evaluation Results")
print("="*50)
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")