<a href="https://colab.research.google.com/github/OneFineStarstuff/Cosmic-Brilliance/blob/main/reward_model_train_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
"""
reward_model_train.py

Trains a simple reward model (single scalar output) on scored context–question pairs.

Expected JSONL file format (scored_questions.jsonl):
{"context": "...", "question": "...", "quality": 0.85}
{"context": "...", "question": "...", "quality": 0.42}
...
"""

from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
import torch
import random
import numpy as np
import os

# -----------------------
# Config
# -----------------------
MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
DATA_FILE = "scored_questions.jsonl"
OUTPUT_DIR = "rm_ckpt"
FINAL_DIR = "rm_final"
MAX_LENGTH = 128
BATCH_SIZE = 32
LR = 2e-5
EPOCHS = 4
WEIGHT_DECAY = 0.01
SEED = 42

# -----------------------
# Reproducibility
# -----------------------
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# -----------------------
# Load tokenizer & model
# -----------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID, num_labels=1
)

# -----------------------
# Preprocess function
# -----------------------
def preprocess(batch):
    """
    Concatenate context and question, tokenize, and attach label.
    """
    text = [f"{c} ||| {q}" for c, q in zip(batch["context"], batch["question"])]
    enc = tokenizer(text, truncation=True, padding="max_length", max_length=MAX_LENGTH)
    enc["labels"] = batch["quality"]
    return enc

# -----------------------
# Load & prepare dataset
# -----------------------
raw_dataset = load_dataset("json", data_files=DATA_FILE)["train"]
dataset = raw_dataset.map(preprocess, batched=True, remove_columns=raw_dataset.column_names)

# -----------------------
# Training args
# -----------------------
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    lr_scheduler_type="cosine",
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none",  # disable W&B/MLflow unless you want them
)

# -----------------------
# Trainer
# -----------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# -----------------------
# Train & save
# -----------------------
trainer.train()
model.save_pretrained(FINAL_DIR)
tokenizer.save_pretrained(FINAL_DIR)

print(f"Training complete. Model & tokenizer saved to '{FINAL_DIR}'.")