In [15]:
import json
from datasets import load_dataset, Dataset, DatasetDict
from prompt_template import format_instruction
import pandas as pd
def load_and_format_json(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data_list = json.load(f)  
    data = [
        { 
            "prompt": format_instruction(entry["instruction"]) 
        }
        for entry in data_list
    ]
    df = pd.DataFrame(data)
    hf_dataset = Dataset.from_pandas(df)
    return hf_dataset
dataset = load_and_format_json("/Users/pavankumartaddi/Desktop/Align-CodeGemma/outputs/test_meta.json")

In [14]:
from typing import List
import re
from openai.types import Completion
from execserver.code_exec_reqs import run_coverage_batched
from utils import JAX_LAX_OPERATIONS,JAX_LIBRARIES,JAX_PRIMITIVES,count_jax_usage
def run_tests_and_reward(completions: List[Completion], timeout=60, tests="", timeout_on_client=False) -> List[int]:
    server = "http://localhost:8000"
    codes = []
    for completion in completions:
        for choice in completion["choices"]:
            codes.append(choice["text"])
    coverage_results = run_coverage_batched(server, codes, tests, timeout, timeout_on_client)
    rewards = [1 if result and result > 0 else 0 for result in coverage_results]
    return rewards
def format_reward_func(completions, **kwargs):
    pattern = r"^<response>\s*<think>.*?</think>\s*<code>.*?</code>\s*<test>.*?</test>\s*</response>$"
    completion_contents = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, content, re.DOTALL) for content in completion_contents]
    return [1.0 if match else 0.0 for match in matches]
def reward_based_on_jax_usage(completions: List[Completion]) -> List[float]:
    codes = []
    for completion in completions:
        for choice in completion["choices"]:
            codes.append(choice["text"])
    max_possible_score = len(JAX_LIBRARIES) + len(JAX_PRIMITIVES) + len(JAX_LAX_OPERATIONS)
    rewards = [
        count_jax_usage(code) / max_possible_score if max_possible_score > 0 else 0.0
        for code in codes
    ]
    return rewards

In [None]:
from trl import GRPOConfig, GRPOTrainer, get_peft_config, ModelConfig

# our model we are going to use as policy 
model_config = ModelConfig(
    model_name_or_path="Qwen/Qwen2.5-3B-Instruct",
    torch_dtype="bfloat16",
    attn_implementation="flash_attention_2",
    use_peft=True,
    load_in_4bit=True,
)

# Hyperparameters
training_args = GRPOConfig(
    output_dir="qwen-r1-aha-moment",
    learning_rate=5e-7,
    lr_scheduler_type="cosine",
    logging_steps=10,
    max_steps=100,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    bf16=True,
    # GRPO specific parameters
    max_prompt_length=256,
    max_completion_length=1024, # max length of the generated output for our solution
    num_generations=2,
    beta=0.001,
    
)
trainer = GRPOTrainer(
    model=model_config.model_name_or_path,
    reward_funcs=[format_reward_func, equation_reward_func],
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=get_peft_config(model_config),
)