In [3]:
import re
def format_reward(completions, **kwargs):
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<think>.*?</think>\s*<answer>.*?</answer>$"
    completion_contents = completions
    matches = [re.match(pattern, content) for content in completion_contents]
    
    
    rewards_full_list = [1.0 if match else 0.0 for match in matches]
    for i, completion in enumerate(completion_contents):
        if not completion.startswith("<think>") or not completion.endswith("</answer>"):
            rewards_full_list[i] = 0.0
    print(f"Rewards for completions: {rewards_full_list}")
    
    rewards_partial_1 = [1.0 if content.count("<think>") == 1 else 0.0 for content in completion_contents]
    rewards_partial_2 = [1.0 if content.count("</think>") == 1 else 0.0 for content in completion_contents]
    rewards_partial_3 = [1.0 if content.count("<answer>") == 1 else 0.0 for content in completion_contents]
    rewards_partial_4 = [1.0 if content.count("</answer>") == 1 else 0.0 for content in completion_contents]
    
    # Combine the rewards
    final_rewards = []
    for full, p1, p2, p3, p4 in zip(rewards_full_list, rewards_partial_1, rewards_partial_2, rewards_partial_3, rewards_partial_4):
        final_reward = (full + (p1 + p2 + p3 + p4) / 4.0) / 2.0
        final_rewards.append(final_reward)
    
    return rewards_full_list


In [7]:
import re
from rouge_score import rouge_scorer
def format_reward(completions, **kwargs):
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<think>.*?</think>\s*<answer>.*?</answer>$"
    completion_contents = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, content) for content in completion_contents]
    
    
    rewards_full_list = [1.0 if match else 0.0 for match in matches]
    for i, completion in enumerate(completion_contents):
        if not completion.startswith("<think>") or not completion.endswith("</answer>"):
            rewards_full_list[i] = 0.0
    
    rewards_partial_1 = [1.0 if content.count("<think>") == 1 else 0.0 for content in completion_contents]
    rewards_partial_2 = [1.0 if content.count("</think>") == 1 else 0.0 for content in completion_contents]
    rewards_partial_3 = [1.0 if content.count("<answer>") == 1 else 0.0 for content in completion_contents]
    rewards_partial_4 = [1.0 if content.count("</answer>") == 1 else 0.0 for content in completion_contents]
    
    # Combine the rewards
    final_rewards = []
    for full, p1, p2, p3, p4 in zip(rewards_full_list, rewards_partial_1, rewards_partial_2, rewards_partial_3, rewards_partial_4):
        final_reward = (full + (p1 + p2 + p3 + p4) / 4.0) / 2.0
        final_rewards.append(final_reward)
    
    return rewards_full_list
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
import evaluate
bertscore = evaluate.load("bertscore")
def accuracy_reward(completions, **kwargs):
    """Reward function that checks if the completion is the same as the ground truth."""
    solutions = kwargs["solution"]
    think = kwargs["think"]
    completion_contents = completions
    
    # answer
    completion_filt = []
    for completion in completion_contents:
        if "<answer>" in completion:
            completion_filt.append(completion[completion.index("<answer>") + len("<answer>"):])
        elif "</think>" in completion:
            completion_filt.append(completion[completion.index("</think>") + len("</think>"):])
        else:
            completion_filt.append(completion)
    
    results = bertscore.compute(predictions=completion_filt, 
                            references=solutions, 
                            lang="en",
                            device="cuda")
    scores_answer = results['f1']
    
    # think
    completion_filt = []
    for completion in completion_contents:
        if "</think>" in completion and "<think>" in completion:
            think_start = completion.index("<think>") + len("<think>")
            think_end = completion.index("</think>")
            completion_filt.append(completion[think_start:think_end])
        else:
            completion_filt.append(completion)
    results = bertscore.compute(predictions=completion_filt, 
                            references=think, 
                            lang="en",
                            device="cuda")
    scores_think = results['f1']
    scores = []
    for score_answer, score_think in zip(scores_answer, scores_think):
        scores.append((score_answer + score_think) / 2.0)
    
    return scores
import numpy as np
def long_think_reward(completions, **kwargs):
    """Reward function that checks if the completion has a long think process and / short answer"""
    
    scores = []
    for completion in completions:
        content = completion[0]["content"]
        think_start = content.find("<think>") + len("<think>")
        think_end = content.find("</think>")
        answer_start = content.find("<answer>") + len("<answer>")
        answer_end = content.find("</answer>")
        
        if think_start == -1 or think_end == -1 or answer_start == -1 or answer_end == -1:
            scores.append(0.0)
            continue
        
        think_length = think_end - think_start
        answer_length = answer_end - answer_start
        
        W_think = 1
        W_answer = 1
        reward = (W_think * think_length - W_answer * answer_length) 
        # sigmoid function to normalize the reward
        k = 0.01
        reward = 1 / (1 + np.exp(-k * reward))
        scores.append(reward)
    return scores

In [6]:
completions = [
    "fmlskjfksjf\n<think> This is a thought process </think> <answer> This is the answer </answer>",
]
# Example usage
format_reward(completions)

Rewards for completions: [0.0]


[0.0]

In [5]:
from google import genai
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
import os
model = "gemini-2.0-flash-lite"
client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))

In [None]:
def gemini_evaluate(question, reference_answer, answer_to_be_judged):
    prompt_template = """You are an expert evaluator. Your task is to rate the following model-generated answer for correctness on a scale of 0.0 to 1.0.

    **Question:**
    {the_question}

    **Ground-Truth Solution:**
    {the_reference_answer}

    **Model's Generated Answer:**
    {the_answer_to_be_judged}

    **Evaluation Criteria:**
    evaluate the factual accuracy in the context of the question.

    Based on these criteria, provide a single floating-point score from 0.0 (completely wrong) to 1.0 (perfectly correct).

    **Score:**"""

    prompt = prompt_template.format(
        the_question=question,
        the_reference_answer=reference_answer,
        the_answer_to_be_judged=answer_to_be_judged
    )
    response = client.models.generate_content(
        model=model,
        contents=[prompt],
    )
    
    try:
        score = float(response.text)
    except ValueError:
        print(f"Error parsing score from response: {response.text}")
        score = 0.5
    return score

0.0

