In [1]:
def is_correct(sample):
    return sample["extracted_output"] == sample["ground_truth_answer"]

def compare(a, b):
    """
    Compare a and b. Return 1 if a wins, -1 if b wins, 0 if tie.
    """
    a_correct = is_correct(a)
    b_correct = is_correct(b)

    if a_correct and not b_correct:
        return 1
    elif not a_correct and b_correct:
        return -1
    else:
        # Both correct or both incorrect: compare length
        a_len = a["response_length"]
        b_len = b["response_length"]
        if a_len < b_len:
            return 1
        elif a_len > b_len:
            return -1
        else:
            return 0

def compute_win_rate(data, baseline_data):
    assert len(data) == len(baseline_data), "Length mismatch"
    total = 0
    wins = 0
    for a, b in zip(data, baseline_data):
        result = compare(a, b)
        if result == 1:
            wins += 1
        if result != 0:
            total += 1
    return wins / total if total > 0 else 0.0


In [2]:
import json

dataset = "outputs-eval-deepseek-ai.DeepSeek-R1-Distill-Qwen-7B-MATH500"
with open("./temp_data/" + dataset + ".json", 'r') as f:
    baseline_data = json.load(f)

dataset = "outputs-eval-...optimization.ckpt.optimized-MATH500"
with open("./temp_data/" + dataset + ".json", 'r') as f:
    data = json.load(f)

In [3]:
win_rate = compute_win_rate(data, baseline_data)
print(f"Win rate of new model over baseline: {win_rate:.3f}")

Win rate of new model over baseline: 0.624
