# Inspect and Compare MATH Results

This notebook compares two model checkpoints on the MATH dataset to identify questions where the "after" model improves over the "before" model in a Pass@k setting.

For each question, we check:
1. Does the "before" model fail (all k samples incorrect)?
2. Does the "after" model succeed (at least one of k samples correct)?

If both are true, we display all correct generations from the "after" model.

In [1]:
import pandas as pd
import numpy as np
from verl.utils.reward_score.math_reward import compute_score as math_compute_score


def compare_math_results(before_path, after_path, k=8, max_display=20):
    """
    Compare two MATH prediction files and identify improvements.
    
    Args:
        before_path: Path to the "before" model's predictions parquet file
        after_path: Path to the "after" model's predictions parquet file
        k: Number of samples for Pass@k evaluation
        max_display: Maximum number of improved questions to display
    """
    print("=" * 100)
    print(f"MATH COMPARISON: Pass@{k}")
    print("=" * 100)
    print(f"Before: {before_path}")
    print(f"After:  {after_path}")
    print()
    
    # Load predictions
    before_df = pd.read_parquet(before_path)
    after_df = pd.read_parquet(after_path)
    
    print(f"Before samples: {len(before_df)}")
    print(f"After samples:  {len(after_df)}")
    print()
    
    # Track improvements and Pass@k scores
    improvements = []
    before_pass_count = 0
    after_pass_count = 0
    total_questions = 0
    
    # Iterate through questions (assuming both have same questions in same order)
    for idx in range(min(len(before_df), len(after_df))):
        before_row = before_df.iloc[idx]
        after_row = after_df.iloc[idx]
        
        # Get ground truth
        ground_truth = before_row['reward_model']['ground_truth']
        
        # Get responses
        before_responses = before_row['responses'] if isinstance(before_row['responses'], (list, np.ndarray)) else [before_row['responses']]
        after_responses = after_row['responses'] if isinstance(after_row['responses'], (list, np.ndarray)) else [after_row['responses']]
        
        # Ensure we have k samples
        before_responses = before_responses[:k]
        after_responses = after_responses[:k]
        
        # Compute scores for before model
        before_scores = [math_compute_score(resp, ground_truth) for resp in before_responses]
        before_pass = any(score > 0 for score in before_scores)
        
        # Compute scores for after model
        after_scores = [math_compute_score(resp, ground_truth) for resp in after_responses]
        after_pass = any(score > 0 for score in after_scores)
        
        # Update Pass@k counters
        if before_pass:
            before_pass_count += 1
        if after_pass:
            after_pass_count += 1
        total_questions += 1
        
        # Check if this is an improvement (before failed, after succeeded)
        if not before_pass and after_pass:
            # Find all correct responses from after model
            correct_responses = [resp for resp, score in zip(after_responses, after_scores) if score > 0]
            
            improvements.append({
                'idx': idx,
                'row': after_row,
                'ground_truth': ground_truth,
                'correct_responses': correct_responses,
                'num_correct': len(correct_responses),
            })
    
    # Calculate Pass@k scores
    before_pass_at_k = before_pass_count / total_questions if total_questions > 0 else 0
    after_pass_at_k = after_pass_count / total_questions if total_questions > 0 else 0
    
    # Summary
    print(f"Total questions: {total_questions}")
    print(f"Before Pass@{k}: {before_pass_at_k:.4f} ({before_pass_count}/{total_questions})")
    print(f"After Pass@{k}:  {after_pass_at_k:.4f} ({after_pass_count}/{total_questions})")
    print(f"Improvement: {after_pass_at_k - before_pass_at_k:+.4f}")
    print()
    print(f"Improvements (before fail → after success): {len(improvements)}")
    print()
    
    if len(improvements) == 0:
        print("No improvements found!")
        return
    
    # Display improved questions
    num_to_display = min(max_display, len(improvements))
    print("=" * 100)
    print(f"DISPLAYING {num_to_display} IMPROVED QUESTIONS")
    print("=" * 100)
    
    for i, improvement in enumerate(improvements[:num_to_display], 1):
        row = improvement['row']
        
        print()
        print("-" * 100)
        print(f"IMPROVED QUESTION {i}/{num_to_display} (Index: {improvement['idx']})")
        print("-" * 100)
        
        # Extract question from prompt
        prompt = row['prompt']
        if isinstance(prompt, (list, np.ndarray)):
            question = None
            for msg in prompt:
                if isinstance(msg, dict) and msg.get('role') == 'user':
                    question = msg.get('content', '')
            if question:
                print(f"\nQuestion:\n{question}")
        
        print(f"\nType: {row['type']}")
        print(f"Level: {row['level']}")
        print(f"\nGround Truth: {improvement['ground_truth']}")
        print(f"\nNumber of correct responses (out of {k}): {improvement['num_correct']}")
        
        # Display all correct responses
        for j, correct_resp in enumerate(improvement['correct_responses'], 1):
            print(f"\n{'~' * 80}")
            print(f"Correct Response {j}/{improvement['num_correct']}:")
            print(f"{'~' * 80}")
            print(correct_resp)
    
    print()
    print("=" * 100)
    print(f"Summary: {len(improvements)} total improvements found")
    print("=" * 100)

In [3]:
# Example usage: Compare a pre-train checkpoint with an RL checkpoint
# Uncomment and modify the paths below to run the comparison

# Pair 1 - fake news 
before_path = "/n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results/1B-step1000-1shot-8samples-temp0.6/math_predictions.parquet"
after_path = "/n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results_sunny_tmp/olmo2_1b_step1000_omi_n64-step400-rl-0shot-boxed-8samples-temp0.6/math_predictions.parquet"

compare_math_results(
    before_path=before_path,
    after_path=after_path,
    k=8,  # Pass@8 evaluation
    max_display=20  # Show up to 20 improved questions
)

MATH COMPARISON: Pass@8
Before: /n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results/1B-step1000-1shot-8samples-temp0.6/math_predictions.parquet
After:  /n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results_sunny_tmp/olmo2_1b_step1000_omi_n64-step400-rl-0shot-boxed-8samples-temp0.6/math_predictions.parquet

Before samples: 5000
After samples:  5000

Total questions: 5000
Before Pass@8: 0.0230 (115/5000)
After Pass@8:  0.0672 (336/5000)
Improvement: +0.0442

Improvements (before fail → after success): 315

DISPLAYING 20 IMPROVED QUESTIONS

----------------------------------------------------------------------------------------------------
IMPROVED QUESTION 1/20 (Index: 0)
----------------------------------------------------------------------------------------------------

Question:
How many vertical asymptotes does the graph of $y=\frac{2}{x^2+x-6}$ have? Let's think step by step and output the final answer within \boxed{}.

Type: Algebra
Level: Level 3

Ground Truth: 2

Number of 

In [4]:

# Pair 2 - fake news 
before_path = "/n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results/1B-step3000-1shot-8samples-temp0.6/math_predictions.parquet"
after_path = "/n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results_sunny_tmp/olmo2_1b_step3000_omi_n64-step600-rl-0shot-boxed-8samples-temp0.6/math_predictions.parquet"


compare_math_results(
    before_path=before_path,
    after_path=after_path,
    k=8,  # Pass@8 evaluation
    max_display=20  # Show up to 20 improved questions
)

MATH COMPARISON: Pass@8
Before: /n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results/1B-step3000-1shot-8samples-temp0.6/math_predictions.parquet
After:  /n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results_sunny_tmp/olmo2_1b_step3000_omi_n64-step600-rl-0shot-boxed-8samples-temp0.6/math_predictions.parquet

Before samples: 5000
After samples:  5000

Total questions: 5000
Before Pass@8: 0.0366 (183/5000)
After Pass@8:  0.0930 (465/5000)
Improvement: +0.0564

Improvements (before fail → after success): 414

DISPLAYING 20 IMPROVED QUESTIONS

----------------------------------------------------------------------------------------------------
IMPROVED QUESTION 1/20 (Index: 1)
----------------------------------------------------------------------------------------------------

Question:
What is the positive difference between $120\%$ of 30 and $130\%$ of 20? Let's think step by step and output the final answer within \boxed{}.

Type: Algebra
Level: Level 1

Ground Truth: 10

Number of c

In [5]:

# Pair 3 - mixed but mostly fake 
before_path = "/n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results/1B-step5000-1shot-8samples-temp0.6/math_predictions.parquet"
after_path = "/n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results_sunny_tmp/olmo2_1b_step5000_omi_n64-step1000-rl-0shot-boxed-8samples-temp0.6/math_predictions.parquet"

compare_math_results(
    before_path=before_path,
    after_path=after_path,
    k=8,  # Pass@8 evaluation
    max_display=20  # Show up to 20 improved questions
)



MATH COMPARISON: Pass@8
Before: /n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results/1B-step5000-1shot-8samples-temp0.6/math_predictions.parquet
After:  /n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results_sunny_tmp/olmo2_1b_step5000_omi_n64-step1000-rl-0shot-boxed-8samples-temp0.6/math_predictions.parquet

Before samples: 5000
After samples:  5000

Total questions: 5000
Before Pass@8: 0.0384 (192/5000)
After Pass@8:  0.0908 (454/5000)
Improvement: +0.0524

Improvements (before fail → after success): 396

DISPLAYING 20 IMPROVED QUESTIONS

----------------------------------------------------------------------------------------------------
IMPROVED QUESTION 1/20 (Index: 0)
----------------------------------------------------------------------------------------------------

Question:
How many vertical asymptotes does the graph of $y=\frac{2}{x^2+x-6}$ have? Let's think step by step and output the final answer within \boxed{}.

Type: Algebra
Level: Level 3

Ground Truth: 2

Number of

In [None]:

# Pair 4 - real improvement
before_path = "/n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results/1B-step10000-1shot-8samples-temp0.6/math_predictions.parquet"
after_path = "/n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results_sunny_tmp/olmo2_1b_step10000_omi_n64-step1000-rl-0shot-boxed-8samples-temp0.6/math_predictions.parquet"

compare_math_results(
    before_path=before_path,
    after_path=after_path,
    k=8,  # Pass@8 evaluation
    max_display=20  # Show up to 20 improved questions
)

MATH COMPARISON: Pass@8
Before: /n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results/1B-step10000-1shot-8samples-temp0.6/math_predictions.parquet
After:  /n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results_sunny_tmp/olmo2_1b_step10000_omi_n64-step1000-rl-0shot-boxed-8samples-temp0.6/math_predictions.parquet

Before samples: 5000
After samples:  5000

Total questions: 5000
Before Pass@8: 0.0744 (372/5000)
After Pass@8:  0.1516 (758/5000)
Improvement: +0.0772

Improvements (before fail → after success): 581

DISPLAYING 20 IMPROVED QUESTIONS

----------------------------------------------------------------------------------------------------
IMPROVED QUESTION 1/20 (Index: 4)
----------------------------------------------------------------------------------------------------

Question:
If $2^8=4^x$, what is the value of $x$? Let's think step by step and output the final answer within \boxed{}.

Type: Algebra
Level: Level 1

Ground Truth: 4

Number of correct responses (out of 8): 2


In [6]:
# Pair 5 - real improvement  
before_path = "/n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results/1B-step14000-1shot-8samples-temp0.6/math_predictions.parquet"
after_path = "/n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results_sunny_tmp/olmo2_1b_step14000_omi_n64-step1000-rl-0shot-boxed-8samples-temp0.6/math_predictions.parquet"


compare_math_results(
    before_path=before_path,
    after_path=after_path,
    k=8,  # Pass@8 evaluation
    max_display=20  # Show up to 20 improved questions
)

MATH COMPARISON: Pass@8
Before: /n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results/1B-step14000-1shot-8samples-temp0.6/math_predictions.parquet
After:  /n/netscratch/dam_lab/Everyone/rl_pretrain/eval_results_sunny_tmp/olmo2_1b_step14000_omi_n64-step1000-rl-0shot-boxed-8samples-temp0.6/math_predictions.parquet

Before samples: 5000
After samples:  5000

Total questions: 5000
Before Pass@8: 0.0262 (131/5000)
After Pass@8:  0.1512 (756/5000)
Improvement: +0.1250

Improvements (before fail → after success): 702

DISPLAYING 20 IMPROVED QUESTIONS

----------------------------------------------------------------------------------------------------
IMPROVED QUESTION 1/20 (Index: 0)
----------------------------------------------------------------------------------------------------

Question:
How many vertical asymptotes does the graph of $y=\frac{2}{x^2+x-6}$ have? Let's think step by step and output the final answer within \boxed{}.

Type: Algebra
Level: Level 3

Ground Truth: 2

Number 