In [2]:
import json
import os
import glob
import networkx as nx
import matplotlib.pyplot as plt

# Create viz directory if it doesn't exist
viz_dir = 'AIME-mcts-qwen-32b-modified/viz'
os.makedirs(viz_dir, exist_ok=True)

# Load all json files from the directory
json_files = glob.glob('AIME-mcts-qwen-32b-modified/jsons/*.json')

for json_file in json_files:
    with open(json_file) as f:
        data = json.load(f)
    
    # Create directed graph
    G = nx.DiGraph()
    
    # Add nodes and their rewards
    for answer in data['to_explore']:
        # Get reward for this answer
        rewards = data['to_explore_reward'].get(answer, [0])
        avg_reward = sum(rewards) / len(rewards) if rewards else 0
        
        # Determine color based on correct/exclude arrays
        color = 'red'
        if answer in data['correct_answers']:
            color = 'lightgreen'
            
        # Add node with attributes
        G.add_node(answer[:50] + '...', # Truncate long answers
                   reward=f'{avg_reward:.1f}',
                   color=color)
    
    # Add edges based on fathers/children relationships
    for child, parent in data['fathers'].items():
        if parent is not None:
            G.add_edge(parent[:50] + '...', child[:50] + '...')
            
    # Draw the graph
    plt.figure(figsize=(12,8))
    
    # Use hierarchical layout for DAG
    pos = nx.nx_agraph.graphviz_layout(G, prog='dot', args='-Grankdir=TB')
    
    # Draw edges with arrows first (before nodes)
    nx.draw_networkx_edges(G, pos, edge_color='gray', arrows=True, arrowsize=20,
                          arrowstyle='->', min_source_margin=20, min_target_margin=20)
    
    # Draw nodes
    node_colors = [G.nodes[node]['color'] for node in G.nodes()]
    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=2000)
    
    # Add only reward labels
    labels = {node: G.nodes[node]['reward'] for node in G.nodes()}
    nx.draw_networkx_labels(G, pos, labels, font_size=8)
    
    plt.title(f'Answer Graph for {os.path.basename(json_file)}')
    plt.axis('off')
    
    # Save figure instead of showing it
    output_file = os.path.join(viz_dir, os.path.basename(json_file).replace('.json', '.png'))
    plt.savefig(output_file, bbox_inches='tight')
    plt.close()


In [4]:
# Initialize list to store all correct answers data
correct_answers_data = []

# Process all json files to analyze correct answers
for json_file in json_files:
    with open(json_file) as f:
        data = json.load(f)
        
    # Load correct answers and their data
    for answer in data['correct_answers']:
        rewards = data['to_explore_reward'].get(answer, [0])
        avg_reward = sum(rewards) / len(rewards) if rewards else 0
        # Get the first analysis from the reward_analysis list if it exists
        analysis = data['reward_analysis'].get(answer, ['No analysis available'])[0]
        
        correct_answers_data.append({
            'question': data['query'],
            'answer': answer,
            'reward': avg_reward,
            'analysis': analysis,
            'file': os.path.basename(json_file)
        })

# Sort all answers by reward ascending
correct_answers_data.sort(key=lambda x: x['reward'])

# Display results for all files
print("\nCorrect Answers Analysis Across All Files:")
print("=" * 100)
for item in correct_answers_data:
    print(f"\nFile: {item['file']}")
    print(f"Reward Score: {item['reward']:.1f}")
    print("-" * 50)
    print("Question:")
    print(item['question'])
    print("-" * 50)
    print("Answer:")
    print(item['answer'])
    print("-" * 50) 
    print("Analysis:")
    print(item['analysis'])
    print("=" * 100)



Correct Answers Analysis Across All Files:

File: 743f8855dbd551d86e11b6fe53231e07.json
Reward Score: -100.0
--------------------------------------------------
Question:
For how many pairs of consecutive integers in $\{1000,1001,1002^{}_{},\ldots,2000\}$ is no carrying required when the two integers are added?
--------------------------------------------------
Answer:
[reasoning process] To determine how many pairs of consecutive integers in the set \(\{1000, 1001, 1002, \ldots, 2000\}\) can be added without any carrying operation, we need to analyze the digits of these numbers. Two numbers can be added without carrying if, for each digit position (units, tens, hundreds, thousands), the sum of the digits in that position is less than 10.

First, let's consider the range of numbers:
- The numbers are from 1000 to 2000, which means the thousands digit is either 1 or 2.
- For the numbers in the form of 1XXX, we need to check the digits X, X, and X such that no two consecutive digits in t

In [8]:
# Initialize list to store all incorrect answers data
incorrect_answers_data = []
json_files = glob.glob('AIME-mcts-qwen-32b-modified/jsons/*.json')

# Process all json files to analyze incorrect answers
for json_file in json_files:
    with open(json_file) as f:
        data = json.load(f)
        
    # Get all answers from answers_list and filter out excluded/correct ones
    correct_answers = set(data['correct_answers'])
    ground_truth = data['ground_truth']
    excluded_answers = set(data.get('exclude', []))
    all_answers = data['answers_list']
    incorrect_answers = [ans for ans in all_answers 
                        if ans not in correct_answers 
                        and ans not in excluded_answers]
    
    # Load incorrect answers and their data
    for answer in incorrect_answers:
        rewards = data['to_explore_reward'].get(answer, [0])
        avg_reward = sum(rewards) / len(rewards) if rewards else 0
        # Get the first analysis from the reward_analysis list if it exists
        analysis = data['reward_analysis'].get(answer, ['No analysis available'])
        
        incorrect_answers_data.append({
            'question': data['query'],
            'answer': answer,
            'ground_truth': ground_truth,
            'rewards': rewards,
            'avg_reward': avg_reward,
            'analysis': analysis,
            'file': os.path.basename(json_file)
        })

# Sort all answers by average reward
incorrect_answers_data.sort(key=lambda x: -x['avg_reward'])

# Display results for all files
print("\nIncorrect Answers Analysis Across All Files:")
print("=" * 100)
for item in incorrect_answers_data:
    print(f"\nFile: {item['file']}")
    print("Reward Scores:")
    for i, reward in enumerate(item['rewards'], 1):
        print(f"iteration {i}: {reward:.1f}")
    print("-" * 50)
    print("Question:")
    print(item['question'])
    print("-" * 50)
    print("Ground Truth:")
    print(item['ground_truth'])
    print("-" * 50)
    print("Answer:")
    print(item['answer'])
    print("-" * 50) 
    for i, analysis in enumerate(item['analysis'], 1):
        print(f"iteration {i}:\n {analysis}\n")
    print("=" * 100)



Incorrect Answers Analysis Across All Files:

File: 3b0672d5e04e413bcfe1336049db4478.json
Reward Scores:
iteration 1: 90.0
--------------------------------------------------
Question:
An ellipse has foci at $(9, 20)$ and $(49, 55)$ in the $xy$ -plane and is tangent to the $x$ -axis. What is the length of its major axis?
--------------------------------------------------
Ground Truth:
85
--------------------------------------------------
Answer:
### [Reasoning Process]

1. **Finding the Center of the Ellipse:**
   - The center \((h, k)\) of the ellipse is the midpoint of the segment joining the foci \((9, 20)\) and \((49, 55)\).
   \[
   (h, k) = \left( \frac{9 + 49}{2}, \frac{20 + 55}{2} \right) = (29, 37.5)
   \]

2. **Understanding the Tangency to the x-axis:**
   - Since the ellipse is tangent to the x-axis, the distance from the center to the x-axis is equal to the semi-minor axis \(b\). The y-coordinate of the center is 37.5, so:
     \[
     b = 37.5 = \frac{75}{2}
     \]

3. *

In [5]:
# Initialize list to store all answers data
all_answers_data = []
json_files = glob.glob('AIME-mcts-qwen-32b-modified/jsons/*.json')
#json_files = glob.glob('gsm8k-mcts-qwen-32b/jsons/*.json')

# Process all json files to analyze answers
for json_file in json_files:
    with open(json_file) as f:
        data = json.load(f)
        
    # Get all answers from answers_list
    ground_truth = data['ground_truth']
    all_answers = data['answers_list']
    correct_answers = set(data['correct_answers'])
    excluded_answers = set(data.get('exclude', []))
    
    # Load all answers and their data
    for answer in all_answers:
        rewards = data['to_explore_reward'].get(answer, [0])
        avg_reward = sum(rewards) / len(rewards) if rewards else 0
        # Get the first analysis from the reward_analysis list if it exists
        
        is_correct = answer in correct_answers
        is_excluded = answer in excluded_answers
        
        all_answers_data.append({
            'question': data['query'],
            'answer': answer,
            'ground_truth': ground_truth,
            'rewards': rewards,
            'avg_reward': avg_reward,
            'is_correct': is_correct,
            'is_excluded': is_excluded,
            'file': os.path.basename(json_file)
        })

# Group by question and analyze answers
question_stats = {}
for item in all_answers_data:
    question = item['question']
    if question not in question_stats:
        question_stats[question] = {'total': 0, 'correct': 0, 'incorrect': 0}
    
    question_stats[question]['total'] += 1
    if item['is_correct']:
        question_stats[question]['correct'] += 1
    else:
        question_stats[question]['incorrect'] += 1

# Count questions with 3+ answers and show their correct/incorrect breakdown
questions_with_3plus_nodes = 0
for question, stats in question_stats.items():
    if stats['total'] >= 3:
        questions_with_3plus_nodes += 1

print(f"\nNumber of trees with 3 or more nodes: {questions_with_3plus_nodes}")
print("\nBreakdown of answers for trees with 3+ nodes:")
for question, stats in question_stats.items():
    if stats['total'] >= 3:
        print(f"Question has {stats['correct']} correct and {stats['incorrect']} incorrect answers")


Number of trees with 3 or more nodes: 45

Breakdown of answers for trees with 3+ nodes:
Question has 1 correct and 3 incorrect answers
Question has 0 correct and 18 incorrect answers
Question has 0 correct and 18 incorrect answers
Question has 0 correct and 18 incorrect answers
Question has 1 correct and 9 incorrect answers
Question has 1 correct and 6 incorrect answers
Question has 1 correct and 15 incorrect answers
Question has 0 correct and 18 incorrect answers
Question has 1 correct and 9 incorrect answers
Question has 0 correct and 18 incorrect answers
Question has 1 correct and 7 incorrect answers
Question has 1 correct and 3 incorrect answers
Question has 0 correct and 18 incorrect answers
Question has 1 correct and 2 incorrect answers
Question has 0 correct and 18 incorrect answers
Question has 1 correct and 2 incorrect answers
Question has 1 correct and 3 incorrect answers
Question has 1 correct and 15 incorrect answers
Question has 1 correct and 3 incorrect answers
Question 

In [9]:
import sympy as sp

# Define the semi-minor axis b and the distance c
b = sp.Rational(75, 2)
c = sp.Rational(5, 2) * sp.sqrt(113)

# Calculate a^2
a_squared = b**2 + c**2

# Calculate a
a = sp.sqrt(a_squared)

# Calculate the length of the major axis
major_axis_length = 2 * a

# Simplify the result
major_axis_length_simplified = sp.simplify(major_axis_length)

# Evaluate the simplified result
major_axis_length_evaluated = sp.N(major_axis_length_simplified)

# Step-by-step verification
b_squared = b**2
c_squared = c**2
a_squared_calc = sp.simplify(b_squared + c_squared)
a_calc = sp.sqrt(a_squared_calc)
major_axis_length_calc = 2 * a_calc

print(f"Simplified length of the major axis: {major_axis_length_simplified}")
print(f"Evaluated length of the major axis: {major_axis_length_evaluated}")
print(f"Step-by-step calculation of a^2: {a_squared_calc}")
print(f"Step-by-step calculation of a: {a_calc}")
print(f"Step-by-step calculation of the major axis length: {major_axis_length_calc}")


Simplified length of the major axis: 65*sqrt(2)
Evaluated length of the major axis: 91.9238815542512
Step-by-step calculation of a^2: 4225/2
Step-by-step calculation of a: 65*sqrt(2)/2
Step-by-step calculation of the major axis length: 65*sqrt(2)
