In [5]:
from google import genai

# The client gets the API key from the environment variable `GEMINI_API_KEY`.
client = genai.Client()

In [6]:
# Import the task loader from our llm-python directory
import sys
sys.path.append('../llm-python')

from task_loader import TaskLoader

# Initialize the task loader and load 100 tasks from all_training subset
loader = TaskLoader(data_root="../data")
task_list = loader.load_tasks_from_subset("all_training", "arc-agi-1")
tasks_100 = task_list[:100]

print(f"Loaded {len(tasks_100)} tasks from arc-agi-1 all_training subset")

Loaded 100 tasks from arc-agi-1 all_training subset


In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from parent directory
load_dotenv("../.env")

def format_grid_for_prompt(grid):
    """Format a grid for the prompt in the specified format"""
    return '\n'.join(' '.join(str(cell) for cell in row) for row in grid)

def create_program_generation_prompt(task_data):
    """Create a prompt for Gemini to generate a program and parameters"""
    
    # Format all training output grids
    examples_text = ""
    for i, example in enumerate(task_data['train']):
        examples_text += f"Example {i+1}:\n\n"
        examples_text += format_grid_for_prompt(example['output'])
        examples_text += "\n\n"
    
    prompt = f"""I have {len(task_data['train'])} grid examples that follow the same pattern. Please write a Python function called "generate(params)" that can recreate each of these grids when called with appropriate parameters.

The grids are:

{examples_text}

Requirements:
1. Write a single function called "generate(params)" that works for all examples
2. The function should return a 2D list representing the grid
3. Parameters should only use simple built-in types (dict, list, int, str, etc.)
4. The function should be general enough to work for all examples but not more general than necessary
5. Provide the exact parameters for each example

Please format your response EXACTLY as follows:

```python
def generate(params):
    # Your implementation here
    pass
```

Example 1:
```python
{{"param1": value1, "param2": value2}}
```

Example 2:
```python
{{"param1": value1, "param2": value2}}
```

Example 3:
```python
{{"param1": value1, "param2": value2}}
```

(Use as many examples as needed - one for each grid above)

IMPORTANT: 
- Put each parameter dictionary in its own ```python code block
- Use valid Python dictionary syntax with double quotes for keys
- Each parameter dict should be on a single line for easy parsing
"""
    
    return prompt

# Find the specific task "05269061"
target_task = None
for task_id, task_data in tasks_100:
    if task_id == "05269061":
        target_task = (task_id, task_data)
        break

if target_task:
    task_id, task_data = target_task
    print(f"Found task {task_id}")
    print(f"Training examples: {len(task_data['train'])}")
    
    # Show the training output grids we'll be working with
    print("\nTraining output grids:")
    for i, example in enumerate(task_data['train']):
        print(f"\nExample {i+1}:")
        print(format_grid_for_prompt(example['output']))
else:
    print("Task 05269061 not found in the loaded tasks")

Found task 05269061
Training examples: 3

Training output grids:

Example 1:
2 8 3 2 8 3 2
8 3 2 8 3 2 8
3 2 8 3 2 8 3
2 8 3 2 8 3 2
8 3 2 8 3 2 8
3 2 8 3 2 8 3
2 8 3 2 8 3 2

Example 2:
2 4 1 2 4 1 2
4 1 2 4 1 2 4
1 2 4 1 2 4 1
2 4 1 2 4 1 2
4 1 2 4 1 2 4
1 2 4 1 2 4 1
2 4 1 2 4 1 2

Example 3:
4 8 3 4 8 3 4
8 3 4 8 3 4 8
3 4 8 3 4 8 3
4 8 3 4 8 3 4
8 3 4 8 3 4 8
3 4 8 3 4 8 3
4 8 3 4 8 3 4


In [8]:
# Test the improved parameter extraction with a sample response
sample_response = """
Here's the solution:

```python
def generate(params):
    height = params['height']
    width = params['width']
    palette = params['palette']
    
    num_palette_items = len(palette)
    grid = [
        [palette[(i + j) % num_palette_items] for j in range(width)]
        for i in range(height)
    ]
    return grid
```

Example 1:
```python
{"height": 7, "width": 7, "palette": [2, 8, 3]}
```

Example 2:
```python
{"height": 7, "width": 7, "palette": [2, 4, 1]}
```

Example 3:
```python
{"height": 7, "width": 7, "palette": [4, 8, 3]}
```

This function creates diagonal patterns by cycling through the palette based on the sum of row and column indices.
"""

# Test extraction functions
import re
import ast

def extract_python_code(response_text):
    """Extract Python code from Gemini response"""
    # Look for the first code block marked with ```python that contains def generate
    code_blocks = re.findall(r'```python\s*\n(.*?)```', response_text, re.DOTALL)
    
    for block in code_blocks:
        if 'def generate(' in block:
            return block.strip()
    
    # Fallback: look for def generate function without code blocks
    func_match = re.search(r'(def generate\(.*?\n(?:.*\n)*?.*?return.*)', response_text, re.DOTALL)
    if func_match:
        return func_match.group(1).strip()
    
    return None

def extract_parameters(response_text):
    """Extract parameter examples from Gemini response using improved parsing"""
    params = []
    
    # Split response into lines and look for Example patterns followed by code blocks
    lines = response_text.split('\n')
    i = 0
    
    while i < len(lines):
        line = lines[i].strip()
        
        # Look for "Example N:" pattern
        if re.match(r'Example \d+:', line):
            # Look for the next ```python block
            j = i + 1
            while j < len(lines) and not lines[j].strip().startswith('```python'):
                j += 1
            
            if j < len(lines):
                # Found ```python, now find the closing ```
                j += 1  # Skip the ```python line
                param_lines = []
                while j < len(lines) and not lines[j].strip().startswith('```'):
                    param_lines.append(lines[j])
                    j += 1
                
                # Try to parse the parameter content
                param_content = '\n'.join(param_lines).strip()
                if param_content:
                    try:
                        # Use ast.literal_eval for safe evaluation
                        param = ast.literal_eval(param_content)
                        if isinstance(param, dict):
                            params.append(param)
                        else:
                            print(f"Warning: Parameter is not a dict: {param}")
                    except (ValueError, SyntaxError) as e:
                        print(f"Warning: Could not parse parameter: {param_content}")
                        print(f"Error: {e}")
                        # Try eval as fallback (less safe but might work)
                        try:
                            param = eval(param_content)
                            if isinstance(param, dict):
                                params.append(param)
                        except:
                            print(f"Fallback eval also failed for: {param_content}")
                
                i = j
            else:
                i += 1
        else:
            i += 1
    
    return params

# Test the extraction
test_code = extract_python_code(sample_response)
test_params = extract_parameters(sample_response)

print("üß™ Testing improved extraction:")
print(f"‚úÖ Code extracted: {'Yes' if test_code else 'No'}")
print(f"‚úÖ Parameters extracted: {len(test_params)} sets")

if test_code:
    print("\nExtracted code:")
    print("-" * 30)
    print(test_code)
    
if test_params:
    print("\nExtracted parameters:")
    for i, param in enumerate(test_params):
        print(f"Example {i+1}: {param}")

print("\nüéØ Extraction test complete!")

üß™ Testing improved extraction:
‚úÖ Code extracted: Yes
‚úÖ Parameters extracted: 3 sets

Extracted code:
------------------------------
def generate(params):
    height = params['height']
    width = params['width']
    palette = params['palette']

    num_palette_items = len(palette)
    grid = [
        [palette[(i + j) % num_palette_items] for j in range(width)]
        for i in range(height)
    ]
    return grid

Extracted parameters:
Example 1: {'height': 7, 'width': 7, 'palette': [2, 8, 3]}
Example 2: {'height': 7, 'width': 7, 'palette': [2, 4, 1]}
Example 3: {'height': 7, 'width': 7, 'palette': [4, 8, 3]}

üéØ Extraction test complete!


In [9]:
# Import our python-sandbox executor for safe code execution
import sys
sys.path.append('../python-sandbox')

from executor_factory import get_best_executor

def execute_gemini_code_safely(code, test_params):
    """
    Execute Gemini-generated code safely and test it with given parameters
    
    Args:
        code: The Python code string from Gemini
        test_params: List of parameter dictionaries to test
        
    Returns:
        dict: Results of execution and testing
    """
    results = {
        'execution_success': False,
        'function_defined': False,
        'test_results': [],
        'error': None
    }
    
    try:
        with get_best_executor() as executor:
            # Simple test: just try to execute the code and call the function
            # If it works, the function is defined correctly
            test_code = f"""
{code}

# Try to call generate with a minimal test parameter to verify it exists
# This is just to check if the function is defined, we don't care about the result
try:
    # Just check if we can reference the function
    generate
    return "success"
except NameError:
    return "function_not_found"
except Exception as e:
    return f"other_error: {{str(e)}}"
"""
            result, error = executor.execute_code(test_code)
            
            if error:
                results['error'] = f"Code execution error: {str(error)}"
                return results
            
            if result == "function_not_found":
                results['error'] = "Function 'generate' not found or not defined"
                return results
            elif result.startswith("other_error"):
                results['error'] = f"Error in code: {result}"
                return results
            elif result == "success":
                results['execution_success'] = True
                results['function_defined'] = True
            else:
                results['error'] = f"Unexpected result from function check: {result}"
                return results
            
            # Test the function with each parameter set
            for i, params in enumerate(test_params):
                try:
                    test_code = f"""
{code}

# Test with parameters
params = {repr(params)}
result = generate(params)
return result
"""
                    test_result, test_error = executor.execute_code(test_code)
                    
                    if test_error:
                        results['test_results'].append({
                            'example': i + 1,
                            'success': False,
                            'error': str(test_error),
                            'result': None
                        })
                    else:
                        results['test_results'].append({
                            'example': i + 1,
                            'success': True,
                            'error': None,
                            'result': test_result
                        })
                        
                except Exception as e:
                    results['test_results'].append({
                        'example': i + 1,
                        'success': False,
                        'error': str(e),
                        'result': None
                    })
                    
    except Exception as e:
        results['error'] = str(e)
    
    return results

print("Python-sandbox executor ready for safe code execution!")

Python-sandbox executor ready for safe code execution!


In [10]:
# Generate the prompt and call Gemini API, then test the result safely
prompt = create_program_generation_prompt(task_data)

print("Calling Gemini API with gemini-2.5-pro...")
response = client.models.generate_content(
    model="gemini-2.5-pro", 
    contents=prompt
)

print("\n" + "="*60)
print("GEMINI RESPONSE:")
print("="*60)
print(response.text)
print("="*60)

# Extract Python code from Gemini response
import re
import ast

def extract_python_code(response_text):
    """Extract Python code from Gemini response"""
    # Look for the first code block marked with ```python that contains def generate
    code_blocks = re.findall(r'```python\s*\n(.*?)```', response_text, re.DOTALL)
    
    for block in code_blocks:
        if 'def generate(' in block:
            return block.strip()
    
    # Fallback: look for def generate function without code blocks
    func_match = re.search(r'(def generate\(.*?\n(?:.*\n)*?.*?return.*)', response_text, re.DOTALL)
    if func_match:
        return func_match.group(1).strip()
    
    return None

def extract_parameters(response_text):
    """Extract parameter examples from Gemini response using improved parsing"""
    params = []
    
    # Split response into lines and look for Example patterns followed by code blocks
    lines = response_text.split('\n')
    i = 0
    
    while i < len(lines):
        line = lines[i].strip()
        
        # Look for "Example N:" pattern
        if re.match(r'Example \d+:', line):
            # Look for the next ```python block
            j = i + 1
            while j < len(lines) and not lines[j].strip().startswith('```python'):
                j += 1
            
            if j < len(lines):
                # Found ```python, now find the closing ```
                j += 1  # Skip the ```python line
                param_lines = []
                while j < len(lines) and not lines[j].strip().startswith('```'):
                    param_lines.append(lines[j])
                    j += 1
                
                # Try to parse the parameter content
                param_content = '\n'.join(param_lines).strip()
                if param_content:
                    try:
                        # Use ast.literal_eval for safe evaluation
                        param = ast.literal_eval(param_content)
                        if isinstance(param, dict):
                            params.append(param)
                        else:
                            print(f"Warning: Parameter is not a dict: {param}")
                    except (ValueError, SyntaxError) as e:
                        print(f"Warning: Could not parse parameter: {param_content}")
                        print(f"Error: {e}")
                        # Try eval as fallback (less safe but might work)
                        try:
                            param = eval(param_content)
                            if isinstance(param, dict):
                                params.append(param)
                        except:
                            print(f"Fallback eval also failed for: {param_content}")
                
                i = j
            else:
                i += 1
        else:
            i += 1
    
    return params

# Extract code and parameters from Gemini's response
gemini_code = extract_python_code(response.text)
gemini_params = extract_parameters(response.text)

print(f"\nüìù Extracted code: {'‚úÖ Found' if gemini_code else '‚ùå Not found'}")
print(f"üìù Extracted {len(gemini_params)} parameter sets")

if gemini_code:
    print("\n" + "="*50)
    print("EXTRACTED PYTHON CODE:")
    print("="*50)
    print(gemini_code)
    print("="*50)

if gemini_params:
    print("\nEXTRACTED PARAMETERS:")
    for i, params in enumerate(gemini_params):
        print(f"Example {i+1}: {params}")
else:
    print("\n‚ö†Ô∏è  No parameters extracted. Check the response format.")

Calling Gemini API with gemini-2.5-pro...

GEMINI RESPONSE:
```python
def generate(params):
    sequence = params["sequence"]
    size = params["size"]
    len_seq = len(sequence)

    grid = []
    for i in range(size):
        row = []
        for j in range(size):
            index = (i + j) % len_seq
            value = sequence[index]
            row.append(value)
        grid.append(row)

    return grid
```

Example 1:
```python
{"sequence": [2, 8, 3], "size": 7}
```

Example 2:
```python
{"sequence": [2, 4, 1], "size": 7}
```

Example 3:
```python
{"sequence": [4, 8, 3], "size": 7}
```

üìù Extracted code: ‚úÖ Found
üìù Extracted 3 parameter sets

EXTRACTED PYTHON CODE:
def generate(params):
    sequence = params["sequence"]
    size = params["size"]
    len_seq = len(sequence)

    grid = []
    for i in range(size):
        row = []
        for j in range(size):
            index = (i + j) % len_seq
            value = sequence[index]
            row.append(value)
        g

In [11]:
# Test the Gemini-generated code safely using our python-sandbox executor
if gemini_code and gemini_params:
    print("üöÄ Testing Gemini-generated code with python-sandbox...")
    
    # Execute the code safely
    execution_results = execute_gemini_code_safely(gemini_code, gemini_params)
    
    print("\n" + "="*60)
    print("EXECUTION RESULTS")
    print("="*60)
    
    if not execution_results['execution_success']:
        print(f"‚ùå Execution failed: {execution_results['error']}")
    else:
        print("‚úÖ Code executed successfully!")
        print(f"‚úÖ Function 'generate' defined: {execution_results['function_defined']}")
        
        # Test each example and compare with expected results
        all_correct = True
        for i, test_result in enumerate(execution_results['test_results']):
            example_num = test_result['example']
            print(f"\n--- Example {example_num} ---")
            
            if not test_result['success']:
                print(f"‚ùå Test failed: {test_result['error']}")
                all_correct = False
            else:
                generated_grid = test_result['result']
                expected_grid = task_data['train'][i]['output']
                
                print("Generated:")
                print(format_grid_for_prompt(generated_grid))
                print("Expected:")
                print(format_grid_for_prompt(expected_grid))
                
                # Compare grids
                grids_match = generated_grid == expected_grid
                print(f"Match: {'‚úÖ' if grids_match else '‚ùå'}")
                
                if not grids_match:
                    all_correct = False
        
        print(f"\nüéØ Overall result: {'‚úÖ All examples correct!' if all_correct else '‚ùå Some examples incorrect'}")
        
        # Summary statistics
        successful_tests = sum(1 for test in execution_results['test_results'] if test['success'])
        correct_outputs = sum(1 for i, test in enumerate(execution_results['test_results']) 
                            if test['success'] and test['result'] == task_data['train'][i]['output'])
        
        print(f"üìä Summary: {successful_tests}/{len(execution_results['test_results'])} tests ran successfully")
        print(f"üìä Accuracy: {correct_outputs}/{len(task_data['train'])} outputs correct")

else:
    if not gemini_code:
        print("‚ùå No Python code found in Gemini response")
    if not gemini_params:
        print("‚ùå No parameters found in Gemini response")
        print("üí° You may need to manually provide parameters for testing")

üöÄ Testing Gemini-generated code with python-sandbox...

EXECUTION RESULTS
‚úÖ Code executed successfully!
‚úÖ Function 'generate' defined: True

--- Example 1 ---
Generated:
2 8 3 2 8 3 2
8 3 2 8 3 2 8
3 2 8 3 2 8 3
2 8 3 2 8 3 2
8 3 2 8 3 2 8
3 2 8 3 2 8 3
2 8 3 2 8 3 2
Expected:
2 8 3 2 8 3 2
8 3 2 8 3 2 8
3 2 8 3 2 8 3
2 8 3 2 8 3 2
8 3 2 8 3 2 8
3 2 8 3 2 8 3
2 8 3 2 8 3 2
Match: ‚úÖ

--- Example 2 ---
Generated:
2 4 1 2 4 1 2
4 1 2 4 1 2 4
1 2 4 1 2 4 1
2 4 1 2 4 1 2
4 1 2 4 1 2 4
1 2 4 1 2 4 1
2 4 1 2 4 1 2
Expected:
2 4 1 2 4 1 2
4 1 2 4 1 2 4
1 2 4 1 2 4 1
2 4 1 2 4 1 2
4 1 2 4 1 2 4
1 2 4 1 2 4 1
2 4 1 2 4 1 2
Match: ‚úÖ

--- Example 3 ---
Generated:
4 8 3 4 8 3 4
8 3 4 8 3 4 8
3 4 8 3 4 8 3
4 8 3 4 8 3 4
8 3 4 8 3 4 8
3 4 8 3 4 8 3
4 8 3 4 8 3 4
Expected:
4 8 3 4 8 3 4
8 3 4 8 3 4 8
3 4 8 3 4 8 3
4 8 3 4 8 3 4
8 3 4 8 3 4 8
3 4 8 3 4 8 3
4 8 3 4 8 3 4
Match: ‚úÖ

üéØ Overall result: ‚úÖ All examples correct!
üìä Summary: 3/3 tests ran successfully
üìä Accuracy: 3/3 outp

In [None]:
import json
import os
from datetime import datetime
from tqdm import tqdm

# Create logs directory if it doesn't exist
os.makedirs('./logs', exist_ok=True)

# Create timestamp for the log file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"./logs/gemini_program_generation_{timestamp}.jsonl"

# Load all 400 training tasks
print("Loading all training tasks...")
all_tasks = loader.load_tasks_from_subset("all_training", "arc-agi-1")
print(f"Loaded {len(all_tasks)} tasks")

# Initialize counters for overall statistics
total_tasks = 0
successful_executions = 0
perfect_accuracy_tasks = 0
total_accuracy = 0.0

print(f"\nüöÄ Starting evaluation of {len(all_tasks)} tasks...")
print(f"üìù Logging results to: {log_file}")

# Process each task
for task_idx, (task_id, task_data) in enumerate(tqdm(all_tasks, desc="Processing tasks")):
    total_tasks += 1
    
    # Skip if no training examples
    if not task_data.get('train'):
        continue
    
    try:
        # Generate prompt for this task
        prompt = create_program_generation_prompt(task_data)
        
        # Call Gemini API
        response = client.models.generate_content(
            model="gemini-2.5-pro", 
            contents=prompt
        )
        
        # Extract code and parameters
        gemini_code = extract_python_code(response.text)
        gemini_params = extract_parameters(response.text)
        
        # Initialize log entry
        log_entry = {
            "task_id": task_id,
            "task_index": task_idx,
            "timestamp": datetime.now().isoformat(),
            "python_code": gemini_code,
            "parameters": gemini_params,
            "num_training_examples": len(task_data['train']),
            "extraction_success": bool(gemini_code and gemini_params),
            "execution_success": False,
            "accuracy_fraction": 0.0,
            "correct_examples": 0,
            "total_examples": len(task_data['train']),
            "error": None
        }
        
        # Test the code if we extracted both code and parameters
        if gemini_code and gemini_params and len(gemini_params) == len(task_data['train']):
            try:
                execution_results = execute_gemini_code_safely(gemini_code, gemini_params)
                
                if execution_results['execution_success']:
                    successful_executions += 1
                    log_entry["execution_success"] = True
                    
                    # Calculate accuracy
                    correct_count = 0
                    for i, test_result in enumerate(execution_results['test_results']):
                        if test_result['success']:
                            generated_grid = test_result['result']
                            expected_grid = task_data['train'][i]['output']
                            if generated_grid == expected_grid:
                                correct_count += 1
                    
                    accuracy = correct_count / len(task_data['train'])
                    log_entry["accuracy_fraction"] = accuracy
                    log_entry["correct_examples"] = correct_count
                    total_accuracy += accuracy
                    
                    if accuracy == 1.0:
                        perfect_accuracy_tasks += 1
                        
                else:
                    log_entry["error"] = execution_results.get('error', 'Unknown execution error')
                    
            except Exception as e:
                log_entry["error"] = f"Testing error: {str(e)}"
        else:
            # Log extraction issues
            if not gemini_code:
                log_entry["error"] = "Failed to extract Python code"
            elif not gemini_params:
                log_entry["error"] = "Failed to extract parameters"
            elif len(gemini_params) != len(task_data['train']):
                log_entry["error"] = f"Parameter count mismatch: got {len(gemini_params)}, expected {len(task_data['train'])}"
        
        # Write log entry to file
        with open(log_file, 'a') as f:
            f.write(json.dumps(log_entry) + '\n')
            
    except Exception as e:
        # Log any unexpected errors
        error_entry = {
            "task_id": task_id,
            "task_index": task_idx,
            "timestamp": datetime.now().isoformat(),
            "error": f"Unexpected error: {str(e)}",
            "execution_success": False,
            "accuracy_fraction": 0.0
        }
        with open(log_file, 'a') as f:
            f.write(json.dumps(error_entry) + '\n')
    
    # Print progress every 50 tasks
    if (task_idx + 1) % 50 == 0:
        current_avg_accuracy = total_accuracy / max(successful_executions, 1)
        print(f"\nüìä Progress after {task_idx + 1} tasks:")
        print(f"   ‚úÖ Successful executions: {successful_executions}/{total_tasks}")
        print(f"   üéØ Perfect accuracy tasks: {perfect_accuracy_tasks}")
        print(f"   üìà Average accuracy: {current_avg_accuracy:.3f}")

# Final statistics
print(f"\nüèÅ EVALUATION COMPLETE!")
print(f"üìÅ Results saved to: {log_file}")
print(f"\nüìä FINAL STATISTICS:")
print(f"   üìù Total tasks processed: {total_tasks}")
print(f"   ‚úÖ Successful executions: {successful_executions}/{total_tasks} ({successful_executions/total_tasks*100:.1f}%)")
print(f"   üéØ Perfect accuracy tasks: {perfect_accuracy_tasks}/{total_tasks} ({perfect_accuracy_tasks/total_tasks*100:.1f}%)")

if successful_executions > 0:
    final_avg_accuracy = total_accuracy / successful_executions
    print(f"   üìà Average accuracy (successful tasks): {final_avg_accuracy:.3f}")
else:
    print(f"   üìà Average accuracy: N/A (no successful executions)")

In [None]:
import pandas as pd
import glob

def load_and_analyze_results(log_file_pattern="../logs/gemini_program_generation_*.jsonl"):
    """Load and analyze results from JSONL log files"""
    
    # Find the most recent log file
    log_files = glob.glob(log_file_pattern)
    if not log_files:
        print("No log files found!")
        return None
    
    latest_log = max(log_files, key=os.path.getctime)
    print(f"üìÅ Loading results from: {latest_log}")
    
    # Load all entries from the JSONL file
    results = []
    with open(latest_log, 'r') as f:
        for line in f:
            if line.strip():
                results.append(json.loads(line))
    
    df = pd.DataFrame(results)
    print(f"üìä Loaded {len(df)} task results")
    
    # Basic statistics
    print(f"\nüîç ANALYSIS:")
    
    # Extraction success rate
    extraction_success = df['extraction_success'].sum()
    print(f"   üìù Code/parameter extraction: {extraction_success}/{len(df)} ({extraction_success/len(df)*100:.1f}%)")
    
    # Execution success rate
    execution_success = df['execution_success'].sum()
    print(f"   ‚úÖ Successful executions: {execution_success}/{len(df)} ({execution_success/len(df)*100:.1f}%)")
    
    # Perfect accuracy tasks
    perfect_tasks = (df['accuracy_fraction'] == 1.0).sum()
    print(f"   üéØ Perfect accuracy tasks: {perfect_tasks}/{len(df)} ({perfect_tasks/len(df)*100:.1f}%)")
    
    # Average accuracy for successful executions
    successful_df = df[df['execution_success'] == True]
    if len(successful_df) > 0:
        avg_accuracy = successful_df['accuracy_fraction'].mean()
        print(f"   üìà Average accuracy (successful): {avg_accuracy:.3f}")
        
        # Accuracy distribution
        print(f"\nüìä ACCURACY DISTRIBUTION (successful executions):")
        accuracy_bins = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
        for i in range(len(accuracy_bins)-1):
            low, high = accuracy_bins[i], accuracy_bins[i+1]
            count = ((successful_df['accuracy_fraction'] > low) & 
                    (successful_df['accuracy_fraction'] <= high)).sum()
            print(f"   {low:.1f} < accuracy ‚â§ {high:.1f}: {count} tasks")
    
    # Common error types
    error_df = df[df['error'].notna()]
    if len(error_df) > 0:
        print(f"\n‚ùå COMMON ERRORS:")
        error_counts = error_df['error'].value_counts().head(5)
        for error, count in error_counts.items():
            print(f"   {count:3d}x: {error[:80]}...")
    
    return df

# You can call this function after running the evaluation to analyze results
print("üí° Analysis function loaded. Call load_and_analyze_results() to analyze the latest results.")

In [None]:
# Install pandas if not available
try:
    import pandas as pd
    print("‚úÖ pandas is available")
except ImportError:
    print("üì¶ Installing pandas...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "pandas"])
    import pandas as pd
    print("‚úÖ pandas installed and loaded")

print("\nüöÄ READY TO RUN FULL EVALUATION!")
print("="*60)
print("To start the evaluation of all 400 tasks:")
print("1. Run the evaluation cell above")
print("2. Wait for completion (this may take a while)")
print("3. Run load_and_analyze_results() to see the summary")
print("="*60)
print(f"üìù Results will be saved to: ../logs/gemini_program_generation_[timestamp].jsonl")
print("üí° Each task attempt includes:")
print("   - task_id: The ARC task identifier")
print("   - python_code: Generated Python function")
print("   - parameters: List of parameter dicts for each example")
print("   - accuracy_fraction: Fraction of examples that were correct")
print("   - execution_success: Whether the code ran without errors")
print("   - error: Any error messages if things went wrong")