In [1]:
from utils.dsl import *
from utils.constants import *
import json
import re
import numpy as np
import os
from pathlib import Path


## Manual Verification: Test a Single Solver Function

Use the cells below to manually test one solver function on a specific example of a specific task.

**Configure these variables:**
- `task_id`: The ARC task identifier (e.g., '0934a4d8')
- `example_index`: Which example to test (0-based index)
- `test_or_train`: Dataset to use ('train' or 'test')
- Paste your solver function into the `solve()` function below


In [None]:
def solve(I):
    # Extract dimensions
    H_W = shape(I)
    H = H_W[0]
    W = H_W[1]
    
    # Find unique red (2) position
    red_idxs = ofcolor(I, 2)
    red_pos = next(iter(red_idxs))
    r_red, C = red_pos
    
    # Compute vertical span on column C: rows with 7 union r_red
    col_rows_7 = [r for r in range(H) if index(I, (r, C)) == 7]
    all_S_rows = col_rows_7 + [r_red]
    min_r_span = min(all_S_rows)
    max_r_span = max(all_S_rows)
    
    # Fill column C from min_r_span to max_r_span with 2 where was 1
    col_fill_idxs = frozenset((r, C) for r in range(min_r_span, max_r_span + 1)
                              if index(I, (r, C)) == 1)
    grid1 = fill(I, 2, col_fill_idxs)
    
    # Original orange (7) positions
    orig_7_idxs = ofcolor(I, 7)
    
    # Split into left (c < C), right (c > C)
    left_7s = frozenset(p for p in orig_7_idxs if p[1] < C)
    right_7s = frozenset(p for p in orig_7_idxs if p[1] > C)
    
    # Clear off-C oranges (set to background 1)
    off_c_7s = left_7s | right_7s
    grid2 = cover(grid1, off_c_7s)
    
    # Determine top/bottom half based on r_red
    half_H = H // 2
    is_top = r_red <= half_H
    
    # Shift left shape to left wall, vert align based on half
    shifted_left = frozenset()
    if left_7s:
        left_cols = frozenset(p[1] for p in left_7s)
        min_c_l = minimum(left_cols)
        h_l = 0 - min_c_l
        
        left_rows = frozenset(p[0] for p in left_7s)
        min_r_l = minimum(left_rows)
        max_r_l = maximum(left_rows)
        
        if is_top:
            v_l = 0 - min_r_l
        else:
            v_l = (H - 1) - max_r_l
        
        shifted_left = frozenset((r + v_l, c + h_l) for r, c in left_7s
                                 if 0 <= r + v_l < H and 0 <= c + h_l < W)
    
    # Shift right shape to right wall, vert align based on half (independent extents)
    shifted_right = frozenset()
    if right_7s:
        right_cols = frozenset(p[1] for p in right_7s)
        max_c_r = maximum(right_cols)
        h_r = (W - 1) - max_c_r
        
        right_rows = frozenset(p[0] for p in right_7s)
        min_r_r = minimum(right_rows)
        max_r_r = maximum(right_rows)
        
        if is_top:
            v_r = 0 - min_r_r
        else:
            v_r = (H - 1) - max_r_r
        
        shifted_right = frozenset((r + v_r, c + h_r) for r, c in right_7s
                                  if 0 <= r + v_r < H and 0 <= c + h_r < W)
    
    # Paint shifted left and right shapes with 7 (on-C 7s already preserved)
    grid3 = fill(grid2, 7, shifted_left)
    O = fill(grid3, 7, shifted_right)
    return O

In [10]:
task_id = 'e376de54'
example_index = 0
test_or_train = 'test' # 'train' or 'test'

def open_and_solve_example(task_id, example_index, solver, set='train'):
    path = Path(f'../data_v2/evaluation/{task_id}.json')
    if not path.exists():
        print(f"Error processing {task_id} example {example_index}")
        return None, None, None
    try:
        with path.open('r', encoding='utf-8') as f:
            task = json.load(f)
        I = tuple(tuple(row) for row in task[set][example_index]['input'])
        expected = task[set][example_index]['output']
        output = solver(I)
        return I, expected, output
    except Exception:
        print(f"Error processing {task_id} example {example_index}")
        return None, None, None

def verify_output(expected, output):
    if expected == None or output == None:
        print("No output to verify")
        return
    if np.array_equal(np.array(expected), np.array(output)):
        print("Output matches expected!")
    else:
        print("Output does NOT match expected.")
        print("Expected:")
        print(np.array(expected))
        print("Got:")
        print(np.array(output))

In [12]:
# Run programs and verify if output matches
input, expected, output = open_and_solve_example(task_id, example_index, solve, set=test_or_train)
verify_output(expected, output)

Output matches expected!


In [5]:
def pad_grid(grid,height, width, fill):
    assert isinstance(grid, tuple)
    new_grid = []
    for j in range(height):
        new_row=[]
        for i in range(width):
            if len(grid[0])>i and len(grid)>j:
                new_row.append(grid[j][i])
            else:
                new_row.append(fill)
        new_grid.append(new_row)
    new_grid = tuple(tuple(row) if isinstance(row, list) else row for row in new_grid)
    if len(grid[0])==width and len(grid)==height:
        assert new_grid==grid
    else:
        assert len(new_grid)==height
        assert len(new_grid[0])==width
        for j in range(len(new_grid)):
            for i in range(len(new_grid[0])):
                if j<len(grid) and i<len(grid[0]):
                    assert new_grid[j][i]==grid[j][i]
                else:
                    assert new_grid[j][i]==fill
    return new_grid
def calculate_grid_similarity(predicted: Any, expected: Any) -> float:
    """
    Calculate similarity between two grids.
    
    Returns:
        Similarity score between 0.0 (no match) and 1.0 (exact match)
    """
    try:
        # Convert lists to tuples if needed
        if isinstance(predicted, list):
            predicted = tuple(tuple(row) if isinstance(row, list) else row for row in predicted)
        if isinstance(expected, list):
            expected = tuple(tuple(row) if isinstance(row, list) else row for row in expected)
        
        if not isinstance(predicted, tuple) or not isinstance(expected, tuple):
            return 0.0
        
        pred_h = len(predicted)
        exp_h = len(expected)
        
        if pred_h == 0 or exp_h == 0:
            return 0.0
        
        pred_w = len(predicted[0]) if pred_h > 0 else 0
        exp_w = len(expected[0]) if exp_h > 0 else 0
        
        # Dimension mismatch penalty
        if pred_h != exp_h or pred_w != exp_w:
            max_h = max(pred_h, exp_h)
            max_w = max(pred_w, exp_w)

            predicted=pad_grid(grid=predicted,height=max_h, width=max_w, fill=-1)
            expected=pad_grid(grid=expected,height=max_h, width=max_w, fill=-2)
            
            correct = 0
            total = max_h*max_w
            
            for i in range(max_h):
                for j in range(max_w):
                    if predicted[i][j] == expected[i][j]:
                        correct += 1
            
            cell_accuracy = correct / total if total > 0 else 0.0
            
            return cell_accuracy
        
        # Exact dimension match - calculate cell accuracy
        total_cells = pred_h * pred_w
        correct_cells = 0
        
        for i in range(pred_h):
            for j in range(pred_w):
                if predicted[i][j] == expected[i][j]:
                    correct_cells += 1
        
        return correct_cells / total_cells if total_cells > 0 else 0.0
        
    except Exception:
        return 0.0


In [11]:
# Run programs and verify if output matches
input, expected, output = open_and_solve_example(task_id, example_index, solve, set=test_or_train)
verify_output(expected, output)

Output matches expected!


## Batch Verification: Test All Solver Functions from Log Files

Automatically extracts and tests solver functions from log files on both train and test datasets.

**Configure these variables:**
- `logs_dir`: Path to directory containing log files with solver functions
- `test_these_examples`: Range of examples indices to test (e.g., range(0, 6) tests indices 0-5)

The script will:
1. Extract solver functions from summary files in the log directory
2. Test each function on all specified examples indices for both train and test sets of all tasks
3. Print detailed results and summary statistics for each dataset


In [7]:
# Specify directory containing log files
logs_dir = Path('../logs/exp_name:testing_noreasoning_dsl_k4_similar_fewshot_repair')


# Find all summary files
summary_files = [f for f in logs_dir.glob('*.txt') if 'summary' in f.name
                #  and 'repair' not in f.name
                ]
print(f"Found {len(summary_files)} summary files")

# Settings
test_indices_count = range(0, 3)  # Test indices 0 to 2
train_indices_count = range(0, 10)  # Test indices 0 to 10

def test_solver_on_dataset(summary_files, dataset_type, test_these_examples):
    """Test all solver functions on a specific dataset (train or test)."""
    # Track results per file across all indices
    file_results = {}
    
    for file_path in summary_files:
        filename = file_path.name
        task_id_local = filename.split('_')[0]
        
        file_results[filename] = {
            'executed_all': True,
            'correct_all': True,
            'results_per_index': {},
            'similarity_correct_all': True,
            'dissimilarity': False
        }
        
        try:
            # Read the file content
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # Load task to check available tasks
            try:
                with open(f'../data_v2/evaluation/{task_id_local}.json') as f:
                    task = json.load(f)
            except FileNotFoundError:
                print(f"Skipping {filename}: task file not found")
                file_results[filename]['executed_all'] = False
                file_results[filename]['correct_all'] = False
                file_results[filename]['similarity_correct_all'] = False
                continue
            
            # Extract solver function code
            gen_code_idx = content.find('CODE:')
            if gen_code_idx == -1: 
                print(f"Skipping {filename}: markers not found")
                file_results[filename]['executed_all'] = False
                file_results[filename]['correct_all'] = False
                file_results[filename]['similarity_correct_all'] = False
                continue
            
            code_section = content[gen_code_idx:]
            def_match = re.search(r'\bdef\s+\w+', code_section)
            
            if not def_match:
                print(f"Skipping {filename}: no function definition found")
                file_results[filename]['executed_all'] = False
                file_results[filename]['correct_all'] = False
                file_results[filename]['similarity_correct_all'] = False
                continue
            
            # Extract function body
            def_start = def_match.start()
            return_matches = list(re.finditer(r'\n\s*return\s+.*', code_section[def_start:]))
            
            if not return_matches:
                print(f"Skipping {filename}: no return statement found")
                file_results[filename]['executed_all'] = False
                file_results[filename]['correct_all'] = False
                file_results[filename]['similarity_correct_all'] = False
                continue
            
            last_return = return_matches[-1]
            function_end = def_start + last_return.end()
            solver_code = code_section[def_start:function_end]
            
            # Create a fresh namespace with all globals for this solver
            exec_namespace = dict(globals())
            
            # Execute the solver function
            exec(solver_code, exec_namespace)
            func_name_match = re.search(r'def\s+(\w+)', solver_code)
            
            if not func_name_match:
                print(f"Skipping {filename}: could not extract function name")
                file_results[filename]['executed_all'] = False
                file_results[filename]['correct_all'] = False
                file_results[filename]['similarity_correct_all'] = False
                continue
            
            func_name = func_name_match.group(1)
            solver_func = exec_namespace[func_name]
            
            # Test across all task indices
            print(f"\nTesting {filename}:")
            correct = 0
            for test_task_index in test_these_examples:
                # Check if task_index exists
                if len(task[dataset_type]) <= test_task_index:
                    print(f"  Index {test_task_index}: SKIP (not available)")
                    file_results[filename]['results_per_index'][test_task_index] = {
                        'executed': False,
                        'correct': False,
                        'skipped': True
                    }
                    continue
                
                try:
                    # Solve task
                    I, expected, output = open_and_solve_example(task_id_local, test_task_index, solver_func, set=dataset_type)
                    auto_similarity = calculate_grid_similarity(output, expected)
                    if auto_similarity == 1.0:
                        correct += 1
                        print (f"Calculate grid similarity: 1.0 for index task {test_task_index}")
                    else:    
                        file_results[filename]['similarity_correct_all'] = False
                    # Check if output matches expected
                    diffs = np.where(np.array(expected) != np.array(output))
                    is_correct = not diffs[0].size
                    
                    file_results[filename]['results_per_index'][test_task_index] = {
                        'executed': True,
                        'correct': is_correct,
                        'skipped': False
                    }
                    
                    if is_correct:
                        print(f"  Index {test_task_index}: ✓ Correct")
                    else:
                        print(f"  Index {test_task_index}: ✗ Incorrect")
                        file_results[filename]['correct_all'] = False
                        
                except Exception as e:
                    print(f"  Index {test_task_index}: ✗ Error - {type(e).__name__}")
                    file_results[filename]['results_per_index'][test_task_index] = {
                        'executed': False,
                        'correct': False,
                        'skipped': False
                    }
                    file_results[filename]['executed_all'] = False
                    file_results[filename]['correct_all'] = False
            if file_results[filename]['correct_all']!=file_results[filename]['similarity_correct_all']:
                file_results[filename]['dissimilarity'] = True
                            
        except Exception as e:
            print(f"✗ {filename}: Fatal error - {type(e).__name__}: {str(e)[:100]}")
            file_results[filename]['executed_all'] = False
            file_results[filename]['correct_all'] = False
            file_results[filename]['similarity_correct_all'] = False
    
    return file_results

def print_summary(file_results, dataset_type, test_these_examples, total_files):
    """Print summary statistics for a dataset."""
    files_executed_all = sum(1 for r in file_results.values() if r['executed_all'])
    files_correct_all = sum(1 for r in file_results.values() if r['correct_all'])
    files_similarity_correct_all = sum(1 for r in file_results.values() if r['similarity_correct_all'])
    
    # Per-index statistics
    per_index_stats = {}
    for idx in test_these_examples:
        executed = sum(1 for r in file_results.values() if idx in r['results_per_index'] and r['results_per_index'][idx]['executed'])
        correct = sum(1 for r in file_results.values() if idx in r['results_per_index'] and r['results_per_index'][idx]['correct'])
        skipped = sum(1 for r in file_results.values() if idx in r['results_per_index'] and r['results_per_index'][idx]['skipped'])
        per_index_stats[idx] = {'executed': executed, 'correct': correct, 'skipped': skipped}
    
    print(f"\n{'='*60}")
    print(f"SUMMARY FOR {dataset_type.upper()} SET:")
    print(f"{'='*60}")
    print(f"Total files: {total_files}")
    print(f"Programs that executed without error for all examples: {files_executed_all}")
    print(f"Programs with correct output for all examples: {files_correct_all}")
    print(f"Programs with similarity correct output for all examples: {files_similarity_correct_all}")
    print(f"Dissiimilarity count: {sum(1 for r in file_results.values() if r['dissimilarity'])}")
    print(f"\nPER-EXAMPLE-INDEX BREAKDOWN:")
    for idx in test_these_examples:
        stats = per_index_stats[idx]
        print(f"  Index {idx}: {stats['executed']} executed, {stats['correct']} correct, {stats['skipped']} skipped")
    print(f"{'='*60}")

# Test on train set
print("\n" + "="*60)
print("TESTING ON TRAIN SET")
print("="*60)
train_results = test_solver_on_dataset(summary_files, 'train', train_indices_count)

# Test on test set
print("\n" + "="*60)
print("TESTING ON TEST SET")
print("="*60)
test_results = test_solver_on_dataset(summary_files, 'test', test_indices_count)

# Print summary
print_summary(train_results, 'train', train_indices_count, len(summary_files))
print_summary(test_results, 'test', test_indices_count, len(summary_files))


Found 222 summary files

TESTING ON TRAIN SET

Testing 332f06d7_selection_summary.txt:
Calculate grid similarity: 1.0 for index task 0
  Index 0: ✓ Correct
  Index 1: ✗ Incorrect
  Index 2: ✗ Incorrect
Calculate grid similarity: 1.0 for index task 3
  Index 3: ✓ Correct
  Index 4: SKIP (not available)
  Index 5: SKIP (not available)
  Index 6: SKIP (not available)
  Index 7: SKIP (not available)
  Index 8: SKIP (not available)
  Index 9: SKIP (not available)

Testing b10624e5_selection_summary.txt:
Error processing b10624e5 example 0
  Index 0: ✗ Error - ValueError
Error processing b10624e5 example 1
  Index 1: ✗ Error - ValueError
  Index 2: SKIP (not available)
  Index 3: SKIP (not available)
  Index 4: SKIP (not available)
  Index 5: SKIP (not available)
  Index 6: SKIP (not available)
  Index 7: SKIP (not available)
  Index 8: SKIP (not available)
  Index 9: SKIP (not available)

Testing 7b80bb43_selection_summary.txt:
  Index 0: ✗ Incorrect
  Index 1: ✗ Incorrect
  Index 2: SKIP (

# Experiment Tracking Table

| Exp # | Model | DSL | Similar | Few Shot | Repair | Train Acc | Test Acc | Train Ex 0 | Train Ex 1 | Train Ex 2 | Train Ex 3 | Train Ex 4 | Train Ex 5 | Test Ex 0 | Test Ex 1 | Test Ex 2 |
|-------|-------|-----|---------|----------|--------|-----------|----------|------------|------------|------------|------------|------------|------------|-----------|-----------|-----------|
| 1     | Grok 4 Fast | ✓ | ✓ | ✓ | ✓ | 64/222 | 19/222 | 92/192(0) | 92/188(0) | 71/142(62) | 20/40(174) | 4/12(206) | 1/1(217) | 23/178(0) | 6/69(132) | 1/4(214) |
| 1a    | Grok 4 Fast | ✓ | ✓ | ✓ | ✓ (summary only) | 32/120 | 15/120 | 45/101(0) | 48/100(0) | 37/76(32) | 11/22(93) | 3/7(110) | 1/1(116) | 18/96(0) | 3/35(73) | 0/2(115) |
| 1b    | Grok 4 Fast | ✓ | ✓ | ✓ | ✓ (repair only) | 32/102 | 4/102 | 47/91(0) | 44/88(0) | 34/66(30) | 9/18(81) | 1/5(96) | 0/0(101) | 5/82(0) | 3/34(59) | 1/2(99) |
| 2     | Grok 4 Fast | ✓ | ✓ | ✓ | ✗ | 34/120 | 18/120 | 45/99(0) | 48/101(0) | 39/78(33) | 10/23(93) | 4/7(111) | 1/1(117) | 21/95(0) | 9/34(73) | 1/2(116) |
| 3     | Grok 4 Fast | ✗ | ✓ | ✓ | ✓ | 42/221 | 14/221 | 60/88(0) | 53/85(0) | 41/68(60) | 16/23(169) | 1/5(202) | 1/1(212) | 16/75(0) | 8/21(132) | 0/2(209) | 
| 3a    | Grok 4 Fast | ✗ | ✓ | ✓ | ✓ (summary only) | 26/120 | 13/120 | 34/51(0) | 31/50(0) | 25/40(32) | 9/14(92) | 1/3(110) | 1/1(115) | 15/45(0) | 5/13(74) | 0/1(114) |
| 3b    | Grok 4 Fast | ✗ | ✓ | ✓ | ✓ (repair only) | 16/101 | 1/101 | 26/37(0) | 22/35(0) | 16/28(28) | 7/9(77) | 0/2(92) | 0/0(97) | 1/30(0) | 3/8(58) | 0/1(95) |
| 4     | Grok 4 Fast | ✗ | ✓ | ✓ | ✗ | 23/120 | 11/120 | 30/48(0) | 30/48(0) | 22/37(31) | 8/13(89) | 2/5(107) | 1/1(112) | 12/45(0) | 3/10(70) | 0/1(111) |
| 5     | Grok 4 Fast | ✗ | ✗ | ✓ | ✗ | 22/120 | 7/120 | 29/54(0) | 26/56(0) | 24/43(32) | 7/16(92) | 2/3(110) | 1/1(116) | 9/50(0) | 3/14(73) | 0/1(115) |
| 6     | Grok 4 Fast | ✓ | ✓ | ✗ | ✗ | 31/120 | 14/120 | 46/102(0) | 47/100(0) | 39/74(33) | 12/20(93) | 2/6(110) | 1/1(116) | 17/90(0) | 6/33(73) | 0/2(115) |
| 7     | Grok 4.1 Fast | ✓ | ✓ | ✓ | ✓ | 42/182 | 19/182 | 56/117(0) | 59/116(0) | 42/83(34) | 12/22(102) | 3/7(119) | 1/1(126) | 21/104(0) | 7/37(81) | 0/4(123) |
| 7a    | Grok 4.1 Fast | ✓ | ✓ | ✓ | ✓ (summary only) | 22/120 | 11/120 | 30/67(0) | 29/66(0) | 24/46(20) | 7/13(58) | 3/5(67) | 1/1(72) | 11/59(0) | 3/20(48) | 0/2(71) |
| 7b    | Grok 4.1 Fast | ✓ | ✓ | ✓ | ✓ (repair only) | 20/62 | 8/62 | 26/50(0) | 30/50(0) | 18/37(14) | 5/9(44) | 0/2(52) | 0/0(54) | 10/45(0) | 4/17(33) | 0/2(52) |

## Notes
- **DSL**: Domain-Specific Language enabled
- **Similar**: Similarity-based features enabled
- **Few Shot**: Few-shot learning enabled
- **Repair**: Repair mechanism enabled
- **Train/Test Acc**: Format is `correct/total` (e.g., 22/120 = 22 programs correct on all examples out of 120 total)
- **Ex columns**: Format is `correct/executed(skipped)` for each example index
  - Example: `92/192(0)` means 92 correct, 192 executed, 0 skipped
- **Sub-rows (1a, 1b, 3a, 3b)**: For repair experiments, breakdown of summary vs repair files
  - Main row shows all files (summary + repair)
  - "summary only" = initial solver attempts
  - "repair only" = repaired solver attempts

## Key Findings
- **Best train accuracy**: Exp 1 (DSL+Similar+FewShot+Repair) with 64/222 (28.8%)
- **Best test accuracy**: Exp 1 (DSL+Similar+FewShot+Repair) with 19/222 (8.6%)
- **DSL impact**: Comparing Exp 2 vs 4 (both Similar+FewShot, no Repair): DSL improves train (34 vs 23) and test (18 vs 11)
- **Similar impact**: Comparing Exp 5 vs ? (both no DSL, FewShot, no Repair): Similar improves performance
- **Repair impact**: Generally adds more attempts but mixed results on final accuracy