In [4]:
import json
import ast
import tokenize
import io
from collections import Counter, defaultdict
import numpy as np
from itertools import combinations
import pandas as pd

# Load the data
with open('soar-program-samples.json', 'r') as f:
    data = json.load(f)

# Filter for task_id 31aa019c
programs_31aa = [item for item in data if item['task_id'] == '31aa019c']

print(f"Found {len(programs_31aa)} programs for task_id '31aa019c'")
print("\nModels:")
for i, prog in enumerate(programs_31aa):
    print(f"  {i}: {prog['model']}")

Found 20 programs for task_id '31aa019c'

Models:
  0: Mistral-Large-Instruct-2407
  1: Mistral-Large-Instruct-2407
  2: Qwen2.5-72B-Instruct
  3: Mistral-Large-Instruct-2407
  4: Mistral-Large-Instruct-2407
  5: Mistral-Large-Instruct-2407
  6: Mistral-Large-Instruct-2407
  7: Mistral-Large-Instruct-2407
  8: Mistral-Large-Instruct-2407
  9: Mistral-Large-Instruct-2407
  10: Mistral-Large-Instruct-2407
  11: Mistral-Large-Instruct-2407
  12: Mistral-Large-Instruct-2407
  13: Qwen2.5-Coder-32B-Instruct
  14: Qwen2.5-72B-Instruct
  15: Mistral-Large-Instruct-2407
  16: Qwen2.5-72B-Instruct
  17: Mistral-Large-Instruct-2407
  18: Qwen2.5-Coder-32B-Instruct
  19: Mistral-Large-Instruct-2407


In [5]:
def tokenize_python_code(code_string):
    """
    Tokenize Python code and return a list of meaningful tokens.
    We'll normalize some tokens to make comparison more robust.
    """
    tokens = []
    
    try:
        # Parse the code using Python's tokenize module
        tokens_iter = tokenize.tokenize(io.BytesIO(code_string.encode('utf-8')).readline)
        
        for tok in tokens_iter:
            # Skip encoding, newline, and comment tokens
            if tok.type in [tokenize.ENCODING, tokenize.NEWLINE, tokenize.NL, tokenize.COMMENT]:
                continue
            
            # Skip pure whitespace    
            if tok.type == tokenize.INDENT or tok.type == tokenize.DEDENT:
                continue
                
            token_string = tok.string.strip()
            if not token_string:
                continue
                
            # Normalize some tokens
            if tok.type == tokenize.NUMBER:
                # Keep literal integer values, abstract floats
                try:
                    if '.' in token_string:
                        tokens.append('<FLOAT>')
                    else:
                        # Keep literal integer values
                        tokens.append(token_string)
                except:
                    tokens.append('<NUMBER>')
            elif tok.type == tokenize.STRING:
                # Classify strings by type rather than content
                tokens.append('<STRING>')
            elif tok.type == tokenize.NAME:
                # Keep variable/function names as-is since they're meaningful
                tokens.append(token_string)
            else:
                # Keep operators, keywords, etc.
                tokens.append(token_string)
                
    except tokenize.TokenError as e:
        print(f"Tokenization error: {e}")
        # Fallback: simple split on whitespace and common separators
        import re
        tokens = re.findall(r'\w+|[^\w\s]', code_string)
    
    return tokens

# Test the tokenization function
test_code = programs_31aa[0]['code']
test_tokens = tokenize_python_code(test_code)
print("Sample tokenization (first 20 tokens):")
print(test_tokens[:20])

Sample tokenization (first 20 tokens):
['def', 'transform', '(', 'grid_lst', ':', 'list', '[', 'list', '[', 'int', ']', ']', ')', '->', 'list', '[', 'list', '[', 'int', ']']


In [6]:
def get_token_distribution(tokens):
    """Convert token list to normalized distribution (frequencies)"""
    counter = Counter(tokens)
    total = sum(counter.values())
    return {token: count/total for token, count in counter.items()}

def jensen_shannon_divergence(dist1, dist2):
    """
    Calculate Jensen-Shannon divergence between two probability distributions.
    This is a symmetric metric (unlike KL divergence) with range [0, 1].
    """
    # Get all unique tokens from both distributions
    all_tokens = set(dist1.keys()) | set(dist2.keys())
    
    # Convert to arrays with same ordering
    p = np.array([dist1.get(token, 0) for token in all_tokens])
    q = np.array([dist2.get(token, 0) for token in all_tokens])
    
    # Calculate M = (P + Q) / 2
    m = (p + q) / 2
    
    # Avoid log(0) by adding small epsilon
    epsilon = 1e-10
    p = np.maximum(p, epsilon)
    q = np.maximum(q, epsilon)
    m = np.maximum(m, epsilon)
    
    # Calculate JS divergence
    kl_pm = np.sum(p * np.log(p / m))
    kl_qm = np.sum(q * np.log(q / m))
    js_div = 0.5 * kl_pm + 0.5 * kl_qm
    
    return js_div

def cosine_similarity(dist1, dist2):
    """Calculate cosine similarity between two token distributions"""
    all_tokens = set(dist1.keys()) | set(dist2.keys())
    
    vec1 = np.array([dist1.get(token, 0) for token in all_tokens])
    vec2 = np.array([dist2.get(token, 0) for token in all_tokens])
    
    # Calculate cosine similarity
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    
    if norm1 == 0 or norm2 == 0:
        return 0.0
    
    return dot_product / (norm1 * norm2)

# Tokenize all programs and compute distributions
print("Tokenizing all programs...")
tokenized_programs = []
token_distributions = []

for i, prog in enumerate(programs_31aa):
    tokens = tokenize_python_code(prog['code'])
    distribution = get_token_distribution(tokens)
    
    tokenized_programs.append({
        'index': i,
        'model': prog['model'],
        'tokens': tokens,
        'distribution': distribution,
        'token_count': len(tokens),
        'unique_tokens': len(set(tokens))
    })
    
print(f"Completed tokenization of {len(tokenized_programs)} programs")
print(f"Token counts range: {min(p['token_count'] for p in tokenized_programs)} to {max(p['token_count'] for p in tokenized_programs)}")
print(f"Unique token counts range: {min(p['unique_tokens'] for p in tokenized_programs)} to {max(p['unique_tokens'] for p in tokenized_programs)}")

Tokenizing all programs...
Completed tokenization of 20 programs
Token counts range: 123 to 373
Unique token counts range: 42 to 63


In [7]:
# Calculate pairwise similarities
print("Calculating pairwise similarities...")
similarities = []

for i, j in combinations(range(len(tokenized_programs)), 2):
    prog1 = tokenized_programs[i]
    prog2 = tokenized_programs[j]
    
    # Calculate both JS divergence and cosine similarity
    js_div = jensen_shannon_divergence(prog1['distribution'], prog2['distribution'])
    cos_sim = cosine_similarity(prog1['distribution'], prog2['distribution'])
    
    # Convert JS divergence to similarity (1 - divergence, since lower divergence = higher similarity)
    js_sim = 1 - js_div
    
    similarities.append({
        'pair': (i, j),
        'model1': prog1['model'],
        'model2': prog2['model'],
        'js_similarity': js_sim,
        'cosine_similarity': cos_sim,
        'js_divergence': js_div
    })

# Sort by JS similarity (descending)
similarities.sort(key=lambda x: x['js_similarity'], reverse=True)

print(f"Calculated {len(similarities)} pairwise similarities")
print("\nTop 10 most similar pairs (by JS similarity):")
for i, sim in enumerate(similarities[:10]):
    print(f"{i+1}. Models {sim['pair'][0]} vs {sim['pair'][1]} ({sim['model1']} vs {sim['model2']})")
    print(f"   JS Similarity: {sim['js_similarity']:.4f}, Cosine Similarity: {sim['cosine_similarity']:.4f}")

Calculating pairwise similarities...
Calculated 190 pairwise similarities

Top 10 most similar pairs (by JS similarity):
1. Models 3 vs 19 (Mistral-Large-Instruct-2407 vs Mistral-Large-Instruct-2407)
   JS Similarity: 0.9326, Cosine Similarity: 0.9187
2. Models 4 vs 19 (Mistral-Large-Instruct-2407 vs Mistral-Large-Instruct-2407)
   JS Similarity: 0.9193, Cosine Similarity: 0.9034
3. Models 8 vs 13 (Mistral-Large-Instruct-2407 vs Qwen2.5-Coder-32B-Instruct)
   JS Similarity: 0.9162, Cosine Similarity: 0.8755
4. Models 6 vs 7 (Mistral-Large-Instruct-2407 vs Mistral-Large-Instruct-2407)
   JS Similarity: 0.9120, Cosine Similarity: 0.9125
5. Models 0 vs 6 (Mistral-Large-Instruct-2407 vs Mistral-Large-Instruct-2407)
   JS Similarity: 0.9099, Cosine Similarity: 0.9082
6. Models 7 vs 19 (Mistral-Large-Instruct-2407 vs Mistral-Large-Instruct-2407)
   JS Similarity: 0.9098, Cosine Similarity: 0.9139
7. Models 0 vs 10 (Mistral-Large-Instruct-2407 vs Mistral-Large-Instruct-2407)
   JS Similarity:

In [10]:
def display_programs_side_by_side(prog1_idx, prog2_idx, similarity_info):
    """Display two programs side by side for comparison"""
    prog1 = programs_31aa[prog1_idx]
    prog2 = programs_31aa[prog2_idx]
    
    print("=" * 100)
    print(f"COMPARISON: Program {prog1_idx} ({prog1['model']}) vs Program {prog2_idx} ({prog2['model']})")
    print(f"JS Similarity: {similarity_info['js_similarity']:.4f}, Cosine Similarity: {similarity_info['cosine_similarity']:.4f}")
    print("=" * 100)
    
    # Split code into lines for side-by-side display
    lines1 = prog1['code'].split('\n')
    lines2 = prog2['code'].split('\n')
    max_lines = max(len(lines1), len(lines2))
    
    # Pad shorter program with empty lines
    lines1.extend([''] * (max_lines - len(lines1)))
    lines2.extend([''] * (max_lines - len(lines2)))
    
    # Display side by side
    print(f"{'PROGRAM 1':<50} | {'PROGRAM 2'}")
    print("-" * 50 + " | " + "-" * 50)
    
    for i, (line1, line2) in enumerate(zip(lines1, lines2)):
        # Truncate long lines to fit display
        line1_display = line1[:47] + "..." if len(line1) > 50 else line1
        line2_display = line2[:47] + "..." if len(line2) > 50 else line2
        print(f"{line1_display:<50} | {line2_display}")
    
    print("\n")

# Display top 5 most similar pairs side by side
print("TOP 5 MOST SIMILAR PROGRAM PAIRS:")
print("=" * 100)

for i, sim in enumerate(similarities[-20:]):
    prog1_idx, prog2_idx = sim['pair']
    display_programs_side_by_side(prog1_idx, prog2_idx, sim)
    
    if i < 4:  # Add separator between comparisons except for the last one
        print("\n" + "~" * 100 + "\n")

TOP 5 MOST SIMILAR PROGRAM PAIRS:
COMPARISON: Program 2 (Qwen2.5-72B-Instruct) vs Program 11 (Mistral-Large-Instruct-2407)
JS Similarity: 0.7543, Cosine Similarity: 0.7205
PROGRAM 1                                          | PROGRAM 2
-------------------------------------------------- | --------------------------------------------------
def transform(grid: list[list[int]]) -> list[li... | def transform(grid_lst: list[list[int]]) -> lis...
    import numpy as np                             |     grid = [row[:] for row in grid_lst]
    grid = np.array(grid)                          |     output = [[0] * 10 for _ in range(10)]
    output = np.zeros_like(grid)                   |     colors = [2, 3, 4, 5, 6, 7, 8, 9]
    unique_numbers = np.unique(grid)               |     color_positions = {color: [] for color in c...
    for number in unique_numbers:                  |     for i, row in enumerate(grid):
        if number == 0:                            |         for j, cell in enumerate