# GSM8K GA Orchestrator
End-to-end experiment runner and live monitor (population=50, 30 generations).

In [1]:
# Bootstrap config and environment
import sys, os, pathlib
# Ensure project root (with src/) is on sys.path
candidates = [pathlib.Path.cwd(), pathlib.Path.cwd().parent, pathlib.Path.cwd().parent.parent]
PROJECT_ROOT = None
for c in candidates:
    if (c / 'src').exists():
        sys.path.insert(0, str(c))
        PROJECT_ROOT = c
        break
from src.utils.config import load_config
cfg = load_config()
print(f'Using provider={cfg.model_provider}, model={cfg.model_name}, temp={cfg.temperature}, max_tokens={cfg.max_tokens}')
print('Project root:', PROJECT_ROOT)
print('Paths:', cfg.paths)
print('Population size (config):', cfg.raw['population']['population_size'], 'Max generations:', cfg.raw['population']['max_generations'])
print('Concurrency limit:', cfg.raw['evaluation']['concurrency_limit'])

Loaded config for provider=openai, model=gpt-4o, temp=0.0, max_tokens=200
Using provider=openai, model=gpt-4o, temp=0.0, max_tokens=200
Project root: /Users/Odyssey/Projects/genetic-prompt
Paths: {'data_root': 'data', 'gsm8k_cache': 'data/gsm8k_raw', 'checkpoints': 'data/checkpoints', 'results': 'data/results', 'embeddings': 'data/embeddings', 'logs': 'data/results/logs'}
Population size (config): 50 Max generations: 30
Concurrency limit: 5


## Data: Ensure GSM8K subsets exist

In [2]:
import os, sys, pathlib, subprocess as sp
def abspath(rel):
    base = PROJECT_ROOT if PROJECT_ROOT else pathlib.Path.cwd()
    return str((base / rel).resolve())
required_rel = ['data/gsm8k_primary_eval.jsonl','data/gsm8k_validation.jsonl','data/gsm8k_final_test.jsonl']
required_abs = [abspath(p) for p in required_rel]
for rel, abs_p in zip(required_rel, required_abs):
    print(f'Found {rel}:', os.path.exists(abs_p))
missing = [p for p in required_abs if not os.path.exists(p)]
if missing:
    print('Missing subsets; downloading dataset and creating subsets...')
    sp.run([sys.executable, str((PROJECT_ROOT/'scripts'/'download_data.py').resolve()), '--out', str((PROJECT_ROOT/'data'/'gsm8k_raw').resolve())], check=True, cwd=str(PROJECT_ROOT))
    sp.run([sys.executable, str((PROJECT_ROOT/'scripts'/'create_subsets.py').resolve()), '--data', str((PROJECT_ROOT/'data'/'gsm8k_raw').resolve()), '--out', str((PROJECT_ROOT/'data').resolve()), '--primary','100','--validation','100','--final','200','--seed_primary','42','--seed_validation','43','--seed_final','44'], check=True, cwd=str(PROJECT_ROOT))
    required_abs = [abspath(p) for p in required_rel]
    for rel, abs_p in zip(required_rel, required_abs):
        print(f'Found {rel}:', os.path.exists(abs_p))

Found data/gsm8k_primary_eval.jsonl: True
Found data/gsm8k_validation.jsonl: True
Found data/gsm8k_final_test.jsonl: True


## Seeds: curated 50 prompts preview

In [3]:
from src.genetics.seeds import SEED_PROMPTS
print('Curated seeds:', len(SEED_PROMPTS))
for i, s in enumerate(SEED_PROMPTS[:5]):
    print(f'{i+1}.', s[:120])

Curated seeds: 50
1. Let's solve this step by step, writing each calculation clearly before the final answer.
2. Think carefully: restate the problem, identify knowns and unknowns, and plan the steps.
3. Break the problem into parts: what is asked, what is given, and which operations are needed.
4. Use units consistently and show intermediate results with units.
5. List relevant quantities, then compute them in order, verifying each intermediate value.


## Run full 30-generation experiment (population=50) with live progress

In [4]:
import threading, time, json, os
import matplotlib.pyplot as plt
from IPython.display import clear_output, display, HTML
from datetime import datetime, timedelta
from src.genetics.controller import evolve

# Ensure logs directory exists and get absolute path
logs_dir = cfg.paths.get('logs','data/results/logs')
if not os.path.isabs(logs_dir):
    logs_dir = str((PROJECT_ROOT / logs_dir).resolve())
os.makedirs(logs_dir, exist_ok=True)
metrics_path = os.path.join(logs_dir, 'metrics.jsonl')
print(f'Metrics will be logged to: {metrics_path}')

# Global variables for monitoring
start_time = time.time()
max_generations = cfg.raw['population']['max_generations']
evolution_error = None

# Enhanced evolution runner with error handling
def _run():
    global evolution_error
    try:
        print('Evolution thread started...')
        result = evolve()
        print('Evolution completed successfully!')
        return result
    except Exception as e:
        evolution_error = str(e)
        print(f'Evolution failed with error: {e}')
        import traceback
        traceback.print_exc()

# Start evolution in background thread
t = threading.Thread(target=_run)
t.start()
# Enhanced metrics reader with comprehensive data extraction
def read_metrics(path):
    gens, bests, avgs, divs, hits, calls, accuracies, timestamps = [], [], [], [], [], [], [], []
    best_text = None
    latest_gen = -1
    if not os.path.exists(path):
        return gens, bests, avgs, divs, hits, calls, accuracies, timestamps, best_text, latest_gen
    try:
        with open(path, 'r') as f:
            for line in f:
                if line.strip():
                    r = json.loads(line)
                    gen = r.get('generation', 0)
                    gens.append(gen)
                    bests.append(r.get('best_fitness', 0.0))
                    avgs.append(r.get('avg_fitness', 0.0))
                    divs.append(r.get('diversity', 0.0))
                    hits.append(r.get('cache_hit_rate', 0.0))
                    calls.append(r.get('api_calls', 0))
                    accuracies.append(r.get('best_accuracy', 0.0))
                    timestamps.append(r.get('timestamp', time.time()))
                    bt = r.get('best_text')
                    if bt: best_text = bt
                    latest_gen = max(latest_gen, gen)
    except Exception as e:
        print(f'Error reading metrics: {e}')
    return gens, bests, avgs, divs, hits, calls, accuracies, timestamps, best_text, latest_gen

# Progress calculation helpers
def format_time(seconds):
    if seconds < 60:
        return f'{seconds:.0f}s'
    elif seconds < 3600:
        return f'{seconds//60:.0f}m {seconds%60:.0f}s'
    else:
        return f'{seconds//3600:.0f}h {(seconds%3600)//60:.0f}m'

def estimate_remaining_time(current_gen, total_gens, elapsed_time):
    if current_gen <= 0:
        return 'Calculating...'
    avg_time_per_gen = elapsed_time / (current_gen + 1)
    remaining_gens = total_gens - current_gen - 1
    remaining_time = avg_time_per_gen * remaining_gens
    return format_time(remaining_time)

# Enhanced live monitoring loop with comprehensive progress display
refresh_interval = 3  # seconds
last_update_time = time.time()

while t.is_alive():
    clear_output(wait=True)
    current_time = time.time()
    elapsed = current_time - start_time
    
    # Check for evolution errors
    if evolution_error:
        print(f'❌ Evolution failed: {evolution_error}')
        break
    
    # Read current metrics
    gens, bests, avgs, divs, hits, calls, accuracies, timestamps, best_text, latest_gen = read_metrics(metrics_path)
    
    # Progress header
    progress_pct = ((latest_gen + 1) / max_generations * 100) if latest_gen >= 0 else 0
    remaining_time = estimate_remaining_time(latest_gen, max_generations, elapsed)
    
    print('🧬 GSM8K Genetic Algorithm - Live Evolution Monitor')
    print('=' * 60)
    print(f'📊 Progress: Generation {latest_gen + 1}/{max_generations} ({progress_pct:.1f}%)')
    print(f'⏱️  Elapsed: {format_time(elapsed)} | Remaining: {remaining_time}')
    
    if gens and len(gens) > 0:
        # Current statistics
        current_best = bests[-1] if bests else 0.0
        current_avg = avgs[-1] if avgs else 0.0
        current_acc = accuracies[-1] if accuracies else 0.0
        current_div = divs[-1] if divs else 0.0
        current_hit_rate = hits[-1] if hits else 0.0
        total_api_calls = calls[-1] if calls else 0
        
        print(f'🎯 Best Fitness: {current_best:.4f} | Accuracy: {current_acc:.4f}')
        print(f'📈 Avg Fitness: {current_avg:.4f} | Diversity: {current_div:.4f}')
        print(f'🔄 API Calls: {total_api_calls} | Cache Hit Rate: {current_hit_rate:.2%}')
        
        # Performance trends
        if len(bests) >= 2:
            fitness_trend = bests[-1] - bests[-2]
            trend_icon = '📈' if fitness_trend > 0 else '📉' if fitness_trend < 0 else '➡️'
            print(f'{trend_icon} Fitness Trend: {fitness_trend:+.4f}')
        
        # Create comprehensive visualization
        fig, axs = plt.subplots(2, 3, figsize=(16, 10))
        fig.suptitle(f'Evolution Progress - Generation {latest_gen + 1}/{max_generations}', fontsize=16)
        
        # Fitness evolution
        axs[0,0].plot(gens, bests, 'b-', linewidth=2, label='Best')
        axs[0,0].plot(gens, avgs, 'r--', alpha=0.7, label='Average')
        axs[0,0].set_title('Fitness Evolution')
        axs[0,0].set_xlabel('Generation')
        axs[0,0].set_ylabel('Fitness')
        axs[0,0].legend()
        axs[0,0].grid(True, alpha=0.3)
        
        # Accuracy progression
        axs[0,1].plot(gens, accuracies, 'g-', linewidth=2)
        axs[0,1].set_title('Accuracy Progression')
        axs[0,1].set_xlabel('Generation')
        axs[0,1].set_ylabel('Accuracy')
        axs[0,1].grid(True, alpha=0.3)
        
        # Population diversity
        axs[0,2].plot(gens, divs, 'm-', linewidth=2)
        axs[0,2].set_title('Population Diversity')
        axs[0,2].set_xlabel('Generation')
        axs[0,2].set_ylabel('Diversity')
        axs[0,2].grid(True, alpha=0.3)
        
        # Cache performance
        axs[1,0].plot(gens, hits, 'c-', linewidth=2)
        axs[1,0].set_title('Cache Hit Rate')
        axs[1,0].set_xlabel('Generation')
        axs[1,0].set_ylabel('Hit Rate')
        axs[1,0].grid(True, alpha=0.3)
        
        # API usage
        axs[1,1].plot(gens, calls, 'orange', linewidth=2)
        axs[1,1].set_title('Cumulative API Calls')
        axs[1,1].set_xlabel('Generation')
        axs[1,1].set_ylabel('API Calls')
        axs[1,1].grid(True, alpha=0.3)
        
        # Progress bar
        axs[1,2].barh([0], [progress_pct], color='lightblue', alpha=0.7)
        axs[1,2].set_xlim(0, 100)
        axs[1,2].set_ylim(-0.5, 0.5)
        axs[1,2].set_title(f'Progress: {progress_pct:.1f}%')
        axs[1,2].set_xlabel('Completion %')
        axs[1,2].set_yticks([])
        
        plt.tight_layout()
        display(fig)
        plt.close(fig)
        
        # Show best prompt preview
        if best_text:
            print('\n🏆 Current Best Prompt (preview):')
            print('-' * 50)
            print(best_text[:300] + ('...' if len(best_text) > 300 else ''))
            print('-' * 50)
    else:
        print('⏳ Initializing evolution... Waiting for first generation metrics...')
        print(f'📁 Monitoring: {metrics_path}')
        if os.path.exists(metrics_path):
            file_size = os.path.getsize(metrics_path)
            print(f'📄 Metrics file exists ({file_size} bytes)')
        else:
            print('📄 Metrics file not yet created')
    
    print(f'\n🔄 Last updated: {datetime.now().strftime("%H:%M:%S")} | Next refresh in {refresh_interval}s')
    time.sleep(refresh_interval)

# Final draw
clear_output(wait=True)
gens, bests, avgs, divs, hits, calls, accuracies, timestamps, best_text, latest_gen = read_metrics(metrics_path)
print('Evolution finished. Generations logged:', len(gens))
if gens:
    fig, axs = plt.subplots(2, 3, figsize=(14,8))
    axs[0,0].plot(gens, bests); axs[0,0].set_title('Best Fitness');
    axs[0,1].plot(gens, avgs); axs[0,1].set_title('Average Fitness');
    axs[0,2].plot(gens, divs); axs[0,2].set_title('Diversity');
    axs[1,0].plot(gens, hits); axs[1,0].set_title('Cache Hit Rate');
    axs[1,1].plot(gens, calls); axs[1,1].set_title('API Calls');
    axs[1,2].axis('off'); plt.tight_layout(); display(fig); plt.close(fig)
    if best_text:
        print('\nFinal best prompt (truncated):\n', best_text[:800])


Running evolution...
Waiting for metrics...


KeyboardInterrupt: 

## Save final results

In [5]:
# Copy best prompt text to results directory if present in metrics
import os, json, shutil
metrics_path = cfg.paths.get('logs','data/results/logs') + '/metrics.jsonl'
best_text = None
if os.path.exists(metrics_path):
    with open(metrics_path, 'r') as f:
        for line in f:
            r = json.loads(line); bt = r.get('best_text');
            if bt: best_text = bt
if best_text:
    outp = os.path.join(cfg.paths.get('results','data/results'), 'best_prompt_final.txt')
    with open(outp, 'w') as f: f.write(best_text)
    print('Saved final best prompt to', outp)
else:
    print('No best_text found in metrics.')


No best_text found in metrics.
