In [1]:
# Simple VLM Client Test - Jupyter Notebook
# Test if your models work with OpenRouter

import os
from dotenv import load_dotenv
import yaml
import requests
import json

# Load environment
load_dotenv()

def load_config(config_path='../config/config.yaml'):
    """Load YAML config"""
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)

def test_openrouter_model(model_name, api_key, test_prompt="Say 'hello'"):
    """
    Test if a model works on OpenRouter
    Returns: (success: bool, response: str, error: str)
    """
    url = "https://openrouter.ai/api/v1/chat/completions"
    
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    
    data = {
        "model": model_name,
        "messages": [
            {"role": "user", "content": test_prompt}
        ],
        "max_tokens": 100
    }
    
    try:
        response = requests.post(url, headers=headers, json=data, timeout=30)
        
        if response.status_code == 200:
            result = response.json()
            content = result['choices'][0]['message']['content']
            return True, content, None
        else:
            error_msg = f"HTTP {response.status_code}: {response.text}"
            return False, None, error_msg
            
    except Exception as e:
        return False, None, str(e)

def list_available_models(api_key):
    """List all available models on OpenRouter"""
    url = "https://openrouter.ai/api/v1/models"
    
    headers = {
        "Authorization": f"Bearer {api_key}",
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            models = response.json()['data']
            xai_models = [m['id'] for m in models if 'x-ai' in m['id'] or 'grok' in m['id'].lower()]
            return xai_models
        else:
            return []
    except:
        return []

# ============================================================================
# RUN TESTS
# ============================================================================

print("="*80)
print("VLM CLIENT TEST")
print("="*80)

# 1. Load config
try:
    config = load_config()
    print("✓ Config loaded")
except Exception as e:
    print(f"✗ Config loading failed: {e}")
    exit()

# 2. Get API key
api_key = os.getenv('OPENROUTER_API_KEY')
if not api_key:
    print("✗ OPENROUTER_API_KEY not found in environment")
    exit()
else:
    print(f"✓ API key found ({api_key[:10]}...)")

# 3. Extract models from config
phase1_model = config['vlm_config']['phase1']['model']
phase2_model = config['vlm_config']['phase2']['model']

print(f"\nPhase 1 Model: {phase1_model}")
print(f"Phase 2 Model: {phase2_model}")

# 4. List available xAI/Grok models
print(f"\n{'='*80}")
print("AVAILABLE X-AI/GROK MODELS ON OPENROUTER")
print("="*80)
available = list_available_models(api_key)
if available:
    for model in sorted(available):
        print(f"  • {model}")
else:
    print("  (Could not fetch model list)")

# 5. Test Phase 1 model
print(f"\n{'='*80}")
print("TESTING PHASE 1 MODEL")
print("="*80)
success, response, error = test_openrouter_model(
    phase1_model, 
    api_key, 
    "Say exactly: 'Phase 1 works'"
)

if success:
    print(f"✓ Phase 1 model works!")
    print(f"  Response ({len(response)} chars): {response}")
else:
    print(f"✗ Phase 1 model FAILED")
    print(f"  Error: {error}")

# 6. Test Phase 2 model
print(f"\n{'='*80}")
print("TESTING PHASE 2 MODEL")
print("="*80)
success, response, error = test_openrouter_model(
    phase2_model, 
    api_key, 
    "Say exactly: 'Phase 2 works'"
)

if success:
    print(f"✓ Phase 2 model works!")
    print(f"  Response ({len(response)} chars): {response}")
else:
    print(f"✗ Phase 2 model FAILED")
    print(f"  Error: {error}")
    
# 7. Suggest alternative if Phase 2 failed
if not success:
    print(f"\n{'='*80}")
    print("SUGGESTED FIX")
    print("="*80)
    print(f"Your Phase 2 model '{phase2_model}' doesn't exist.")
    print(f"\nTry one of these instead:")
    print(f"  • x-ai/grok-4.1-fast  (recommended)")
    print(f"  • x-ai/grok-4-fast")
    print(f"  • x-ai/grok-beta")
    
print(f"\n{'='*80}")
print("TEST COMPLETE")
print("="*80)


VLM CLIENT TEST
✓ Config loaded
✓ API key found (sk-or-v1-c...)

Phase 1 Model: x-ai/grok-4.1-fast
Phase 2 Model: x-ai/grok-code-fast-1

AVAILABLE X-AI/GROK MODELS ON OPENROUTER
  • x-ai/grok-3
  • x-ai/grok-3-beta
  • x-ai/grok-3-mini
  • x-ai/grok-3-mini-beta
  • x-ai/grok-4
  • x-ai/grok-4-fast
  • x-ai/grok-4.1-fast
  • x-ai/grok-code-fast-1

TESTING PHASE 1 MODEL


✓ Phase 1 model works!
  Response (13 chars): Phase 1 works

TESTING PHASE 2 MODEL
✓ Phase 2 model works!
  Response (13 chars): Phase 2 works

TEST COMPLETE


In [2]:
# Test WITH reasoning parameter (like your actual code)
def test_with_reasoning(model, api_key):
    url = "https://openrouter.ai/api/v1/chat/completions"
    
    payload_with_reasoning = {
        "model": model,
        "messages": [{"role": "user", "content": "Say 'test'"}],
        "max_tokens": 100,
        "reasoning": {"enabled": True}  # ← This is the problem
    }
    
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    
    response = requests.post(url, headers=headers, json=payload_with_reasoning)
    print(f"\nWith reasoning parameter:")
    print(f"  Status: {response.status_code}")
    print(f"  Response: {response.text[:200]}")
    return response.status_code == 200

# Test both models
print("="*80)
print("TESTING WITH REASONING PARAMETER")
print("="*80)

phase1_works = test_with_reasoning("x-ai/grok-4.1-fast", api_key)
phase2_works = test_with_reasoning("x-ai/grok-code-fast-1", api_key)

print(f"\nPhase 1 (grok-4.1-fast): {'✓ Works' if phase1_works else '✗ Fails'}")
print(f"Phase 2 (grok-code-fast-1): {'✓ Works' if phase2_works else '✗ Fails'}")

TESTING WITH REASONING PARAMETER

With reasoning parameter:
  Status: 200
  Response: 
         

         

         
{"id":"gen-1767176942-VZ0GCmWBeFjOed1P6meQ","provider":"xAI","model":"x-ai/grok-4.1-fast","object":"chat.completion","created":1767176942,"choices":[{"logprobs":null,"

With reasoning parameter:
  Status: 200
  Response: 
         

         

         

         

         

         

         

         
{"id":"gen-1767176944-0LNJdfYpbIVecGsliSZ4","provider":"xAI","model":"x-ai/grok-code-fast-1","object":"chat.comp

Phase 1 (grok-4.1-fast): ✓ Works
Phase 2 (grok-code-fast-1): ✓ Works


In [9]:
# Test Phase 2C Response Generation
# Load a real Phase 2B file and generate Phase 2C response

import os
import json
import sys
from pathlib import Path

# Add src to path
# sys.path.append('../src')
sys.path.insert(0, '/home/te0245/llms_ftw')

from src.vlm_prompter import VLMPrompter
from src.vlm_client import VLMConfig, create_client
from dotenv import load_dotenv

load_dotenv()

# ============================================================================
# CONFIGURATION
# ============================================================================
log_dir = "../logs/grok4.1fast_grokcodefast1_reasoning_dsl_k4_80703"
task_id = "1ae2feb7"
sample_idx = 0

phase2b_file = f"{log_dir}/{task_id}_sample{sample_idx}_phase2b_validation.txt"
task_file = f"../data_v2/evaluation/{task_id}.json"

# ============================================================================
# LOAD FILES
# ============================================================================
print("="*80)
print("LOADING FILES")
print("="*80)

# Load Phase 2B validation
with open(phase2b_file, 'r') as f:
    phase2b_content = f.read()

print(f"✓ Loaded Phase 2B file ({len(phase2b_content)} chars)")
print(f"Preview:\n{phase2b_content[:500]}...\n")

# Load task
with open(task_file, 'r') as f:
    task = json.load(f)

print(f"✓ Loaded task {task_id}")
print(f"  Train examples: {len(task['train'])}")
print(f"  Test examples: {len(task['test'])}\n")

# ============================================================================
# EXTRACT VALIDATED PATTERN
# ============================================================================
import re

def extract_validated_pattern_from_response(response: str) -> str:
    """Extract the validated pattern from phase 2b response."""
    pattern_match = re.search(r'<validated_pattern>(.*?)</validated_pattern>', 
                             response, re.DOTALL)
    if pattern_match:
        return pattern_match.group(1).strip()
    
    return response.strip()

validated_pattern = extract_validated_pattern_from_response(phase2b_content)

print("="*80)
print("EXTRACTED VALIDATED PATTERN")
print("="*80)
print(validated_pattern[:1000])
print("="*80 + "\n")

# ============================================================================
# BUILD PHASE 2C PROMPT
# ============================================================================
prompter = VLMPrompter()

# Build prompt (DSL enabled, few-shot enabled, no similar programs for this test)
phase2c_prompt = prompter.build_phase2c_prompt(
    task=task,
    validated_pattern=validated_pattern,
    similar_programs=None,
    few_shot=True,
    dsl_enabled=True
)

print("="*80)
print("PHASE 2C PROMPT BUILT")
print("="*80)
print(f"Content blocks: {len(phase2c_prompt)}")
print(f"Total text length: {sum(len(b['text']) for b in phase2c_prompt if b['type'] == 'text')}")

# Show first and last blocks
print(f"\nFirst block (first 300 chars):")
print(phase2c_prompt[0]['text'][:300] if phase2c_prompt[0]['type'] == 'text' else phase2c_prompt[0])

print(f"\nLast block (first 300 chars):")
last_block = phase2c_prompt[-1]
print(last_block['text'][:300] if last_block['type'] == 'text' else last_block)

print("="*80 + "\n")

# ============================================================================
# CREATE VLM CLIENT FOR PHASE 2
# ============================================================================
api_key = os.getenv('OPENROUTER_API_KEY')

vlm_config_phase2 = VLMConfig(
    api_key=api_key,
    model="x-ai/grok-4.1-fast",
    api_base="https://openrouter.ai/api/v1",
    max_tokens=32000,
    max_retries=3,
    extra_params={
        'reasoning': {
            'enabled': True
        }
    },
    suppress_errors=False  # ← Don't suppress errors so we can see what fails
)

vlm_client_phase2 = create_client("grok", config=vlm_config_phase2)

print("="*80)
print("SENDING REQUEST TO API")
print("="*80)
print(f"Model: {vlm_config_phase2.model}")
print(f"API Base: {vlm_config_phase2.api_base}")
print(f"Max Tokens: {vlm_config_phase2.max_tokens}")
print(f"Reasoning: {vlm_config_phase2.extra_params}")
print("="*80 + "\n")

# ============================================================================
# SEND REQUEST
# ============================================================================
import time

system_prompt = """You are an expert at generating code using the given DSL primitives to solve ARC puzzles. You are provided with a natural language description of the pattern to implement, as well as training and test examples and some similar programs you might find useful as reference. Generate a Python function `def solve(I):` that implements the described transformation using ONLY the provided DSL primitives. Ensure your code is syntactically correct and follows best practices."""

try:
    start_time = time.time()
    response = vlm_client_phase2.query(phase2c_prompt, system_prompt)
    elapsed = time.time() - start_time
    
    print("="*80)
    print("API RESPONSE RECEIVED")
    print("="*80)
    print(f"Time: {elapsed:.2f}s")
    print(f"Response length: {len(response)} chars")
    print(f"\nFirst 1000 chars:")
    print(response[:1000])
    print("\n" + "="*80)
    
    # ============================================================================
    # EXTRACT CODE
    # ============================================================================
    def extract_code_from_response(response: str):
        """Extract Python code from LLM response."""
        python_blocks = re.findall(r'```python\n(.*?)```', response, re.DOTALL)
        
        if python_blocks:
            for block in python_blocks:
                if 'def solve' in block:
                    return block.strip()
            return python_blocks[0].strip()
        
        match = re.search(r'(def solve\(I\):.*?)(?=\n\ndef|\n\nif __name__|$)', response, re.DOTALL)
        if match:
            return match.group(1).strip()
        
        return None
    
    code = extract_code_from_response(response)
    
    print("\n" + "="*80)
    print("CODE EXTRACTION")
    print("="*80)
    if code:
        print(f"✓ Code extracted successfully ({len(code)} chars)")
        print(f"\nExtracted code:\n")
        print(code)
    else:
        print("✗ No code found in response")
        print("\nSearching for 'def solve':")
        if 'def solve' in response:
            print("  Found 'def solve' in response but extraction failed")
            idx = response.index('def solve')
            print(f"  Context around match:\n{response[max(0,idx-100):idx+300]}")
        else:
            print("  'def solve' not found in response at all")
    
    print("="*80)
    
except Exception as e:
    print("="*80)
    print("ERROR OCCURRED")
    print("="*80)
    print(f"Error type: {type(e).__name__}")
    print(f"Error message: {e}")
    
    import traceback
    print("\nFull traceback:")
    traceback.print_exc()
    print("="*80)

LOADING FILES
✓ Loaded Phase 2B file (5514 chars)
Preview:
Task ID: 1ae2feb7 (Sample 0/3)
PHASE 2B: HYPOTHESIS VALIDATION

INITIAL HYPOTHESIS:
--------------------------------------------------------------------------------
In plain English, the transformation rule is:
- Locate divider: unique full-height vertical bar of color 2 at col D.
- Copy input left (0..D-1) & divider (D) unchanged to ou...

✓ Loaded task 1ae2feb7
  Train examples: 3
  Test examples: 3

EXTRACTED VALIDATED PATTERN
**Final Pattern Description:**
- Locate divider: unique (nearly) full-height vertical bar of *uniform non-zero color C* (not fixed to 2; e.g., C=2 in training, C=3 in test 1&2, C=4 in test 3) at col D. "Full-height" means identical C in every row of the grid (test 1/2/train) or nearly so, spanning all patterned rows even if bottom padding row(s) are 0 (test 3).
- Copy input cols 0..D unchanged to output (including any 0s in divider col).
- Initialize output right cols (D+1..end) to all 0 (ignores any i

API RESPONSE RECEIVED
Time: 139.50s
Response length: 569 chars

First 1000 chars:
```python
def solve(I):
    # Find rightmost column D with any non-background cell (divider column)
    positions = asindices(I)
    W = shape(I)[1]
    col_has = [False] * W
    for pos in positions:
        col_has[pos[1]] = True
    D = max((j for j in range(W) if col_has[j]), default=-1)  # safe max

    # Grid dimensions
    H = shape(I)[0]
    Wr = W - D - 1

    # Build output row by row
    O_rows = []
    for r in range(H):
        # Extract input row
        row_in = tuple(index(I, (r, j)) for j in range(W))
        left = row_in[:D + 1]
        pattern


CODE EXTRACTION
✓ Code extracted successfully (559 chars)

Extracted code:

def solve(I):
    # Find rightmost column D with any non-background cell (divider column)
    positions = asindices(I)
    W = shape(I)[1]
    col_has = [False] * W
    for pos in positions:
        col_has[pos[1]] = True
    D = max((j for j in range(W) if col_has[j]),

In [3]:
pwd

'/home/te0245/llms_ftw/src'

In [2]:
sys.path

['/home/te0245/miniconda3/lib/python313.zip',
 '/home/te0245/miniconda3/lib/python3.13',
 '/home/te0245/miniconda3/lib/python3.13/lib-dynload',
 '',
 '/home/te0245/miniconda3/lib/python3.13/site-packages']