In [108]:
# Imports
import hypothesis as hp
from hypothesis import strategies as st
from itertools import combinations
from collections import defaultdict
from typing import List, Optional, Set, Union
import pandas as pd
from pprint import pprint
from hypothesis_pick import (
    find_disagreements,
    find_stronger_weaker,
    infer_implications,
)

import platform
import sys
from datetime import datetime, timezone
from pathlib import Path

import json
import os
from typing import Any, Optional, Tuple

from dotenv import load_dotenv
from google import genai

In [109]:
# Configure PICK: PBT settings
FREEFORM_SPEC: bool = False
PROBLEM: str = "roman_numerals"
MODEL_NAME: str = "gemini-2.5-flash"
TEMPERATURE: float = 0.2
MAX_OUTPUT_TOKENS: int = None
TIMEOUT_SEC: int = 60


In [110]:
# Define path to log file
RUN_TIMESTAMP: str = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%SZ")
LOGS_DIR: Path = Path("logs")
LOGS_DIR.mkdir(parents=True, exist_ok=True)

RUN_LOG_PATH: Path = LOGS_DIR / f"{PROBLEM}_{RUN_TIMESTAMP}.json"

print(f"Run timestamp (UTC): {RUN_TIMESTAMP}")
print(f"Run log path: {RUN_LOG_PATH}")

Run timestamp (UTC): 20260115_204027Z
Run log path: logs/roman_numerals_20260115_204027Z.json


In [111]:
# Read in the problem specification file
def _read_problem_spec(*, problem: str, freeform_spec: bool) -> str:
    """Load the user-provided spec text for a given problem.

    Files:
      - if freeform_spec=True:  <problem>/freeform_spec.md
      - if freeform_spec=False: <problem>/labeled_spec.xml   (already XML)
    """
    filename = "freeform_spec.md" if freeform_spec else "labeled_spec.xml"
    candidates = [
        Path("problems") / problem / filename,
        Path(problem) / filename,
        Path(".") / problem / filename
    ]
    for p in candidates:
        if p.exists():
            return p.read_text(encoding="utf-8")
    raise FileNotFoundError(f"Couldn't find {filename!r} for problem={problem!r}; tried: {candidates}")

# Build the property generation prompt
def build_property_gen_prompt(
    *,
    problem: str,
    freeform_spec: bool,
    num_properties: int = 15,
) -> str:
    """Build the prompt sent to Gemini to elicit properties in STRICT JSON."""
    problem_spec = _read_problem_spec(problem=problem, freeform_spec=freeform_spec).strip()

    spec_expectations = (
        "Freeform spec expectations (freeform_spec=true):\n"
        "- The user text should include background/problem statement.\n"
        "- It should clearly name the target function(s) and what they do.\n"
        "- It should describe inputs/outputs, constraints, and edge cases.\n"
        "- It should define any custom data types/encodings and examples if relevant."
        if freeform_spec
        else
        "Labeled spec expectations (freeform_spec=false):\n"
        "- <background>: high-level context, definitions, and rules.\n"
        "- <custom_data_types>: any custom types, encodings, invariants, examples.\n"
        "- <function_signature>: names, parameters, return types (as text), and meaning.\n"
        "- <function_details>: precise behavior, preconditions, postconditions, errors."
    )

    return "\n".join(
        [
            "You are helping me generate property-based tests (PBT) in Python.",
            "My test framework will be Hypothesis.",
            "",
            "Problem specification:",
            problem_spec,
            "",
            spec_expectations,
            "",
            "Task:",
            f"- Produce {num_properties} distinct properties of the target function(s).",
            "- Each property MUST be testable via random data generation (Hypothesis).",
            "- Phrase each property precisely using quantifiers like 'for all' / 'there exists' and include any necessary preconditions.",
            "- Include edge cases, invariants, metamorphic properties, and error/exception behavior when relevant.",
            "- Do NOT include any code.",
            "",
            "Output requirements (STRICT):",
            "- Output MUST be valid JSON and NOTHING ELSE.",
            "- Output MUST be exactly ONE JSON object matching one of the schemas below.",
            "",
            "Schema A (success):",
            "{",
            "  \"result\": [",
            "    <property description string>,",
            "    <property description string>,",
            "    ...",
            "  ],",
            "  \"explanation\": null",
            "}",
            "",
            "Schema B (insufficient/unknown):",
            "{",
            "  \"result\": null,",
            "  \"explanation\": <string describing what is missing / ambiguous and what to add>",
            "}",
            "",
            "Additional constraints:",
            "- Each property in result is a string describing one property statement.",
            "- Do not wrap in Markdown. Do not include code blocks.",
        ]
    )


PROMPT = build_property_gen_prompt(problem=PROBLEM, freeform_spec=FREEFORM_SPEC)
print(PROMPT)


You are helping me generate property-based tests (PBT) in Python.
My test framework will be Hypothesis.

Problem specification:
<background>
This problem concerns conversion between (1) positive integers and (2) their canonical Roman numeral string representations.

Roman numerals use seven symbols with fixed values:
	- I=1, V=5, X=10, L=50, C=100, D=500, M=1000.

Numeral formation rules (canonical form):
	1) Additive principle: if a symbol appears after a symbol of equal or greater value, its value is added.
		 Example: VI = 5 + 1 = 6.
	2) Subtractive principle: if a symbol appears before a symbol of greater value, its value is subtracted.
		 Subtraction is restricted to these pairs only:
			 - I before V or X (IV=4, IX=9)
			 - X before L or C (XL=40, XC=90)
			 - C before D or M (CD=400, CM=900)
	3) Repetition limits:
			 - V, L, D must not repeat.
			 - I, X, C, M may repeat at most 3 times consecutively.
	4) Order of magnitude: symbols are arranged from highest to lowest value lef

In [112]:
# Define JSON schema for properties response
# Note: Using Google Genai Schema format, not standard JSON Schema.
# Each property is now just a string description
PROPERTIES_RESPONSE_SCHEMA = {
    "type": "OBJECT",
    "properties": {
        "result": {
            "type": "ARRAY",
            "items": {
                "type": "STRING"
            },
            "description": "Array of property description strings, or null if spec is insufficient",
            "nullable": True
        },
        "explanation": {
            "type": "STRING",
            "description": "Explanation of what is missing/ambiguous if result is null, otherwise null",
            "nullable": True
        }
    },
    "required": ["result", "explanation"]
}

In [113]:
# Build prompt to define hypothesis signature + strategy
def build_signature_and_strategies_prompt(
    *,
    problem: str,
    freeform_spec: bool,
    max_strategies: int = 12,
) -> str:
    """Build a Gemini prompt that returns a JSON schema for test-harness signature + strategies."""
    problem_spec = _read_problem_spec(problem=problem, freeform_spec=freeform_spec).strip()

    spec_expectations = (
        "Freeform spec expectations (freeform_spec=true):\n"
        "- The user text should include background/problem statement.\n"
        "- It should clearly name the target function(s) and what they do.\n"
        "- It should describe inputs/outputs, constraints, edge cases.\n"
        "- It should define any custom data types/encodings and examples if relevant."
        if freeform_spec
        else
        "Labeled spec expectations (freeform_spec=false):\n"
        "- <background>: high-level context, definitions, and rules.\n"
        "- <custom_data_types>: any custom types, encodings, invariants, examples.\n"
        "- <function_signature>: names, parameters, return types (as text), and meaning.\n"
        "- <function_details>: precise behavior, preconditions, postconditions, errors."
    )

    return "\n".join(
        [
            "You are helping me generate property-based tests (PBT) in Python.",
            "My test framework will be Hypothesis.",
            "",
            f"Problem name: {problem}",
            "",
            "Problem specification:",
            problem_spec,
            "",
            spec_expectations,
            "",
            "Task:",
            "1) Identify the target function under test, its argument list, and its return type.",
            "2) Propose a standard *property-checking* function signature suitable for Hypothesis.",
            "   - It should accept the function under test (as a Callable) and the same inputs that function expects.",
            "   - It must return a boolean (True/False) indicating whether the property holds.",
            "3) Provide Hypothesis strategies to generate valid inputs for the property-checking function.",
            "   - Provide strategies for the *inputs* (not for the Callable itself).",
            "   - Respect any constraints/invariants from the spec.",
            "   - If some constraints are hard, propose assume()-style preconditions (as strings) or filtering.",
            "",
            "Output requirements (STRICT):",
            "- Output MUST be valid JSON and NOTHING ELSE.",
            "- If the spec is insufficient or the inputs are impossible/unsupported to generate with Hypothesis, return:",
            "  {\"result\": null, \"explanation\": <string>}.",
            "- Otherwise return an object of this shape:",
            "  {",
            "    \"result\": {",
            "      \"function_under_test\": {\"name\": <string>, \"args\": [<string>...], \"returns\": <string>},",
            "      \"property_function\": {\"name\": <string>, \"typing\": <string>, \"args\": [<string>...], \"returns\": \"bool\"},",
            "      \"strategies\": [",
            "        {\"arg\": <string>, \"strategy\": <string>, \"notes\": <string|null>},",
            "        ...",
            "      ],",
            "      \"assumptions\": [<string>...],",
            "      \"imports\": [<string>...]",
            "    }",
            "  }",
            f"- Provide at most {max_strategies} strategy entries.",
            "- The values in \"strategy\" should be Python expressions like 'st.integers(...)' as STRINGS (not full code).",
        ]
    )


SIGNATURE_STRATEGY_PROMPT = build_signature_and_strategies_prompt(
    problem=PROBLEM,
    freeform_spec=FREEFORM_SPEC,
    max_strategies=12,
)

print(SIGNATURE_STRATEGY_PROMPT)

You are helping me generate property-based tests (PBT) in Python.
My test framework will be Hypothesis.

Problem name: roman_numerals

Problem specification:
<background>
This problem concerns conversion between (1) positive integers and (2) their canonical Roman numeral string representations.

Roman numerals use seven symbols with fixed values:
	- I=1, V=5, X=10, L=50, C=100, D=500, M=1000.

Numeral formation rules (canonical form):
	1) Additive principle: if a symbol appears after a symbol of equal or greater value, its value is added.
		 Example: VI = 5 + 1 = 6.
	2) Subtractive principle: if a symbol appears before a symbol of greater value, its value is subtracted.
		 Subtraction is restricted to these pairs only:
			 - I before V or X (IV=4, IX=9)
			 - X before L or C (XL=40, XC=90)
			 - C before D or M (CD=400, CM=900)
	3) Repetition limits:
			 - V, L, D must not repeat.
			 - I, X, C, M may repeat at most 3 times consecutively.
	4) Order of magnitude: symbols are arranged fr

In [114]:
def _load_gemini_api_key() -> str:
    """Load the Gemini API key from a .env file or environment variable."""
    load_dotenv(dotenv_path=Path(".env"), override=False)
    key = os.getenv("GEMINI_API_KEY")
    if not key:
        raise RuntimeError(
            "GEMINI_API_KEY is missing. Add it to a .env file at the repo root as GEMINI_API_KEY=..."
        )
    return key

# Call Gemini with JSON response
def call_gemini_json(
    prompt: str,
    *,
    model_name: Optional[str] = None,
    temperature: float = 0.2,
    max_output_tokens: Optional[int] = None,
    timeout_s: int = 60,
    schema: Optional[dict[str, Any]] = None,
) -> Tuple[Any, str]:
    """Call Gemini with `prompt` and return (parsed_json, raw_text).

    This relies on Gemini JSON mode (response_mime_type='application/json').
    If JSON parsing fails, the raised error includes the raw response text
    (truncated) so you can see what the model actually returned.
    """
    api_key = _load_gemini_api_key()
    client = genai.Client(api_key=api_key)

    effective_model_name = model_name or os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
    
    generation_config = genai.types.GenerateContentConfig(
        temperature=temperature,
        response_mime_type="application/json",
    )
    if max_output_tokens is not None:
        generation_config.max_output_tokens = max_output_tokens
    if schema is not None:
        generation_config.response_schema = schema

    resp = client.models.generate_content(
        model=effective_model_name,
        contents=prompt,
        config=generation_config,
    )
    raw = (resp.text or "").strip()

    if not raw:
        # This often happens when the request is blocked, times out, or returns no text.
        # We include `resp` metadata to aid debugging without trying to "extract" JSON.
        raise ValueError(
            "Model returned an empty response in JSON mode. "
            f"model={effective_model_name!r}. "
            "Check for safety blocks, timeouts, or SDK/model compatibility."
        )

    try:
        return json.loads(raw), raw
    except json.JSONDecodeError as e:
        excerpt = raw[:4000]
        raise ValueError(
            "Model did not return valid JSON (despite JSON mode). "
            f"model={effective_model_name!r}. "
            "Raw model output (truncated):\n" + excerpt
        ) from e

def validate_property_list(properties_json: dict) -> None:
    """Validate that property list has correct structure.
    
    Args:
        properties_json: The parsed JSON response from Gemini
        
    Raises:
        ValueError: If validation fails
    """
    if properties_json is None:
        raise ValueError("Properties JSON is None")
    
    result = properties_json.get("result")
    
    # If result is null, that's okay (insufficient spec case)
    if result is None:
        return
    
    if not isinstance(result, list):
        raise ValueError(f"result must be a list, got {type(result).__name__}")
    
    for i, item in enumerate(result):
        if not isinstance(item, str):
            raise ValueError(
                f"Property {i} must be a string, got {type(item).__name__}: {item}"
            )

In [115]:
# Call for properties generation 
properties_json = None
properties_raw = ""
properties_error = None

try:
    properties_json, properties_raw = call_gemini_json(
        PROMPT,
        model_name=MODEL_NAME,
        temperature=TEMPERATURE,
        max_output_tokens=MAX_OUTPUT_TOKENS,
        timeout_s=TIMEOUT_SEC,
        schema=PROPERTIES_RESPONSE_SCHEMA,
    )
    # Validate the structure
    validate_property_list(properties_json)
except Exception as e:
    properties_error = f"{type(e).__name__}: {e}"

print("Raw model output (truncated):")
print(properties_raw[:2000] + ("..." if len(properties_raw) > 2000 else ""))
print("\nParsed JSON:")
pprint(properties_json)
if properties_error:
    print("\nERROR:")
    print(properties_error)

Raw model output (truncated):
{
  "result": [
    "For all integers N such that 1 <= N <= 3999, it must be true that from_numerals(to_numerals(N)) == N.",
    "For all integers N such that 1 <= N <= 3999, the return value of to_numerals(N) must be of type string.",
    "For all integers N such that 1 <= N <= 3999, every character in the string returned by to_numerals(N) must be one of 'I', 'V', 'X', 'L', 'C', 'D', 'M'.",
    "For all integers N such that 1 <= N <= 3999, the string returned by to_numerals(N) must not contain the substrings 'VV', 'LL', or 'DD'.",
    "For all integers N such that 1 <= N <= 3999, the string returned by to_numerals(N) must not contain the substrings 'IIII', 'XXXX', 'CCCC', or 'MMMM'.",
    "For all integers N such that 1 <= N <= 3999, if the string S = to_numerals(N) contains a symbol s1 immediately followed by a symbol s2 where value(s1) < value(s2), then the pair s1s2 must be one of 'IV', 'IX', 'XL', 'XC', 'CD', 'CM'.",
    "For all integers N such that 

In [116]:
# ===== Generate PBT function signature and strategies from property list =====
# This will call Gemini with the spec and ALL properties to get a unified test function signature

# Define schema for PBT function signature response
PBT_SIGNATURE_SCHEMA = {
    "type": "OBJECT",
    "properties": {
        "parameters": {
            "type": "ARRAY",
            "items": {
                "type": "OBJECT",
                "properties": {
                    "name": {"type": "STRING"},
                    "type_hint": {"type": "STRING"},
                },
                "required": ["name", "type_hint"]
            },
            "description": "List of parameters for the test_function_pbt function"
        },
        "strategies": {
            "type": "ARRAY",
            "items": {
                "type": "OBJECT",
                "properties": {
                    "param_name": {"type": "STRING"},
                    "strategy": {"type": "STRING"},
                    "notes": {"type": "STRING", "nullable": True},
                },
                "required": ["param_name", "strategy"]
            },
            "description": "Hypothesis strategies for each parameter"
        },
    },
    "required": ["parameters", "strategies"]
}

pbt_signature_json = None
pbt_signature_raw = ""
pbt_signature_error = None

# Only proceed if we got properties
if properties_json and properties_json.get("result"):
    property_list = properties_json.get("result", [])
    problem_spec_text = _read_problem_spec(problem=PROBLEM, freeform_spec=FREEFORM_SPEC).strip()
    
    # Build prompt for PBT signature generation
    properties_str = "\n".join([f"{i+1}. {prop}" for i, prop in enumerate(property_list)])
    
    PBT_SIGNATURE_PROMPT = '\n'.join([
        "You are designing a property-based test (PBT) function signature for Hypothesis.",
        "Output MUST be valid JSON and NOTHING ELSE and match the provided schema.",
        "",
        "Problem specification:",
        problem_spec_text,
        "",
        "Properties to test:",
        properties_str,
        "",
        "Task:",
        "1) Determine the parameters needed for a test function that can check ALL these properties.",
        "   - The function signature will be: def test_function_pbt(<parameters>) -> bool:",
        "   - Parameters should represent the INPUTS to the function under test (e.g., if testing to_numerals(n), the parameter would be 'n: int').",
        "   - Include type hints for each parameter.",
        "2) Provide Hypothesis strategies to generate each parameter.",
        "   - Strategies should be Python expressions like 'st.integers(min_value=1, max_value=3999)'.",
        "   - Respect constraints from the spec and properties.",
        "",
        "Return a JSON object with:",
        "- parameters: [{\"name\": <str>, \"type_hint\": <str>}, ...]",
        "- strategies: [{\"param_name\": <str>, \"strategy\": <str>, \"notes\": <str|null>}, ...]",
    ])
    
    try:
        pbt_signature_json, pbt_signature_raw = call_gemini_json(
            PBT_SIGNATURE_PROMPT,
            model_name=MODEL_NAME,
            temperature=TEMPERATURE,
            max_output_tokens=MAX_OUTPUT_TOKENS,
            timeout_s=TIMEOUT_SEC,
            schema=PBT_SIGNATURE_SCHEMA,
        )
    except Exception as e:
        pbt_signature_error = f"{type(e).__name__}: {e}"

print("Raw model output (truncated):")
print(pbt_signature_raw[:2000] + ("..." if len(pbt_signature_raw) > 2000 else ""))
print("\nParsed JSON:")
pprint(pbt_signature_json)
if pbt_signature_error:
    print("\nERROR:")
    print(pbt_signature_error)

Raw model output (truncated):
{
  "parameters": [
    {
      "name": "N",
      "type_hint": "Any"
    }
  ],
  "strategies": [
    {
      "param_name": "N",
      "strategy": "st.one_of(st.integers(min_value=1, max_value=3999), st.integers(max_value=0), st.integers(min_value=4000), st.floats(allow_nan=false, allow_infinity=false), st.text(), st.none(), st.booleans())",
      "notes": "Generates valid integers (1-3999) for core functionality tests, out-of-range integers (<1, >3999) for ValueError tests, and various non-integer types (float, string, None, boolean) for TypeError tests. This comprehensive strategy allows a single test function to cover all specified properties, including error handling."
    }
  ]
}

Parsed JSON:
{'parameters': [{'name': 'N', 'type_hint': 'Any'}],
 'strategies': [{'notes': 'Generates valid integers (1-3999) for core '
                          'functionality tests, out-of-range integers (<1, '
                          '>3999) for ValueError tests, and 

In [117]:
# ===== Generate test function body implementation =====
# Call LLM to generate 4 semantically different implementations

# Define schema for function body response
FUNCTION_BODY_SCHEMA = {
    "type": "OBJECT",
    "properties": {
        "implementations": {
            "type": "ARRAY",
            "items": {
                "type": "OBJECT",
                "properties": {
                    "body": {
                        "type": "STRING",
                        "description": "The Python code for the function body (without the def line)"
                    },
                    "description": {
                        "type": "STRING",
                        "description": "Brief description of what makes this implementation semantically different"
                    },
                    "imports": {
                        "type": "ARRAY",
                        "items": {"type": "STRING"},
                        "description": "Any additional imports needed"
                    }
                },
                "required": ["body", "description", "imports"]
            },
            "description": "Array of 4 semantically different implementations"
        }
    },
    "required": ["implementations"]
}

function_body_json = None
function_body_raw = ""
function_body_error = None
valid_implementations = []  # Cache for valid implementations
MAX_RETRIES = 5

# Only proceed if we have signature
if pbt_signature_json and not pbt_signature_error and properties_json:
    property_list = properties_json.get("result", [])
    problem_spec_text = _read_problem_spec(problem=PROBLEM, freeform_spec=FREEFORM_SPEC).strip()
    
    # Build the function signature string for context
    params = pbt_signature_json.get("parameters", [])
    if params:
        param_strs = [f"{p['name']}: {p['type_hint']}" for p in params]
        signature_for_prompt = f"def test_function_pbt({', '.join(param_strs)}) -> None:"
    else:
        signature_for_prompt = "def test_function_pbt() -> None:"
    
    properties_str = "\n".join([f"{i+1}. {prop}" for i, prop in enumerate(property_list)])
    
    FUNCTION_BODY_PROMPT = '\n'.join([
        "You are implementing FOUR semantically different versions of a property-based test function.",
        "Output MUST be valid JSON and NOTHING ELSE and match the provided schema.",
        "",
        "Problem specification:",
        problem_spec_text,
        "",
        "Properties to test:",
        properties_str,
        "",
        "Function signature:",
        signature_for_prompt,
        "",
        "Task:",
        "Generate FOUR semantically different implementations of the test function body.",
        "Each implementation MUST:",
        "1) Be syntactically valid Python code (indented with 4 spaces).",
        "2) Test some subset of the properties using assertions.",
        "3) Return None (Hypothesis requirement - no return statement needed).",
        "4) Print the input values at the start.",
        "5) Be SEMANTICALLY DIFFERENT from the others in meaningful ways:",
        "   - Test different subsets of properties",
        "   - Use different testing strategies (direct vs indirect)",
        "   - Check different edge cases",
        "   - Use different assertion styles",
        "   - Make different assumptions about valid inputs",
        "",
        "The differences should be significant enough that when tested with the same",
        "Hypothesis strategy, the implementations may produce DIFFERENT results (pass/fail).",
        "",
        "Important formatting:",
        "- Return ONLY the function body code (NOT the 'def' line).",
        "- Use exactly 4 spaces for indentation.",
        "- Each implementation should be complete and executable.",
        "- Do NOT use placeholder comments like '# ... rest of implementation'.",
        "",
        "Return a JSON object with:",
        '{"implementations": [',
        '  {"body": <string>, "description": <string>, "imports": [<strings>]},',
        '  ... (4 total)',
        ']}',
    ])
    
    retry_count = 0
    while len(valid_implementations) < 4 and retry_count < MAX_RETRIES:
        if retry_count > 0:
            print(f"\nüîÑ Retry attempt {retry_count}/{MAX_RETRIES}: Need {4 - len(valid_implementations)} more valid implementations")
        
        try:
            function_body_json, function_body_raw = call_gemini_json(
                FUNCTION_BODY_PROMPT,
                model_name=MODEL_NAME,
                temperature=TEMPERATURE,
                max_output_tokens=MAX_OUTPUT_TOKENS,
                timeout_s=TIMEOUT_SEC,
                schema=FUNCTION_BODY_SCHEMA,
            )
            
            implementations = function_body_json.get("implementations", [])
            
            # Validate each implementation by trying to compile it
            for idx, impl in enumerate(implementations):
                # Skip if we already have 4 valid ones
                if len(valid_implementations) >= 4:
                    break
                
                body = impl.get("body", "")
                description = impl.get("description", "")
                imports = impl.get("imports", [])
                
                # Try to compile the complete function
                complete_func = f"{signature_for_prompt}\n{body}"
                
                try:
                    compile(complete_func, '<string>', 'exec')
                    # Valid! Add to cache
                    valid_implementations.append({
                        "body": body,
                        "description": description,
                        "imports": imports,
                        "index": len(valid_implementations) + 1
                    })
                    print(f"‚úÖ Implementation {len(valid_implementations)}: Valid - {description}")
                except SyntaxError as e:
                    print(f"‚ùå Implementation candidate {idx+1} invalid: {e}")
                    print(f"   Description: {description}")
            
            retry_count += 1
            
        except Exception as e:
            function_body_error = f"{type(e).__name__}: {e}"
            print(f"‚ùå Error calling LLM: {function_body_error}")
            retry_count += 1
    
    # Check if we got all 4
    if len(valid_implementations) < 4:
        function_body_error = f"Only generated {len(valid_implementations)}/4 valid implementations after {MAX_RETRIES} attempts"
        print(f"\n‚ùå FINAL ERROR: {function_body_error}")
    else:
        print(f"\n‚úÖ Successfully generated 4 valid implementations!")
        function_body_json = {"implementations": valid_implementations}

print("\n" + "="*60)
print("Generated function body implementations:")
if valid_implementations:
    for impl in valid_implementations:
        print(f"\n--- Implementation {impl['index']} ---")
        print(f"Description: {impl['description']}")
        print(f"Imports: {impl['imports'] if impl['imports'] else '(none)'}")
        print("Body preview (first 200 chars):")
        print(impl['body'][:200] + "..." if len(impl['body']) > 200 else impl['body'])
else:
    print("No valid implementations generated")
    if function_body_error:
        print(f"ERROR: {function_body_error}")

‚úÖ Implementation 1: Valid - This implementation performs a comprehensive check for valid inputs, covering roundtrip conversion, type, character validity, length, repetition limits, and specific invalid subtractive pairs. It also verifies the non-increasing order of token values, ensuring adherence to the greedy decomposition algorithm.
‚úÖ Implementation 2: Valid - This implementation focuses primarily on error handling for invalid input types and out-of-range integer values, asserting that the correct exceptions (TypeError or ValueError) are raised. It also includes direct assertions for specific, well-known Roman numeral conversions.
‚úÖ Implementation 3: Valid - This implementation emphasizes structural correctness of the Roman numeral string, performing extensive substring checks to ensure adherence to repetition limits (e.g., no 'VV', 'IIII') and the absence of invalid two-symbol sequences. It also verifies that any subtractive pairs are among the allowed canonical forms.
‚úÖ Im

In [118]:
# ===== Generate and define the test_function_pbt functions =====
# Parse the signature and use LLM-generated bodies (4 implementations)

test_functions = []  # List to store all generated functions

if pbt_signature_json and not pbt_signature_error and function_body_json and not function_body_error:
    params = pbt_signature_json.get("parameters", [])
    strategies = pbt_signature_json.get("strategies", [])
    implementations = function_body_json.get("implementations", [])
    
    print("="*60)
    print("Assembling test functions...")
    print("="*60)
    
    for impl in implementations:
        idx = impl['index']
        body_code = impl['body']
        imports = impl.get('imports', [])
        description = impl['description']
        
        print(f"\n--- Implementation {idx} ---")
        print(f"Description: {description}")
        
        # Build the function signature with index
        if params:
            param_strs = [f"{p['name']}: {p['type_hint']}" for p in params]
            signature_str = f"def test_function_pbt_{idx}({', '.join(param_strs)}) -> None:"
        else:
            signature_str = f"def test_function_pbt_{idx}() -> None:"
        
        # Ensure body is properly indented
        if not body_code.startswith("    "):
            body_lines = body_code.split('\n')
            body_code = '\n'.join(['    ' + line if line.strip() else '' for line in body_lines])
        
        function_code = f'''{signature_str}
    """Property-based test function generated by LLM.
    
    {description}
    """
{body_code}
'''
        
        # Execute any needed imports first
        if imports:
            print(f"  Executing imports:")
            for imp in imports:
                print(f"    {imp}")
                try:
                    exec(imp, globals())
                except Exception as e:
                    print(f"      Warning: {e}")
        
        # Execute the function definition to make it available
        try:
            exec(function_code, globals())
            print(f"  ‚úÖ Function test_function_pbt_{idx} created successfully")
            
            # Store the function info
            test_functions.append({
                "index": idx,
                "description": description,
                "code": function_code,
                "imports": imports,
                "function_name": f"test_function_pbt_{idx}"
            })
        except Exception as e:
            print(f"  ‚ùå Error creating function: {e}")
            print(f"  Code was:\n{function_code}")
    
    print("\n" + "="*60)
    print(f"Created {len(test_functions)} test function implementations")
    print("="*60)
    
    # Print summary
    if test_functions:
        print("\nFunction signatures and strategies:")
        print(f"Parameters: {param_strs if params else 'None'}")
        print(f"\nStrategies:")
        for s in strategies:
            notes_str = f" # {s.get('notes')}" if s.get('notes') else ""
            print(f"  {s['param_name']}: {s['strategy']}{notes_str}")
        
        print("\nAvailable functions:")
        for tf in test_functions:
            print(f"  - {tf['function_name']}: {tf['description']}")
else:
    print("Skipping function generation due to errors or missing data")
    if pbt_signature_error:
        print(f"Signature error: {pbt_signature_error}")
    if function_body_error:
        print(f"Body error: {function_body_error}")
    test_functions = []

Assembling test functions...

--- Implementation 1 ---
Description: This implementation performs a comprehensive check for valid inputs, covering roundtrip conversion, type, character validity, length, repetition limits, and specific invalid subtractive pairs. It also verifies the non-increasing order of token values, ensuring adherence to the greedy decomposition algorithm.
  Executing imports:
    ROMAN_MAP_VALUES = {
    'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000,
    'IV': 4, 'IX': 9, 'XL': 40, 'XC': 90, 'CD': 400, 'CM': 900
}
    ROMAN_SYMBOLS = set('IVXLCDM')
    ROMAN_TOKEN_VALUES = [
    (1000, 'M'), (900, 'CM'), (500, 'D'), (400, 'CD'), (100, 'C'), (90, 'XC'),
    (50, 'L'), (40, 'XL'), (10, 'X'), (9, 'IX'), (5, 'V'), (4, 'IV'), (1, 'I')
]
    def from_numerals(roman_numeral_string: str) -> int:
    if not isinstance(roman_numeral_string, str):
        raise TypeError("Input must be a string.")
    if not roman_numeral_string:
        raise ValueError("Inp

In [119]:
# ===== Test the generated functions with sample inputs =====

if test_functions:
    print("Testing all generated test_function_pbt implementations with sample inputs:\n")
    
    # Try calling with some sample values based on the parameters
    if pbt_signature_json:
        params = pbt_signature_json.get("parameters", [])
        
        if params:
            # For roman_numerals, we expect an integer parameter
            if any('int' in p.get('type_hint', '').lower() for p in params):
                test_values = [1, 10, 100, 3999]
                
                for tf in test_functions:
                    func_name = tf['function_name']
                    func = globals().get(func_name)
                    
                    if func:
                        print("="*60)
                        print(f"Testing: {func_name}")
                        print(f"Description: {tf['description']}")
                        print("-" * 60)
                        
                        for test_val in test_values:
                            try:
                                result = func(test_val)
                                print(f"‚úÖ test_val={test_val}: Passed")
                            except AssertionError as e:
                                print(f"‚ùå test_val={test_val}: Failed - {e}")
                            except Exception as e:
                                print(f"‚ö†Ô∏è  test_val={test_val}: Error - {e}")
                        print()
            else:
                # Generic test
                for tf in test_functions:
                    func_name = tf['function_name']
                    print(f"\n{func_name}: {tf['description']}")
                    print("(Call with appropriate arguments based on the signature)")
        else:
            # No parameters
            for tf in test_functions:
                func_name = tf['function_name']
                func = globals().get(func_name)
                if func:
                    try:
                        result = func()
                        print(f"‚úÖ {func_name}: Passed")
                    except Exception as e:
                        print(f"‚ùå {func_name}: {e}")
else:
    print("No test functions defined - check for errors in previous cells")

Testing all generated test_function_pbt implementations with sample inputs:


test_function_pbt_1: This implementation performs a comprehensive check for valid inputs, covering roundtrip conversion, type, character validity, length, repetition limits, and specific invalid subtractive pairs. It also verifies the non-increasing order of token values, ensuring adherence to the greedy decomposition algorithm.
(Call with appropriate arguments based on the signature)

test_function_pbt_2: This implementation focuses primarily on error handling for invalid input types and out-of-range integer values, asserting that the correct exceptions (TypeError or ValueError) are raised. It also includes direct assertions for specific, well-known Roman numeral conversions.
(Call with appropriate arguments based on the signature)

test_function_pbt_3: This implementation emphasizes structural correctness of the Roman numeral string, performing extensive substring checks to ensure adherence to repetition li

In [None]:
# ===== Run test_function_pbt implementations with Hypothesis =====
# This uses the @given decorator to run each function with generated inputs

if pbt_signature_json and not pbt_signature_error and test_functions:
    from hypothesis import given, settings
    
    strategies_list = pbt_signature_json.get("strategies", [])
    
    if strategies_list:
        print("Running all test_function_pbt implementations with Hypothesis...\n")
        
        # Build the strategy arguments dynamically
        strategy_kwargs = {}
        for s in strategies_list:
            param_name = s['param_name']
            strategy_expr = s['strategy']
            
            # Fix JSON-style booleans (true/false) to Python booleans (True/False)
            strategy_expr_fixed = strategy_expr.replace('true', 'True').replace('false', 'False')
            
            # Evaluate the strategy expression to get the actual strategy object
            try:
                strategy_obj = eval(strategy_expr_fixed)
                strategy_kwargs[param_name] = strategy_obj
                print(f"Strategy for {param_name}: {strategy_expr_fixed}")
            except Exception as e:
                print(f"Error evaluating strategy for {param_name}: {e}")
                print(f"  Original expression: {strategy_expr}")
                print(f"  Fixed expression: {strategy_expr_fixed}")
                strategy_kwargs = None
                break
        
        if strategy_kwargs:
            print(f"\nRunning 100 test cases per implementation...")
            print("=" * 60)
            
            # Test each implementation
            for tf in test_functions:
                func_name = tf['function_name']
                func = globals().get(func_name)
                
                if func:
                    print(f"\n--- Testing: {func_name} ---")
                    print(f"Description: {tf['description']}")
                    print("-" * 60)
                    
                    # Create a wrapper that applies @given decorator
                    @given(**strategy_kwargs)
                    @settings(max_examples=100, print_blob=True)
                    def run_test(**kwargs):
                        return func(**kwargs)
                    
                    # Run the test
                    try:
                        run_test()
                        print(f"‚úÖ All 100 test cases passed for {func_name}!")
                    except Exception as e:
                        print(f"‚ùå Test failed for {func_name}: {e}")
                    
                    print()
        else:
            print("Could not evaluate strategies")
    else:
        print("No strategies available")
else:
    if not test_functions:
        print("No test functions available - check for errors in previous cells")
    elif pbt_signature_error:
        print(f"Cannot run tests - signature error: {pbt_signature_error}")
    else:
        print("Cannot run with Hypothesis - missing signature or function not defined")

Running all test_function_pbt implementations with Hypothesis...

Error evaluating strategy for N: name 'false' is not defined
Could not evaluate strategies


In [121]:
# Write run log to logs/<problem>_<timestamp>.json
from __future__ import annotations

import json
from typing import Any

def _jsonable(obj: Any) -> Any:
    """Best-effort conversion to JSON-serializable types."""
    try:
        json.dumps(obj)
        return obj
    except TypeError:
        return repr(obj)

run_log = {
    "timestamp_utc": RUN_TIMESTAMP,
    "problem": PROBLEM,
    "settings": {
        "FREEFORM_SPEC": FREEFORM_SPEC,
        "MODEL_NAME": MODEL_NAME,
        "TEMPERATURE": TEMPERATURE,
        "MAX_OUTPUT_TOKENS": MAX_OUTPUT_TOKENS,
        "TIMEOUT_SEC": TIMEOUT_SEC,
        "python_version": sys.version,
        "platform": platform.platform(),
    },
    "prompts": {
        "properties_prompt": PROMPT,
        "signature_strategy_prompt": SIGNATURE_STRATEGY_PROMPT,
        "function_body_prompt": FUNCTION_BODY_PROMPT if 'FUNCTION_BODY_PROMPT' in globals() else None,
    },
    "responses": {
        "properties": {
            "raw": properties_raw,
            "parsed": _jsonable(properties_json),
            "error": properties_error,
        },
        "pbt_signature": {
            "raw": pbt_signature_raw,
            "parsed": _jsonable(pbt_signature_json),
            "error": pbt_signature_error,
        },
        "function_body": {
            "raw": function_body_raw,
            "parsed": _jsonable(function_body_json),
            "error": function_body_error,
            "generated_functions": [
                {
                    "index": tf["index"],
                    "function_name": tf["function_name"],
                    "description": tf["description"],
                    "code": tf["code"]
                }
                for tf in test_functions
            ] if test_functions else []
        },
    },
}

with RUN_LOG_PATH.open("w") as f:
    json.dump(run_log, f, indent=2)

print(f"\n‚úÖ Run log written to: {RUN_LOG_PATH}")
print(f"Generated {len(test_functions) if test_functions else 0} function implementations")


‚úÖ Run log written to: logs/roman_numerals_20260115_204027Z.json
Generated 4 function implementations
