In [None]:
# Imports
import hypothesis as hp
from hypothesis import strategies as st
from itertools import combinations
from collections import defaultdict
from typing import List, Optional, Set, Union
import pandas as pd
from pprint import pprint
from hypothesis_pick import (
    find_disagreements,
    find_stronger_weaker,
    infer_implications,
)

import platform
import sys
from datetime import datetime, timezone
from pathlib import Path

import json
import os
from typing import Any, Optional, Tuple

from dotenv import load_dotenv
from google import genai

In [None]:
# Configure PICK: PBT settings
FREEFORM_SPEC: bool = False
PROBLEM: str = "roman_numerals"
MODEL_NAME: str = "gemini-2.5-flash"
TEMPERATURE: float = 0.2
MAX_OUTPUT_TOKENS: int = None
TIMEOUT_SEC: int = 60


In [None]:
# Define path to log file
RUN_TIMESTAMP: str = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%SZ")
LOGS_DIR: Path = Path("logs")
LOGS_DIR.mkdir(parents=True, exist_ok=True)

RUN_LOG_PATH: Path = LOGS_DIR / f"{PROBLEM}_{RUN_TIMESTAMP}.json"

print(f"Run timestamp (UTC): {RUN_TIMESTAMP}")
print(f"Run log path: {RUN_LOG_PATH}")

Run timestamp (UTC): 20260115_022504Z
Run log path: logs\roman_numerals_20260115_022504Z.json


In [None]:
# Read in the problem specification file
def _read_problem_spec(*, problem: str, freeform_spec: bool) -> str:
    """Load the user-provided spec text for a given problem.

    Files:
      - if freeform_spec=True:  <problem>/freeform_spec.md
      - if freeform_spec=False: <problem>/labeled_spec.xml   (already XML)
    """
    filename = "freeform_spec.md" if freeform_spec else "labeled_spec.xml"
    candidates = [
        Path("problems") / problem / filename,
        Path(problem) / filename,
        Path(".") / problem / filename
    ]
    for p in candidates:
        if p.exists():
            return p.read_text(encoding="utf-8")
    raise FileNotFoundError(f"Couldn't find {filename!r} for problem={problem!r}; tried: {candidates}")

# Build the property generation prompt
def build_property_gen_prompt(
    *,
    problem: str,
    freeform_spec: bool,
    num_properties: int = 15,
) -> str:
    """Build the prompt sent to Gemini to elicit properties in STRICT JSON."""
    problem_spec = _read_problem_spec(problem=problem, freeform_spec=freeform_spec).strip()

    spec_expectations = (
        "Freeform spec expectations (freeform_spec=true):\n"
        "- The user text should include background/problem statement.\n"
        "- It should clearly name the target function(s) and what they do.\n"
        "- It should describe inputs/outputs, constraints, and edge cases.\n"
        "- It should define any custom data types/encodings and examples if relevant."
        if freeform_spec
        else
        "Labeled spec expectations (freeform_spec=false):\n"
        "- <background>: high-level context, definitions, and rules.\n"
        "- <custom_data_types>: any custom types, encodings, invariants, examples.\n"
        "- <function_signature>: names, parameters, return types (as text), and meaning.\n"
        "- <function_details>: precise behavior, preconditions, postconditions, errors."
    )

    return "\n".join(
        [
            "You are helping me generate property-based tests (PBT) in Python.",
            "My test framework will be Hypothesis.",
            "",
            "Problem specification:",
            problem_spec,
            "",
            spec_expectations,
            "",
            "Task:",
            f"- Produce {num_properties} distinct properties of the target function(s).",
            "- Each property MUST be testable via random data generation (Hypothesis).",
            "- Phrase each property precisely using quantifiers like 'for all' / 'there exists' and include any necessary preconditions.",
            "- Include edge cases, invariants, metamorphic properties, and error/exception behavior when relevant.",
            "- Do NOT include any code.",
            "",
            "Output requirements (STRICT):",
            "- Output MUST be valid JSON and NOTHING ELSE.",
            "- Output MUST be exactly ONE JSON object matching one of the schemas below.",
            "",
            "Schema A (success):",
            "{",
            "  \"result\": [<string>, <string>, ...],",
            "  \"explanation\": null",
            "}",
            "",
            "Schema B (insufficient/unknown):",
            "{",
            "  \"result\": null,",
            "  \"explanation\": <string describing what is missing / ambiguous and what to add>",
            "}",
            "",
            "Additional constraints:",
            "- Each string in result must be ONE property statement.",
            "- Do not wrap in Markdown. Do not include code blocks.",
        ]
    )


PROMPT = build_property_gen_prompt(problem=PROBLEM, freeform_spec=FREEFORM_SPEC)
print(PROMPT)

You are helping me generate property-based tests (PBT) in Python.
My test framework will be Hypothesis.

Problem specification:
<background>
This problem concerns conversion between (1) positive integers and (2) their canonical Roman numeral string representations.

Roman numerals use seven symbols with fixed values:
	- I=1, V=5, X=10, L=50, C=100, D=500, M=1000.

Numeral formation rules (canonical form):
	1) Additive principle: if a symbol appears after a symbol of equal or greater value, its value is added.
		 Example: VI = 5 + 1 = 6.
	2) Subtractive principle: if a symbol appears before a symbol of greater value, its value is subtracted.
		 Subtraction is restricted to these pairs only:
			 - I before V or X (IV=4, IX=9)
			 - X before L or C (XL=40, XC=90)
			 - C before D or M (CD=400, CM=900)
	3) Repetition limits:
			 - V, L, D must not repeat.
			 - I, X, C, M may repeat at most 3 times consecutively.
	4) Order of magnitude: symbols are arranged from highest to lowest value lef

In [None]:
# Define JSON schema for properties response
# Note: Using Google Genai Schema format, not standard JSON Schema.
PROPERTIES_RESPONSE_SCHEMA = {
    "type": "OBJECT",
    "properties": {
        "result": {
            "type": "ARRAY",
            "items": {
                "type": "STRING"
            },
            "description": "Array of property statements, or null if spec is insufficient",
            "nullable": True
        },
        "explanation": {
            "type": "STRING",
            "description": "Explanation of what is missing/ambiguous if result is null, otherwise null",
            "nullable": True
        }
    },
    "required": ["result", "explanation"]
}

In [None]:
# Build prompt to define hypothesis signature + strategy
def build_signature_and_strategies_prompt(
    *,
    problem: str,
    freeform_spec: bool,
    max_strategies: int = 12,
) -> str:
    """Build a Gemini prompt that returns a JSON schema for test-harness signature + strategies."""
    problem_spec = _read_problem_spec(problem=problem, freeform_spec=freeform_spec).strip()

    spec_expectations = (
        "Freeform spec expectations (freeform_spec=true):\n"
        "- The user text should include background/problem statement.\n"
        "- It should clearly name the target function(s) and what they do.\n"
        "- It should describe inputs/outputs, constraints, edge cases.\n"
        "- It should define any custom data types/encodings and examples if relevant."
        if freeform_spec
        else
        "Labeled spec expectations (freeform_spec=false):\n"
        "- <background>: high-level context, definitions, and rules.\n"
        "- <custom_data_types>: any custom types, encodings, invariants, examples.\n"
        "- <function_signature>: names, parameters, return types (as text), and meaning.\n"
        "- <function_details>: precise behavior, preconditions, postconditions, errors."
    )

    return "\n".join(
        [
            "You are helping me generate property-based tests (PBT) in Python.",
            "My test framework will be Hypothesis.",
            "",
            f"Problem name: {problem}",
            "",
            "Problem specification:",
            problem_spec,
            "",
            spec_expectations,
            "",
            "Task:",
            "1) Identify the target function under test, its argument list, and its return type.",
            "2) Propose a standard *property-checking* function signature suitable for Hypothesis.",
            "   - It should accept the function under test (as a Callable) and the same inputs that function expects.",
            "   - It must return a boolean (True/False) indicating whether the property holds.",
            "3) Provide Hypothesis strategies to generate valid inputs for the property-checking function.",
            "   - Provide strategies for the *inputs* (not for the Callable itself).",
            "   - Respect any constraints/invariants from the spec.",
            "   - If some constraints are hard, propose assume()-style preconditions (as strings) or filtering.",
            "",
            "Output requirements (STRICT):",
            "- Output MUST be valid JSON and NOTHING ELSE.",
            "- If the spec is insufficient or the inputs are impossible/unsupported to generate with Hypothesis, return:",
            "  {\"result\": null, \"explanation\": <string>}.",
            "- Otherwise return an object of this shape:",
            "  {",
            "    \"result\": {",
            "      \"function_under_test\": {\"name\": <string>, \"args\": [<string>...], \"returns\": <string>},",
            "      \"property_function\": {\"name\": <string>, \"typing\": <string>, \"args\": [<string>...], \"returns\": \"bool\"},",
            "      \"strategies\": [",
            "        {\"arg\": <string>, \"strategy\": <string>, \"notes\": <string|null>},",
            "        ...",
            "      ],",
            "      \"assumptions\": [<string>...],",
            "      \"imports\": [<string>...]",
            "    }",
            "  }",
            f"- Provide at most {max_strategies} strategy entries.",
            "- The values in \"strategy\" should be Python expressions like 'st.integers(...)' as STRINGS (not full code).",
        ]
    )


SIGNATURE_STRATEGY_PROMPT = build_signature_and_strategies_prompt(
    problem=PROBLEM,
    freeform_spec=FREEFORM_SPEC,
    max_strategies=12,
)

print(SIGNATURE_STRATEGY_PROMPT)

You are helping me generate property-based tests (PBT) in Python.
My test framework will be Hypothesis.

Problem name: roman_numerals

Problem specification:
<background>
This problem concerns conversion between (1) positive integers and (2) their canonical Roman numeral string representations.

Roman numerals use seven symbols with fixed values:
	- I=1, V=5, X=10, L=50, C=100, D=500, M=1000.

Numeral formation rules (canonical form):
	1) Additive principle: if a symbol appears after a symbol of equal or greater value, its value is added.
		 Example: VI = 5 + 1 = 6.
	2) Subtractive principle: if a symbol appears before a symbol of greater value, its value is subtracted.
		 Subtraction is restricted to these pairs only:
			 - I before V or X (IV=4, IX=9)
			 - X before L or C (XL=40, XC=90)
			 - C before D or M (CD=400, CM=900)
	3) Repetition limits:
			 - V, L, D must not repeat.
			 - I, X, C, M may repeat at most 3 times consecutively.
	4) Order of magnitude: symbols are arranged fr

In [None]:
def _load_gemini_api_key() -> str:
    """Load the Gemini API key from a .env file or environment variable."""
    load_dotenv(dotenv_path=Path(".env"), override=False)
    key = os.getenv("GEMINI_API_KEY")
    if not key:
        raise RuntimeError(
            "GEMINI_API_KEY is missing. Add it to a .env file at the repo root as GEMINI_API_KEY=..."
        )
    return key

# Call Gemini with JSON response
def call_gemini_json(
    prompt: str,
    *,
    model_name: Optional[str] = None,
    temperature: float = 0.2,
    max_output_tokens: Optional[int] = None,
    timeout_s: int = 60,
    schema: Optional[dict[str, Any]] = None,
) -> Tuple[Any, str]:
    """Call Gemini with `prompt` and return (parsed_json, raw_text).

    This relies on Gemini JSON mode (response_mime_type='application/json').
    If JSON parsing fails, the raised error includes the raw response text
    (truncated) so you can see what the model actually returned.
    """
    api_key = _load_gemini_api_key()
    client = genai.Client(api_key=api_key)

    effective_model_name = model_name or os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
    
    generation_config = genai.types.GenerateContentConfig(
        temperature=temperature,
        response_mime_type="application/json",
    )
    if max_output_tokens is not None:
        generation_config.max_output_tokens = max_output_tokens
    if schema is not None:
        generation_config.response_schema = schema

    resp = client.models.generate_content(
        model=effective_model_name,
        contents=prompt,
        config=generation_config,
    )
    raw = (resp.text or "").strip()

    if not raw:
        # This often happens when the request is blocked, times out, or returns no text.
        # We include `resp` metadata to aid debugging without trying to "extract" JSON.
        raise ValueError(
            "Model returned an empty response in JSON mode. "
            f"model={effective_model_name!r}. "
            "Check for safety blocks, timeouts, or SDK/model compatibility."
        )

    try:
        return json.loads(raw), raw
    except json.JSONDecodeError as e:
        excerpt = raw[:4000]
        raise ValueError(
            "Model did not return valid JSON (despite JSON mode). "
            f"model={effective_model_name!r}. "
            "Raw model output (truncated):\n" + excerpt
        ) from e

In [None]:
# Call for properties generation 
properties_json = None
properties_raw = ""
properties_error = None

try:
    properties_json, properties_raw = call_gemini_json(
        PROMPT,
        model_name=MODEL_NAME,
        temperature=TEMPERATURE,
        max_output_tokens=MAX_OUTPUT_TOKENS,
        timeout_s=TIMEOUT_SEC,
        schema=PROPERTIES_RESPONSE_SCHEMA,
    )
except Exception as e:
    properties_error = f"{type(e).__name__}: {e}"

print("Raw model output (truncated):")
print(properties_raw[:2000] + ("..." if len(properties_raw) > 2000 else ""))
print("\nParsed JSON:")
pprint(properties_json)
if properties_error:
    print("\nERROR:")
    print(properties_error)

Raw model output (truncated):
{
  "result": [
    "For any integer N such that 1 <= N <= 3999, the result of converting N to its Roman numeral string and then converting that string back to an integer (via an assumed 'from_numerals' function) must be equal to N.",
    "For any integer N such that 1 <= N <= 3999, the function 'to_numerals(N)' must return a value of type string.",
    "For any integer N such that 1 <= N <= 3999, the string returned by 'to_numerals(N)' must not be empty.",
    "For any integer N such that 1 <= N <= 3999, every character in the string returned by 'to_numerals(N)' must be one of 'I', 'V', 'X', 'L', 'C', 'D', or 'M'.",
    "For any integer N such that N < 1, calling 'to_numerals(N)' must raise a ValueError.",
    "For any integer N such that N > 3999, calling 'to_numerals(N)' must raise a ValueError.",
    "For any input N that is not an integer type (e.g., float, string, boolean, None), calling 'to_numerals(N)' must raise a TypeError.",
    "For any integer

In [None]:
# Call for signature + strategy generation
from pprint import pprint

sig_strat_json = None
sig_strat_raw = ""
sig_strat_error = None

try:
    sig_strat_json, sig_strat_raw = call_gemini_json(
        SIGNATURE_STRATEGY_PROMPT,
        model_name=MODEL_NAME,
        temperature=TEMPERATURE,
        max_output_tokens=MAX_OUTPUT_TOKENS,
        timeout_s=TIMEOUT_SEC,
    )
except Exception as e:
    sig_strat_error = f"{type(e).__name__}: {e}"

print("Raw model output (truncated):")
print(sig_strat_raw[:2000] + ("..." if len(sig_strat_raw) > 2000 else ""))
print("\nParsed JSON:")
pprint(sig_strat_json)
if sig_strat_error:
    print("\nERROR:")
    print(sig_strat_error)

Raw model output (truncated):
{
  "result": {
    "function_under_test": {
      "name": "to_numerals",
      "args": ["N"],
      "returns": "str"
    },
    "property_function": {
      "name": "prop_to_numerals_round_trip",
      "typing": "def prop_to_numerals_round_trip(to_numerals_func: Callable[[int], str], N: int) -> bool:",
      "args": ["to_numerals_func", "N"],
      "returns": "bool"
    },
    "strategies": [
      {
        "arg": "N",
        "strategy": "st.integers(min_value=1, max_value=3999)",
        "notes": "Generates integers within the valid Roman numeral range [1, 3999] as specified by the problem's numeric domain."
      }
    ],
    "assumptions": [],
    "imports": [
      "from typing import Callable",
      "import hypothesis.strategies as st",
      "from your_module import from_numerals # This function is required to check the round-trip property."
    ]
  }
}

Parsed JSON:
{'result': {'assumptions': [],
            'function_under_test': {'args': ['N']

In [45]:
# Write run log to logs/<problem>_<timestamp>.json
from __future__ import annotations

import json
from typing import Any

def _jsonable(obj: Any) -> Any:
    """Best-effort conversion to JSON-serializable types."""
    try:
        json.dumps(obj)
        return obj
    except TypeError:
        return repr(obj)

run_log = {
    "timestamp_utc": RUN_TIMESTAMP,
    "problem": PROBLEM,
    "settings": {
        "FREEFORM_SPEC": FREEFORM_SPEC,
        "MODEL_NAME": MODEL_NAME,
        "TEMPERATURE": TEMPERATURE,
        "MAX_OUTPUT_TOKENS": MAX_OUTPUT_TOKENS,
        "TIMEOUT_SEC": TIMEOUT_SEC,
        "python_version": sys.version,
        "platform": platform.platform(),
    },
    "prompts": {
        "properties_prompt": PROMPT,
        "signature_strategy_prompt": SIGNATURE_STRATEGY_PROMPT,
    },
    "responses": {
        "properties": {
            "raw": properties_raw,
            "parsed": _jsonable(properties_json),
            "error": properties_error,
        },
        "signature_strategies": {
            "raw": sig_strat_raw,
            "parsed": _jsonable(sig_strat_json),
            "error": sig_strat_error,
        },
    },
}

RUN_LOG_PATH.write_text(json.dumps(run_log, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
print(f"Wrote run log: {RUN_LOG_PATH}")

Wrote run log: logs\roman_numerals_20260115_022504Z.json
