In [2]:
import os
import re
import pathlib
from datasets import load_dataset
import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
MODEL = "gemini-2.5-flash"
# OUT_DIR = pathlib.Path("generated_tests")
# OUT_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
def extract_docstring(prompt_text: str) -> list:
    """
    Extract all triple-quoted docstrings from the input text.
    Returns a list of docstring contents as strings.
    """
    matches = re.findall(r'("""|\'\'\')(.*?)(\1)', prompt_text, flags=re.DOTALL)
    # Join all docstrings into a single string (if needed)
    doc_text = "\n\n".join([m[1].strip() for m in matches])

    return doc_text


In [5]:
api_key = os.environ.get("GOOGLE_API_KEY")
if not api_key:
    raise RuntimeError("Please set GOOGLE_API_KEY in your environment.")
genai.configure(api_key=api_key)
model = genai.GenerativeModel(MODEL)

ds = load_dataset("openai/openai_humaneval")["test"]  # 164 items
print(f"Loaded HumanEval with {len(ds)} problems.")

Loaded HumanEval with 164 problems.


In [None]:
for idx, item in enumerate(ds):
    doc = extract_docstring(item["prompt"])
    prompt = f'"""\n{doc}\n"""\nPlease generate 10 test cases in Python\'s standard unittest format for this problem. Please ONLY generate test cases, assume the function exist.'
    if(idx == 32):
        print(prompt)
    # uncomment to run generation
    # try:
    #     resp = model.generate_content(prompt)
    #     text = (resp.text or "").strip()
    #     # If fences sneak in, strip them.
    #     text = re.sub(r"^```(?:python)?\s*", "", text)
    #     text = re.sub(r"\s*```$", "", text)

    #     out_path = OUT_DIR / f"HumanEval_{idx}.py"
    #     out_path.write_text(text, encoding="utf-8")
    # except Exception as e:
    #     print(f"[WARN] Problem {idx} failed: {e}")
        
print("Generation Complete")

"""
Evaluates polynomial with coefficients xs at point x.
    return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n

xs are coefficients of a polynomial.
    find_zero find x such that poly(x) = 0.
    find_zero returns only only zero point, even if there are many.
    Moreover, find_zero only takes list xs having even number of coefficients
    and largest non zero coefficient as it guarantees
    a solution.
    >>> round(find_zero([1, 2]), 2) # f(x) = 1 + 2x
    -0.5
    >>> round(find_zero([-6, 11, -6, 1]), 2) # (x - 1) * (x - 2) * (x - 3) = -6 + 11x - 6x^2 + x^3
    1.0
"""
Please generate 10 test cases in Python's standard unittest format for this problem. Please ONLY generate test cases, assume the function exist.
Generation Complete


In [25]:
import re
import pathlib
import textwrap
from datasets import Dataset, DatasetDict

# === CONFIG ===
OUT_DIR = pathlib.Path("canonical_solutions")
OUT_DIR.mkdir(parents=True, exist_ok=True)
INCLUDE_IMPORTS = True  # set False to drop import lines and keep only `def ...:` + body

def extract_signature_from_prompt(prompt: str, include_imports: bool = True) -> str:
    """
    From HumanEval `prompt`, remove the triple-quoted docstring and 'pass',
    then return the remaining code (imports + def line).
    """
    # remove first triple-quoted docstring block ("""...""" or '''...''')
    code_wo_doc = re.sub(r'("""|\'\'\')(.*?)(\1)', '', prompt, flags=re.DOTALL)
    # remove any bare 'pass' lines
    code_wo_doc = re.sub(r'^[ \t]*pass[ \t]*\r?\n?', '', code_wo_doc, flags=re.MULTILINE)
    # normalize whitespace
    lines = [ln.rstrip() for ln in code_wo_doc.strip().splitlines() if ln.strip()]

    if not lines:
        raise ValueError("No signature content found in prompt.")

    # Find the first def line
    def_idx = next((i for i, ln in enumerate(lines) if ln.lstrip().startswith("def ")), None)
    if def_idx is None:
        # fallback: sometimes there's a blank before def; just join everything
        joined = "\n".join(lines) + ("\n" if not lines[-1].endswith("\n") else "")
        return joined

    if include_imports:
        kept = lines[:def_idx+1]  # imports (if any) + the def line
    else:
        kept = [lines[def_idx]]   # only the def line

    sig = "\n".join(kept)
    if not sig.endswith("\n"):
        sig += "\n"
    return sig

def assemble_module(signature_code: str, body_code: str) -> str:
    """
    Indent the canonical_solution body under the def line.
    Handles cases where body is already (or not) indented.
    """
    body = textwrap.dedent(body_code.rstrip("\n")) + "\n"
    body_indented = textwrap.indent(body, "    ")
    return signature_code + body_indented

def get_split(ds_any) -> Dataset:
    """
    Accept either:
      - a DatasetDict with key 'test'
      - a Dataset that is already the split
    """
    if isinstance(ds_any, DatasetDict):
        return ds_any["test"]
    if isinstance(ds_any, Dataset):
        return ds_any
    raise TypeError("Provide a Hugging Face Dataset or DatasetDict (with 'test').")

try:
    split = get_split(ds)       # if your variable is named ds
except NameError:
    ds = load_dataset("openai/openai_humaneval")["test"]  # 164 items
    split = get_split(ds) 

print(f"Writing {len(split)} canonical solutions...")

for i, item in enumerate(split):
    task_id = item.get("task_id", f"HumanEval/{i}")
    entry_point = item.get("entry_point", f"task_{i}")
    prompt = item["prompt"]
    body = item["canonical_solution"]

    # 1) signature
    signature = extract_signature_from_prompt(prompt, include_imports=INCLUDE_IMPORTS)

    # 2) final module
    final_code = assemble_module(signature, body)

    # 3) filename like 000_entrypoint.py
    idx_str = str(i).zfill(3)
    out_path = OUT_DIR / f"{idx_str}_{entry_point}.py"
    out_path.write_text(final_code, encoding="utf-8")

print("Done")


Writing 164 canonical solutions...
Done


### -------------------------------------------------------------------------------

In [115]:
import pandas as pd
import subprocess
import sys
import os
import tempfile
import re
import glob

def extract_test_class_name(test_code):
    """extract class information from the test file"""
    class_match = re.search(r'class\s+(\w+)\(unittest\.TestCase\)', test_code)
    if class_match:
        return class_match.group(1)
    else:
        # if can not extract, try other way
        class_match = re.search(r'class\s+(\w+)\s*\(.*TestCase.*\)', test_code)
        if class_match:
            return class_match.group(1)
    return None

def run_single_test(solution_file, test_file):
    """run single test instance"""
    
    try:
        # read solution file
        with open(solution_file, 'r', encoding='utf-8') as f:
            solution_code = f.read()
        
        # read test file
        with open(test_file, 'r', encoding='utf-8') as f:
            test_code = f.read()
        
        # extract test class name
        test_class_name = extract_test_class_name(test_code)
        if not test_class_name:
            print(f"  warning: can not extract class name from {os.path.basename(test_file)}")
            return 0
        
        # creat temp file，merge solution and test
        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') as temp_file:
            # write solution code
            temp_file.write(solution_code + "\n\n")
            
            # write test code
            temp_file.write(test_code + "\n\n")
            
            # add code to excute test
            temp_file.write(f"""
import sys

if __name__ == '__main__':
    import unittest
    try:
        suite = unittest.TestLoader().loadTestsFromTestCase({test_class_name})
        runner = unittest.TextTestRunner(verbosity=0, stream=sys.stdout)
        result = runner.run(suite)
        
        # output statistic
        print(f"TESTS_RUN:{{result.testsRun}}")
        print(f"FAILURES:{{len(result.failures)}}")
        print(f"ERRORS:{{len(result.errors)}}")
        print(f"PASSED:{{result.testsRun - len(result.failures) - len(result.errors)}}")
    except Exception as e:
        print(f"ERROR running tests: {{e}}")
        print("TESTS_RUN:0")
        print("FAILURES:0")
        print("ERRORS:1")
        print("PASSED:0")
""")
            temp_file_path = temp_file.name
        
        # run test
        result = subprocess.run(
            [sys.executable, temp_file_path],
            capture_output=True,
            text=True,
            timeout=30  # 30s timeout
        )
        
        # parse output
        output = result.stdout + result.stderr
        
        # extract pass number from output
        passed_match = re.search(r'PASSED:(\d+)', output)
        tests_run_match = re.search(r'TESTS_RUN:(\d+)', output)
        
        if passed_match and tests_run_match:
            passed_tests = int(passed_match.group(1))
            tests_run = int(tests_run_match.group(1))
            
            return passed_tests
        else:
            # if can't parse the output
            if "OK" in output and ("FAILED" not in output or "FAILURES: 0" in output):
                return 10
            else:
                # count fail and error
                failures = len(re.findall(r'FAIL:', output)) + len(re.findall(r'ERROR:', output))
                errors = len(re.findall(r'Traceback', output))
                return max(0, 10 - failures - errors)
                
    except subprocess.TimeoutExpired:
        print(f"  Timeout: {os.path.basename(test_file)}")
        return 0
    except Exception as e:
        print(f"  Error in running test {os.path.basename(test_file)}: {e}")
        return 0
    finally:
        # remove temp file
        try:
            if 'temp_file_path' in locals():
                os.unlink(temp_file_path)
        except:
            pass

def calculate_validity_rate_all_tests():
    """Calculate Validity Rate for all files"""
    
    test_dir = "C:\\Users\\zhang\\Downloads\\Evaluating_and_Enhancing_LLM_Generated_Test_Suites\\generated_tests"
    solution_dir = "C:\\Users\\zhang\\Downloads\\Evaluating_and_Enhancing_LLM_Generated_Test_Suites\\canonical_solutions"
    
    # get file list with proper ordering
    test_files = sorted([f for f in os.listdir(test_dir) if f.startswith("HumanEval_") and f.endswith(".py")])
    solution_files = sorted([f for f in os.listdir(solution_dir) if f.endswith(".py")])
    
    # make sure the file number is correct
    if len(test_files) != 164 or len(solution_files) != 164:
        print(f"warning: expect 164 files, but find {len(test_files)} test files {len(solution_files)} solution files")
    
    print(f"Found {len(test_files)} test files {len(solution_files)} solution files")
    print("Test Start...\n")
    
    results = []
    
    for i in range(len(test_files)):
        # mapping files based on name
        test_num = f"HumanEval_{i}.py"
        solution_num = f"{i:03d}_*.py"
        
        # find corresponding solution file
        solution_pattern = os.path.join(solution_dir, f"{i:03d}_*.py")
        matching_solutions = glob.glob(solution_pattern)
        
        if test_num in test_files and matching_solutions:
            test_file = test_num
            solution_file = os.path.basename(matching_solutions[0])
        else:
            # if failed mapping by number，map by order
            test_file = test_files[i] if i < len(test_files) else None
            solution_file = solution_files[i] if i < len(solution_files) else None
        
        if not test_file or not solution_file:
            print(f"Skip index {i}: can not file file")
            continue
            
        test_path = os.path.join(test_dir, test_file)
        solution_path = os.path.join(solution_dir, solution_file)
        
        # print(f"Testing: {i+1:3d}/164: {test_file} -> {solution_file}")
        
        # Run test
        passed_tests = run_single_test(solution_path, test_path)
        validity_rate = passed_tests / 10.0
        
        results.append({
            'test_id': i,
            'test_file': test_file,
            'solution_file': solution_file,
            'passed_tests': passed_tests,
            'total_tests': 10,
            'validity_rate': validity_rate
        })
        
        # print(f"  Result: {passed_tests}/10 passed, Validity Rate: {validity_rate:.2f}")
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    # Calculate statistics
    total_passed = df['passed_tests'].sum()
    total_tests = len(df) * 10
    overall_validity = total_passed / total_tests if total_tests > 0 else 0
    
    # print(f"\n" + "="*50)
    # print(f"=== Overall Summary ===")
    # print(f"Number of test files: {len(df)}")
    # print(f"Passed tests: {total_passed}/{total_tests}")
    # print(f"Overall Validity Rate: {overall_validity:.4f} ({overall_validity*100:.2f}%)")
    print(f"Average Validity Rate: {df['validity_rate'].mean():.4f}")
    # print(f"Pass distribution:")
    # print(df['passed_tests'].value_counts().sort_index())
    # print("="*50)
    
    return df

# Run full tests
df_results = calculate_validity_rate_all_tests()

# Display result
df_results

# # show description
# print("\nDescription:")
# print(df_results['validity_rate'].describe())

# save the result to csv
# output_file = 'validity_results_complete.csv'
# df_results.to_csv(output_file, index=False)
# print(f"\nSaved to: {output_file}")


Found 164 test files 164 solution files
Test Start...

Average Validity Rate: 0.8085


Unnamed: 0,test_id,test_file,solution_file,passed_tests,total_tests,validity_rate
0,0,HumanEval_0.py,000_has_close_elements.py,10,10,1.0
1,1,HumanEval_1.py,001_separate_paren_groups.py,9,10,0.9
2,2,HumanEval_2.py,002_truncate_number.py,10,10,1.0
3,3,HumanEval_3.py,003_below_zero.py,10,10,1.0
4,4,HumanEval_4.py,004_mean_absolute_deviation.py,9,10,0.9
...,...,...,...,...,...,...
159,159,HumanEval_159.py,159_eat.py,10,10,1.0
160,160,HumanEval_160.py,160_do_algebra.py,0,10,0.0
161,161,HumanEval_161.py,161_solve.py,10,10,1.0
162,162,HumanEval_162.py,162_string_to_md5.py,0,10,0.0


In [6]:
import builtins
import importlib.util
import io
import json
import pathlib
import re
import sys
import types
import unittest
import warnings

import pandas as pd
from coverage import Coverage
from coverage.exceptions import CoverageWarning
warnings.filterwarnings("ignore", category=CoverageWarning)

CANON_DIR = pathlib.Path("canonical_solutions").resolve()
TESTS_DIR = pathlib.Path("old_generated_tests").resolve()

assert CANON_DIR.is_dir(), "canonical_solutions/ missing"
assert TESTS_DIR.is_dir(), "generated_tests/ missing"

def load_solution_as_module(sol_path: pathlib.Path) -> types.ModuleType:
    """
    Read the canonical solution and execute it into a fresh module.
    IMPORTANT: compile with filename=str(sol_path) so Coverage attributes lines to this file.
    """
    code = sol_path.read_text(encoding="utf-8")
    mod = types.ModuleType("candidate")  # name isn't important for coverage
    compiled = compile(code, filename=str(sol_path), mode="exec")
    exec(compiled, mod.__dict__)
    return mod

def load_test_module(test_path: pathlib.Path, entry_point_func) -> types.ModuleType:
    """
    Load the test file as a module, pre-injecting the target function into its globals.
    Also put the function into builtins so bare calls resolve anywhere.
    """
    # Safety net: make the function visible as a builtin
    setattr(builtins, entry_point_func.__name__, entry_point_func)

    spec = importlib.util.spec_from_file_location("test_generated", str(test_path))
    if spec is None or spec.loader is None:
        raise RuntimeError(f"Cannot load tests from {test_path}")
    test_mod = importlib.util.module_from_spec(spec)

    # Inject before executing the test file
    test_mod.__dict__[entry_point_func.__name__] = entry_point_func

    # Execute test module
    sys.modules["test_generated"] = test_mod
    spec.loader.exec_module(test_mod)
    return test_mod

def run_tests_get_suite(test_mod: types.ModuleType) -> unittest.TestSuite:
    """Create a suite from the already-loaded test module."""
    loader = unittest.defaultTestLoader
    return loader.loadTestsFromModule(test_mod)

def coverage_percent_for_file(cov: Coverage, filename: str) -> float:
    """
    Compute percent covered for a specific filename using Coverage API.
    Returns 0.0 if file wasn't measured.
    """
    try:
        data = cov.get_data()
        if not data or filename not in data.measured_files():
            return 0.0, 0, 0
        # analysis2 returns (filename, statements, excluded, missing, partial)
        _, statements, _, missing, _ = cov.analysis2(filename)
        if not statements:
            return 0.0
        covered = len(statements) - len(missing)
        return round(100.0 * covered / len(statements), 2), len(statements), len(missing)
    except CoverageWarning:
        return 0.0, 0, 0

rows = []

# Discover (idx, entry_point, solution_path)
canon = []
for f in sorted(CANON_DIR.glob("*.py")):
    m = re.match(r"^(\d{3})_(.+)\.py$", f.name)
    if m:
        canon.append((int(m.group(1)), m.group(2), f))

for idx, entry_point, sol_path in canon:
    test_path = TESTS_DIR / f"HumanEval_{idx}.py"
    if not test_path.exists():
        print(f"[SKIP] {idx:03d} no test file {test_path.name}")
        continue

    # 1) Load solution module and fetch the function
    sol_mod = load_solution_as_module(sol_path)
    if not hasattr(sol_mod, entry_point):
        print(f"[WARN] {idx:03d} entry point '{entry_point}' not found in {sol_path.name}")
        func = None
    else:
        func = getattr(sol_mod, entry_point)

    # 2) Load tests with function injected
    if func is None:
        coverage_pct = 0.0
    else:
        test_mod = load_test_module(test_path, func)
        suite = run_tests_get_suite(test_mod)

        # 3) Run tests under Coverage API, measuring ONLY the target file via include=[...]

        cov = Coverage(include=[str(sol_path)])

        cov.erase()
        cov.start()
        
        # run tests
        result = unittest.TextTestRunner(stream=io.StringIO(), verbosity=0).run(suite)
        cov.stop()
        cov.save()
        # cov.report()  # initialize coverage

        # 4) Compute percent for this exact file
        coverage_pct, statements, missing = coverage_percent_for_file(cov, str(sol_path))

    # print(f"[{idx:03d}] cov={coverage_pct:.1f}%")
    rows.append({
        "id": idx,
        "entry_point": entry_point,
        "statements": statements,
        "missing": missing,
        "coverage": coverage_pct,
    })
    # if(idx == 2):

    #     break

df_cov = pd.DataFrame(rows).sort_values("id").reset_index(drop=True)
print(f"Average Coverage Rate: {df_cov['coverage'].mean():.2f}%")
df_cov

Average Coverage Rate: 63.41%


Unnamed: 0,id,entry_point,statements,missing,coverage
0,0,has_close_elements,9,2,77.78
1,1,separate_paren_groups,16,2,87.50
2,2,truncate_number,2,1,50.00
3,3,below_zero,8,2,75.00
4,4,mean_absolute_deviation,4,2,50.00
...,...,...,...,...,...
159,159,eat,4,1,75.00
160,160,do_algebra,0,0,0.00
161,161,solve,15,1,93.33
162,162,string_to_md5,3,1,66.67


In [None]:
# df_cov.to_csv("C:\\Users\\zhang\\Downloads\\coverage.csv", index=False)

### ----------------------------------------------------------------------

In [108]:
import subprocess, pathlib, re, pandas as pd

# Adjust paths
CANON_DIR = pathlib.Path("canonical_solutions").resolve()
TESTS_DIR = pathlib.Path("generated_tests").resolve()
RUNNER = "/mnt/c/Users/zhang/Downloads/Evaluating_and_Enhancing_LLM_Generated_Test_Suites/_test_runner.py"
WSL_PY = "/home/zhang/.venvs/humaneval/bin/python"


def to_wsl_path(p: pathlib.Path) -> str:
    s = str(p)
    if s.startswith("/"):  # already WSL style
        return s
    drive = p.drive.rstrip(":").lower()
    tail = s.replace(p.drive, "").replace("\\", "/")
    return f"/mnt/{drive}{tail}"

def wsl_run(cmd: str):
    """Run a shell command in WSL and return result object with stdout/stderr."""
    result =subprocess.run(
        ["wsl", "-d", "Ubuntu", "bash", "-lc", cmd],
        capture_output=True,
        text=True,
        encoding="utf-8"
    )
    # print(result.stdout if result.stdout.strip() else "<empty>")
    return result
    

rows = []

for sol in sorted(CANON_DIR.glob("*.py")):
    
    m = re.match(r"^(\d{3})_(.+)\.py$", sol.name)
    if not m:
        continue
    idx, entry_point = int(m.group(1)), m.group(2)
    test = TESTS_DIR / f"HumanEval_{idx}.py"
    if not test.exists():
        print(f"[SKIP] {idx:03d} missing {test.name}")
        continue

    CANDIDATE_FILE = to_wsl_path(sol)
    TEST_FILE = to_wsl_path(test)
    
    # 1) clear cache
    wsl_run("rm -rf .mutmut-cache")

    # 2) mutmut run
    cmd = (
        f'CANDIDATE_FILE={CANDIDATE_FILE} '
        f'TEST_FILE={TEST_FILE} '
        f'ENTRY_POINT={entry_point} '
        f'PYTHONPATH={to_wsl_path(TESTS_DIR)} '
        f'{WSL_PY} -m mutmut run --paths-to-mutate "{CANDIDATE_FILE}" '
        f'--runner "{WSL_PY} {RUNNER}" --tests-dir .'
    )
    result = wsl_run(cmd)

    # extract last line of stdout
    lines = [ln for ln in result.stdout.splitlines() if ln.strip()]
    last_line = lines[-1] if lines else ""
    m_total = re.search(r"(\d+)/(\d+)", last_line)
    m_killed = re.search(r"🎉\s*(\d+)", last_line)
    if m_total and m_killed:
        total = int(m_total.group(2))   # second number in x/y
        killed = int(m_killed.group(1))
        mutation_score = 100.0 * killed / total if total else 0.0
    else:
        total, killed, mutation_score = 0, 0, 0.0

    # print(f"[{idx:03d}] {entry_point}: total={total}, killed={killed}, score={mutation_score:.1f}")
    rows.append({
        "id": idx,
        "entry_point": entry_point,
        "total_mutants": total,
        "killed_mutants": killed,
        "mutation_score": mutation_score,
    })
    wsl_run("rm -rf .mutmut-cache")
    # if (idx == 1):
    #     print(CANDIDATE_FILE, TEST_FILE)
    #     print(entry_point)
    #     print(to_wsl_path(TESTS_DIR))
    #     break

df_mut = pd.DataFrame(rows).sort_values("id").reset_index(drop=True)
df_mut


Unnamed: 0,id,entry_point,total_mutants,killed_mutants,mutation_score
0,0,has_close_elements,6,6,100.000000
1,1,separate_paren_groups,0,0,0.000000
2,2,truncate_number,2,2,100.000000
3,3,below_zero,8,8,100.000000
4,4,mean_absolute_deviation,0,0,0.000000
...,...,...,...,...,...
159,159,eat,5,4,80.000000
160,160,do_algebra,0,0,0.000000
161,161,solve,19,17,89.473684
162,162,string_to_md5,0,0,0.000000


In [111]:
print(f"Average Mutation Score: {df_mut['mutation_score'].mean():.2f}")

Average Mutation Score: 55.84
