In [97]:
import os
import re
import pathlib
from datasets import load_dataset
import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm


In [98]:
MODEL = "gemini-2.5-flash"
OUT_DIR = pathlib.Path("generated_tests")
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [120]:
def extract_docstring(prompt_text: str) -> list:
    """
    Extract all triple-quoted docstrings from the input text.
    Returns a list of docstring contents as strings.
    """
    matches = re.findall(r'("""|\'\'\')(.*?)(\1)', prompt_text, flags=re.DOTALL)
    # Join all docstrings into a single string (if needed)
    doc_text = "\n\n".join([m[1].strip() for m in matches])

    return doc_text


In [100]:
api_key = os.environ.get("GOOGLE_API_KEY")
if not api_key:
    raise RuntimeError("Please set GOOGLE_API_KEY in your environment.")
genai.configure(api_key=api_key)
model = genai.GenerativeModel(MODEL)

ds = load_dataset("openai/openai_humaneval")["test"]  # 164 items
print(f"Loaded HumanEval with {len(ds)} problems.")

Loaded HumanEval with 164 problems.


In [None]:
for idx, item in enumerate(ds):
    doc = extract_docstring(item["prompt"])
    prompt = f'"""\n{doc}\n"""\nPlease generate 10 test cases in Python\'s standard unittest format for this problem. Please ONLY generate test cases, assume the function exist.'
    if(idx == 32):
        print(prompt)
    # try:
    #     resp = model.generate_content(prompt)
    #     text = (resp.text or "").strip()
    #     # If fences sneak in, strip them.
    #     text = re.sub(r"^```(?:python)?\s*", "", text)
    #     text = re.sub(r"\s*```$", "", text)

    #     out_path = OUT_DIR / f"HumanEval_{idx}.py"
    #     out_path.write_text(text, encoding="utf-8")
    # except Exception as e:
    #     print(f"[WARN] Problem {idx} failed: {e}")
        
print("Generation Complete")

In [25]:
import re
import pathlib
import textwrap
from datasets import Dataset, DatasetDict

# === CONFIG ===
OUT_DIR = pathlib.Path("canonical_solutions")
OUT_DIR.mkdir(parents=True, exist_ok=True)
INCLUDE_IMPORTS = True  # set False to drop import lines and keep only `def ...:` + body

def extract_signature_from_prompt(prompt: str, include_imports: bool = True) -> str:
    """
    From HumanEval `prompt`, remove the triple-quoted docstring and 'pass',
    then return the remaining code (imports + def line).
    """
    # remove first triple-quoted docstring block ("""...""" or '''...''')
    code_wo_doc = re.sub(r'("""|\'\'\')(.*?)(\1)', '', prompt, flags=re.DOTALL)
    # remove any bare 'pass' lines
    code_wo_doc = re.sub(r'^[ \t]*pass[ \t]*\r?\n?', '', code_wo_doc, flags=re.MULTILINE)
    # normalize whitespace
    lines = [ln.rstrip() for ln in code_wo_doc.strip().splitlines() if ln.strip()]

    if not lines:
        raise ValueError("No signature content found in prompt.")

    # Find the first def line
    def_idx = next((i for i, ln in enumerate(lines) if ln.lstrip().startswith("def ")), None)
    if def_idx is None:
        # fallback: sometimes there's a blank before def; just join everything
        joined = "\n".join(lines) + ("\n" if not lines[-1].endswith("\n") else "")
        return joined

    if include_imports:
        kept = lines[:def_idx+1]  # imports (if any) + the def line
    else:
        kept = [lines[def_idx]]   # only the def line

    sig = "\n".join(kept)
    if not sig.endswith("\n"):
        sig += "\n"
    return sig

def assemble_module(signature_code: str, body_code: str) -> str:
    """
    Indent the canonical_solution body under the def line.
    Handles cases where body is already (or not) indented.
    """
    body = textwrap.dedent(body_code.rstrip("\n")) + "\n"
    body_indented = textwrap.indent(body, "    ")
    return signature_code + body_indented

def get_split(ds_any) -> Dataset:
    """
    Accept either:
      - a DatasetDict with key 'test'
      - a Dataset that is already the split
    """
    if isinstance(ds_any, DatasetDict):
        return ds_any["test"]
    if isinstance(ds_any, Dataset):
        return ds_any
    raise TypeError("Provide a Hugging Face Dataset or DatasetDict (with 'test').")

try:
    split = get_split(ds)       # if your variable is named ds
except NameError:
    ds = load_dataset("openai/openai_humaneval")["test"]  # 164 items
    split = get_split(ds) 

print(f"Writing {len(split)} canonical solutions...")

for i, item in enumerate(split):
    task_id = item.get("task_id", f"HumanEval/{i}")
    entry_point = item.get("entry_point", f"task_{i}")
    prompt = item["prompt"]
    body = item["canonical_solution"]

    # 1) signature
    signature = extract_signature_from_prompt(prompt, include_imports=INCLUDE_IMPORTS)

    # 2) final module
    final_code = assemble_module(signature, body)

    # 3) filename like 000_entrypoint.py
    idx_str = str(i).zfill(3)
    out_path = OUT_DIR / f"{idx_str}_{entry_point}.py"
    out_path.write_text(final_code, encoding="utf-8")

print("Done")


Writing 164 canonical solutions...
Done


In [81]:
# import os, sys, re, json, subprocess, pathlib, textwrap
# import pandas as pd

# CANON_DIR = pathlib.Path("canonical_solutions").resolve()
# TESTS_DIR = pathlib.Path("generated_tests").resolve()
# assert CANON_DIR.is_dir(), f"Missing {CANON_DIR}"
# assert TESTS_DIR.is_dir(), f"Missing {TESTS_DIR}"

# # --------------------------------------------------------------------
# # 1) Create a single tiny runner (only once)
# #    This runner:
# #      - reads env vars: CANDIDATE_FILE, TEST_FILE
# #      - loads the canonical solution as module "candidate"
# #      - runs unittest on TEST_FILE
# #      - prints a single line of JSON with {"testsRun", "failures", "errors", "passed"}
# # --------------------------------------------------------------------
# runner_path = pathlib.Path("_test_runner.py").resolve()
# if not runner_path.exists():
#     runner_code = r'''
# # _test_runner.py
# import os, sys, types, json, unittest, pathlib, builtins, importlib.util

# CANDIDATE_FILE = os.environ.get("CANDIDATE_FILE")
# TEST_FILE = os.environ.get("TEST_FILE")
# ENTRY_POINT = os.environ.get("ENTRY_POINT")

# def die(msg, code=2):
#     print(json.dumps({"error": msg}))
#     sys.exit(code)

# if not CANDIDATE_FILE or not TEST_FILE:
#     die("missing env CANDIDATE_FILE/TEST_FILE")

# # 1) Load canonical solution module as "candidate"
# sol_path = pathlib.Path(CANDIDATE_FILE)
# code = sol_path.read_text(encoding="utf-8")
# candidate_mod = types.ModuleType("candidate")
# exec(compile(code, str(sol_path), "exec"), candidate_mod.__dict__)
# sys.modules["candidate"] = candidate_mod

# # 2) Put symbol in builtins (safety net)
# if ENTRY_POINT and hasattr(candidate_mod, ENTRY_POINT):
#     setattr(builtins, ENTRY_POINT, getattr(candidate_mod, ENTRY_POINT))

# # 3) Load the specific test file as a module so we can inject into its globals
# test_path = pathlib.Path(TEST_FILE)
# spec = importlib.util.spec_from_file_location("test_generated", str(test_path))
# if spec is None or spec.loader is None:
#     die(f"cannot load {TEST_FILE}")
# test_mod = importlib.util.module_from_spec(spec)

# # Inject the entry point into the test module globals BEFORE executing it
# if ENTRY_POINT and hasattr(candidate_mod, ENTRY_POINT):
#     test_mod.__dict__[ENTRY_POINT] = getattr(candidate_mod, ENTRY_POINT)

# sys.modules["test_generated"] = test_mod
# spec.loader.exec_module(test_mod)  # executes the test file

# # 4) Build suite from that module and run
# suite = unittest.defaultTestLoader.loadTestsFromModule(test_mod)
# result = unittest.TextTestRunner(verbosity=0).run(suite)

# out = {
#     "testsRun": result.testsRun,
#     "failures": len(result.failures),
#     "errors": len(result.errors),
#     "passed": result.testsRun - len(result.failures) - len(result.errors),
# }
# print(json.dumps(out))
# # exit code 0 even on failures; coverage/mutmut decide what they need
# '''
#     runner_path.write_text(textwrap.dedent(runner_code), encoding="utf-8")
#     print(f"Created {runner_path.name}")

# def run_cmd(cmd, cwd=None, env=None):
#     p = subprocess.Popen(cmd, cwd=cwd, env=env,
#                          stdout=subprocess.PIPE, stderr=subprocess.PIPE,
#                          text=True)
#     out, err = p.communicate()
#     return p.returncode, out, err

# # Build an index from your canonical_solutions files: 000_xxx.py -> (idx, entry_point, path)
# canon_index = {}
# for f in sorted(CANON_DIR.glob("*.py")):
#     m = re.match(r"^(\d{3})_(.+)\.py$", f.name)
#     if not m:
#         continue
#     idx = int(m.group(1))
#     entry = m.group(2)
#     canon_index[idx] = (entry, f)

# rows = []

# for idx, (entry_point, sol_path) in canon_index.items():
#     test_file = TESTS_DIR / f"HumanEval_{idx}.py"
#     if not test_file.exists():
#         print(f"[SKIP] No test for {idx}: {test_file.name}")
#         continue

#     # -----------------------
#     # A) RUN UNDER COVERAGE
#     # -----------------------
#     env = os.environ.copy()
#     env["PYTHONPATH"] = str(TESTS_DIR) + os.pathsep + env.get("PYTHONPATH", "")
#     env["CANDIDATE_FILE"] = str(sol_path)
#     env["TEST_FILE"] = str(test_file)
#     env["ENTRY_POINT"] = entry_point

#     # 1) run tests with coverage
#     rc, out, err = run_cmd(
#         ["coverage", "run", "--source", str(CANON_DIR), str(runner_path)],
#         env=env
#     )
#     # parse stdout JSON from runner for pass counts
#     passed = 0
#     total = 10
#     try:
#         last_line = [ln for ln in out.splitlines() if ln.strip()][-1]
#         stats = json.loads(last_line)
#         passed = int(stats.get("passed", 0))
#         total = int(stats.get("testsRun", 10))
#     except Exception:
#         print(f"[WARN] could not parse test stats for {idx}. stdout:\n{out}\nstderr:\n{err}")

#     validity_rate = passed / total if total else 0.0

#     # 2) get coverage JSON once we have a .coverage file
#     cov_json_path = f".cov_{idx:03d}.json"
#     rc, out_cov, err_cov = run_cmd(["coverage", "json", "-o", cov_json_path], env=env)
#     coverage_pct = 0.0
#     if rc == 0 and pathlib.Path(cov_json_path).exists():
#         cov = json.loads(pathlib.Path(cov_json_path).read_text(encoding="utf-8"))
#         files = cov.get("files", {})
#         key = str(sol_path)
#         if key in files:
#             coverage_pct = float(files[key]["summary"]["percent_covered"])
#         else:
#             # fallback by basename
#             for k, v in files.items():
#                 if pathlib.Path(k).name == sol_path.name:
#                     coverage_pct = float(v["summary"]["percent_covered"])
#                     break
#         # clean up the per-idx json to keep new files to a minimum
#         try:
#             pathlib.Path(cov_json_path).unlink()
#         except Exception:
#             pass
#     else:
#         print(f"[WARN] coverage json failed for {idx}: {err_cov or out_cov}")

#     # -----------------------
#     # B) MUTATION TESTING
#     # -----------------------
#     # mutmut will spawn the test runner many times; we provide the same env so
#     # _test_runner.py knows which solution and test to use.
#     mut_env = env.copy()
#     # Important: keep PYTHONPATH so _test_runner.py is importable as a script
#     # We'll use the script path, not -m, to avoid path headaches.
#     # Limit mutations to THIS single solution file.
#     rc, out_run, err_run = run_cmd(
#         [
#             "mutmut", "run",
#             "--paths-to-mutate", str(sol_path),
#             "--runner", f"python {runner_path}",
#             "--tests-dir", ".",   # runner ignores this and uses TEST_FILE
#             "--no-progress",
#             "--use-coverage"      # speeds up by using coverage cache
#         ],
#         env=mut_env
#     )
#     rc, out_res, err_res = run_cmd(["mutmut", "results"], env=mut_env)

#     killed = survived = 0
#     for line in (out_res + "\n" + err_res).splitlines():
#         m = re.search(r"\bKilled\s*\((\d+)\)", line)
#         if m: killed = int(m.group(1))
#         m = re.search(r"\bSurvived\s*\((\d+)\)", line)
#         if m: survived = int(m.group(1))
#     total_mutants = killed + survived
#     mutation_score = (100.0 * killed / total_mutants) if total_mutants else 0.0

#     rows.append({
#         "id": idx,
#         "entry_point": entry_point,
#         "passed": passed,
#         "validity_rate": validity_rate,
#         "coverage": coverage_pct,
#         "mutation_score": mutation_score,
#     })
#     print(f"[{idx:03d}] passed={passed}/{total}  cov={coverage_pct:.1f}%  mut={mutation_score:.1f}")

#     if(idx == 10):
#         break

# # Final table
# df = pd.DataFrame(rows).sort_values("id").reset_index(drop=True)
# df


### -------------------------------------------------------------------------------

In [127]:
import pandas as pd
import subprocess
import sys
import os
import tempfile
import re
import glob

def extract_test_class_name(test_code):
    """从测试代码中提取测试类名"""
    class_match = re.search(r'class\s+(\w+)\(unittest\.TestCase\)', test_code)
    if class_match:
        return class_match.group(1)
    else:
        # 如果无法提取，尝试其他模式
        class_match = re.search(r'class\s+(\w+)\s*\(.*TestCase.*\)', test_code)
        if class_match:
            return class_match.group(1)
    return None

def run_single_test(solution_file, test_file):
    """运行单个测试并返回通过数量"""
    
    try:
        # 读取解决方案文件
        with open(solution_file, 'r', encoding='utf-8') as f:
            solution_code = f.read()
        
        # 读取测试文件
        with open(test_file, 'r', encoding='utf-8') as f:
            test_code = f.read()
        
        # 提取测试类名
        test_class_name = extract_test_class_name(test_code)
        if not test_class_name:
            print(f"  警告: 无法从 {os.path.basename(test_file)} 中提取测试类名")
            return 0
        
        # 创建临时文件，合并解决方案和测试代码
        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') as temp_file:
            # 写入解决方案代码
            temp_file.write(solution_code + "\n\n")
            
            # 写入测试代码
            temp_file.write(test_code + "\n\n")
            
            # 添加测试运行代码
            temp_file.write(f"""
import sys

if __name__ == '__main__':
    import unittest
    try:
        suite = unittest.TestLoader().loadTestsFromTestCase({test_class_name})
        runner = unittest.TextTestRunner(verbosity=0, stream=sys.stdout)
        result = runner.run(suite)
        
        # 输出统计信息
        print(f"TESTS_RUN:{{result.testsRun}}")
        print(f"FAILURES:{{len(result.failures)}}")
        print(f"ERRORS:{{len(result.errors)}}")
        print(f"PASSED:{{result.testsRun - len(result.failures) - len(result.errors)}}")
    except Exception as e:
        print(f"ERROR running tests: {{e}}")
        print("TESTS_RUN:0")
        print("FAILURES:0")
        print("ERRORS:1")
        print("PASSED:0")
""")
            temp_file_path = temp_file.name
        
        # 运行测试
        result = subprocess.run(
            [sys.executable, temp_file_path],
            capture_output=True,
            text=True,
            timeout=30  # 30秒超时
        )
        
        # 解析输出
        output = result.stdout + result.stderr
        
        # 从输出中提取通过的数量
        passed_match = re.search(r'PASSED:(\d+)', output)
        tests_run_match = re.search(r'TESTS_RUN:(\d+)', output)
        
        if passed_match and tests_run_match:
            passed_tests = int(passed_match.group(1))
            tests_run = int(tests_run_match.group(1))
            
            return passed_tests
        else:
            # 如果无法解析输出，使用启发式方法
            if "OK" in output and ("FAILED" not in output or "FAILURES: 0" in output):
                return 10
            else:
                # 统计失败和错误
                failures = len(re.findall(r'FAIL:', output)) + len(re.findall(r'ERROR:', output))
                errors = len(re.findall(r'Traceback', output))
                return max(0, 10 - failures - errors)
                
    except subprocess.TimeoutExpired:
        print(f"  超时: {os.path.basename(test_file)}")
        return 0
    except Exception as e:
        print(f"  错误运行测试 {os.path.basename(test_file)}: {e}")
        return 0
    finally:
        # 清理临时文件
        try:
            if 'temp_file_path' in locals():
                os.unlink(temp_file_path)
        except:
            pass

def calculate_validity_rate_all_tests():
    """计算所有164个测试的Validity Rate"""
    
    test_dir = "C:\\Users\\zhang\\Downloads\\Evaluating_and_Enhancing_LLM_Generated_Test_Suites\\generated_tests"
    solution_dir = "C:\\Users\\zhang\\Downloads\\Evaluating_and_Enhancing_LLM_Generated_Test_Suites\\canonical_solutions"
    
    # 获取文件列表并确保正确排序
    test_files = sorted([f for f in os.listdir(test_dir) if f.startswith("HumanEval_") and f.endswith(".py")])
    solution_files = sorted([f for f in os.listdir(solution_dir) if f.endswith(".py")])
    
    # 确保文件数量正确且对应
    if len(test_files) != 164 or len(solution_files) != 164:
        print(f"警告: 期望164个文件，但找到 {len(test_files)} 个测试文件和 {len(solution_files)} 个解决方案文件")
    
    print(f"找到 {len(test_files)} 个测试文件和 {len(solution_files)} 个解决方案文件")
    print("开始运行测试...\n")
    
    results = []
    
    for i in range(len(test_files)):
        # 根据命名规则匹配文件
        test_num = f"HumanEval_{i}.py"
        solution_num = f"{i:03d}_*.py"
        
        # 查找对应的解决方案文件
        solution_pattern = os.path.join(solution_dir, f"{i:03d}_*.py")
        matching_solutions = glob.glob(solution_pattern)
        
        if test_num in test_files and matching_solutions:
            test_file = test_num
            solution_file = os.path.basename(matching_solutions[0])
        else:
            # 如果按数字匹配失败，按顺序匹配
            test_file = test_files[i] if i < len(test_files) else None
            solution_file = solution_files[i] if i < len(solution_files) else None
        
        if not test_file or not solution_file:
            print(f"跳过索引 {i}: 无法找到对应的文件")
            continue
            
        test_path = os.path.join(test_dir, test_file)
        solution_path = os.path.join(solution_dir, solution_file)
        
        # print(f"测试 {i+1:3d}/164: {test_file} -> {solution_file}")
        
        # 运行测试
        passed_tests = run_single_test(solution_path, test_path)
        validity_rate = passed_tests / 10.0
        
        results.append({
            'test_id': i,
            'test_file': test_file,
            'solution_file': solution_file,
            'passed_tests': passed_tests,
            'total_tests': 10,
            'validity_rate': validity_rate
        })
        
        # print(f"  结果: {passed_tests}/10 通过, Validity Rate: {validity_rate:.2f}")
    
    # 创建DataFrame
    df = pd.DataFrame(results)
    
    # 计算总体统计
    total_passed = df['passed_tests'].sum()
    total_tests = len(df) * 10
    overall_validity = total_passed / total_tests if total_tests > 0 else 0
    
    print(f"\n" + "="*50)
    print(f"=== 总体统计 ===")
    print(f"总测试文件: {len(df)}")
    print(f"总通过测试: {total_passed}/{total_tests}")
    print(f"总体Validity Rate: {overall_validity:.4f} ({overall_validity*100:.2f}%)")
    print(f"平均Validity Rate: {df['validity_rate'].mean():.4f}")
    print(f"通过率分布:")
    print(df['passed_tests'].value_counts().sort_index())
    print("="*50)
    
    return df

# 在Jupyter中运行完整测试
print("开始运行164个测试...")
df_results = calculate_validity_rate_all_tests()

# 显示结果
print("\n前10行结果:")
# display(df_results.head(10))
df_results

# # 显示统计摘要
# print("\n统计摘要:")
# print(df_results['validity_rate'].describe())

# 保存结果
# output_file = 'validity_results_complete.csv'
# df_results.to_csv(output_file, index=False)
# print(f"\n完整结果已保存到: {output_file}")


开始运行164个测试...
找到 164 个测试文件和 164 个解决方案文件
开始运行测试...


=== 总体统计 ===
总测试文件: 164
总通过测试: 1326/1640
总体Validity Rate: 0.8085 (80.85%)
平均Validity Rate: 0.8085
通过率分布:
passed_tests
0      26
5       1
6       2
7       1
8       9
9      21
10    103
11      1
Name: count, dtype: int64

前10行结果:


Unnamed: 0,test_id,test_file,solution_file,passed_tests,total_tests,validity_rate
0,0,HumanEval_0.py,000_has_close_elements.py,10,10,1.0
1,1,HumanEval_1.py,001_separate_paren_groups.py,9,10,0.9
2,2,HumanEval_2.py,002_truncate_number.py,10,10,1.0
3,3,HumanEval_3.py,003_below_zero.py,10,10,1.0
4,4,HumanEval_4.py,004_mean_absolute_deviation.py,9,10,0.9
...,...,...,...,...,...,...
159,159,HumanEval_159.py,159_eat.py,10,10,1.0
160,160,HumanEval_160.py,160_do_algebra.py,0,10,0.0
161,161,HumanEval_161.py,161_solve.py,10,10,1.0
162,162,HumanEval_162.py,162_string_to_md5.py,0,10,0.0


In [126]:
import builtins
import importlib.util
import io
import json
import pathlib
import re
import sys
import types
import unittest
import warnings

import pandas as pd
from coverage import Coverage
from coverage.exceptions import CoverageWarning

CANON_DIR = pathlib.Path("canonical_solutions").resolve()
TESTS_DIR = pathlib.Path("generated_tests").resolve()

assert CANON_DIR.is_dir(), "canonical_solutions/ missing"
assert TESTS_DIR.is_dir(), "generated_tests/ missing"

def load_solution_as_module(sol_path: pathlib.Path) -> types.ModuleType:
    """
    Read the canonical solution and execute it into a fresh module.
    IMPORTANT: compile with filename=str(sol_path) so Coverage attributes lines to this file.
    """
    code = sol_path.read_text(encoding="utf-8")
    mod = types.ModuleType("candidate")  # name isn't important for coverage
    compiled = compile(code, filename=str(sol_path), mode="exec")
    exec(compiled, mod.__dict__)
    return mod

def load_test_module(test_path: pathlib.Path, entry_point_func) -> types.ModuleType:
    """
    Load the test file as a module, pre-injecting the target function into its globals.
    Also put the function into builtins so bare calls resolve anywhere.
    """
    # Safety net: make the function visible as a builtin
    setattr(builtins, entry_point_func.__name__, entry_point_func)

    spec = importlib.util.spec_from_file_location("test_generated", str(test_path))
    if spec is None or spec.loader is None:
        raise RuntimeError(f"Cannot load tests from {test_path}")
    test_mod = importlib.util.module_from_spec(spec)

    # Inject before executing the test file
    test_mod.__dict__[entry_point_func.__name__] = entry_point_func

    # Execute test module
    sys.modules["test_generated"] = test_mod
    spec.loader.exec_module(test_mod)
    return test_mod

def run_tests_get_suite(test_mod: types.ModuleType) -> unittest.TestSuite:
    """Create a suite from the already-loaded test module."""
    loader = unittest.defaultTestLoader
    return loader.loadTestsFromModule(test_mod)

def coverage_percent_for_file(cov: Coverage, filename: str) -> float:
    """
    Compute percent covered for a specific filename using Coverage API.
    Returns 0.0 if file wasn't measured.
    """
    try:
        data = cov.get_data()
        if not data or filename not in data.measured_files():
            return 0.0, 0, 0
        # analysis2 returns (filename, statements, excluded, missing, partial)
        _, statements, _, missing, _ = cov.analysis2(filename)
        if not statements:
            return 0.0
        covered = len(statements) - len(missing)
        return round(100.0 * covered / len(statements), 2), len(statements), len(missing)
    except CoverageWarning:
        return 0.0, 0, 0

rows = []

# Discover (idx, entry_point, solution_path)
canon = []
for f in sorted(CANON_DIR.glob("*.py")):
    m = re.match(r"^(\d{3})_(.+)\.py$", f.name)
    if m:
        canon.append((int(m.group(1)), m.group(2), f))

for idx, entry_point, sol_path in canon:
    test_path = TESTS_DIR / f"HumanEval_{idx}.py"
    if not test_path.exists():
        print(f"[SKIP] {idx:03d} no test file {test_path.name}")
        continue

    # 1) Load solution module and fetch the function
    sol_mod = load_solution_as_module(sol_path)
    if not hasattr(sol_mod, entry_point):
        print(f"[WARN] {idx:03d} entry point '{entry_point}' not found in {sol_path.name}")
        func = None
    else:
        func = getattr(sol_mod, entry_point)

    # 2) Load tests with function injected
    if func is None:
        coverage_pct = 0.0
    else:
        test_mod = load_test_module(test_path, func)
        suite = run_tests_get_suite(test_mod)

        # 3) Run tests under Coverage API, measuring ONLY the target file via include=[...]
        cov = Coverage(include=[str(sol_path)])
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=CoverageWarning)

        cov.erase()
        cov.start()
        
        # run tests
        result = unittest.TextTestRunner(stream=io.StringIO(), verbosity=0).run(suite)
        cov.stop()
        cov.save()
        # cov.report()  # initialize coverage

        # 4) Compute percent for this exact file
        coverage_pct, statements, missing = coverage_percent_for_file(cov, str(sol_path))

    # print(f"[{idx:03d}] cov={coverage_pct:.1f}%")
    rows.append({
        "id": idx,
        "entry_point": entry_point,
        "statements": statements,
        "missing": missing,
        "coverage": coverage_pct,
    })
    # if(idx == 2):

    #     break

df_cov = pd.DataFrame(rows).sort_values("id").reset_index(drop=True)
print(f"Average Coverage Rate: {df_cov['coverage'].mean():.2f}%")
df_cov


Average Coverage Rate: 63.41%


Unnamed: 0,id,entry_point,statements,missing,coverage
0,0,has_close_elements,9,2,77.78
1,1,separate_paren_groups,16,2,87.50
2,2,truncate_number,2,1,50.00
3,3,below_zero,8,2,75.00
4,4,mean_absolute_deviation,4,2,50.00
...,...,...,...,...,...
159,159,eat,4,1,75.00
160,160,do_algebra,0,0,0.00
161,161,solve,15,1,93.33
162,162,string_to_md5,3,1,66.67


In [68]:
import coverage
import os

# 最简单的测试版本
def debug_coverage():
    cov = coverage.Coverage()
    cov.start()
    
    # 直接执行一个简单的测试
    test_code = '''
mod = importlib.import_module(
    "canonical_solutions.000_has_close_elements"
)
has_close_elements = mod.has_close_elements

# 简单测试
result = has_close_elements([1.0, 2.0, 3.0], 0.5)
print(f"Test result: {result}")
'''
    
    exec(test_code)
    
    cov.stop()
    cov.save()
    cov.report()
    # cov.html_report()

# 先运行这个调试版本
print("运行调试版本...")
debug_coverage()

运行调试版本...
Test result: False
Name                                            Stmts   Miss  Cover
-------------------------------------------------------------------
canonical_solutions\000_has_close_elements.py       9      3    67%
-------------------------------------------------------------------
TOTAL                                               9      3    67%
