In [1]:
# model/set/run list configurations
MODELS = [
    "gpt-4.1",
    "gpt-4.1-mini",
    "gpt-4.1-nano",
    "gpt-4o-mini",
    "gpt-4o-2024-11-20",
    "deepseek-chat-v3-0324",
    "deepseek-r1-0528",
    "gemini-2.5-flash",
    "gemini-2.5-pro",
    "llama-3.3-70b-instruct",
    "llama-3.1-8b-instruct",
    "llama-3.2-3b-instruct",
    "llama-3.2-1b-instruct",
    "llama-4-maverick",
    "llama-4-scout",
    "qwen3-30b-a3b",
    "qwen3-32b",
    "qwen3-8b",
    "qwen3-1.7b",
    "ernie-4.5-turbo-128k",
    "ernie-4.5-0.3b",
    "ernie-4.5-21b-a3b"
]
SET_NAME = [
    "test",
    "renamed",
    "interfered"
]
LABS = [
    "lab01",
    "lab02",
    "lab03"
]

In [2]:
# Validate answers from LLM with Python interpreter. Async programming for faster result.
import re
import asyncio
import tempfile

def code_wrapper (row):
    prompt = row["prompt"]
    code = row["response"]
    pattern = r"```.*"
    test_code = row["test"]

    if re.match(pattern, code):
        code = re.split(pattern, code)[1]
    executable = f"{prompt}\n{code}\n{test_code}\ncheck({row['entry_point']})"

    return executable

async def code_runner (executable) -> str:
    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as tmp:
        tmp.write(executable)
        tempPath = tmp.name

    process_eval = await asyncio.create_subprocess_exec(
        "python",
        tempPath,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE
    )

    os.remove(tempPath)

    status = ""
    try:
        stdout_bytes, stderr_bytes = await asyncio.wait_for(
            process_eval.communicate(), timeout=5.00
        )
        status = "Pass" if process_eval.returncode == 0 else stderr_bytes.decode("utf-8", errors="ignore")
    except asyncio.TimeoutError:
        process_eval.kill()
        await process_eval.wait()
        status = "Timeout"

    return status


In [3]:
import pickle
import pandas as pd
import os
from tqdm.notebook import tqdm, trange
import asyncio

async def eval():
    df = pd.DataFrame()
    pbar_all = trange(len(MODELS)*len(SET_NAME)*len(LABS), desc="Evaluation")

    for model in MODELS:
        for set in SET_NAME:
            for lab in LABS:
                with open(f"lab/{lab}/{set}/{model}-{set}.pkl", "rb") as f:
                    outputs = pickle.load(f)
                assert len(outputs) == 164

                coroutines = [code_runner(code_wrapper(sample)) for sample in outputs]

                eval_results = await asyncio.gather(*coroutines)
                eval_results = [{
                    "model": model,
                    "set": set,
                    "lab": lab,
                    "sample": i,
                    "result": result
                } for i, result in enumerate(eval_results)]
                eval_df = pd.DataFrame(eval_results)
                df = pd.concat([df, eval_df], ignore_index=True)
                pbar_all.update(1)

    return df

import nest_asyncio
nest_asyncio.apply()

df = await eval()

Evaluation:   0%|          | 0/198 [00:00<?, ?it/s]

In [4]:

# save data to csv
# df = pandas.DataFrame(eval_data)
df.to_csv(f"lab/code_check.csv", index=False)
df

Unnamed: 0,model,set,lab,sample,result
0,gpt-4.1,test,lab01,0,Pass
1,gpt-4.1,test,lab01,1,Pass
2,gpt-4.1,test,lab01,2,Pass
3,gpt-4.1,test,lab01,3,Pass
4,gpt-4.1,test,lab01,4,Pass
...,...,...,...,...,...
32467,ernie-4.5-21b-a3b,interfered,lab03,159,"File ""/var/folders/3f/dx390skx54s51c7kcqjp8g..."
32468,ernie-4.5-21b-a3b,interfered,lab03,160,"File ""/var/folders/3f/dx390skx54s51c7kcqjp8g..."
32469,ernie-4.5-21b-a3b,interfered,lab03,161,"File ""/var/folders/3f/dx390skx54s51c7kcqjp8g..."
32470,ernie-4.5-21b-a3b,interfered,lab03,162,"File ""/var/folders/3f/dx390skx54s51c7kcqjp8g..."
