In [None]:
!pip install datasets

In [1]:
from datasets import load_dataset

human_eval = load_dataset("openai/openai_humaneval")
# human_eval = load_dataset()

print(human_eval)

DatasetDict({
    test: Dataset({
        features: ['task_id', 'prompt', 'canonical_solution', 'test', 'entry_point'],
        num_rows: 164
    })
})


In [4]:
human_eval["test"][0]["prompt"]


'from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n'

In [7]:
from openai import OpenAI
import time
import pickle

MODELS = ["deepseek/deepseek-chat-v3-0324", "deepseek/deepseek-chat"]

for model in MODELS:
    client = OpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=""
    )

    MODEL_NAME = model

    # encapsulate openai API into a function only requires messages as input
    def pipe(messages):
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages
        )
        return response

    # messages = [
    #     {"role": "system", "content": "You are a helpful assistant."},
    #     {
    #         "role": "user",
    #         "content": "Hi Gemini"
    #     }
    # ]
    # print(pipe(messages))

    outputs = []
    for sample in human_eval["test"]:
        begin_time = time.perf_counter()

        prompt = sample["prompt"]
        messages = [
            {"role": "user", "content": prompt},
        ]
        response = pipe(messages)
        outputs.append({
            "task_id": sample["task_id"],
            "prompt": prompt,
            'canonical_solution': sample["canonical_solution"],
            'test': sample["test"],
            'entry_point': sample["entry_point"],
            # "response": response[0]["generated_text"]
            "response": response.choices[0].message.content
        })
        with open(f"lab/{MODEL_NAME}.pkl", "wb") as f:
            pickle.dump(outputs, f)

        end_time = time.perf_counter()
        # print current progress
        serial = int(sample["task_id"].split("/")[-1]) + 1
        print("Responsed:\t{}\t{}/{}\t {:.3f}s".format(sample["task_id"], serial, len(human_eval['test']), end_time-begin_time), end="\r")
        # wait 5 seconds to avoid rate limiting
        time.sleep(5)

    print(f"{MODEL_NAME} with HumanEval done.")


Responsed:	HumanEval/163	164/164	 20.969sesponsed:	HumanEval/102	103/164	 16.683sResponsed:	HumanEval/104	105/164	 15.145sResponsed:	HumanEval/106	107/164	 23.376sResponsed:	HumanEval/108	109/164	 210.071sResponsed:	HumanEval/110	111/164	 239.277sResponsed:	HumanEval/112	113/164	 5.819sResponsed:	HumanEval/114	115/164	 178.699sResponsed:	HumanEval/116	117/164	 113.761sResponsed:	HumanEval/118	119/164	 28.604sResponsed:	HumanEval/120	121/164	 203.079sResponsed:	HumanEval/122	123/164	 152.483sResponsed:	HumanEval/124	125/164	 5.057sResponsed:	HumanEval/126	127/164	 14.907sResponsed:	HumanEval/128	129/164	 19.566sResponsed:	HumanEval/130	131/164	 379.805sResponsed:	HumanEval/132	133/164	 2.429sResponsed:	HumanEval/134	135/164	 12.348sResponsed:	HumanEval/136	137/164	 29.812sResponsed:	HumanEval/138	139/164	 105.828sResponsed:	HumanEval/140	141/164	 104.798sResponsed:	HumanEval/142	143/164	 103.621sResponsed:	HumanEval/144	145/164	 109.896sResponsed:	HumanEval/146	147/164	 156.090sRespon

In [8]:
import os
import signal

pid = 64743
os.kill(pid, signal.SIGINT)

In [None]:
outputs

In [None]:
code = outputs[0]["response"]
# strip other text from code
code = code.split("```python")[1].split("```")[0]

test_code = outputs[0]["test"]

executes = code + "\n" + test_code + "\n" + "check({})".format(outputs[0]["entry_point"])
exec(executes)
print("PASS!")

PASS!


In [None]:
# run code as python script
code = """
from typing import List

def has_close_elements(numbers: List[float], threshold: float) -> bool:
    for idx, elem in enumerate(numbers):
        for idx2, elem2 in enumerate(numbers):
            if idx != idx2:
                distance = abs(elem - elem2)
                if distance < threshold:
                    return False    # False to trigger AssertionError

    return False
"""
test_code = outputs[0]["test"]
print(test_code)



METADATA = {
    'author': 'jt',
    'dataset': 'test'
}


def check(candidate):
    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False
    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True
    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False
    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True
    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True
    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False


