In [None]:
!pip install -U datasets

In [14]:
from datasets import load_dataset

human_eval = load_dataset("openai/openai_humaneval")
# human_eval = load_dataset()
human_eval_renamed = load_dataset("csv", data_files="data/human_eval_renamed.csv")
human_eval["renamed"] = human_eval_renamed["train"]

print(human_eval)

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['task_id', 'prompt', 'canonical_solution', 'test', 'entry_point'],
        num_rows: 164
    })
    renamed: Dataset({
        features: ['task_id', 'prompt', 'canonical_solution', 'test', 'entry_point'],
        num_rows: 164
    })
})


In [18]:
OPENROUTER_API_KEY = "<OPENROUTER_API_KEY>"
SET_NAME = "renamed" # "test" | "renamed"
MODELS = [
    "openai/gpt-4.1",
    "openai/gpt-4.1-mini",
    "openai/gpt-4.1-nano",
    "openai/gpt-4o-mini",
    "openai/gpt-4o-2024-11-20",
    "deepseek/deepseek-chat-v3-0324",
    "deepseek/deepseek-r1-0528",
    "google/gemini-2.5-flash",
    "google/gemini-2.5-pro",
    "meta-llama/llama-3.3-70b-instruct",
    "meta-llama/llama-3.1-8b-instruct",
    "meta-llama/llama-3.2-3b-instruct",
    "meta-llama/llama-3.2-1b-instruct",
    "meta-llama/llama-4-maverick",
    "meta-llama/llama-4-scout",
    # "qwen/qwen3-30b-a3b",
    # "qwen/qwen3-14b",
    # "qwen/qwen3-8b",
    # "qwen/qwen3-1.7b",
    # "qwen/qwen-2.5-coder-32b-instruct",
    # "qwen/qwen2.5-coder-7b-instruct"
]

In [19]:
# Utils to check OpenRouter rate limit
import requests
import json
response = requests.get(
  url="https://openrouter.ai/api/v1/auth/key",
  headers={
    "Authorization": f"Bearer {OPENROUTER_API_KEY}"
  }
)
print(json.dumps(response.json(), indent=2))

{
  "data": {
    "label": "sk-or-v1-853...596",
    "limit": null,
    "usage": 0.483612,
    "is_provisioning_key": false,
    "limit_remaining": null,
    "is_free_tier": false,
    "rate_limit": {
      "requests": 100,
      "interval": "10s"
    }
  }
}


In [None]:
from openai import OpenAI
import pickle
import os
from tqdm import tqdm

for model in MODELS:
    client = OpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=OPENROUTER_API_KEY
    )

    MODEL_NAME = model

    # encapsulate openai API into a function that only requires messages as input
    def pipe(messages):
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages
        )
        return response

    # messages = [
    #     {"role": "system", "content": "You are a helpful assistant."},
    #     {
    #         "role": "user",
    #         "content": "Hi Gemini"
    #     }
    # ]
    # print(pipe(messages))

    outputs = []
    if os.path.exists(f"drive/MyDrive/lab/{SET_NAME}/{MODEL_NAME}-{SET_NAME}.pkl"):
      with open(f"drive/MyDrive/lab/{SET_NAME}/{MODEL_NAME}-{SET_NAME}.pkl", "rb") as f:
        outputs = pickle.load(f)

    for i in tqdm(range(len(outputs), len(human_eval[SET_NAME])), desc=f"{MODEL_NAME}-{SET_NAME}"):
        sample = human_eval[SET_NAME][i]

        prompt = sample["prompt"]
        messages = [
            {"role": "system", "content": "You are a code generation assistant. You are given the beginning part of the code and docstring. Complete the code without repeating given part, without any introductory or concluding remarks"},
            {"role": "user", "content": prompt},
        ]
        response = pipe(messages)
        outputs.append({
            "task_id": sample["task_id"],
            "prompt": prompt,
            'canonical_solution': sample["canonical_solution"],
            'test': sample["test"],
            'entry_point': sample["entry_point"],
            # "response": response[0]["generated_text"]
            "response": response.choices[0].message.content
        })
        with open(f"drive/MyDrive/lab/{SET_NAME}/{MODEL_NAME}-{SET_NAME}.pkl", "wb") as f:
            pickle.dump(outputs, f)

        # rate limit: 180 requests per 10s interval
        # time.sleep(0.2)


In [None]:
code = outputs[0]["response"]
# strip other text from code
code = code.split("```python")[1].split("```")[0]

test_code = outputs[0]["test"]

executes = code + "\n" + test_code + "\n" + "check({})".format(outputs[0]["entry_point"])
exec(executes)
print("PASS!")

PASS!


In [None]:
# run code as python script
code = """
from typing import List

def has_close_elements(numbers: List[float], threshold: float) -> bool:
    for idx, elem in enumerate(numbers):
        for idx2, elem2 in enumerate(numbers):
            if idx != idx2:
                distance = abs(elem - elem2)
                if distance < threshold:
                    return False    # False to trigger AssertionError

    return False
"""
test_code = outputs[0]["test"]
print(test_code)



METADATA = {
    'author': 'jt',
    'dataset': 'test'
}


def check(candidate):
    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True
    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False
    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True
    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False
    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True
    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True
    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False


