In [None]:
!pip install -U datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from datasets import load_dataset

DATA_DIR = "drive/MyDrive/HumanEval-P/data/"

human_eval = load_dataset("openai/openai_humaneval")
# human_eval = load_dataset()
human_eval_renamed = load_dataset("csv", data_files=f"{DATA_DIR}/human_eval_renamed.csv")
human_eval["renamed"] = human_eval_renamed["train"]
human_eval_interfered = load_dataset("csv", data_files=f"{DATA_DIR}/human_eval_interfered.csv")
human_eval["interfered"] = human_eval_interfered["train"]

print(human_eval)

In [None]:
# model/set/run list configurations
OPENROUTER_API_KEY = "<>"
SET_NAME = "interfered" # "test" | "renamed" | "interfered"
MODELS = [
    "openai/gpt-4.1",
    "openai/gpt-4.1-mini",
    "openai/gpt-4.1-nano",
    "openai/gpt-4o-mini",
    "openai/gpt-4o-2024-11-20",
    "deepseek/deepseek-chat-v3-0324",
    "deepseek/deepseek-r1-0528",
    # "google/gemini-2.5-flash",
    # "google/gemini-2.5-pro",
    "meta-llama/llama-3.3-70b-instruct",
    "meta-llama/llama-3.1-8b-instruct",
    "meta-llama/llama-3.2-3b-instruct",
    "meta-llama/llama-3.2-1b-instruct",
    "meta-llama/llama-4-maverick",
    "meta-llama/llama-4-scout",
    # "qwen/qwen3-30b-a3b",
    # "qwen/qwen3-14b",
    # "qwen/qwen3-8b",
    # "qwen/qwen3-1.7b",
    # "qwen/qwen-2.5-coder-32b-instruct",
    # "qwen/qwen2.5-coder-7b-instruct"
]

In [None]:
# Utils to check OpenRouter rate limit
import requests
import json
response = requests.get(
  url="https://openrouter.ai/api/v1/auth/key",
  headers={
    "Authorization": f"Bearer {OPENROUTER_API_KEY}"
  }
)
print(json.dumps(response.json(), indent=2))

In [None]:
from openai import OpenAI
import pickle
import os
from tqdm.notebook import tqdm

for model in MODELS:
    client = OpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=OPENROUTER_API_KEY
    )

    MODEL_NAME = model

    # encapsulate openai API into a function that only requires messages as input
    def pipe(messages):
        response = client.chat.completions.create(
            model=f"{MODEL_NAME}:nitro",
            messages=messages
        )
        return response

    # messages = [
    #     {"role": "system", "content": "You are a helpful assistant."},
    #     {
    #         "role": "user",
    #         "content": "Hi Gemini"
    #     }
    # ]
    # print(pipe(messages))

    outputs = []

    PATH = f"drive/MyDrive/HumanEval-P/lab02/{SET_NAME}/{MODEL_NAME.split('/')[-1]}-{SET_NAME}.pkl"

    if os.path.exists(PATH):
      with open(PATH, "rb") as f:
        outputs = pickle.load(f)

    for i in tqdm(range(len(outputs), len(human_eval[SET_NAME])), desc=f"{MODEL_NAME}-{SET_NAME}"):
        sample = human_eval[SET_NAME][i]

        prompt = sample["prompt"]
        messages = [
            {"role": "system", "content": "You are a code generation assistant. You are given the beginning part of the code and docstring. Complete the code without repeating given part, without any introductory or concluding remarks."},
            {"role": "user", "content": prompt},
        ]
        response = pipe(messages)
        outputs.append({
            "task_id": sample["task_id"],
            "prompt": prompt,
            'canonical_solution': sample["canonical_solution"],
            'test': sample["test"],
            'entry_point': sample["entry_point"],
            # "response": response[0]["generated_text"]
            "response": response.choices[0].message.content
        })
        with open(PATH, "wb") as f:
            pickle.dump(outputs, f)

        # rate limit: 180 requests per 10s interval
        # time.sleep(0.2)