In [None]:
!pip install -U datasets

In [None]:
# prompt: mount google drive

from google.colab import drive
drive.mount('/content/drive')

In [None]:
from datasets import load_dataset

DATA_DIR = "drive/MyDrive/HumanEval-P/data/"

human_eval = load_dataset("openai/openai_humaneval")
# human_eval = load_dataset()
human_eval_renamed = load_dataset("csv", data_files=f"{DATA_DIR}/human_eval_renamed.csv")
human_eval["renamed"] = human_eval_renamed["train"]
human_eval_interfered = load_dataset("csv", data_files=f"{DATA_DIR}/human_eval_interfered.csv")
human_eval["interfered"] = human_eval_interfered["train"]

print(human_eval)

In [None]:
GCP_API_KEY = "<>"
SET_NAME = "interfered" # "test" | "renamed" | "interfered"
MODELS = [
    # "openai/gpt-4.1",
    # "openai/gpt-4.1-mini",
    # "openai/gpt-4.1-nano",
    # "openai/gpt-4o-mini",
    # "openai/gpt-4o-2024-11-20",
    # "deepseek/deepseek-chat-v3-0324",
    # "deepseek/deepseek-r1-250528",
    "google/gemini-2.5-flash",
    "google/gemini-2.5-pro",
    # "meta-llama/llama-3.3-70b-instruct",
    # "meta-llama/llama-3.1-8b-instruct",
    # "meta-llama/llama-3.2-3b-instruct",
    # "meta-llama/llama-3.2-1b-instruct",
    # "meta-llama/llama-4-maverick",
    # "meta-llama/llama-4-scout",
    # "qwen/qwen3-30b-a3b",
    # "qwen/qwen3-32b",
    # "qwen/qwen3-8b",
    # "qwen/qwen3-1.7b",
    # "baidu/ernie-4.5-turbo-128k",
    # "baidu/ernie-4.5-0.3b",
    # "baidu/ernie-4.5-21b-a3b"
]

In [None]:
from google import genai
from google.genai import types
import pickle
import os
from tqdm.notebook import tqdm

for model in MODELS:
    client = genai.Client(
        api_key=GCP_API_KEY
    )

    MODEL_PATH = model
    MODEL_NAME = model.split("/")[-1]
    if MODEL_NAME == "gemini-2.5-flash":
        thinking_budget = 0
    else:
        thinking_budget = -1

    # encapsulate openai API into a function that only requires messages as input
    def pipe(messages):
        response = client.models.generate_content(
            model=MODEL_NAME,
            contents=messages,
            config=types.GenerateContentConfig(
                system_instruction="You are a code generation assistant. You are given the beginning part of the code and docstring. Complete the code without repeating given part, without any introductory or concluding remarks",

                # thinking_config=types.ThinkingConfig(thinking_budget=0) # Disables thinking for flash
                # enable dynamic thinking
                thinking_config=types.ThinkingConfig(thinking_budget=thinking_budget)
            )
        )
        return response


    outputs = []
    PATH = f"drive/MyDrive/HumanEval-P/lab02/{SET_NAME}/{MODEL_NAME.split('/')[-1]}-{SET_NAME}.pkl"
    if os.path.exists(PATH):
      with open(PATH, "rb") as f:
        outputs = pickle.load(f)

    for i in tqdm(range(len(outputs), len(human_eval[SET_NAME])), desc=f"{MODEL_PATH}-{SET_NAME}"):
        sample = human_eval[SET_NAME][i]

        prompt = sample["prompt"]
        messages = prompt
        response = pipe(messages)
        outputs.append({
            "task_id": sample["task_id"],
            "prompt": prompt,
            'canonical_solution': sample["canonical_solution"],
            'test': sample["test"],
            'entry_point': sample["entry_point"],
            # "response": response[0]["generated_text"]
            "response": response.text
        })
        with open(PATH, "wb") as f:
            pickle.dump(outputs, f)

        # rate limit: 180 requests per 10s interval
        # time.sleep(0.2)
