# APPS Benchmarking

## Setup

### Datasets and Metrics

In [1]:
from datasets import load_dataset
from evaluate import load

# This seems to do something weirdly inefficient
# datasets = {x: load_dataset("codeparrot/apps", trust_remote_code=True, split="test", difficulties=[x]) for x in ["introductory", "interview", "competition"]}

def get_dataset(difficulty):
    return load_dataset("codeparrot/apps", trust_remote_code=True, split="test", difficulties=[difficulty])

metric = load('codeparrot/apps_metric')

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/local/Cellar/python@3.10/3.10.15/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/local/Cellar/python@3.10/3.10.15/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Users/lewishammond/Repositories/code/nip/playground/benchmarking/venv/lib/python3.10/site-packages/ipykernel_la

### Models

In [2]:
import requests
import json

OPENROUTER_API_KEY = "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"

MODELS = ["openai/gpt-4o-mini",
          "meta-llama/llama-3.1-8b-instruct", 
          "deepseek/deepseek-coder",
          "mistralai/codestral-mamba"]

def get_responses(model, messages):
    """
    Sends a POST request to the OpenRouter API to get responses from a chat model.

    Parameters:
    - model (str): The name of the chat model to use.
    - messages (list): A list of dictionaries representing the chat messages. Each dictionary should have a "role" key with the value "user" or "assistant", and a "content" key with the content of the message.

    Returns:
    - response (Response): The response object returned by the API.

    Raises:
    - requests.exceptions.RequestException: If there was an error sending the request.

    """

    response = requests.post(
    url="https://openrouter.ai/api/v1/chat/completions",
    headers={ "Authorization": f"Bearer {OPENROUTER_API_KEY}"},
    data=json.dumps({"model": model, "messages": messages}))

    return response.json()['choices'][0]['message']['content']

### Utilities

In [3]:
def save_answers(answers, model, difficulty):
    
    model_name = model.split('/')[-1]
    file_path = f"apps/generations/{model_name}_{difficulty}.json"

    with open(file_path, "w") as f:
        json.dump(answers, f, indent=4)

def save_results(results):
    
    with open("apps/results.json", "w") as f:
        json.dump(results, f, indent=4)

In [4]:
SYSTEM_PROMPT = "Answer the following question by writing a function in Python (you do not need to include `python` anywhere and you do not need to call the function). Do NOT include any explanation or natural language text, ONLY the code. Your code should be concise and should not include any comments. The solution should take input via stdin (i.e., as `input()`) and should output by printing to stdout.\n\n\n"

## Evaluations

In [5]:
def evaluate(model, data, attempts):

    answers = {}

    for j in range(len(data)):

        problem_id = data[j]['problem_id']
        answers[problem_id] = []

        for _ in range(attempts):

            messages = [{"role": "system", "content": SYSTEM_PROMPT},
                        {"role": "user", "content": data[j]['question']}]
            response = get_responses(model, messages)
            answers[problem_id].append(response)

    return answers

def run(models, difficulties, attempts=1):

    results = {}
    for d in difficulties:
        data = get_dataset(d)
        for m in models:
            answers = evaluate(m, data, attempts)
            save_answers(answers, m, d)
            results[m][d] = metric.compute(predictions=answers, level=d)

    save_results(results)

    return results

### Test

In [8]:
print("================ QUESTION ================\n")
all_data = get_dataset("introductory")
data = all_data[0]
print(data["question"])




Using the latest cached version of the module from /Users/lewishammond/.cache/huggingface/modules/datasets_modules/datasets/codeparrot--apps/04ac807715d07d6e5cc580f59cdc8213cd7dc4529d0bb819cca72c9f8e8c1aa5 (last modified on Sun Sep  8 23:53:31 2024) since it couldn't be found locally at codeparrot/apps, or remotely on the Hugging Face Hub.


An accordion is a string (yes, in the real world accordions are musical instruments, but let's forget about it for a while) which can be represented as a concatenation of: an opening bracket (ASCII code $091$), a colon (ASCII code $058$), some (possibly zero) vertical line characters (ASCII code $124$), another colon, and a closing bracket (ASCII code $093$). The length of the accordion is the number of characters in it.

For example, [::], [:||:] and [:|||:] are accordions having length $4$, $6$ and $7$. (:|:), {:||:}, [:], ]:||:[ are not accordions. 

You are given a string $s$. You want to transform it into an accordion by removing some (possibly zero) characters from it. Note that you may not insert new characters or reorder existing ones. Is it possible to obtain an accordion by removing characters from $s$, and if so, what is the maximum possible length of the result?


-----Input-----

The only line contains one string $s$ ($1 \le |s| \le 500000$). It consists of lowercase Latin

In [7]:
model = "openai/gpt-4o-mini"
print("================ ANSWER ================\n")
answers = evaluate(model, [data], 1)
print(answers[0][0])


def max_accordion_length(s):
    first_colon = s.find(':')
    last_colon = s.rfind(':')
    if first_colon == -1 or last_colon == -1 or first_colon >= last_colon:
        return -1
    open_bracket = s.rfind('[', 0, first_colon)
    close_bracket = s.find(']', last_colon)
    if open_bracket == -1 or close_bracket == -1:
        return -1
    count_vertical = s[open_bracket:close_bracket].count('|')
    return 4 + count_vertical

print(max_accordion_length(input().strip()))


In [1]:
from evaluate import load
metric = load('codeparrot/apps_metric')
answer = "s = input().strip()\nleft_bracket = s.find('[')\nright_bracket = s.rfind(']')\nif left_bracket == -1 or right_bracket == -1 or left_bracket >= right_bracket:\n    print(-1)\n    return\ncolon1 = s.find(':', left_bracket)\ncolon2 = s.rfind(':', left_bracket, right_bracket)\nif colon1 == -1 or colon2 == -1 or colon1 >= colon2:\n    print(-1)\n    return\nvertical_lines = s[colon1 + 1:colon2].count('|')\nprint(4 + vertical_lines)"

print("================ RESULTS ================\n")

results = metric.compute(predictions=[[answer]], level="introductory")
print(results)


IT was safe
Computing accuracy metrics...
number of compile errors = 1 avg = 1.0
number of runtime errors = 0 avg = 0.0
number of problems evaluated = 1
Average Accuracy : 0.0
Strict Accuracy : 0.0
{'avg_accuracy': 0.0, 'strict_accuracy': 0.0, 'pass_at_k': None}


## Results