# APPS Benchmarking

## Setup

### Datasets and Metrics

In [26]:
from datasets import load_dataset
from evaluate import load

# This seems to do something weirdly inefficient
# datasets = {x: load_dataset("codeparrot/apps", trust_remote_code=True, split="test", difficulties=[x]) for x in ["introductory", "interview", "competition"]}

def get_dataset(difficulty):
    return load_dataset("codeparrot/apps", trust_remote_code=True, split="test", difficulties=[difficulty])

metric = load('codeparrot/apps_metric')

### Models

In [27]:
import requests
import json

OPENROUTER_API_KEY = "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"

MODELS = ["openai/gpt-4o-mini",
          "meta-llama/llama-3.1-8b-instruct", 
          "deepseek/deepseek-coder",
          "mistralai/codestral-mamba"]

def get_responses(model, messages):
    """
    Sends a POST request to the OpenRouter API to get responses from a chat model.

    Parameters:
    - model (str): The name of the chat model to use.
    - messages (list): A list of dictionaries representing the chat messages. Each dictionary should have a "role" key with the value "user" or "assistant", and a "content" key with the content of the message.

    Returns:
    - response (Response): The response object returned by the API.

    Raises:
    - requests.exceptions.RequestException: If there was an error sending the request.

    """

    response = requests.post(
    url="https://openrouter.ai/api/v1/chat/completions",
    headers={ "Authorization": f"Bearer {OPENROUTER_API_KEY}"},
    data=json.dumps({"model": model, "messages": messages}))

    return response.json()['choices'][0]['message']['content']

### Utilities

In [28]:
def save_answers(answers, model, difficulty):
    
    model_name = model.split('/')[-1]
    file_path = f"apps/generations/{model_name}_{difficulty}.json"

    with open(file_path, "w") as f:
        json.dump(answers, f, indent=4)

def save_results(results):
    
    with open("apps/results.json", "w") as f:
        json.dump(results, f, indent=4)

In [48]:
SYSTEM_PROMPT = "Answer the following question by providing Python code. Do NOT include any explanation or natural language text, ONLY the code. Your code should be concise and should not include any comments.\n\n\n"

## Evaluations

In [49]:
def evaluate(model, data, attempts):

    answers = {}

    for j in range(len(data)):

        problem_id = data[j]['problem_id']
        answers[problem_id] = []

        for _ in range(attempts):

            messages = [{"role": "system", "content": SYSTEM_PROMPT},
                        {"role": "user", "content": data[j]['question']}]
            response = get_responses(model, messages)
            answers[problem_id].append(response)

    return answers

def run(models, difficulties, attempts=1):

    results = {}
    for d in difficulties:
        data = get_dataset(d)
        for m in models:
            answers = evaluate(m, data, attempts)
            save_answers(answers, m, d)
            results[m][d] = metric.compute(predictions=answers, level=d)

    save_results(results)

    return results

### Test

In [42]:
print("================ QUESTION ================")
all_data = get_dataset("introductory")
data = all_data[0]
print(data["question"])

An accordion is a string (yes, in the real world accordions are musical instruments, but let's forget about it for a while) which can be represented as a concatenation of: an opening bracket (ASCII code $091$), a colon (ASCII code $058$), some (possibly zero) vertical line characters (ASCII code $124$), another colon, and a closing bracket (ASCII code $093$). The length of the accordion is the number of characters in it.

For example, [::], [:||:] and [:|||:] are accordions having length $4$, $6$ and $7$. (:|:), {:||:}, [:], ]:||:[ are not accordions. 

You are given a string $s$. You want to transform it into an accordion by removing some (possibly zero) characters from it. Note that you may not insert new characters or reorder existing ones. Is it possible to obtain an accordion by removing characters from $s$, and if so, what is the maximum possible length of the result?


-----Input-----

The only line contains one string $s$ ($1 \le |s| \le 500000$). It consists of lowercase Latin

In [50]:
model = "openai/gpt-4o-mini"
print("================ ANSWER ================")
answers = evaluate(model, [data], 1)
print(answers[0][0])

```python
s = input()
l, r = -1, -1
for i in range(len(s)):
    if s[i] == '[':
        l = i
    if s[i] == ']':
        r = i
if l == -1 or r == -1 or l > r:
    print(-1)
else:
    colons = s[l:r].count(':')
    if colons < 2:
        print(-1)
    else:
        vertical_lines = s[l:r].count('|')
        print(4 + vertical_lines)
```


In [62]:
a = answers[0][0][10:-3]
print(a)


s = input()
l, r = -1, -1
for i in range(len(s)):
    if s[i] == '[':
        l = i
    if s[i] == ']':
        r = i
if l == -1 or r == -1 or l > r:
    print(-1)
else:
    colons = s[l:r].count(':')
    if colons < 2:
        print(-1)
    else:
        vertical_lines = s[l:r].count('|')
        print(4 + vertical_lines)



In [58]:
print("================ RESULTS ================")
results = metric.compute(predictions=[[answers[0][0][10:-3]]], level="introductory")
print(results)



KeyboardInterrupt: 

## Results