<a href="https://colab.research.google.com/github/friederrr/proof_contamination/blob/main/code/LLM_eval/LLM_Evaluations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
%%capture
# install and import libs:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import torch
import json
from pathlib import Path
from datetime import datetime
import pytz
import itertools
import uuid  # Generate a unique experiment ID



!pip install datasets
from datasets import Dataset, concatenate_datasets

In [4]:
#connect to your account (somehow only once needed) [uncomment if needed]

from google.colab import drive
drive.mount('/content/drive')

#from huggingface_hub import notebook_login
#notebook_login()

Mounted at /content/drive


# Pipeline

## Model Selection

**OLMO**:

- "allenai/OLMo-7B-0724-hf": Downlaod (F32): (27 GB), GPU-RAM (bfloat16): 14 GB (T4 sufficient)

- "allenai/OLMo-7B-0724-SFT-hf": Download (BF16) (14 GB), GPU-RAM (14 GB)

- "allenai/OLMo-7B-0724-Instruct-hf": Download (BF16) (14 GB), GPU-RAM (14 GB)

- "allenai/OLMo-2-1124-13B-Instruct": Download (BF16) (28 GB), GPU-RAM (27 GB) -> works fine with A100!

**DEEPSEEK**:
- "deepseek-ai/deepseek-math-7b-instruct":

**LEMMA**:
- "EleutherAI/llemma_7b"

**GPT4o**:
- "gpt-4o-2024-11-20"


In [48]:
#for API Access
model_id="gpt-4o-2024-11-20"
api_flag=True

In [49]:
if api_flag:
  tokenizer=None
else:
  model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map='auto')
  tokenizer = AutoTokenizer.from_pretrained(model_id)

### FUNCTIONS

In [88]:
def add_formatted_prompts(
    dataset: Dataset,
    tokenizer,
    model_name: str,
    shots_count: int = 0,
    question_key: str = "PROBLEM",
    answer_key: str = "SOLUTION",
    id_key: str = "ID"
) -> Dataset:
    """
    Add a 'formatted_prompt' column to each row in the dataset.
    Uses:
      - First `n` examples as fixed shots for all rows >= n
      - Last `n` examples as shots for rows < n
      - Each shot includes EXTRACTED_ANSWER, formatted as: "The final answer is \\boxed{...}"
      - Adds 'shot_ids': list of IDs used for shots
    """

    dataset_len = len(dataset)

    if shots_count > dataset_len:
        raise ValueError(f"shots_count ({shots_count}) exceeds dataset size ({dataset_len})")

    def format_shot(q, a, extracted):
        if extracted:
            return f"{a}\nThe final answer is \\boxed{{{extracted}}}."
        return a

    head_shots = [
        (
            dataset[i][question_key],
            format_shot(
                dataset[i][question_key],
                dataset[i][answer_key],
                dataset[i].get("EXTRACTED_ANSWER", "")
            ),
            dataset[i][id_key]
        )
        for i in range(shots_count)
    ] if shots_count > 0 else []

    tail_shots = [
        (
            dataset[dataset_len - shots_count + i][question_key],
            format_shot(
                dataset[dataset_len - shots_count + i][question_key],
                dataset[dataset_len - shots_count + i][answer_key],
                dataset[dataset_len - shots_count + i].get("EXTRACTED_ANSWER", "")
            ),
            dataset[dataset_len - shots_count + i][id_key]
        )
        for i in range(shots_count)
    ] if shots_count > 0 else []

    def formatting_map(example, idx):
        if shots_count == 0:
            selected_shots = []
            shot_ids = []
        elif idx < shots_count:
            selected_shots = [(q, a) for q, a, _id in tail_shots]
            shot_ids = [_id for _, _, _id in tail_shots]
        else:
            selected_shots = [(q, a) for q, a, _id in head_shots]
            shot_ids = [_id for _, _, _id in head_shots]

        prompt = build_prompt_for_model(
            model_name=model_name,
            shots=selected_shots,
            eval_question=example[question_key],
            tokenizer=tokenizer
        )

        return {
            "formatted_prompt": prompt,
            "shot_ids": shot_ids
        }

    return dataset.map(formatting_map, with_indices=True)


def build_prompt_for_model(
    model_name: str,
    shots: list,          # list of (question, answer)
    eval_question: str,
    tokenizer=None        # needed only for chat-template models
):
    name = model_name.lower()

    # === 1. Llemma (no chat template, plain text CoT) ===
    if "llemma" in name:
        prompt_parts = []
        for q, a in shots:
            prompt_parts.append(f"Q: {q} Let's think step by step.\nA: {a}")
        final_q = f"Q: {eval_question} Let's think step by step.\nA:"
        return "\n\n".join(prompt_parts + [final_q])

    # === 2. DeepSeek (chat format, no system prompt) ===
    elif "deepseek" in name:
        chat = []
        for q, a in shots:
            shot_q = f"{q}\nPlease reason step by step, and put your final answer within \\boxed"
            chat.append({"role": "user", "content": shot_q})
            chat.append({"role": "assistant", "content": a})
        final_q = f"{eval_question}\nPlease reason step by step, and put your final answer within \\boxed"
        chat.append({"role": "user", "content": final_q})
        if tokenizer:
            return tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
        return chat

    # === 3. Chat-style models (system,user,assistant): OLMO and GPT40 ===
    else:
        system_prompt = "You are a helpful assistant."
        chat = [{"role": "system", "content": system_prompt}]
        for q, a in shots:
            shot_q = f"{q}\nPlease reason step by step, and put your final answer within \\boxed"
            chat.append({"role": "user", "content": shot_q})
            chat.append({"role": "assistant", "content": a})
        final_q = f"{eval_question}\nPlease reason step by step, and put your final answer within \\boxed"
        chat.append({"role": "user", "content": final_q})
        if tokenizer:
            return tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
        return chat


# Function to write results to a file
def write_results_to_file(data_entries, drive_path, dataset_shortname, start_i, stop_i, model_shortname, current_date):
    log_file = f"output_{dataset_shortname}_{start_i}-{stop_i}_{model_shortname}_{current_date}.json"
    final_output = {
        "metadata": metadata,
        "data": data_entries
    }
    with open(drive_path / log_file, "w") as f:
        json.dump(final_output, f, indent=4)
    print(f"Results have been logged to {log_file}")

def generate_and_log(row, model, tokenizer, config_file):
    # Prompt generating
    formatted_prompt = row["formatted_prompt"]

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")

    # Generate
    start_time = datetime.now()
    outputs = model.generate(**inputs, generation_config=config_file)
    end_time = datetime.now()

    # Get the input IDs and their length
    input_ids = inputs['input_ids']
    input_length = input_ids.shape[1]

    # Slice the output to get only the new tokens
    new_tokens = outputs[:, input_length:]
    generated_text = tokenizer.batch_decode(new_tokens)

    # Create data entry
    data_entry = {
        "ID": row['ID'],
        "PROBLEM": row['PROBLEM'],
        "CATEGORY": row['CATEGORY'],
        "LABEL": row['LABEL'],
        "PROMPT": formatted_prompt,
        "SOLUTION_KEY": row["EXTRACTED_ANSWER"],
        "SOLUTION_TEXT": row["SOLUTION"],
        "LLM_RESPONSE": generated_text[0],
        "attributes": {
            "start_time": start_time.strftime('%Y-%m-%dT%H:%M:%S'),
            "end_time": end_time.strftime('%Y-%m-%dT%H:%M:%S')
        }
    }

    # Return data_entry
    return data_entry

def generate_and_log_api(row, model_id, api_key, config_dict=None):
    """
    Generates a response from GPT-4o using OpenAI's API and logs relevant info.
    """
    client = OpenAI(api_key=api_key)
    # Prepare the chat prompt
    formatted_prompt = row["formatted_prompt"]

    # Handle generation parameters
    generation_args = {
        "model": model_id,
        "input": formatted_prompt,
    }
    if config_dict:
        generation_args.update(config_dict)

    # Generate
    start_time = datetime.now()
    response = client.responses.create(**generation_args)
    end_time = datetime.now()

    # Extract generated text
    generated_text = response.output_text

    # Create data entry
    data_entry = {
        "ID": row['ID'],
        "PROBLEM": row['PROBLEM'],
        "CATEGORY": row['CATEGORY'],
        "LABEL": row['LABEL'],
        "PROMPT": formatted_prompt,
        "SOLUTION_KEY": row["EXTRACTED_ANSWER"],
        "SOLUTION_TEXT": row["SOLUTION"],
        "LLM_RESPONSE": generated_text,
        "attributes": {
            "start_time": start_time.strftime('%Y-%m-%dT%H:%M:%S'),
            "end_time": end_time.strftime('%Y-%m-%dT%H:%M:%S')
        }
    }

    return data_entry

## Data Selection

In [51]:
from datasets import load_dataset

repo_id = "Tobstar001/MathCONTA"
split = "test"

config_name="core"
ds_conta_core = load_dataset(path=repo_id,name=config_name,split=split)

config_name="verbose"
ds_conta_verbose = load_dataset(path=repo_id,name=config_name,split=split)

In [52]:
# We need Extracted Answer!

# Convert datasets to pandas DataFrames
df_conta = ds_conta_core.to_pandas()
df_verbose = ds_conta_verbose.to_pandas()

# Merge on the ID column (adjust 'ID' if it's named differently)
merged_df = df_conta.merge(
    df_verbose[['ID', 'EXTRACTED_ANSWER']],
    on='ID',
    how='left'
)

# Convert back to Hugging Face dataset
merged_ds = Dataset.from_pandas(merged_df)

def correct_specific_row(example):
    if example["ID"] == "owm-word-3":
        example["EXTRACTED_ANSWER"] = "300"
    if example["ID"] == "clean-word-4":
        example["EXTRACTED_ANSWER"] = "672"
    if example["ID"] == "owm-word-10":
        example["EXTRACTED_ANSWER"]= "-6 / A"
    return example

merged_ds = merged_ds.map(correct_specific_row)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [53]:
#Important: filter on category to geht category specific shots

data_name="word-problems"
filtered_dataset = merged_ds.filter(lambda example: example["CATEGORY"] == data_name)
filtered_dataset_w=add_formatted_prompts(filtered_dataset, tokenizer, model_name=model_id, shots_count=1)
len(filtered_dataset_w)

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

24

In [54]:
#Important: filter on category to geht category specific shots

data_name="AMC8"
filtered_dataset = merged_ds.filter(lambda example: example["CATEGORY"] == data_name)
filtered_dataset_am=add_formatted_prompts(filtered_dataset, tokenizer, model_name=model_id, shots_count=1)
len(filtered_dataset_am)

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

36

In [55]:
#Important: filter on category to geht category specific shots

data_name="AIME"
filtered_dataset = merged_ds.filter(lambda example: example["CATEGORY"] == data_name)
filtered_dataset_ai=add_formatted_prompts(filtered_dataset, tokenizer, model_name=model_id, shots_count=1)
len(filtered_dataset_ai)

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

20

In [56]:
#Important: filter on category to geht category specific shots

data_name="Forum"
filtered_dataset = merged_ds.filter(lambda example: example["CATEGORY"] == data_name)
filtered_dataset_f=add_formatted_prompts(filtered_dataset, tokenizer, model_name=model_id, shots_count=1)
len(filtered_dataset_f)

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

20

In [57]:
#filtered_dataset_f.to_pandas()[["ID","CATEGORY","LABEL","PROBLEM", "EXTRACTED_ANSWER"]].to_excel("forum.xlsx")

## TEST Einzelner Beispiele

In [None]:
filtered_dataset[i]

{'ID': 'clean-forum-3',
 'CATEGORY': 'Forum',
 'PROBLEM': 'I am trying to evaluate the indefinite integral\\n$$\\\\int\\\\frac{\\\\sqrt{x^4+x}}{2x^3+1}\\\\mathrm dx$$\\nwhich has an elementary closed form. Can you help me?\\n\\n',
 'LABEL': 'Clean',
 'SOLUTION': '$I = \\\\int\\\\frac{\\\\sqrt{x^4+x}}{2x^3+1} \\\\, \\\\mathrm{d}x$\\n\\nusing $x\\\\to \\\\frac{1}{t}$\\n\\n$I=-\\\\int\\\\frac{\\\\sqrt{t^3+1}}{t(t^3+2)} \\\\, \\\\mathrm{d}t$\\n\\nLet $t^3=z$\\n\\n$I=-\\\\frac{1}{3}\\\\int \\\\frac{\\\\sqrt{z+1}}{z(z+2)} \\\\, \\\\mathrm{d}z$\\n\\nnow it is easily solvable by $u=\\\\sqrt{z+1}$ followed by partial fraction decomposition which finally gives us\\n\\n\\\\fbox{$I =\\\\frac{\\\\ln\\\\left(\\\\sqrt{z + 1} + 1\\\\right)-\\\\ln\\\\left(\\\\left|\\\\sqrt{z + 1} - 1\\\\right|\\\\right)   - 2 \\\\arctan\\\\left(\\\\sqrt{z + 1}\\\\right)}{6}+C$}\\n\\nwhere $z=\\\\frac{1}{x^3}$.',
 'LABEL_BINARY': 0,
 'EXTRACTED_ANSWER': '\\\\frac{\\\\ln\\\\left(\\\\sqrt{z + 1} + 1\\\\right)-\\\\ln\\\\le

In [None]:
config_file = model.generation_config
config_file.max_new_tokens = 2000
config_file.temperature = None
config_file.do_sample = False

In [None]:
for row in filtered_dataset:
  prompt=row["formatted_prompt"]

  print(row)
  break

In [None]:
#TEST
i+=1
#i=10
formatted_prompt = filtered_dataset[i]["formatted_prompt"]
print("-------Prompt--------")
print(formatted_prompt)


inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")



# Generate
start_time = datetime.now()
outputs = model.generate(**inputs, generation_config=config_file)
end_time = datetime.now()

# Get the input IDs and their length
input_ids = inputs['input_ids']
input_length = input_ids.shape[1]

# Slice the output to get only the new tokens
new_tokens = outputs[:, input_length:]
generated_text = tokenizer.batch_decode(new_tokens)
print("-------LLM Response--------")
print(generated_text[0])

print("-------Ground Truth Answer--------")
print(filtered_dataset[i]["EXTRACTED_ANSWER"])
print(filtered_dataset[i]["SOLUTION"])

### Prepare Input GENERAL:

1. **(opt) apply_chat_template()**:
Traditionell, muss ein Chat-format für das Modell verständlich sein, ansonsten würde er einfach versuchen, den Satz bestmöglich zu beenden. (Language Model after all).
Üblicherweise verwendet man daher ein Chat-Template. Chat Verläufe sind in der From anzugeben: In einer Liste von dicts. Jedes dieser dicts hat ein key "`role`" (with values `system` oder `user` oder `assist`) und ein key "`content`" (mit text). Hier ein Beispiel:
```
chat1= [
  {"role":"system|user|assist", "content":"blablabla"},
  {...},
  ...}
  ]
tokenizer.apply_chat_template(chat1,tokenize=False,add_generation_prompt=True)
```
Diese werden dann mit einem tokenize-spezifischen (und daher model spezifischen) "apply_chat_template()" umgewandelt.

2. **add instructions**: To elicit CoT (for example by adding "lets think step by step" or similar or having the model to conclude with the final answer in a certain format.
Example:
  ```
  problem = "What is 2+2?"
  gsm8k_prompt_format = """
  I have a math probelm. Please provide your answer to that problem in the following format:
  1. Start with the problem statement.
  2. Present the detailed calculations and reasoning in line with the problem.
  3. Use double angle brackets '<< >>' for intermediate calculations within the explanation.
  4. Conclude with the final answer preceded by '####'. For example: The final answer is #### 56.
  Here is the problem:
  {problem}"""
  gsm8k_prompt_format.format(problem=problem)
  ```

3. **tokenize the input**: It should be clear that before using ```model.generate()```we need to tokenize the input with a tokenizer. Make sure to use return tensor format, **inputs (so that input_ids as well as attention_mask is transmitted), and have the model as well as the tensor on the same device.

4. **generation_config**: modify the generation config file with the GenerationConfig_Class, which can be used to store all parameters. Default or special ones (dependig of printing the class or converting it in a dictionary first).



In [None]:
#old bzw general ok
empty_format= """{question}"""

zeroshot_cot_prompt_format = """
I have a math probelm. Please provide your answer to that problem in the following format:
1. Present the detailed calculations and reasoning in line with the problem. Think step by step.
2. Conclude with the final answer preceded by '####'. For example: The final answer is #### 56.
Here is the problem:
{question}"""

zeroshot_cot_prompt_format_deepseek = """ {question}\nPlease reason step by step, and put your final answer within \boxed""" #for dps it was mentioned that this is the format we should use. Furthermore no systemprompt should be used.

def generate_chat_structure_system(formatted_user_prompt):
    chat_structure = [
      {"role": "system", "content": "For this conversation, follow carefully the instruction of the user."},
      {"role": "user", "content": formatted_user_prompt}
    ]
    return chat_structure

def generate_chat_structure_wo_system(formatted_user_prompt):
    chat_structure = [
      {"role": "user", "content": formatted_user_prompt}
    ]
    return chat_structure

def generate_fewshot_cot_chat(shots, final_question, system_prompt=None):
    """
    shots: list of tuples -> [(question1, answer1), (question2, answer2), ...]
    final_question: str
    system_prompt: str or None
    """
    chat = []
    if system_prompt:
        chat.append({"role": "system", "content": system_prompt})

    for q, a in shots:
        chat.append({"role": "user", "content": q})
        chat.append({"role": "assistant", "content": a})

    chat.append({"role": "user", "content": final_question})
    return chat



In [None]:
shots = [
    ("What is 13 + 24? Let's think step by step.", "First, add 10 and 20 to get 30. Then add 3 and 4 to get 7. 30 + 7 = 37. #### 37"),
    ("If a pen costs $2 and you buy 3, how much is it? Let's think step by step.", "2 * 3 = 6. So the total is $6. #### 6"),
    ("What's the square of 9? Let's think step by step.", "9 * 9 = 81. So the answer is 81. #### 81"),
]
final_question = "What is 47 * 2?" + "Let's think step by step."


In [None]:
chat=generate_fewshot_cot_chat(shots, final_question, "You are a helpful assistant.") #"You are a helpful assistant."
chat

[{'role': 'system', 'content': 'You are a helpful assistant.'},
 {'role': 'user', 'content': "What is 13 + 24? Let's think step by step."},
 {'role': 'assistant',
  'content': 'First, add 10 and 20 to get 30. Then add 3 and 4 to get 7. 30 + 7 = 37. #### 37'},
 {'role': 'user',
  'content': "If a pen costs $2 and you buy 3, how much is it? Let's think step by step."},
 {'role': 'assistant', 'content': '2 * 3 = 6. So the total is $6. #### 6'},
 {'role': 'user',
  'content': "What's the square of 9? Let's think step by step."},
 {'role': 'assistant', 'content': '9 * 9 = 81. So the answer is 81. #### 81'},
 {'role': 'user', 'content': "What is 47 * 2?Let's think step by step."}]

In [None]:
tokenizer.apply_chat_template(chat,tokenize=False,add_generation_prompt=True)

"<|endoftext|><|user|>\nWhat is 13 + 24? Let's think step by step.\n<|assistant|>\nFirst, add 10 and 20 to get 30. Then add 3 and 4 to get 7. 30 + 7 = 37. #### 37<|endoftext|>\n<|user|>\nIf a pen costs $2 and you buy 3, how much is it? Let's think step by step.\n<|assistant|>\n2 * 3 = 6. So the total is $6. #### 6<|endoftext|>\n<|user|>\nWhat's the square of 9? Let's think step by step.\n<|assistant|>\n9 * 9 = 81. So the answer is 81. #### 81<|endoftext|>\n<|user|>\nWhat is 47 * 2?Let's think step by step.\n<|assistant|>\n"

# Experimental RUN

## Parameters

In [21]:
#DEFINTIONS of Prompt Templates - only for metadata storage!
#If you want to change, you also have to change it in the function build_prompt_for_model()

prompt_template_ds="""
chat = []
for q, a in shots:
    shot_q = f"{q}\nPlease reason step by step, and put your final answer within \\boxed"
    chat.append({"role": "user", "content": shot_q})
    chat.append({"role": "assistant", "content": a})
final_q = f"{eval_question}\nPlease reason step by step, and put your final answer within \\boxed"
chat.append({"role": "user", "content": final_q})
if tokenizer:
    return tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
return chat
"""

prompt_template_ol="""
system_prompt = "You are a helpful assistant."
chat = [{"role": "system", "content": system_prompt}]
for q, a in shots:
    shot_q = f"{q}\nPlease reason step by step, and put your final answer within \\boxed"
    chat.append({"role": "user", "content": shot_q})
    chat.append({"role": "assistant", "content": a})
final_q = f"{eval_question}\nPlease reason step by step, and put your final answer within \\boxed"
chat.append({"role": "user", "content": final_q})
if tokenizer:
    return tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
return chat
"""

prompt_template_le="""
prompt_parts = []
for q, a in shots:
    prompt_parts.append(f"Q: {q} Let's think step by step.\nA: {a}")
final_q = f"Q: {eval_question} Let's think step by step.\nA:"
return "\n\n".join(prompt_parts + [final_q])
"""

## Local LLM Evaluation RUN

In [21]:
#Define some parameters
seed = 42
torch.manual_seed(seed)

#if dataset is too large
start_i = 1 #(default 1)
stop_i = None# inkl.
log_batch_size = None # Number of iterations before writing to file

#basic parameters

dataset_shortname = "MathCONTA_aime"
model_shortname = "OL"
ds=filtered_dataset_ai


#generation specific
prompt_template= prompt_template_ol

#generation_config
#... please use model.gneration_config.<attribute_name>
config_file = model.generation_config
config_file.max_new_tokens = 2000
config_file.temperature = None
config_file.do_sample = False



#Automaically
exp_id = str(uuid.uuid4()) # Generate a unique experiment ID
current_date = datetime.now().date()


# Define metadata as a nested dictionary
metadata = {
    "experiment_id": exp_id,
    "timestamp": format(datetime.now(), '%Y-%m-%dT%H:%M:%SZ'),
    "description": f"Evaluation of model {model_shortname} on {dataset_shortname}",
    "additional_info": {
        "colab": True,
        "cpu_gpu": "L4",
        "data": repo_id+"_"+split+"_"+config_name,
        "data_short": dataset_shortname,
        "split": "test",
        "rows": f"{start_i}-{stop_i}",
        "log_batch_size": log_batch_size,
        "prompt_template": prompt_template,
        "model": model_id,
        "model_short": model_shortname,
        "generation_parameter": model.generation_config.to_dict(),
        "seed": seed,
        "LORA": False
    }
}

In [20]:
# Create directory if it doesn't exist
DRIVE_PATH = Path('drive/MyDrive/Masterarbeit25/eval_data/MathCONTA_v1') / model_id
exp_dir = DRIVE_PATH
exp_dir.mkdir(parents=True, exist_ok=True)
# Track the number of iterations
iteration_count = 0
# List to hold data entries
data_entries = []

# Generate text for each row and log it
for i, row in enumerate(ds, start=1):
    if i >= start_i:
        data_entries.append(generate_and_log(row, model, tokenizer, config_file))

        # Increment the iteration count
        iteration_count += 1
        #print progress bar
        if i % 1 == 0:
          print(f"{i}/{len(ds)}")

        # Write to file every batch_size iterations
        if log_batch_size is not None and iteration_count % log_batch_size == 0:
            write_results_to_file(data_entries, exp_id, dataset_shortname, start_i, i, model_shortname, current_date)
            data_entries = []  # Clear the list after writing to file
            start_i = i + 1  # Update start_i for the next batch

    if i == stop_i:
        break

# Write any remaining results to file
if data_entries:
    write_results_to_file(data_entries, exp_dir, dataset_shortname, start_i, i, model_shortname, current_date)

print(f"All results have been logged to directory {exp_dir}")

1/20
2/20
3/20
4/20
5/20
6/20
7/20
8/20
9/20
10/20
11/20
12/20
13/20
14/20
15/20
16/20
17/20
18/20
19/20
20/20
Results have been logged to output_MathCONTA_forum_1-20_OL_2025-04-17.json
All results have been logged to directory drive/MyDrive/Masterarbeit25/eval_data/MathCONTA_v1/allenai/OLMo-7B-0724-Instruct-hf


## Model Access via GPT4o API

In [86]:
import openai

from google.colab import userdata
api_key=userdata.get('open_AI_key')

In [83]:
#Define some parameters
seed = 42
torch.manual_seed(seed)

#if dataset is too large
start_i = 1 #(default 1)
stop_i = None# inkl.
log_batch_size = None # Number of iterations before writing to file

#basic parameters

dataset_shortname = "MathCONTA_forum"
model_shortname = "GPT4o"
ds=filtered_dataset_f


#generation specific
prompt_template= prompt_template_ol

#generation_config
config_file = {}

config_file["max_output_tokens"] = 2000   # for OpenAI API, it's 'max_tokens', not 'max_new_tokens'
config_file["temperature"] = 0.0   # None is NOT allowed by OpenAI, use 0.0 if you want no randomness




#Automaically
exp_id = str(uuid.uuid4()) # Generate a unique experiment ID
current_date = datetime.now().date()


# Define metadata as a nested dictionary
metadata = {
    "experiment_id": exp_id,
    "timestamp": format(datetime.now(), '%Y-%m-%dT%H:%M:%SZ'),
    "description": f"Evaluation of model {model_shortname} on {dataset_shortname}",
    "additional_info": {
        "colab": True,
        "cpu_gpu": "API",
        "data": repo_id+"_"+split+"_"+config_name,
        "data_short": dataset_shortname,
        "split": "test",
        "rows": f"{start_i}-{stop_i}",
        "log_batch_size": log_batch_size,
        "prompt_template": prompt_template,
        "model": model_id,
        "model_short": model_shortname,
        "generation_parameter": config_file,
        "seed": seed,
        "LORA": False
    }
}

In [85]:
# Create directory if it doesn't exist
DRIVE_PATH = Path('drive/MyDrive/Masterarbeit25/eval_data/MathCONTA_v1') / model_id
exp_dir = DRIVE_PATH
exp_dir.mkdir(parents=True, exist_ok=True)
# Track the number of iterations
iteration_count = 0
# List to hold data entries
data_entries = []

# Generate text for each row and log it
for i, row in enumerate(ds, start=1):
    if i >= start_i:
        data_entries.append(generate_and_log_api(row, model_id, api_key,config_file))

        # Increment the iteration count
        iteration_count += 1
        #print progress bar
        if i % 1 == 0:
          print(f"{i}/{len(ds)}")

        # Write to file every batch_size iterations
        if log_batch_size is not None and iteration_count % log_batch_size == 0:
            write_results_to_file(data_entries, exp_id, dataset_shortname, start_i, i, model_shortname, current_date)
            data_entries = []  # Clear the list after writing to file
            start_i = i + 1  # Update start_i for the next batch

    if i == stop_i:
        break

# Write any remaining results to file
if data_entries:
    write_results_to_file(data_entries, exp_dir, dataset_shortname, start_i, i, model_shortname, current_date)

print(f"All results have been logged to directory {exp_dir}")

1/20
2/20
3/20
4/20
5/20
6/20
7/20
8/20
9/20
10/20
11/20
12/20
13/20
14/20
15/20
16/20
17/20
18/20
19/20
20/20
Results have been logged to output_MathCONTA_forum_1-20_GPT4o_2025-04-18.json
All results have been logged to directory drive/MyDrive/Masterarbeit25/eval_data/MathCONTA_v1/gpt-4o-2024-11-20
