In [13]:
%load_ext autoreload
%autoreload 2

In [14]:
import os.path

from datasets import load_dataset, load_from_disk
from dotenv import load_dotenv

load_dotenv("../.env")
# Load GSM8k Dataset
dataset_name = "LFrancis/GSM8k-NoOp-Plus"
baseline_dataset_name = "openai/gsm8k"
subset = "main_typo_add_random"
dataset = load_dataset(dataset_name, subset)["train"]

# VLLM API Configuration
BASE_URL = "http://134.76.18.30:8081/v1/chat/completions"
HEADERS = {"Content-Type": "application/json", "Authorization": "Bearer " + os.getenv("VLLM_API_KEY")}
MODEL_NAME = "meta-llama/Llama-3.3-70B-Instruct"

EVALUATED_MODEL_PATH = dataset_name + "_" + subset + "_evaluated_" + MODEL_NAME
BASELINE_MODEL_PATH = baseline_dataset_name + "_evaluated_" + MODEL_NAME

if not os.path.exists(EVALUATED_MODEL_PATH):
    dataset.save_to_disk(EVALUATED_MODEL_PATH)
if not os.path.exists(BASELINE_MODEL_PATH):
    baseline_dataset = load_dataset(baseline_dataset_name, subset)["test"]
    baseline_dataset.save_to_disk(BASELINE_MODEL_PATH)

Saving the dataset (0/1 shards):   0%|          | 0/1319 [00:00<?, ? examples/s]

In [15]:
import re
from datasets import Dataset
import requests


# Helper Functions
def create_chat_messages(question, sys_msg):
    """
    Create a formatted list of chat messages for the chat model.
    """
    user_prompt = f"{question}\n" + "\nCalculations:"
    return [
        {"role": "system", "content": sys_msg},
        {"role": "user", "content": user_prompt}
    ]


def query_vllm_api(payload):
    """
    Send a query to the VLLM API and return the response.
    """
    response = requests.post(BASE_URL, json=payload, headers=HEADERS, timeout=120)
    response.raise_for_status()  # Raise an error for HTTP issues
    return response.json()


def evaluate_question(entry):
    # Step 1: Generate reasoning (CoT) response
    sys_msg = "The following are math questions. Think step by step. State your final answer at the end of your calculations."
    question = entry["question"]
    messages = create_chat_messages(question, sys_msg)

    cot_payload = {
        "model": MODEL_NAME,
        "messages": messages,
        "max_tokens": 2000,
        "temperature": 0.0,
    }

    cot_response = query_vllm_api(cot_payload)
    if "object" in cot_response.keys() and cot_response["object"] == "error":
        raise Exception(cot_response["message"])

    cot_text = cot_response["choices"][0]["message"]["content"].strip()  # Extract CoT reasoning
    gen_answer = extract_answer(cot_text)
    entry["generated_answer"] = gen_answer
    entry["generated_cot"] = cot_text
    return entry


def extract_answer(gen_answer):
    # Remove commas so for example 5,000 becomes 5000
    model_resp = gen_answer.replace(",", "")
    # Find the last number
    extracted_num = re.findall(r"-?\d+\.?\d*", model_resp)[-1]
    # Use float to ensure 3.0 and 3 are the same.
    return str(float(extracted_num))


def is_correct(entry):
    """
    Determines if the choice with the lowest log probability corresponds to the correct answer.

    Args:
        entry (dict): A dictionary containing the question, choices, answer index, and logprobs.

    Returns:
        bool: True if the option with the lowest logprob matches the correct answer index, False otherwise.
    """
    # Extract logprobs and the correct answer index
    gen_answer = entry['generated_answer']
    if gen_answer == "":
        return False
    answer = extract_answer(entry['answer'])

    return answer == gen_answer


def process_dataset(dataset: Dataset, numproc=1):
    """
    Process the dataset using Dataset.map.
    """

    def process_entry(entry):
        if "generated_answer" in entry.keys() and not entry["generated_answer"] == "":
            return entry
        try:
            return evaluate_question(entry)
        except Exception as e:
            print(f"Error processing entry: {entry}, Exception: {e}")
            entry["generated_answer"] = ""
            entry["generated_cot"] = ""
            return entry

    return dataset.map(process_entry, with_indices=False, num_proc=numproc)

In [16]:
def update_dataset(dataset, is_baseline=False):
    # Save the updated dataset to a temporary location
    temp_path = "temp"
    dataset.save_to_disk(temp_path)

    # Overwrite the original dataset directory
    import shutil
    original_path = EVALUATED_MODEL_PATH if is_baseline == False else BASELINE_MODEL_PATH

    # Remove the old dataset and replace it with the new one
    shutil.rmtree(original_path)  # Remove the old dataset directory
    shutil.move(temp_path, original_path)

In [18]:
def main(is_continue=False, is_baseline=False, numproc=1):
    """
    Main function to evaluate the dataset asynchronously.
    """
    if is_baseline:
        selected_dataset = load_from_disk(BASELINE_MODEL_PATH)
    elif is_continue:
        selected_dataset = load_from_disk(EVALUATED_MODEL_PATH)
    else:
        selected_dataset = dataset
    # Process the dataset asynchronously
    processed_dataset = process_dataset(selected_dataset, numproc)

    # Save the updated dataset
    update_dataset(processed_dataset, is_baseline)


# Run the script
main(True, False, 75)

Map (num_proc=75):   0%|          | 0/1319 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1319 [00:00<?, ? examples/s]

In [19]:
from converter.converter import save_value_to_json

for ss in ["addition", "lexicon", "syntax", "scramble", "naive", "typo_qwerty", 'typo_doubling',
    'typo_deletion',
    'typo_hold_down',
    'typo_add_random', "our_baseline", "", "typo"]:
    if ss == "":
        model_path = dataset_name + "_main" + "_evaluated_" + MODEL_NAME
        ss = "our_baseline"
    else:
        model_path = dataset_name + "_main_" + ss + "_evaluated_" + MODEL_NAME
    if os.path.exists(model_path):
        selected_dataset = load_from_disk(model_path)
        print(model_path)
        print(selected_dataset)
        score = [is_correct(result) for result in selected_dataset]
        score = sum(score) / len(score)
        save_value_to_json(ss, score, MODEL_NAME)
        print(ss, "accuracy", score)
    else:
        print("skipping", model_path)

LFrancis/GSM8k-NoOp-Plus_main_addition_evaluated_meta-llama/Llama-3.3-70B-Instruct
Dataset({
    features: ['question', 'answer', 'generated_answer', 'generated_cot'],
    num_rows: 1319
})
addition accuracy 0.9211523881728583
LFrancis/GSM8k-NoOp-Plus_main_lexicon_evaluated_meta-llama/Llama-3.3-70B-Instruct
Dataset({
    features: ['question', 'answer', 'generated_answer', 'generated_cot'],
    num_rows: 1319
})
lexicon accuracy 0.8218347232752085
LFrancis/GSM8k-NoOp-Plus_main_syntax_evaluated_meta-llama/Llama-3.3-70B-Instruct
Dataset({
    features: ['question', 'answer', 'generated_answer', 'generated_cot'],
    num_rows: 1319
})
syntax accuracy 0.9264594389689158
LFrancis/GSM8k-NoOp-Plus_main_scramble_evaluated_meta-llama/Llama-3.3-70B-Instruct
Dataset({
    features: ['question', 'answer', 'generated_answer', 'generated_cot'],
    num_rows: 1319
})
scramble accuracy 0.8908263836239575
LFrancis/GSM8k-NoOp-Plus_main_naive_evaluated_meta-llama/Llama-3.3-70B-Instruct
Dataset({
    feat

In [49]:
baseline_dataset = load_from_disk(BASELINE_MODEL_PATH)
score = [is_correct(result) for result in baseline_dataset]
score = sum(score) / len(score)
save_value_to_json("baseline", score, MODEL_NAME)
print("Baseline Accuracy", score)

Baseline Accuracy 0.9575435936315391


In [50]:
selected_dataset = load_from_disk(dataset_name + "_main_addition_evaluated_" + MODEL_NAME)
selected_dataset[0]["generated_cot"]

"To find out how much Janet makes every day at the farmers' market, we first need to determine how many eggs she sells. \n\n1. Janet's ducks lay 16 eggs per day.\n2. She eats 3 eggs for breakfast.\n3. She uses 4 eggs to bake muffins.\n\nTotal eggs used = 3 (for breakfast) + 4 (for muffins) = 7 eggs\n\n4. To find the number of eggs she sells, subtract the total eggs used from the total eggs laid:\n   Eggs sold = Total eggs laid - Total eggs used\n             = 16 - 7\n             = 9 eggs\n\n5. Since she sells each egg for $2, her daily earnings from the farmers' market can be calculated as follows:\n   Daily earnings = Number of eggs sold * Price per egg\n                  = 9 * $2\n                  = $18\n\nFinal answer: $18"

In [51]:
baseline_dataset[0]["question"]

"Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"