In [1]:
%load_ext autoreload
%autoreload 2

In [28]:
import os.path

from datasets import load_dataset, load_from_disk
from dotenv import load_dotenv

load_dotenv("../.env")
# Load GSM8k Dataset
dataset_name = "LFrancis/GSM8k-NoOp-Plus"
baseline_dataset_name = "openai/gsm8k"
subset = "main_scramble"
dataset = load_dataset(dataset_name, subset)["train"]

# VLLM API Configuration
BASE_URL = "http://134.76.18.30:8082/v1/chat/completions"
HEADERS = {"Content-Type": "application/json", "Authorization": "Bearer " + os.getenv("VLLM_API_KEY")}
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"

EVALUATED_MODEL_PATH = dataset_name + "_" + subset + "_evaluated_" + MODEL_NAME
BASELINE_MODEL_PATH = baseline_dataset_name + "_evaluated_" + MODEL_NAME

if not os.path.exists(EVALUATED_MODEL_PATH):
    dataset.save_to_disk(EVALUATED_MODEL_PATH)
if not os.path.exists(BASELINE_MODEL_PATH):
    baseline_dataset = load_dataset(baseline_dataset_name, subset)["test"]
    baseline_dataset.save_to_disk(BASELINE_MODEL_PATH)

Saving the dataset (0/1 shards):   0%|          | 0/1319 [00:00<?, ? examples/s]

In [29]:
import re
from datasets import Dataset
import requests


# Helper Functions
def create_chat_messages(question, sys_msg):
    """
    Create a formatted list of chat messages for the chat model.
    """
    user_prompt = f"{question}\n" + "\nCalculations:"
    return [
        {"role": "system", "content": sys_msg},
        {"role": "user", "content": user_prompt}
    ]


def query_vllm_api(payload):
    """
    Send a query to the VLLM API and return the response.
    """
    response = requests.post(BASE_URL, json=payload, headers=HEADERS, timeout=120)
    response.raise_for_status()  # Raise an error for HTTP issues
    return response.json()


def evaluate_question(entry):
    # Step 1: Generate reasoning (CoT) response
    sys_msg = "The following are math questions. Think step by step. State your final answer at the end of your calculations."
    question = entry["question"]
    messages = create_chat_messages(question, sys_msg)

    cot_payload = {
        "model": MODEL_NAME,
        "messages": messages,
        "max_tokens": 2000,
        "temperature": 0.0,
    }

    cot_response = query_vllm_api(cot_payload)
    if "object" in cot_response.keys() and cot_response["object"] == "error":
        raise Exception(cot_response["message"])

    cot_text = cot_response["choices"][0]["message"]["content"].strip()  # Extract CoT reasoning
    gen_answer = extract_answer(cot_text)
    entry["generated_answer"] = gen_answer
    entry["generated_cot"] = cot_text
    return entry


def extract_answer(gen_answer):
    # Remove commas so for example 5,000 becomes 5000
    model_resp = gen_answer.replace(",", "")
    # Find the last number
    extracted_num = re.findall(r"-?\d+\.?\d*", model_resp)[-1]
    # Use float to ensure 3.0 and 3 are the same.
    return str(float(extracted_num))


def is_correct(entry):
    """
    Determines if the choice with the lowest log probability corresponds to the correct answer.

    Args:
        entry (dict): A dictionary containing the question, choices, answer index, and logprobs.

    Returns:
        bool: True if the option with the lowest logprob matches the correct answer index, False otherwise.
    """
    # Extract logprobs and the correct answer index
    gen_answer = entry['generated_answer']
    if gen_answer == "":
        return False
    answer = extract_answer(entry['answer'])

    return answer == gen_answer


def process_dataset(dataset: Dataset, numproc=1):
    """
    Process the dataset using Dataset.map.
    """

    def process_entry(entry):
        if "generated_answer" in entry.keys() and not entry["generated_answer"] == "":
            return entry
        try:
            return evaluate_question(entry)
        except Exception as e:
            print(f"Error processing entry: {entry}, Exception: {e}")
            entry["generated_answer"] = ""
            entry["generated_cot"] = ""
            return entry

    return dataset.map(process_entry, with_indices=False, num_proc=numproc)

In [30]:
def update_dataset(dataset, is_baseline=False):
    # Save the updated dataset to a temporary location
    temp_path = "temp"
    dataset.save_to_disk(temp_path)

    # Overwrite the original dataset directory
    import shutil
    original_path = EVALUATED_MODEL_PATH if is_baseline == False else BASELINE_MODEL_PATH

    # Remove the old dataset and replace it with the new one
    shutil.rmtree(original_path)  # Remove the old dataset directory
    shutil.move(temp_path, original_path)

In [33]:
def main(is_continue=False, is_baseline=False, numproc=1):
    """
    Main function to evaluate the dataset asynchronously.
    """
    if is_baseline:
        selected_dataset = load_from_disk(BASELINE_MODEL_PATH)
    elif is_continue:
        selected_dataset = load_from_disk(EVALUATED_MODEL_PATH)
    else:
        selected_dataset = dataset
    # Process the dataset asynchronously
    processed_dataset = process_dataset(selected_dataset, numproc)

    # Save the updated dataset
    update_dataset(processed_dataset, is_baseline)


# Run the script
main(True, True, 200)

Map (num_proc=200):   0%|          | 0/1319 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1319 [00:00<?, ? examples/s]

In [34]:
from converter.converter import save_value_to_json

for subset in ["addition", "lexicon", "syntax", "", "naive", "typo", "scramble"]:
    if subset == "":
        EVALUATED_MODEL_PATH = dataset_name + "_main" + "_evaluated_" + MODEL_NAME
        subset = "our_baseline"
    else:
        EVALUATED_MODEL_PATH = dataset_name + "_main_" + subset + "_evaluated_" + MODEL_NAME
    if os.path.exists(EVALUATED_MODEL_PATH):
        selected_dataset = load_from_disk(EVALUATED_MODEL_PATH)
        print(EVALUATED_MODEL_PATH)
        print(selected_dataset)
        score = [is_correct(result) for result in selected_dataset]
        score = sum(score) / len(score)
        save_value_to_json(subset, score, MODEL_NAME)
        print(subset, "accuracy", score)
    else:
        print("skipping", EVALUATED_MODEL_PATH)

LFrancis/GSM8k-NoOp-Plus_main_addition_evaluated_meta-llama/Meta-Llama-3-8B-Instruct
Dataset({
    features: ['question', 'answer', 'generated_answer', 'generated_cot'],
    num_rows: 1319
})
addition accuracy 0.6618650492797574
LFrancis/GSM8k-NoOp-Plus_main_lexicon_evaluated_meta-llama/Meta-Llama-3-8B-Instruct
Dataset({
    features: ['question', 'answer', 'generated_answer', 'generated_cot'],
    num_rows: 1319
})
lexicon accuracy 0.6391205458680819
LFrancis/GSM8k-NoOp-Plus_main_syntax_evaluated_meta-llama/Meta-Llama-3-8B-Instruct
Dataset({
    features: ['question', 'answer', 'generated_answer', 'generated_cot'],
    num_rows: 1319
})
syntax accuracy 0.7065959059893859
LFrancis/GSM8k-NoOp-Plus_main_evaluated_meta-llama/Meta-Llama-3-8B-Instruct
Dataset({
    features: ['question', 'answer', 'generated_answer', 'generated_cot'],
    num_rows: 1319
})
our_baseline accuracy 0.7505686125852918
LFrancis/GSM8k-NoOp-Plus_main_naive_evaluated_meta-llama/Meta-Llama-3-8B-Instruct
Dataset({
   

In [35]:
baseline_dataset = load_from_disk(BASELINE_MODEL_PATH)
score = [is_correct(result) for result in baseline_dataset]
score = sum(score) / len(score)
save_value_to_json("baseline", score, MODEL_NAME)
print("Baseline Accuracy", score)

Baseline Accuracy 0.7513267626990144


In [68]:
selected_dataset = load_from_disk(dataset_name + "_main_addition_evaluated_" + MODEL_NAME)
selected_dataset[0]["generated_cot"]

"To find out how much Janet makes every day at the farmers' market, we need to calculate the number of eggs she has left after eating and baking, and then multiply that number by the price she sells each egg for.\n\n1. Calculate the total number of eggs laid by the ducks per day: \n   Total eggs per day = 16\n\n2. Calculate the number of eggs Janet eats for breakfast:\n   Eggs eaten for breakfast = 3\n\n3. Calculate the number of eggs Janet bakes for her friends:\n   Eggs baked for friends = 4\n\n4. Calculate the total number of eggs Janet uses:\n   Total eggs used = Eggs eaten for breakfast + Eggs baked for friends\n   Total eggs used = 3 + 4\n   Total eggs used = 7\n\n5. Calculate the number of eggs Janet has left:\n   Eggs left = Total eggs per day - Total eggs used\n   Eggs left = 16 - 7\n   Eggs left = 9\n\n6. Calculate the amount Janet makes by selling the eggs:\n   Amount made = Eggs left * Price per egg\n   Amount made = 9 * $2\n   Amount made = $18\n\nTherefore, Janet makes $1

In [69]:
baseline_dataset[0]["question"]

"Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"