In [10]:
from datasets import concatenate_datasets


def preprocess(dataset_name, subset):
    dataset = load_dataset(dataset_name, subset)[subset]
    dataset = dataset.add_column("subset", [subset] * len(dataset))
    return dataset


def create_dataset_from_subsets(split, dataset_name):
    # split is noop subset, like lexicon, syntax etc
    datasets = [preprocess(dataset_name, subset + (("_" + split) if not split == "" else "")) for subset in
                ["murder_mysteries", "team_allocation", "object_placements"]]

    return concatenate_datasets(datasets)

In [15]:
import os.path

from datasets import load_dataset, load_from_disk
from dotenv import load_dotenv

load_dotenv("../.env")
# Load GSM8k Dataset
dataset_name = "LFrancis/MuSR-NoOp-Plus"
baseline_dataset_name = "TAUR-Lab/MuSR"
subset = ""
dataset = create_dataset_from_subsets(subset, dataset_name)

# VLLM API Configuration
BASE_URL = "http://134.76.18.30:8080/v1/chat/completions"
HEADERS = {"Content-Type": "application/json", "Authorization": "Bearer " + os.getenv("VLLM_API_KEY")}
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"

EVALUATED_MODEL_PATH = dataset_name + "_" + subset + "_evaluated_" + MODEL_NAME
BASELINE_MODEL_PATH = baseline_dataset_name + "_evaluated_" + MODEL_NAME
if not os.path.exists(EVALUATED_MODEL_PATH):
    dataset.save_to_disk(EVALUATED_MODEL_PATH)
if not os.path.exists(BASELINE_MODEL_PATH):
    def p(dataset_name, subset):
        dataset = load_dataset(dataset_name)[subset]
        dataset = dataset.add_column("subset", [subset] * len(dataset))
        return dataset

    datasets = [p(baseline_dataset_name, subset) for subset in
                    ["murder_mysteries", "team_allocation", "object_placements"]]

    baseline_dataset = concatenate_datasets(datasets)
    baseline_dataset.save_to_disk(BASELINE_MODEL_PATH)

In [16]:
from datasets import Dataset
import requests


# Helper Functions
def create_chat_messages(narrative, question, sys_msg):
    """
    Create a formatted list of chat messages for the chat model.
    """
    user_prompt = f"Narrative: {narrative}\nQ: {question}\nA: Let's think step by step."
    return [
        {"role": "system", "content": sys_msg},
        {"role": "user", "content": user_prompt}
    ]


def query_vllm_api(payload):
    """
    Send a query to the VLLM API and return the response.
    """
    response = requests.post(BASE_URL, json=payload, headers=HEADERS, timeout=120)
    response.raise_for_status()  # Raise an error for HTTP issues
    return response.json()


def evaluate_question(entry):
    # Step 1: Generate reasoning (CoT) response
    sys_msg = ""
    question = entry["question"]
    narrative = entry["narrative"]
    messages = create_chat_messages(narrative, question, sys_msg)

    cot_payload = {
        "model": MODEL_NAME,
        "messages": messages,
        "max_tokens": 200,
        "temperature": 0.0,
    }

    cot_response = query_vllm_api(cot_payload)
    if "object" in cot_response.keys() and cot_response["object"] == "error":
        raise Exception(cot_response["message"])

    cot_text = cot_response["choices"][0]["message"]["content"].strip()  # Extract CoT reasoning

    # Step 2: Calculate logprobs for each choice
    final_prompt = f"Only output your answer, no other explanation or addition. the answer will be copy/pasted as is. Only output one of the following choices as the final Answer:\nQ: {question}\nChoices: {entry['choices']}\nFinal Answer: "
    choice_messages = [
        *messages,
        {"role": "system", "content": cot_text},
        {"role": "user", "content": final_prompt}
    ]
    final_payload = {
        "model": MODEL_NAME,
        "messages": choice_messages,
        "max_tokens": 10,
        "temperature": 0.0,
        "stop": ["\n"],
    }
    final_response = query_vllm_api(final_payload)
    gen_answer = final_response["choices"][0]["message"]["content"].strip()
    entry["generated_answer"] = gen_answer
    entry["generated_cot"] = cot_text
    return entry


def is_correct(entry):
    """
    Determines if the choice with the lowest log probability corresponds to the correct answer.

    Args:
        entry (dict): A dictionary containing the question, choices, answer index, and logprobs.

    Returns:
        bool: True if the option with the lowest logprob matches the correct answer index, False otherwise.
    """
    # Extract logprobs and the correct answer index
    gen_answer: str = entry['generated_answer']
    if gen_answer is None:
        return False
    answer: str = entry['answer_choice']

    return gen_answer.find(answer) != -1


def process_dataset(dataset: Dataset, numproc=1):
    """
    Process the dataset using Dataset.map.
    """

    def process_entry(entry):
        if "generated_answer" in entry.keys() and entry["generated_answer"] is not None:
            return entry
        try:
            return evaluate_question(entry)
        except Exception as e:
            print(f"Error processing entry: {entry}, Exception: {e}")
            entry["generated_answer"] = None
            entry["generated_cot"] = None
            return entry

    return dataset.map(process_entry, with_indices=False, num_proc=numproc)

In [17]:
def update_dataset(dataset, is_baseline=False):
    # Save the updated dataset to a temporary location
    temp_path = "temp"
    dataset.save_to_disk(temp_path)

    # Overwrite the original dataset directory
    import shutil
    original_path = EVALUATED_MODEL_PATH if is_baseline == False else BASELINE_MODEL_PATH

    # Remove the old dataset and replace it with the new one
    shutil.rmtree(original_path)  # Remove the old dataset directory
    shutil.move(temp_path, original_path)

In [20]:
def main(is_continue=False, is_baseline=False, numproc=1):
    """
    Main function to evaluate the dataset asynchronously.
    """
    if is_baseline:
        selected_dataset = load_from_disk(BASELINE_MODEL_PATH)
    elif is_continue:
        selected_dataset = load_from_disk(EVALUATED_MODEL_PATH)
    else:
        selected_dataset = dataset
    # Process the dataset asynchronously
    processed_dataset = process_dataset(selected_dataset, numproc)

    # Save the updated dataset
    update_dataset(processed_dataset, is_baseline)


# Run the script
main(True, False, 1)

Map:   0%|          | 0/756 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/756 [00:00<?, ? examples/s]

In [22]:
from converter.converter import save_value_to_json

for subset in ["addition", "lexicon", "syntax", "", "naive", "typo", "scramble"]:
    if subset == "":
        EVALUATED_MODEL_PATH = dataset_name + "__evaluated_" + MODEL_NAME
        subset = "our_baseline"
    else:
        EVALUATED_MODEL_PATH = dataset_name + "_" + subset + "_evaluated_" + MODEL_NAME
    if os.path.exists(EVALUATED_MODEL_PATH):
        selected_dataset = load_from_disk(EVALUATED_MODEL_PATH)
        print(selected_dataset)
        score = [is_correct(result) for result in selected_dataset]
        score = sum(score) / len(score)
        save_value_to_json(subset, score, MODEL_NAME)
        print(subset, "accuracy", score)
    else:
        print("skipping", EVALUATED_MODEL_PATH)

Dataset({
    features: ['narrative', 'question', 'choices', 'answer_index', 'answer_choice', 'subset', 'generated_answer', 'generated_cot'],
    num_rows: 756
})
addition accuracy 0.3425925925925926
Dataset({
    features: ['narrative', 'question', 'choices', 'answer_index', 'answer_choice', 'subset', 'generated_answer', 'generated_cot'],
    num_rows: 756
})
lexicon accuracy 0.3333333333333333
Dataset({
    features: ['narrative', 'question', 'choices', 'answer_index', 'answer_choice', 'subset', 'generated_answer', 'generated_cot'],
    num_rows: 756
})
syntax accuracy 0.2962962962962963
Dataset({
    features: ['narrative', 'question', 'choices', 'answer_index', 'answer_choice', 'subset', 'generated_answer', 'generated_cot'],
    num_rows: 756
})
our_baseline accuracy 0.3425925925925926
Dataset({
    features: ['narrative', 'question', 'choices', 'answer_index', 'answer_choice', 'subset', 'generated_answer', 'generated_cot'],
    num_rows: 756
})
naive accuracy 0.36507936507936506
D

In [19]:
selected_dataset[0]

{'narrative': 'In an aanledrine incuidng benuge jmpniug stie, Mcak\'s tihlrl-sieenkg atdurvene came to a goeusmre end by a nnhckuau; now, it\'s up to Dtticeeve Wotinsn to urneavl the ddelay stecres btweeen Mczkeinae and Ana.\n\nWtnsion took a glup of his blcak cfofee, stinarg at the neots swlpaerd aocsrs his desk. A mruder csae at a begnue jmnpiug stie was deitfniley out of the odrirany. Tdaoy\'s vitcim was a ynoug man named Mack, loud mheoutd and ccoky by all acctouns. \n\nMack was bugene jnpumig the day he was kellid. Odldy egnouh, aoccirdng to the rcdores, no one else was duoetncmed at the buenge jpnumig site taht day, making this csae eevn mroe puealicr. The first stop for the day was to vsiit one of Mcak\'s htsuaomees, a wamon naemd Ana. Tehy were seen liaevng in the smae vchleie from tiher srhead hiuosng cmloepx the mirnong of the muerdr, and it was time for Wtoinsn to dig depeer. \n\nAs he pllued itno the sehard hnsuoig dwaeivry, a nonpiedrsct car cmae itno sghit. He lreaned fro

In [20]:
baseline_dataset = load_from_disk(BASELINE_MODEL_PATH)
score = [is_correct(result) for result in baseline_dataset]
score = sum(score) / len(score)
save_value_to_json("baseline", score, MODEL_NAME)
print("Baseline Accuracy", score)

Baseline Accuracy 0.3505291005291005
