In [1]:
def get_subsets(dataset_name: str) -> list[str]:
    import requests
    headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"}
    API_URL = f"https://datasets-server.huggingface.co/splits?dataset={dataset_name}"
    data = requests.get(API_URL, headers=headers).json()
    return [subset["config"] for subset in data["splits"]]

In [2]:
choice_subsets = [
    "boolean_expressions",
    "causal_judgement",
    "date_understanding",
    "disambiguation_qa",
    "formal_fallacies",
    "geometric_shapes",
    "hyperbaton",
    "logical_deduction_five_objects",
    "logical_deduction_seven_objects",
    "logical_deduction_three_objects",
    "movie_recommendation",
    "navigate",
    "penguins_in_a_table",
    "reasoning_about_colored_objects",
    "ruin_names",
    "salient_translation_error_detection",
    "salient_translation_error_detection",
    "snarks",
    "sports_understanding",
    "temporal_sequences",
    "tracking_shuffled_objects_five_objects",
    "tracking_shuffled_objects_seven_objects",
    "tracking_shuffled_objects_three_objects",
    "web_of_lies"
]
# Subsets that do not follow the choice paradigm are ignored
other_subsets = [
    "dyck_languages",
    "multistep_arithmetic_two",
    "object_counting",
    "word_sorting"
]

In [3]:
from datasets import concatenate_datasets


def preprocess(dataset_name, subset):
    dataset = load_dataset(dataset_name, subset)["train"]
    choices = list(set(dataset['target']))
    dataset = dataset.add_column("choices", [choices] * len(dataset))
    dataset = dataset.add_column("subset", [subset] * len(dataset))
    return dataset


def create_dataset_from_subsets(split, dataset_name):
    # split is noop subset, like lexicon, syntax etc
    datasets = [preprocess(dataset_name, subset + (("_" + split) if not split == "" else "")) for subset in
                choice_subsets]

    return concatenate_datasets(datasets)

In [4]:
import os.path

from datasets import load_dataset, load_from_disk
from dotenv import load_dotenv

load_dotenv("../.env")
# Load GSM8k Dataset
dataset_name = "LFrancis/BBH-NoOp-Plus"
baseline_dataset_name = "maveriq/bigbenchhard"
subset = "scramble"
dataset = create_dataset_from_subsets(subset, dataset_name)

# VLLM API Configuration
BASE_URL = "http://134.76.18.30:8081/v1/chat/completions"
HEADERS = {"Content-Type": "application/json", "Authorization": "Bearer " + os.getenv("VLLM_API_KEY")}
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"

EVALUATED_MODEL_PATH = dataset_name + "_" + subset + "_evaluated_" + MODEL_NAME
BASELINE_MODEL_PATH = baseline_dataset_name + "_evaluated_" + MODEL_NAME
if not os.path.exists(EVALUATED_MODEL_PATH):
    dataset.save_to_disk(EVALUATED_MODEL_PATH)
if not os.path.exists(BASELINE_MODEL_PATH):
    baseline_dataset = create_dataset_from_subsets("", baseline_dataset_name)
    baseline_dataset.save_to_disk(BASELINE_MODEL_PATH)

In [None]:
for subset in get_subsets(baseline_dataset_name):
    if subset not in choice_subsets and subset not in other_subsets:
        print(subset)  # Check if all subsets are accounted for

In [5]:
from datasets import Dataset
import requests


# Helper Functions
def create_chat_messages(question, sys_msg):
    """
    Create a formatted list of chat messages for the chat model.
    """
    user_prompt = f"Q: ${question}\nA: Let's think step by step."
    return [
        {"role": "system", "content": sys_msg},
        {"role": "user", "content": user_prompt}
    ]


def query_vllm_api(payload):
    """
    Send a query to the VLLM API and return the response.
    """
    response = requests.post(BASE_URL, json=payload, headers=HEADERS, timeout=120)
    response.raise_for_status()  # Raise an error for HTTP issues
    return response.json()


def evaluate_question(entry):
    # Step 1: Generate reasoning (CoT) response
    sys_msg = ""
    question = entry["input"]
    messages = create_chat_messages(question, sys_msg)

    cot_payload = {
        "model": MODEL_NAME,
        "messages": messages,
        "max_tokens": 200,
        "temperature": 0.0,
    }

    cot_response = query_vllm_api(cot_payload)
    if "object" in cot_response.keys() and cot_response["object"] == "error":
        raise Exception(cot_response["message"])

    cot_text = cot_response["choices"][0]["message"]["content"].strip()  # Extract CoT reasoning

    # Step 2: Calculate logprobs for each choice
    final_prompt = f"Only output your answer, no other explanation or addition. the answer will be copy/pasted as is. Only output one of the following choices as the final Answer:\nChoices: {entry['choices']}\nFinal Answer: "
    choice_messages = [
        *messages,
        {"role": "system", "content": cot_text},
        {"role": "user", "content": final_prompt}
    ]
    final_payload = {
        "model": MODEL_NAME,
        "messages": choice_messages,
        "max_tokens": 10,
        "temperature": 0.0,
        "stop": ["\n"],
    }
    final_response = query_vllm_api(final_payload)
    gen_answer = final_response["choices"][0]["message"]["content"].strip()
    entry["generated_answer"] = gen_answer
    entry["generated_cot"] = cot_text
    return entry


def is_correct(entry):
    """
    Determines if the choice with the lowest log probability corresponds to the correct answer.

    Args:
        entry (dict): A dictionary containing the question, choices, answer index, and logprobs.

    Returns:
        bool: True if the option with the lowest logprob matches the correct answer index, False otherwise.
    """
    # Extract logprobs and the correct answer index
    gen_answer = entry['generated_answer']
    if gen_answer is None:
        return False
    answer = entry['target']

    return answer == gen_answer


def process_dataset(dataset: Dataset, numproc=1):
    """
    Process the dataset using Dataset.map.
    """

    def process_entry(entry):
        if "generated_answer" in entry.keys() and entry["generated_answer"] is not None:
            return entry
        try:
            return evaluate_question(entry)
        except Exception as e:
            print(f"Error processing entry: {entry}, Exception: {e}")
            entry["generated_answer"] = None
            entry["generated_cot"] = None
            return entry

    return dataset.map(process_entry, with_indices=False, num_proc=numproc)

In [6]:
def update_dataset(dataset, is_baseline=False):
    # Save the updated dataset to a temporary location
    temp_path = "temp"
    dataset.save_to_disk(temp_path)

    # Overwrite the original dataset directory
    import shutil
    original_path = EVALUATED_MODEL_PATH if is_baseline == False else BASELINE_MODEL_PATH

    # Remove the old dataset and replace it with the new one
    shutil.rmtree(original_path)  # Remove the old dataset directory
    shutil.move(temp_path, original_path)

In [None]:
def main(is_continue=False, is_baseline=False, numproc=1):
    """
    Main function to evaluate the dataset asynchronously.
    """
    if is_baseline:
        selected_dataset = load_from_disk(BASELINE_MODEL_PATH)
    elif is_continue:
        selected_dataset = load_from_disk(EVALUATED_MODEL_PATH)
    else:
        selected_dataset = dataset
    # Process the dataset asynchronously
    processed_dataset = process_dataset(selected_dataset, numproc)

    # Save the updated dataset
    update_dataset(processed_dataset, is_baseline)


# Run the script
main(True, False, 100)

In [7]:
from converter.converter import save_value_to_json

for subset in ["addition", "lexicon", "syntax", "", "naive", "typo", "scramble"]:
    if subset == "":
        EVALUATED_MODEL_PATH = dataset_name + "_evaluated_" + MODEL_NAME
        subset = "our_baseline"
    else:
        EVALUATED_MODEL_PATH = dataset_name + "_" + subset + "_evaluated_" + MODEL_NAME
    if os.path.exists(EVALUATED_MODEL_PATH):
        selected_dataset = load_from_disk(EVALUATED_MODEL_PATH)
        print(selected_dataset)
        score = [is_correct(result) for result in selected_dataset]
        score = sum(score) / len(score)
        save_value_to_json(subset, score, MODEL_NAME)
        print(subset, "accuracy", score)
    else:
        print("skipping", EVALUATED_MODEL_PATH)

skipping LFrancis/BBH-NoOp-Plus_addition_evaluated_meta-llama/Llama-3.1-8B-Instruct
Dataset({
    features: ['input', 'target', 'choices', 'subset', 'generated_answer', 'generated_cot'],
    num_rows: 5761
})
lexicon accuracy 0.5551119597292137
Dataset({
    features: ['input', 'target', 'choices', 'subset', 'generated_answer', 'generated_cot'],
    num_rows: 5761
})
syntax accuracy 0.5653532372851935
skipping LFrancis/BBH-NoOp-Plus_evaluated_meta-llama/Llama-3.1-8B-Instruct
Dataset({
    features: ['input', 'target', 'choices', 'subset', 'generated_answer', 'generated_cot'],
    num_rows: 5761
})
naive accuracy 0.5926054504426315
Dataset({
    features: ['input', 'target', 'choices', 'subset', 'generated_answer', 'generated_cot'],
    num_rows: 5761
})
typo accuracy 0.49210206561360875
Dataset({
    features: ['input', 'target', 'choices', 'subset', 'generated_answer', 'generated_cot'],
    num_rows: 5761
})
scramble accuracy 0.4106925880923451


In [8]:
baseline_dataset = load_from_disk(BASELINE_MODEL_PATH)
score = [is_correct(result) for result in baseline_dataset]
score = sum(score) / len(score)
save_value_to_json("baseline", score, MODEL_NAME)
print("Baseline Accuracy", score)

Baseline Accuracy 0.616906787016143
