In [1]:
import collections
import json
import os
import re
import string
import sys

import numpy as np
import pandas as pd

In [32]:
def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)

def white_space_fix(text):
    return " ".join(text.split())

def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)

def lower(text):
    return text.lower()

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    return white_space_fix(remove_articles(remove_punc(lower(s))))


def get_tokens(s):
    if not s:
        return []
    return normalize_answer(s).split()


def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))


def compute_f1_precision_recall(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return [int(gold_toks == pred_toks), int(gold_toks == pred_toks), int(gold_toks == pred_toks)]
    if num_same == 0:
        return [0, 0, 0]
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return [f1, precision, recall]

In [33]:

df = pd.read_csv("/scratch/ssd004/scratch/snajafi/checkpoints/llama3-squadv2.0_normal_icl/llama3.squad2.dev.results.csv")
data = df.to_dict(orient='records')
predictions = {}
for row in data:
    a_pred = row["potential_answer"]
    a_pred = a_pred.removeprefix("assistant\n\n")
    a_pred = a_pred.removeprefix("The shortest continuous text span from the passage that serves as an answer to the given question is:\n\n")
    a_pred = a_pred.removeprefix("The shortest continuous text span from the passage that serves as an answer to the question is:\n\n")
    a_pred = a_pred.removeprefix("The shortest continuous text span that serves as an answer to the given question is:\n\n")
    a_pred = a_pred.removeprefix("Based on the passage, the correct answer is")
    a_pred = a_pred.removeprefix("The correct answer is")
    a_pred = a_pred.removeprefix("According to the passage,")
    a_pred = a_pred.removeprefix(":")
    a_pred = a_pred.removeprefix("Here is the answer:")
    a_pred = a_pred.removeprefix("the correct answer is")
    a_pred = white_space_fix(a_pred)
    try:
        a_pred = a_pred.split("Final Answer_11: ")[1]
    except:
        try:
            a_pred = a_pred.split("Answer: ")[1]
        except:
            print(a_pred)
    if "<no_answer>" in a_pred:
        a_pred = "<no_answer>"
    
    predictions[row["row_id"]] = a_pred

France
"in the 10th and 11th centuries".
Denmark, Iceland, and Norway
Rollo
"...the distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century..." Therefore, the correct answer is: 10th century
"The Normans" This answer is justified by the passage, which states that the Normans were the people who gave their name to Normandy in the 10th and 11th centuries.
Rollo.
William the Conqueror.
Richard I of Normandy.
medieval Europe. The passage states that the Norman dynasty had a major political, cultural, and military impact on medieval Europe, which is a specific time period in history.
"They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language." This text span does not answer the question. The correct answer is not found in the passage. Therefore, the final answer is: <no_answer>
The passage does not explicitly state who assimi

In [47]:
from datasets import load_dataset

def compute_dev_results(predictions_dict):
    # Read gold-data
    dataset = load_dataset("rajpurkar/squad_v2", split="validation")
    exact_scores = {}
    f1_scores = {}
    precision_scores = {}
    recall_scores = {}
    for row in dataset:
        gold_answers = [a for a in row["answers"]["text"] if normalize_answer(a)]
        if not gold_answers:
            gold_answers = [""]
        a_pred = predictions_dict[row["id"]]
        if a_pred == "<no_answer>":
            a_pred = ""
        # Take max over all gold answers
        exact_scores[row["id"]] = max(compute_exact(a, a_pred) for a in gold_answers)
        f1_scores[row["id"]] = max(compute_f1_precision_recall(a, a_pred)[0] for a in gold_answers)
        precision_scores[row["id"]] = max(compute_f1_precision_recall(a, a_pred)[1] for a in gold_answers)
        recall_scores[row["id"]] = max(compute_f1_precision_recall(a, a_pred)[2] for a in gold_answers)

    total = len(exact_scores)
    scores = collections.OrderedDict(
                [
                    ("exact", 100.0 * sum(exact_scores.values()) / total),
                    ("f1", 100.0 * sum(f1_scores.values()) / total),
                    ("precision", 100.0 * sum(precision_scores.values()) / total),
                    ("recall", 100.0 * sum(recall_scores.values()) / total),
                    ("total", total),
                ]
        )
    return scores

In [48]:
compute_dev_results(predictions)

OrderedDict([('exact', 38.305398804009094),
             ('f1', 47.51433054699122),
             ('precision', 44.91873615966334),
             ('recall', 64.33414717639224),
             ('total', 11873)])

In [49]:
# with explanations
df = pd.read_csv("/scratch/ssd004/scratch/snajafi/checkpoints/llama3-squadv2.0_explanation_icl/llama3.squad2.dev.results.csv")
data = df.to_dict(orient='records')
explanation_predictions = {}
for row in data:
    a_pred = row["potential_answer"]
    try:
        a_pred = a_pred.removeprefix("assistant\n\n").split("Final Answer_11:")[1]
    except:
        try:
            a_pred = a_pred.removeprefix("assistant\n\n").split("Final Answer:")[1]
        except:
            try:
                a_pred = a_pred.removeprefix("assistant\n\n").split("Reasoning:")[1]
            except:
                print(a_pred)

    if "<no_answer>" in a_pred:
        a_pred = "<no_answer>"

    explanation_predictions[row["row_id"]] = a_pred

assistant

I apologize, but there is no passage 11 or question 11 provided. It seems that the passage and question numbers are mismatched. If you could provide the correct passage and question, I'd be happy to help you with the answer.
assistant

Based on the passage, we can identify the three factors that directly affect how powerful a Turing machine may or may not be:

1. **Time**: The passage mentions that when time is bounded, some Turing machines may be more powerful than others.
2. **Space**: The passage also mentions that when space is bounded, some Turing machines may be more powerful than others.
3. **Resources**: The passage states that when resources (such as time or space) are bounded, some Turing machines may be more powerful than others.

These three factors are directly mentioned in the passage as affecting the power of Turing machines.
assistant

I apologize, but there is no Passage_11. The passage numbers only go up to Passage_10. Therefore, it is not possible to answe

In [50]:
compute_dev_results(explanation_predictions)

OrderedDict([('exact', 61.2060978691148),
             ('f1', 67.58288726227273),
             ('precision', 66.93251948389496),
             ('recall', 70.59189028363731),
             ('total', 11873)])