In [2]:
import collections
import json
import os
import re
import string
import sys

import numpy as np
import pandas as pd

In [3]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def get_tokens(s):
    if not s:
        return []
    return normalize_answer(s).split()


def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))


def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [23]:
def get_raw_scores_gemini(preds):
    exact_scores = {}
    f1_scores = {}
    for row in preds:
        gold_answers = [a for a in row["answers"]["text"] if normalize_answer(a)]
        if not gold_answers:
            gold_answers = [""]
        a_pred = row["gemini-1.0-pro-latest_answer"]
        if a_pred == "<no_answer>":
            a_pred = ""
        if a_pred == "<API_failed>":
            continue
        # Take max over all gold answers
        exact_scores[row["id"]] = max(compute_exact(a, a_pred) for a in gold_answers)
        f1_scores[row["id"]] = max(compute_f1(a, a_pred) for a in gold_answers)
    return exact_scores, f1_scores

In [30]:
def get_raw_scores_llama3(preds):
    exact_scores = {}
    f1_scores = {}
    for row in preds:
        gold_answers = [a for a in row["answers"]["text"] if normalize_answer(a)]
        if not gold_answers:
            gold_answers = [""]
        a_pred = row["potential_answer"]
        a_pred = a_pred.split("Answer_11: ")[1]
        if "<no_answer>" in a_pred:
            a_pred = ""
        # Take max over all gold answers
        exact_scores[row["id"]] = max(compute_exact(a, a_pred) for a in gold_answers)
        f1_scores[row["id"]] = max(compute_f1(a, a_pred) for a in gold_answers)
    return exact_scores, f1_scores

In [40]:
def make_eval_dict(exact_scores, f1_scores, qid_list=None):
    old_ids = list(exact_scores.keys())
    updated_qid = list(set(old_ids).intersection(set(qid_list)))
    if not updated_qid:
        total = len(exact_scores)
        return collections.OrderedDict(
            [
                ("exact", 100.0 * sum(exact_scores.values()) / total),
                ("f1", 100.0 * sum(f1_scores.values()) / total),
                ("total", total),
            ]
        )
    else:
        total = len(updated_qid)
        return collections.OrderedDict(
            [
                ("exact", 100.0 * sum(exact_scores[k] for k in updated_qid) / total),
                ("f1", 100.0 * sum(f1_scores[k] for k in updated_qid) / total),
                ("total", total),
            ]
        )

In [24]:
with open("../qa_prediction_files/gemini-1.0-pro-latest_squad2.0_validation.json", "r") as fin:
    data = json.load(fin)

In [25]:
exact_scores, f1_scores = get_raw_scores_gemini(data)

In [26]:
make_eval_dict(exact_scores, f1_scores, qid_list=None)

OrderedDict([('exact', 76.77725118483413),
             ('f1', 80.81593955483237),
             ('total', 11605)])

In [10]:
# Find ids where Gemini failed.
failed_ids = []
for row in data:
    if row["gemini-1.0-pro-latest_answer"] == "<API_failed>":
        failed_ids.append(row["id"])

print(len(failed_ids))
print(failed_ids)

268
['56de15104396321400ee25b7', '5ad3ee2d604f3c001a3ff7e1', '56de1563cffd8e1900b4b5c3', '5ad3f028604f3c001a3ff825', '5ad3fb01604f3c001a3ffb36', '56de3dbacffd8e1900b4b6d2', '5ad3fb6e604f3c001a3ffb5f', '5ad567055b96ef001a10adeb', '5ad04de377cf76001a686fa6', '570d2c20fed7b91900d45ca7', '57109275b654c5140001f9a1', '571114cfb654c5140001fb0a', '571114cfb654c5140001fb0c', '57111ab8a58dae1900cd6c40', '571144d1a58dae1900cd6d6e', '5ad3d689604f3c001a3ff30d', '5ad3ed37604f3c001a3ff7a4', '5ad415fd604f3c001a40032b', '571c3e8cdd7acb1400e4c0a7', '571a4d1a4faf5e1900b8a95a', '571c4132dd7acb1400e4c0b0', '571cebc05efbb31900334e48', '571cebc05efbb31900334e49', '571cebc05efbb31900334e4a', '571cebc05efbb31900334e4c', '5ad2678ad7d075001a42922c', '5ad2678ad7d075001a42922f', '571ce9bddd7acb1400e4c1a1', '5ad2685dd7d075001a429279', '571c7d55dd7acb1400e4c0c4', '571c8198dd7acb1400e4c0cf', '5ad24180d7d075001a428970', '5ad24ce8d7d075001a428c0f', '571c9348dd7acb1400e4c118', '571cc5c45efbb31900334ddb', '5ad258b4d7d075

In [16]:
data_frame = pd.read_csv("../qa_prediction_files/llama3_8b_instruction_10_shot.predicted.tsv", sep=",").to_dict("records")

In [28]:
preds = []
for row in data_frame:
    a_pred = row["potential_answer"]
    a_pred = a_pred.removeprefix("assistant\n\n")
    a_pred = a_pred.removeprefix("The shortest continuous text span from the passage that serves as an answer to the given question is:\n\n")
    a_pred = a_pred.removeprefix("The shortest continuous text span that serves as an answer to the given question is:\n\n")
    try:
        a_pred = a_pred.split("Answer_11: ")[1]
    except:
        try:
            a_pred = a_pred.split("Answer: ")[1]
        except:
            print(a_pred)
    if "<no_answer>" in a_pred:
        a_pred = ""
    #print(a_pred)

"The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East."

This answer is correct because it directly addresses the question, which asks about the type of major impact the Norman dynasty had on medieval Europe.
The Normans were famed for their Christian piety.
The Normans assimilated the Gallo-Romance language of the Frankish land they settled.
According to the passage, the French promised to protect Rollo and his men from "further Viking incursions".
According to the passage, Robert Guiscard, a Norman adventurer, ultimately drove the Byzantines out of southern Italy, and later conquered the Balkan peninsula as a foothold for western feudal lords and the Catholic Church.
According to the passage, the following places fell to the Normans:

* Dyrrachium
* Valona
* Butrint
* Ioannina
* Some minor cities in southwestern Macedonia and Thessaly
* Thessalonica (appeared at the gates, but did not capture it)

Note that the passage does 

In [4]:
with open("../gemini-1.0-pro-latest.squadv2.0.validation.cot.json", "r") as fin:
    data_first_part = json.load(fin)

with open("../gemini-1.0-pro-latest.squadv2.0.validation.cot.part2.json", "r") as fin:
    data_second_part = json.load(fin)

In [12]:
def get_raw_scores_gemini_cot(preds_first_part, preds_second_part):
    exact_scores = {}
    f1_scores = {}
    for row in preds_first_part:
        gold_answers = [a for a in row["answers"]["text"] if normalize_answer(a)]
        if not gold_answers:
            gold_answers = [""]
        a_pred = row["gemini-1.0-pro-latest_answer_cot"]
        if a_pred == "<API_failed>":
            continue
        a_pred = a_pred.split("Final Answer: ")[1]
        if a_pred == "<no_answer>":
            a_pred = ""

        # Take max over all gold answers
        exact_scores[row["id"]] = max(compute_exact(a, a_pred) for a in gold_answers)
        f1_scores[row["id"]] = max(compute_f1(a, a_pred) for a in gold_answers)

    for row in preds_second_part:
        gold_answers = [a for a in row["answers"]["text"] if normalize_answer(a)]
        if not gold_answers:
            gold_answers = [""]
        a_pred = row["gemini-1.0-pro-latest_answer_cot"]
        if a_pred == "<API_failed>":
            continue
        a_pred = a_pred.split("Final Answer: ")[1]
        if a_pred == "<no_answer>":
            a_pred = ""

        # Take max over all gold answers
        exact_scores[row["id"]] = max(compute_exact(a, a_pred) for a in gold_answers)
        f1_scores[row["id"]] = max(compute_f1(a, a_pred) for a in gold_answers)
    
    return exact_scores, f1_scores

In [41]:
exact_scores, f1_scores = get_raw_scores_gemini_cot(data_first_part, data_second_part)

In [14]:
make_eval_dict(exact_scores, f1_scores)

OrderedDict([('exact', 66.84468179795283),
             ('f1', 71.69851113772972),
             ('total', 2247)])

In [42]:
q_ids = list(exact_scores.keys())

In [43]:
with open("../qa_prediction_files/gemini-1.0-pro-latest_squad2.0_validation.json", "r") as fin:
    fewshot_data = json.load(fin)

In [44]:
exact_scores, f1_scores = get_raw_scores_gemini(fewshot_data)

In [45]:
make_eval_dict(exact_scores, f1_scores, qid_list=q_ids)

OrderedDict([('exact', 75.84803256445048),
             ('f1', 79.1624556631292),
             ('total', 2211)])