In [1]:
import pandas as pd
from datasets import load_dataset

In [2]:
import regex, unicodedata
class SimpleTokenizer(object):
    ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+'
    NON_WS = r'[^\p{Z}\p{C}]'

    def __init__(self):
        """
        Args:
            annotators: None or empty set (only tokenizes).
        """
        self._regexp = regex.compile(
            '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
            flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
        )

    def tokenize(self, text, uncased=False):
        matches = [m for m in self._regexp.finditer(text)]
        if uncased:
            tokens = [m.group().lower() for m in matches]
        else:
            tokens = [m.group() for m in matches]
        return tokens

def has_answer(answers: list[str], text, tokenizer) -> bool:
    """Check if a document contains an answer string."""
    text = _normalize(text)
    text = tokenizer.tokenize(text, uncased=True)

    for answer in answers:
        answer = _normalize(answer)
        answer = tokenizer.tokenize(answer, uncased=True)
        for i in range(0, len(text) - len(answer) + 1):
            if answer == text[i: i + len(answer)]:
                return True
    return False

def _normalize(text):
    return unicodedata.normalize('NFD', text)

In [3]:
df = pd.read_csv("/data/seongilpark/research/in-context-robust-ralm/data/unans.csv")
zero = pd.read_csv("/data/seongilpark/research/in-context-robust-ralm/data/zero.csv")

In [4]:
def hasanswer(ctxs) -> bool:
    return any([c["hasanswer"] for c in ctxs])

def answerable(ctxs) -> bool:
    res = []
    for ctx in ctxs:
        hasanswer, entail = ctx["hasanswer"], ctx["nli"]
        if hasanswer and (entail in ["entailment", "contradiction"]):
            res.append("answerable")
        elif (not hasanswer) and (entail != "entailment"):
            res.append("unanswerable")
        else:
            res.append("uncertain")
    if res.count("answerable") >= 1:
        return "answerable"
    elif res.count("unanswerable") == 5:
        return "unanswerable"
    else:
        return "uncertain"

In [5]:
dataset = load_dataset("Atipico1/incontext_nq")["train"]
dataset = dataset.map(lambda x: {"ctxs":x["ctxs"][:5]})
dataset = dataset.map(lambda x: {"answerable":answerable(x["ctxs"])})
dataset = dataset.filter(lambda x: x["answerable"] == "answerable")

In [20]:
dataset = pd.DataFrame(dataset)
dataset["answerable"] = dataset["ctxs"].apply(lambda x: answerable(x))
print(dataset.answerable.value_counts())
df["answerable"] = dataset["answerable"].tolist()
zero["answerable"] = dataset["answerable"].tolist()
df["answers"] = dataset["answers"]
zero["answers"] = dataset["answers"]

answerable
uncertain       632
answerable      255
unanswerable    113
Name: count, dtype: int64


In [7]:
df["is_unans"] = df["pred"].apply(lambda x: "unanswerable" in x.lower())
zero["is_unans"] = zero["pred"].apply(lambda x: "unanswerable" in x.lower())

In [8]:
df["is_unans_zero"] = zero["is_unans"].tolist()

In [9]:
len(df[(df.is_unans == df.is_unans_zero) & (df.is_unans == True)])

125

In [10]:
len(df[(df.is_unans != df.is_unans_zero) & (df.is_unans == True)])

69

In [11]:
for data in [df, zero]:
    total = data[data.answerable=="unanswerable"].shape[0]
    sub = data[(data.answerable=="unanswerable") & (data.is_unans == True)].shape[0]
    print(sub/total)

0.4247787610619469
0.37168141592920356


In [21]:
tokenizer = SimpleTokenizer()
df["acc"] = df.apply(lambda x: has_answer(x["answers"], x["pred"], tokenizer), axis=1)
zero["acc"] = zero.apply(lambda x: has_answer(x["answers"], x["pred"], tokenizer), axis=1)
for data in [df, zero]:
    total = data[data.answerable=="answerable"].shape[0]
    sub = data[(data.answerable=="answerable") & (data.acc == True)].shape[0]
    print(sub/total)

0.42745098039215684
0.4196078431372549


In [24]:
tokenizer = SimpleTokenizer()
df["acc"] = df.apply(lambda x: has_answer(x["answers"], x["pred"], tokenizer), axis=1)
zero["acc"] = zero.apply(lambda x: has_answer(x["answers"], x["pred"], tokenizer), axis=1)
for data in [df, zero]:
    total = data[data.answerable!="unanswerable"].shape[0]
    sub = data[(data.answerable!="unanswerable") & (data.acc == True)].shape[0]
    print(sub/total)

0.29312288613303267
0.28410372040586246
