In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

device = "cuda:0"

model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)



In [2]:
label2id = {0: "entailment", 1: "neutral", 2: "contradiction"}

In [9]:
premise = ["Angela Merkel ist eine Politikerin in Deutschland und Vorsitzende der CDU"]*2
hypothesis = ["Emmanuel Macron is the President of France"]*2
model.to(device)
input = tokenizer(premise, hypothesis, padding=True, return_tensors="pt").to(device)
output = model(input["input_ids"])  # device = "cuda:0" or "cpu"
prediction = torch.softmax(output["logits"][0], -1).tolist()
label_names = ["entailment", "neutral", "contradiction"]
prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
print(prediction)


{'entailment': 82.4, 'neutral': 17.3, 'contradiction': 0.3}


In [5]:
model.to(device)
from typing import List
def func(q: str, ctxs: List[dict]) -> List[dict]:
    new_output = []
    ctx_texts = [c["text"] for c in ctxs]
    input = tokenizer([q]*len(ctx_texts), ctx_texts, padding=True, max_length=256, return_tensors="pt").to(device)
    output = model(input["input_ids"])
    for out, ctx in zip(output["logits"], ctxs):
        ctx["nli"] = label2id[out.argmax().item()]
        new_output.append(ctx)
    return new_output

In [1]:
from datasets import load_dataset

nq = load_dataset("Atipico1/incontext_nq_v2", split="train")
tqa = load_dataset("Atipico1/incontext_tqa_v2", split="train")
nq = nq.map(lambda x: {"ctxs" : x["ctxs"][:5]})
tqa = tqa.map(lambda x: {"ctxs" : x["ctxs"][:5]})

Downloading readme:   0%|          | 0.00/849 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/31.8M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/11313 [00:00<?, ? examples/s]

Map:   0%|          | 0/3610 [00:00<?, ? examples/s]

Map:   0%|          | 0/11313 [00:00<?, ? examples/s]

In [7]:
nq = nq.map(lambda x: {"ctxs" : func(x["gpt_answer_sentence"], x["ctxs"])})

Map:   0%|          | 0/3610 [00:00<?, ? examples/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.

KeyboardInterrupt



In [33]:
tqa = tqa.map(lambda x: {"ctxs" : func(x["gpt_answer_sentence"], x["ctxs"])})

Map:   0%|          | 0/11313 [00:00<?, ? examples/s]



In [2]:
def hasanswer(ctxs) -> bool:
    return any([c["hasanswer"] for c in ctxs])

def answerable(ctxs) -> bool:
    res = []
    for ctx in ctxs:
        hasanswer, entail = ctx["hasanswer"], ctx["nli"]
        if hasanswer and (entail in ["entailment", "contradiction"]):
            res.append("answerable")
        elif (not hasanswer) and (entail != "entailment"):
            res.append("unanswerable")
        else:
            res.append("uncertain")
    if res.count("answerable") >= 1:
        return "answerable"
    elif res.count("unanswerable") == 5:
        return "unanswerable"
    else:
        return "uncertain"

In [3]:
nq = nq.map(lambda x: {"hasanswer": hasanswer(x["ctxs"])})
nq = nq.map(lambda x: {"answerable": answerable(x["ctxs"])})

Map:   0%|          | 0/3610 [00:00<?, ? examples/s]

Map:   0%|          | 0/3610 [00:00<?, ? examples/s]

In [4]:
nq.to_pandas().answerable.value_counts()

answerable
uncertain       1694
answerable      1433
unanswerable     483
Name: count, dtype: int64

In [7]:
1433/3610

0.3969529085872576

In [47]:
df = pd.DataFrame(nq)

In [48]:
df["answerable"].value_counts()

answerable
uncertain       2238
answerable       889
unanswerable     483
Name: count, dtype: int64

In [52]:
tqa.push_to_hub("Atipico1/incontext_tqa")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

In [None]:
!huggingface-cli login