# Baseline models

# 2. ViLT

### Setup

In [3]:
!pip install transformers accelerate bert-score scikit-learn timeout-decorator pandas pillow torch



In [14]:
# Cell 2: Imports and Setup
import os
import glob
import pandas as pd
import torch
from PIL import Image
from timeout_decorator import timeout, TimeoutError
from sklearn.metrics import accuracy_score, f1_score
from bert_score import score as bertscore_score
from transformers import ViltProcessor, ViltForQuestionAnswering
from tqdm import tqdm

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on", device)

Running on cuda


### Load and Concatenate Curated Dataset (S1–S6)

In [5]:
base_path = "/kaggle/input/vr-dataset/dataset_curated"
subfolders = [f"S{i}" for i in range(1, 7)]
all_dfs = []

for folder in subfolders:
    csv_path = os.path.join(base_path, folder, f"{folder}_qa_data.csv")
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path, header=None, names=["image_path", "question", "answer"])
        df["image_path"] = df["image_path"].apply(
            lambda p: os.path.join(base_path, os.path.normpath(p).split("dataset_curated/")[-1])
        )
        df = df[df["image_path"].apply(os.path.exists)].reset_index(drop=True)
        all_dfs.append(df)
        print(f"Loaded {len(df)} examples from {csv_path}")
    else:
        print(f"CSV not found: {csv_path}")

df_all = pd.concat(all_dfs).reset_index(drop=True)
print(f"\nTotal QA pairs loaded: {len(df_all)}")

Loaded 14366 examples from /kaggle/input/vr-dataset/dataset_curated/S1/S1_qa_data.csv
Loaded 14358 examples from /kaggle/input/vr-dataset/dataset_curated/S2/S2_qa_data.csv
Loaded 14367 examples from /kaggle/input/vr-dataset/dataset_curated/S3/S3_qa_data.csv
Loaded 14366 examples from /kaggle/input/vr-dataset/dataset_curated/S4/S4_qa_data.csv
Loaded 14387 examples from /kaggle/input/vr-dataset/dataset_curated/S5/S5_qa_data.csv
Loaded 14376 examples from /kaggle/input/vr-dataset/dataset_curated/S6/S6_qa_data.csv

Total QA pairs loaded: 86220


### Initialize ViLT Processor & Model

In [6]:

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

model.to(device)
model.eval()


ViltForQuestionAnswering(
  (vilt): ViltModel(
    (embeddings): ViltEmbeddings(
      (text_embeddings): TextEmbeddings(
        (word_embeddings): Embedding(30522, 768)
        (position_embeddings): Embedding(40, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (patch_embeddings): ViltPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32))
      )
      (token_type_embeddings): Embedding(2, 768)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViltEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViltLayer(
          (attention): ViltAttention(
            (attention): ViltSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=76

### Define Inference Function with Timeout

In [7]:
@timeout(60)  # max 60s per sample
def predict_answer(image_path: str, question: str) -> str:
    try:
        # 1. Load and preprocess
        image = Image.open(image_path).convert("RGB")
        inputs = processor(image, question, return_tensors="pt")
        # **NEW**: move inputs to GPU if model is on GPU
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # 2. Forward pass
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits

        # 3. Decode
        pred_id = logits.argmax(-1).item()
        answer = model.config.id2label[pred_id]
        return answer.split()[0].strip().lower()

    except Exception as e:
        # Print the first few errors to see what’s wrong
        if predict_answer.err_count < 5:
            print(f"[Error] {type(e).__name__}: {e}")
        predict_answer.err_count += 1
        return ""
# initialize counter
predict_answer.err_count = 0

### Run Inference & Collect Predictions

In [15]:
# Main evaluation loop with 1,000-sample checkpoints
# Prepare for resuming
# Paths for checkpointing
pred_path    = "/kaggle/working/vilt_vqa_predictions.csv"
metrics_path = "/kaggle/working/vilt_vqa_metrics_summary.csv"


start_idx = 0
if os.path.exists(pred_path):
    existing = pd.read_csv(pred_path)
    start_idx = len(existing)
    print("Resuming from index", start_idx)

# Buffers
buffer_results = []
buffer_y_true  = []
buffer_y_pred  = []

for idx, row in tqdm(df_all.iterrows(), total=len(df_all), desc="Evaluating"):
    if idx < start_idx:
        continue

    img_path = row["image_path"]
    question = row["question"]
    true_ans = str(row["answer"]).strip().lower()

    pred = predict_answer(img_path, question)
    pred = pred or ""  # ensure string
    pred = pred.lower()

    buffer_results.append({
        "img_path": img_path,
        "question": question,
        "true_answer": true_ans,
        "predicted_answer": pred
    })
    buffer_y_true.append(true_ans)
    buffer_y_pred.append(pred)

    # checkpoint every 1,000 or at end
    if (idx + 1) % 1000 == 0 or (idx + 1) == len(df_all):
        # 1) save predictions chunk
        chunk_df = pd.DataFrame(buffer_results)
        mode    = "w" if not os.path.exists(pred_path) else "a"
        header  = not os.path.exists(pred_path)
        chunk_df.to_csv(pred_path, mode=mode, index=False, header=header)

        # 2) compute interim metrics
        acc = accuracy_score(buffer_y_true, buffer_y_pred)
        f1m = f1_score(buffer_y_true, buffer_y_pred, average="macro", zero_division=0)
        # BERTScore
        P, R, F1b = bertscore_score(buffer_y_pred, buffer_y_true, lang="en")
        bert = F1b.mean().item()

        metrics_df = pd.DataFrame([{
            "step": idx+1,
            "accuracy": acc,
            "f1_macro": f1m,
            "bert_score": bert
        }])
        m_mode   = "w" if not os.path.exists(metrics_path) else "a"
        m_header = not os.path.exists(metrics_path)
        metrics_df.to_csv(metrics_path, mode=m_mode, index=False, header=m_header)

        print(f"Checkpoint @ {idx+1}: acc={acc:.4f}, f1={f1m:.4f}, bert={bert:.4f}")

        # clear buffers
        buffer_results.clear()
        buffer_y_true.clear()
        buffer_y_pred.clear()

print("Evaluation complete.")

Evaluating:   1%|          | 997/86220 [00:25<37:14, 38.15it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   1%|          | 1005/86220 [00:28<3:54:19,  6.06it/s]

Checkpoint @ 1000: acc=0.2950, f1=0.0808, bert=0.9749


Evaluating:   2%|▏         | 1996/86220 [00:56<43:01, 32.63it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   2%|▏         | 2004/86220 [00:57<2:17:55, 10.18it/s]

Checkpoint @ 2000: acc=0.3020, f1=0.0825, bert=0.9767


Evaluating:   3%|▎         | 2998/86220 [01:27<42:20, 32.76it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   3%|▎         | 3006/86220 [01:28<2:18:45,  9.99it/s]

Checkpoint @ 3000: acc=0.2890, f1=0.0622, bert=0.9706


Evaluating:   5%|▍         | 3999/86220 [01:58<32:32, 42.12it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   5%|▍         | 4004/86220 [01:59<2:37:36,  8.69it/s]

Checkpoint @ 4000: acc=0.2920, f1=0.0772, bert=0.9737


Evaluating:   6%|▌         | 4999/86220 [02:28<37:36, 35.99it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   6%|▌         | 5007/86220 [02:30<2:10:49, 10.35it/s]

Checkpoint @ 5000: acc=0.2770, f1=0.0663, bert=0.9705


Evaluating:   7%|▋         | 5998/86220 [02:57<37:26, 35.71it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   7%|▋         | 6005/86220 [02:59<2:52:38,  7.74it/s]

Checkpoint @ 6000: acc=0.2860, f1=0.0643, bert=0.9732


Evaluating:   8%|▊         | 6997/86220 [03:26<36:09, 36.51it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   8%|▊         | 7005/86220 [03:27<2:08:39, 10.26it/s]

Checkpoint @ 7000: acc=0.2970, f1=0.0745, bert=0.9735


Evaluating:   9%|▉         | 7999/86220 [03:54<37:43, 34.55it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:   9%|▉         | 8003/86220 [03:56<2:44:12,  7.94it/s]

Checkpoint @ 8000: acc=0.3040, f1=0.0713, bert=0.9720


Evaluating:  10%|█         | 8999/86220 [04:24<39:57, 32.21it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  10%|█         | 9003/86220 [04:25<2:54:48,  7.36it/s]

Checkpoint @ 9000: acc=0.2760, f1=0.0910, bert=0.9753


Evaluating:  12%|█▏        | 9997/86220 [04:52<32:29, 39.09it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  12%|█▏        | 10005/86220 [04:53<1:53:51, 11.16it/s]

Checkpoint @ 10000: acc=0.3130, f1=0.0856, bert=0.9748


Evaluating:  13%|█▎        | 10997/86220 [05:19<32:11, 38.94it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  13%|█▎        | 11006/86220 [05:21<1:48:12, 11.58it/s]

Checkpoint @ 11000: acc=0.2880, f1=0.0850, bert=0.9727


Evaluating:  14%|█▍        | 11997/86220 [05:48<32:27, 38.11it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  14%|█▍        | 12005/86220 [05:49<1:57:32, 10.52it/s]

Checkpoint @ 12000: acc=0.2920, f1=0.0811, bert=0.9726


Evaluating:  15%|█▌        | 12997/86220 [06:15<32:14, 37.86it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  15%|█▌        | 13005/86220 [06:17<1:59:47, 10.19it/s]

Checkpoint @ 13000: acc=0.2710, f1=0.0700, bert=0.9709


Evaluating:  16%|█▌        | 13996/86220 [06:44<32:31, 37.01it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  16%|█▌        | 14004/86220 [06:45<1:54:56, 10.47it/s]

Checkpoint @ 14000: acc=0.3070, f1=0.0882, bert=0.9721


Evaluating:  17%|█▋        | 14999/86220 [07:13<31:08, 38.12it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  17%|█▋        | 15007/86220 [07:15<1:54:46, 10.34it/s]

Checkpoint @ 15000: acc=0.2860, f1=0.0757, bert=0.9719


Evaluating:  19%|█▊        | 15998/86220 [07:42<33:22, 35.06it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  19%|█▊        | 16006/86220 [07:44<1:54:00, 10.26it/s]

Checkpoint @ 16000: acc=0.3120, f1=0.0689, bert=0.9711


Evaluating:  20%|█▉        | 16998/86220 [08:11<31:17, 36.88it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  20%|█▉        | 17006/86220 [08:13<2:04:05,  9.30it/s]

Checkpoint @ 17000: acc=0.2850, f1=0.0706, bert=0.9710


Evaluating:  21%|██        | 17996/86220 [08:40<32:56, 34.52it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  21%|██        | 18004/86220 [08:42<1:48:16, 10.50it/s]

Checkpoint @ 18000: acc=0.2890, f1=0.0721, bert=0.9745


Evaluating:  22%|██▏       | 18998/86220 [09:11<31:04, 36.05it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  22%|██▏       | 19006/86220 [09:12<1:48:47, 10.30it/s]

Checkpoint @ 19000: acc=0.2830, f1=0.0733, bert=0.9716


Evaluating:  23%|██▎       | 19998/86220 [09:40<32:15, 34.22it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  23%|██▎       | 20007/86220 [09:41<1:42:56, 10.72it/s]

Checkpoint @ 20000: acc=0.3080, f1=0.0782, bert=0.9719


Evaluating:  24%|██▍       | 20997/86220 [10:09<31:28, 34.55it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  24%|██▍       | 21005/86220 [10:10<1:46:25, 10.21it/s]

Checkpoint @ 21000: acc=0.2950, f1=0.0812, bert=0.9759


Evaluating:  26%|██▌       | 21996/86220 [10:38<31:33, 33.92it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  26%|██▌       | 22004/86220 [10:40<1:45:01, 10.19it/s]

Checkpoint @ 22000: acc=0.2900, f1=0.0685, bert=0.9708


Evaluating:  27%|██▋       | 22999/86220 [11:08<28:45, 36.63it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  27%|██▋       | 23007/86220 [11:09<1:45:20, 10.00it/s]

Checkpoint @ 23000: acc=0.2750, f1=0.0667, bert=0.9759


Evaluating:  28%|██▊       | 23999/86220 [11:38<29:58, 34.59it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  28%|██▊       | 24003/86220 [11:39<2:14:18,  7.72it/s]

Checkpoint @ 24000: acc=0.3050, f1=0.0691, bert=0.9744


Evaluating:  29%|██▉       | 24997/86220 [12:08<28:44, 35.50it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  29%|██▉       | 25005/86220 [12:10<1:41:15, 10.08it/s]

Checkpoint @ 25000: acc=0.2660, f1=0.0806, bert=0.9732


Evaluating:  30%|███       | 25999/86220 [12:38<27:41, 36.23it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  30%|███       | 26007/86220 [12:40<1:36:33, 10.39it/s]

Checkpoint @ 26000: acc=0.2790, f1=0.0760, bert=0.9730


Evaluating:  31%|███▏      | 26998/86220 [13:08<28:05, 35.13it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  31%|███▏      | 27006/86220 [13:09<1:36:15, 10.25it/s]

Checkpoint @ 27000: acc=0.3020, f1=0.0737, bert=0.9727


Evaluating:  32%|███▏      | 27996/86220 [13:37<28:19, 34.26it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  32%|███▏      | 28005/86220 [13:39<1:32:40, 10.47it/s]

Checkpoint @ 28000: acc=0.2810, f1=0.0570, bert=0.9735


Evaluating:  34%|███▎      | 28999/86220 [14:07<27:41, 34.43it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  34%|███▎      | 29006/86220 [14:09<1:42:06,  9.34it/s]

Checkpoint @ 29000: acc=0.2840, f1=0.0495, bert=0.9716


Evaluating:  35%|███▍      | 29997/86220 [14:37<25:38, 36.55it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  35%|███▍      | 30005/86220 [14:38<1:31:11, 10.27it/s]

Checkpoint @ 30000: acc=0.2730, f1=0.0682, bert=0.9727


Evaluating:  36%|███▌      | 30999/86220 [15:07<25:50, 35.62it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  36%|███▌      | 31003/86220 [15:09<2:00:39,  7.63it/s]

Checkpoint @ 31000: acc=0.2990, f1=0.0727, bert=0.9724


Evaluating:  37%|███▋      | 31996/86220 [15:37<24:20, 37.13it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  37%|███▋      | 32004/86220 [15:38<1:26:10, 10.49it/s]

Checkpoint @ 32000: acc=0.2960, f1=0.0725, bert=0.9748


Evaluating:  38%|███▊      | 32999/86220 [16:06<23:58, 36.99it/s]  Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating:  38%|███▊      | 33007/86220 [16:08<1:32:57,  9.54it/s]

Checkpoint @ 33000: acc=0.2860, f1=0.0631, bert=0.9728


Evaluating:  39%|███▉      | 33764/86220 [16:29<25:37, 34.13it/s]  


KeyboardInterrupt: 

### Compute Evaluation Metrics

In [16]:
y_true = df_all["answer"].astype(str).str.lower().tolist()
y_pred = df_all["predicted_answer"].astype(str).str.lower().tolist()

accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average="macro")

# BERTScore: rescaled against baseline
P, R, F1_bert = bertscore_score(y_pred, y_true, lang="en", rescale_with_baseline=True)
bert_score = F1_bert.mean().item()

print(f"Accuracy:  {accuracy:.4f}")
print(f"F1 Macro:  {f1:.4f}")
print(f"BERTScore: {bert_score:.4f}")


KeyError: 'predicted_answer'