# Imports

In [None]:
"""Haystack RAG"""

!pip install "farm-haystack[inmemory]" sentence-transformers transformers accelerate torch --quiet

from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import EmbeddingRetriever
from haystack import Document
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.7/48.7 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m764.4/764.4 kB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m136.4/136.4 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.6/10.6 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.0/70.0 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.5/163.5 kB[0m [31m12.5 MB/s[0m eta [

In [None]:
"""BLIP Image Captioner"""

# Simple English captions with BLIP (Can later translate or just feed as-is)
from transformers import BlipProcessor, BlipForConditionalGeneration

# !pip install timm --quiet

In [None]:
from typing import List, Dict, Any, Union, Callable
from tqdm.auto import tqdm
from pathlib import Path
from PIL import Image
import os, re, json, unicodedata
import numpy as np
import pandas as pd
import torch

# Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/project/Questions/
%ls

/content/drive/MyDrive


'/content/drive/MyDrive'

# Device

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Configs and Filenames

In [None]:
# DATA_PATH
SAVE_DIR  = "/content/drive/MyDrive/project/Baseline/"
MATCH_MODE = "exact" # "exact" or "relaxed"
DEFAULT_LANG_CODE_MBART = "fa_IR"

uni_modal_test_data_path = "mcq_questions_90.json"
multi_modal_test_data_path = "mcq_with_image_40.json"

# Haystack RAG (Retriever = Haystack, Generator = HF mT5)

In [None]:
class HaystackRAG:
    def __init__(self, device = "cpu", docs = None):

        self.device = device

        # --------- BLIP English Captioner ----------

        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        self.blip = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(self.device)

        # --------- Multilingual generator (mT5) ----------

        self.tok_mt5 = AutoTokenizer.from_pretrained("google/mt5-base")
        self.gen_mt5 = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base").to(self.device)

        self.tok_mbart = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")
        self.gen_mbart = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-one-mmt").to(self.device)



    """Answer UniModal questions without Retrieval"""
    def mt5_generate(self, question, answers, max_new_tokens=80):

        prompt = (
            "شما باید به سوال چند گزینه ای که برایتان ارسال می شود پاسخ بدهید، لطفا با دقت سوال زیر را بخوانید و از بین گزینه های ارائه شده گزینه ی صحیح را انتخاب کنید\n\n "
            "سوال: {q}\n"
            "گزینه ها:\n"
            "1) {ctx[0]}\n"
            "2) {ctx[1]}\n"
            "3) {ctx[2]}\n"
            "4) {ctx[3]}\n"
            "لطفا فقط شماره ی گزینه ی صحیح (1،2،3 و 4) را بیان کن و هیچ توضیح اضافی نده\n"
            "خروجی نمونه: 2"
        ).format(q = question, ctx = answers)

        # Generate
        inputs = self.tok_mt5(prompt, return_tensors="pt", truncation=True, max_length=1024).to(self.device)
        out_ids_mt5 = self.gen_mt5.generate(**inputs, max_new_tokens=max_new_tokens)
        ans_mt5 = self.tok_mt5.decode(out_ids_mt5[0], skip_special_tokens=True)

        return ans_mt5

    def mbart_generate(self, question, answers, max_new_tokens=80):

        prompt = (
            "شما باید به سوال چند گزینه ای که برایتان ارسال می شود پاسخ بدهید، لطفا با دقت سوال زیر را بخوانید و از بین گزینه های ارائه شده گزینه ی صحیح را انتخاب کنید\n\n "
            "سوال: {q}\n"
            "گزینه ها:\n"
            "1) {ctx[0]}\n"
            "2) {ctx[1]}\n"
            "3) {ctx[2]}\n"
            "4) {ctx[3]}\n"
            "لطفا فقط شماره ی گزینه ی صحیح (1،2،3 و 4) را بیان کن و هیچ توضیح اضافی نده\n"
            "خروجی نمونه: 2"
        ).format(q = question, ctx = answers)

        # Generate
        self.tok_mbart.src_lang = "fa_IR"   # Persian input
        inputs = self.tok_mbart(prompt, return_tensors="pt").to(self.device)
        out_ids_mbart = self.gen_mbart.generate(**inputs, forced_bos_token_id=self.tok_mbart.lang_code_to_id["fa_IR"])
        ans_mbart = self.tok_mbart.decode(out_ids_mbart[0], skip_special_tokens=True)

        return ans_mbart




    """Answer MultiModal questions without Retrieval"""

    def caption_image(self, image_path, max_new_tokens=80):
        img = Image.open(image_path).convert("RGB")
        inputs = self.processor(images=img, return_tensors="pt").to(self.device)
        ids = self.blip.generate(**inputs, max_new_tokens=max_new_tokens)
        caption = self.processor.decode(ids[0], skip_special_tokens=True)
        return caption

    def mt5_multimodal_generate(self, question, image_path, answers, max_new_tokens=64):

        caption = self.caption_image(image_path)

        prompt = (
                "شما باید به سوال چند گزینه ای که برایتان ارسال می شود پاسخ بدهید، همراه با سوال توضیحات مربوط به یک تصویر نیز ارسال خواهد شد که سوال در رابطه با آن تصویر است. لطفا با دقت سوال زیر را بخوانید و از بین گزینه های ارائه شده گزینه ی صحیح را انتخاب کنید\n\n "
                "سوال: {q}\n"
                "توصیف تصویر: {captions}\n"
                "گزینه ها\n:"
                "1) {ctx[0]}\n"
                "2) {ctx[1]}\n"
                "3) {ctx[2]}\n"
                "4) {ctx[3]}\n"
                "لطفا فقط شماره ی گزینه ی صحیح (1،2،3 و 4) را بیان کن و هیچ توضیح اضافی نده\n"
                "خروجی نمونه: 2"
            ).format(q = question, ctx = answers, captions = caption)

        # Generate
        inputs = self.tok_mt5(prompt, return_tensors="pt", truncation=True, max_length=1024).to(self.device)
        out_ids_mt5 = self.gen_mt5.generate(**inputs, max_new_tokens=max_new_tokens)
        ans_mt5 = self.tok_mt5.decode(out_ids_mt5[0], skip_special_tokens=True)

        return ans_mt5

    def mbart_multimodal_generate(self, question, image_path, answers, max_new_tokens=64):

        caption = self.caption_image(image_path)

        prompt = (
                "شما باید به سوال چند گزینه ای که برایتان ارسال می شود پاسخ بدهید، همراه با سوال توضیحات مربوط به یک تصویر نیز ارسال خواهد شد که سوال در رابطه با آن تصویر است. لطفا با دقت سوال زیر را بخوانید و از بین گزینه های ارائه شده گزینه ی صحیح را انتخاب کنید\n\n "
                "سوال: {q}\n"
                "توصیف تصویر: {captions}\n"
                "گزینه ها\n:"
                "1) {ctx[0]}\n"
                "2) {ctx[1]}\n"
                "3) {ctx[2]}\n"
                "4) {ctx[3]}\n"
                "لطفا فقط شماره ی گزینه ی صحیح (1،2،3 و 4) را بیان کن و هیچ توضیح اضافی نده\n"
                "خروجی نمونه: 2"
            ).format(q = question, ctx = answers, captions = caption)

        # Generate
        self.tok_mbart.src_lang = "fa_IR"   # Persian input
        inputs = self.tok_mbart(prompt, return_tensors="pt").to(self.device)
        out_ids_mbart = self.gen_mbart.generate(**inputs, forced_bos_token_id=self.tok_mbart.lang_code_to_id["fa_IR"])
        ans_mbart = self.tok_mbart.decode(out_ids_mbart[0], skip_special_tokens=True)

        return ans_mbart


# **Test**

In [None]:
# ---------- Normalization for evaluation ----------
ARABIC_TO_PERSIAN = {"\u064a":"\u06cc", "\u0643":"\u06a9", "\u0629":"\u0647", "\u0649":"\u06cc", "\u0623":"\u0627", "\u0625":"\u0627"}
FA2EN = str.maketrans("۰۱۲۳۴۵۶۷۸۹", "0123456789")
AR2EN = str.maketrans("٠١٢٣٤٥٦٧٨٩", "0123456789")
PUNCT = re.compile(r"[^\w\s\u0600-\u06FF]") # Delete some non alphanumeric punctuations

# ---------- Utility Functions ----------
def normalize_eval_text(s: str) -> str:
    if s is None: return ""
    s = str(s)
    s = unicodedata.normalize("NFC", s)
    for a,p in ARABIC_TO_PERSIAN.items(): s = s.replace(a,p)
    s = s.translate(FA2EN).translate(AR2EN)
    s = PUNCT.sub(" ", s)
    s = re.sub(r"\s+"," ", s).strip().lower()
    return s

def to_list_answers(ans: Union[str, List[str]]) -> List[str]:
    if ans is None: return []
    if isinstance(ans, list): return ans
    parts = re.split(r"\s*\|\s*|\s*;\s*|\s*،،\s*|\s*،\s*", str(ans))
    parts = [p for p in parts if p]
    return parts if parts else [str(ans)]

def exact_match(pred: str, golds: List[str]) -> bool:
    p = normalize_eval_text(pred)
    gs = [normalize_eval_text(g) for g in golds]
    return any(p == g for g in gs)

def relaxed_match(pred: str, golds: List[str]) -> bool:
    p = normalize_eval_text(pred)
    gs = [normalize_eval_text(g) for g in golds]
    return any((p in g) or (g in p) for g in gs)

In [None]:
RAG = HaystackRAG(device)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/268 [00:00<?, ?B/s]

# UniModal test without Retrieval

In [None]:
"""MT5 Model"""

model_name = "MT5_unimodal"
data = pd.read_excel(uni_modal_test_data_path)
rows = []
invalid_answers = []

# --------- Test (Persian and English are plausible) ----------
for idx in range(len(data)):
    # display(data.iloc[idx])
    question = data.iloc[idx]["سوال"]
    answers_lst = [data.iloc[idx]["گزینه 1"], data.iloc[idx]["گزینه 2"], data.iloc[idx]["گزینه 3"], data.iloc[idx]["گزینه 4"]]
    correct_answer = data.iloc[idx]["پاسخ صحیح"]
    category = data.iloc[idx]["موضوع"]
    related_doc = data.iloc[idx]["سند مرتبط"]
    pred_idx = RAG.mt5_generate(question = question, answers = answers_lst, max_new_tokens=150)

    if str(pred_idx) in [1,2,3,4]:
        pred = data.iloc[idx]["گزینه " + str(pred_idx)]
        gold = to_list_answers(correct_answer)
        ok = exact_match(pred, gold) if MATCH_MODE == "exact" else relaxed_match(pred, gold)

        rows.append({
            "id": idx,
            "category": category,
            "question": question,
            "correct": correct_answer,
            "prediction": pred,
            "correct": int(ok)
        })
    else:
        invalid_answers.append(pred_idx)
        continue


out = pd.DataFrame(rows)
# Micro accuracy (overall)
micro_acc = float(out["correct"].mean()) if len(out) else float("nan")

print(invalid_answers)
display(out)

# # Per-category accuracy
# by_cat = out.groupby(["category"])["correct"].agg(["mean", "count"]).reset_index()
# by_cat = by_cat.rename(columns={"mean": "accuracy", "count": "n"})

# # Macro accuracy (mean over categories)
# macro_acc = float(by_cat["accuracy"].mean()) if len(by_cat) else float("nan")

# errors = out[out["correct"] == 0].copy()

# # Save
# out.to_csv(SAVE_DIR / f"predictions_{model_name}.csv", index=False)
# errors.to_csv(SAVE_DIR / f"errors_{model_name}.csv", index=False)
# report = {
#     "model": model_name,
#     "overall": {
#         "micro_accuracy": micro_acc,
#         "macro_accuracy": macro_acc,
#         "n_samples": int(len(out))
#     },
#     "by_category": by_cat.to_dict(orient="records"),
#     "n_errors": int(len(errors))
# }
# with open(SAVE_DIR / f"report_{model_name}.json", "w", encoding="utf-8") as f:
#     json.dump(report, f, ensure_ascii=False, indent=2)

# print(f"\n=== {model_name} report ===")
# print(json.dumps(report, ensure_ascii=False, indent=2))


['<extra_id_0> برای ارسال نمونه', '<extra_id_0> برای ارسال نمونه', '<extra_id_0> برای ارسال سوال: 1', '<extra_id_0> برای پاسخ به سوال', '<extra_id_0> برای ارسال نمونه', '<extra_id_0> برای ارسال سوال', '<extra_id_0> برای ارسال سوال', '<extra_id_0> برای ارسال نمونه', '<extra_id_0> برای ارسال نمونه', '<extra_id_0> برای پاسخ به سوال', '<extra_id_0> برای ارسال نمونه:', '<extra_id_0> برای ارسال سوال:', '<extra_id_0> برای پاسخ به سوال', '<extra_id_0> برای ارسال سوال', '<extra_id_0> برای ارسال سوال', '<extra_id_0> برای ارسال نمونه', '<extra_id_0> مینا احدی بیشتر در', '<extra_id_0> برای ارسال سوال', '<extra_id_0> برای ارسال سوال', '<extra_id_0> برای ارسال سوال', '<extra_id_0> برای ارسال سوال', '<extra_id_0> برای ارسال سوال', '<extra_id_0> زهرا ساعی', '<extra_id_0> برای ارسال نمونه', '<extra_id_0> برای ارسال سوال', '<extra_id_0> برای ارسال نمونه', '<extra_id_0> زهرا مصطفوی', '<extra_id_0> برای ارسال سوال', '<extra_id_0> برای ارسال نمونه', '<extra_id_0> برای ارسال سوال', '<extra_id_0> برای ارسال 

In [None]:
"""MBART Model"""

model_name = "MBART_unimodal"
data = pd.read_excel(uni_modal_test_data_path)
# RAG = HaystackRAG(device)
rows = []
invalid_answers = []

# --------- Test (Persian and English are plausible) ----------
for idx in range(len(data)):
    # display(data.iloc[idx])
    question = data.iloc[idx]["سوال"]
    answers_lst = [data.iloc[idx]["گزینه 1"], data.iloc[idx]["گزینه 2"], data.iloc[idx]["گزینه 3"], data.iloc[idx]["گزینه 4"]]
    correct_answer = data.iloc[idx]["پاسخ صحیح"]
    category = data.iloc[idx]["موضوع"]
    related_doc = data.iloc[idx]["سند مرتبط"]
    pred_idx = RAG.mbart_generate(question = question, answers = answers_lst, max_new_tokens=150)

    if str(pred_idx) in [1,2,3,4]:
        pred = data.iloc[idx]["گزینه " + str(pred_idx)]
        gold = to_list_answers(correct_answer)
        ok = exact_match(pred, gold) if MATCH_MODE == "exact" else relaxed_match(pred, gold)

        rows.append({
            "id": idx,
            "category": category,
            "question": question,
            "correct": correct_answer,
            "prediction": pred,
            "correct": int(ok)
        })
    else:
        invalid_answers.append(pred_idx)
        continue


out = pd.DataFrame(rows)
# Micro accuracy (overall)
micro_acc = float(out["correct"].mean()) if len(out) else float("nan")

print(invalid_answers)
display(out)

# # Per-category accuracy
# by_cat = out.groupby("category")["correct"].agg(["mean", "count"]).reset_index()
# by_cat = by_cat.rename(columns={"mean": "accuracy", "count": "n"})

# # Macro accuracy (mean over categories)
# macro_acc = float(by_cat["accuracy"].mean()) if len(by_cat) else float("nan")

# errors = out[out["correct"] == 0].copy()

# # Save
# out.to_csv(SAVE_DIR / f"predictions_{model_name}.csv", index=False)
# errors.to_csv(SAVE_DIR / f"errors_{model_name}.csv", index=False)
# report = {
#     "model": model_name,
#     "overall": {
#         "micro_accuracy": micro_acc,
#         "macro_accuracy": macro_acc,
#         "n_samples": int(len(out))
#     },
#     "by_category": by_cat.to_dict(orient="records"),
#     "n_errors": int(len(errors))
# }
# with open(SAVE_DIR / f"report_{model_name}.json", "w", encoding="utf-8") as f:
#     json.dump(report, f, ensure_ascii=False, indent=2)

# print(f"\n=== {model_name} report ===")
# print(json.dumps(report, ensure_ascii=False, indent=2))


['You have to answer the question of how many choices are being sent to you, please read carefully the following question and choose the right one from the available choices: Question: Where was Yahya al-Ishaq born in? Choices: 1) Tehran 2) No. 3) Scene 4) Isfahan Please just give the number of the right choice (1, 2, 3 and 4) and no further explanation Output Sample: 2', 'You have to answer the question of how many options are being sent to you, please read carefully the question below and choose the correct option from the options provided. Question: What period did Hajj Babakhan of Pahlavi operate in? Options: 1) the Kamakura period 2) the Provisional movement 3) the first wave period 4) the Islamic revolution Please just give the number of the correct option (1, 2, 3 and 4) and no additional explanation is given Output Sample: 2', "You have to answer the question of how many options are being sent to you, and please read carefully the following question and choose the right one out

# Multi modal test without Retrieval

In [None]:
"""MT5 Model"""

model_name = "MT5_multimodal"
multimodal_data = pd.read_excel(multi_modal_test_data_path)
# RAG2 = HaystackRAG(device)
rows = []
invalid_answers = []

# --------- Test (Persian and English are plausible) ----------
for idx in range(len(multimodal_data)):
    question = multimodal_data.iloc[idx]["سوال"]
    character_name = multimodal_data.iloc[idx]["سند مرتبط"] # related_doc
    file_num = multimodal_data.iloc[idx]["فایل تصویر"]
    image_path = images_parent_directory + file_num[:-4] + "_" + character_name + ".jpg"
    answers_lst = [multimodal_data.iloc[idx]["گزینه 1"], multimodal_data.iloc[idx]["گزینه 2"], multimodal_data.iloc[idx]["گزینه 3"], multimodal_data.iloc[idx]["گزینه 4"]]
    correct_answer = data.iloc[idx]["پاسخ صحیح"]
    category = data.iloc[idx]["موضوع"]
    pred_idx = RAG.mt5_multimodal_generate(question = question, image_path = image_path, answers = answers_lst, max_new_tokens=150)
    if str(pred_idx) in [1,2,3,4]:
        pred = multimodal_data.iloc[idx]["گزینه " + str(pred_idx)]
        gold = to_list_answers(correct_answer)
        ok = exact_match(pred, gold) if MATCH_MODE == "exact" else relaxed_match(pred, gold)
        rows.append({
            "id": idx,
            "category": category,
            "question": question,
            "correct": correct_answer,
            "prediction": pred,
            "correct": int(ok)
        })
    else:
        invalid_answers.append(pred_idx)
        continue

out = pd.DataFrame(rows)
# Micro accuracy (overall)
micro_acc = float(out["correct"].mean()) if len(out) else float("nan")

print(invalid_answers)
display(out)

# # Per-category accuracy
# by_cat = out.groupby("category")["correct"].agg(["mean", "count"]).reset_index()
# by_cat = by_cat.rename(columns={"mean": "accuracy", "count": "n"})

# # Macro accuracy (mean over categories)
# macro_acc = float(by_cat["accuracy"].mean()) if len(by_cat) else float("nan")

# errors = out[out["correct"] == 0].copy()

# # Save
# out.to_csv(SAVE_DIR / f"predictions_{model_name}.csv", index=False)
# errors.to_csv(SAVE_DIR / f"errors_{model_name}.csv", index=False)
# report = {
#     "model": model_name,
#     "overall": {
#         "micro_accuracy": micro_acc,
#         "macro_accuracy": macro_acc,
#         "n_samples": int(len(out))
#     },
#     "by_category": by_cat.to_dict(orient="records"),
#     "n_errors": int(len(errors))
# }
# with open(SAVE_DIR / f"report_{model_name}.json", "w", encoding="utf-8") as f:
#     json.dump(report, f, ensure_ascii=False, indent=2)

# print(f"\n=== {model_name} report ===")
# print(json.dumps(report, ensure_ascii=False, indent=2))


FileNotFoundError: [Errno 2] No such file or directory: './HW3_NLP/images/2_0_علی آقامحمدی.jpg'

In [None]:
"""MBART Model"""

model_name = "MBART_multimodal"
multimodal_data = pd.read_excel(multi_modal_test_data_path)
# RAG2 = HaystackRAG(device)
rows = []
invalid_answers =[]

# --------- Test (Persian and English are plausible) ----------
for idx in range(len(multimodal_data)):
    question = multimodal_data.iloc[idx]["سوال"]
    character_name = multimodal_data.iloc[idx]["سند مرتبط"] # related_doc
    file_num = multimodal_data.iloc[idx]["فایل تصویر"]
    image_path = images_parent_directory + file_num[:-4] + "_" + character_name + ".jpg"
    answers_lst = [multimodal_data.iloc[idx]["گزینه 1"], multimodal_data.iloc[idx]["گزینه 2"], multimodal_data.iloc[idx]["گزینه 3"], multimodal_data.iloc[idx]["گزینه 4"]]
    correct_answer = data.iloc[idx]["پاسخ صحیح"]
    category = data.iloc[idx]["موضوع"]
    pred_idx = RAG.mbart_multimodal_generate(question = question, image_path = image_path, answers = answers_lst, max_new_tokens=150)
    if str(pred_idx) in [1,2,3,4]:
        pred = multimodal_data.iloc[idx]["گزینه " + str(pred_idx)]
        gold = to_list_answers(correct_answer)
        ok = exact_match(pred, gold) if MATCH_MODE == "exact" else relaxed_match(pred, gold)
        rows.append({
            "id": idx,
            "category": category,
            "question": question,
            "correct": correct_answer,
            "prediction": pred,
            "correct": int(ok)
        })
    else:
        invalid_answers.append(pred_idx)
        continue

out = pd.DataFrame(rows)
# Micro accuracy (overall)
micro_acc = float(out["correct"].mean()) if len(out) else float("nan")

print(invalid_answers)
display(out)

# # Per-category accuracy
# by_cat = out.groupby("category")["correct"].agg(["mean", "count"]).reset_index()
# by_cat = by_cat.rename(columns={"mean": "accuracy", "count": "n"})

# # Macro accuracy (mean over categories)
# macro_acc = float(by_cat["accuracy"].mean()) if len(by_cat) else float("nan")

# errors = out[out["correct"] == 0].copy()

# # Save
# out.to_csv(SAVE_DIR / f"predictions_{model_name}.csv", index=False)
# errors.to_csv(SAVE_DIR / f"errors_{model_name}.csv", index=False)
# report = {
#     "model": model_name,
#     "overall": {
#         "micro_accuracy": micro_acc,
#         "macro_accuracy": macro_acc,
#         "n_samples": int(len(out))
#     },
#     "by_category": by_cat.to_dict(orient="records"),
#     "n_errors": int(len(errors))
# }
# with open(SAVE_DIR / f"report_{model_name}.json", "w", encoding="utf-8") as f:
#     json.dump(report, f, ensure_ascii=False, indent=2)

# print(f"\n=== {model_name} report ===")
# print(json.dumps(report, ensure_ascii=False, indent=2))