# Imports

In [None]:
# !pip install langchain chromadb sentence-transformers transformers accelerate torch --quiet

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

In [None]:
from typing import List, Dict, Any, Union, Callable
from tqdm.auto import tqdm
from pathlib import Path
from PIL import Image
import os, re, json, unicodedata
import numpy as np
import pandas as pd
import torch

# Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/project/Questions/
%ls

# Device

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Configs and Filenames

In [None]:
# DATA_PATH
SAVE_DIR  = "/content/drive/MyDrive/project/Baseline/"
MATCH_MODE = "exact" # "exact" or "relaxed"
DEFAULT_LANG_CODE_MBART = "fa_IR"

uni_modal_test_data_path = "mcq_questions_90.json"
multi_modal_test_data_path = "mcq_with_image_40.json"

In [None]:

# --------- 1) Your multilingual corpus ----------
docs = [
    Document(page_content="امیرکبیر بنیان‌گذار دارالفنون بود و در دوره ناصرالدین‌شاه صدراعظم شد.", metadata={"lang":"fa"}),
    Document(page_content="Nima Yooshij is considered the father of modern Persian poetry.", metadata={"lang":"en"}),
    Document(page_content="غلامرضا تختی قهرمان کشتی آزاد ایران و پهلوان نامی بود.", metadata={"lang":"fa"}),
    Document(page_content="Hafez is a celebrated Persian poet from Shiraz.", metadata={"lang":"en"}),
]

# --------- 2) Multilingual embeddings + Vector store (Chroma) ----------
emb = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
vectordb = Chroma.from_documents(docs, emb, collection_name="bios")

# Simple retriever
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

# --------- 3) Multilingual generator (mT5) ----------
device = "cuda" if torch.cuda.is_available() else "cpu"
tok = AutoTokenizer.from_pretrained("google/mt5-base")
gen = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base").to(device)

def rag_answer_lc(query, max_new_tokens=64):
    # Retrieve context
    hits = retriever.get_relevant_documents(query)
    ctx = "\n\n".join([d.page_content for d in hits])

    # Minimal prompt
    prompt = (
        "Question: {q}\n"
        "Relevant passages:\n{ctx}\n\n"
        "Answer briefly in the question's language:\n"
    ).format(q=query, ctx=ctx)

    inputs = tok(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
    out_ids = gen.generate(**inputs, max_new_tokens=max_new_tokens)
    ans = tok.decode(out_ids[0], skip_special_tokens=True)
    return ans, hits

# --------- 4) Test ----------
ans, hits = rag_answer_lc("محل تولد حافظ کجاست؟")
print("Answer:", ans)

ans, hits = rag_answer_lc("Who founded Dar ul-Funun?")
print("Answer:", ans)

In [None]:
# Faiss instead of Chroma
# pip install faiss-cpu
from langchain_community.vectorstores import FAISS
vectordb = FAISS.from_documents(docs, emb)
retriever = vectordb.as_retriever(search_kwargs={"k": 3})