In [3]:
import json
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from lxml import etree

In [5]:
# === Load model once ===
MODEL_NAME = "intfloat/multilingual-e5-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).eval()
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Device: {device}")
model.to(device)

Device: mps


XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, eleme

In [1]:
WIKI_PATH = "cswiki-latest-pages-articles.xml"
PASSAGE_FILE = "npfl140/data/wiki_passages_with_embeddings.jsonl"

In [7]:
# === Query embedding ===
def encode_query(query: str) -> np.ndarray:
    with torch.no_grad():
        inputs = tokenizer("query: " + query, return_tensors="pt", truncation=True, max_length=512).to(device)
        output = model(**inputs)
        mask = inputs["attention_mask"].unsqueeze(-1).bool()
        masked = output.last_hidden_state.masked_fill(~mask, 0.0)
        pooled = masked.sum(1) / mask.sum(1)
        return F.normalize(pooled, p=2, dim=1)[0].cpu().numpy()

# === Load .jsonl passages ===
def load_passages(jsonl_path: str):
    passages = []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            obj["embedding"] = np.array(obj["embedding"], dtype=np.float32)
            passages.append(obj)
    return passages

# === Find best match ===
def find_best_passage(query_vec, passages, top_k=1):
    matrix = np.stack([p["embedding"] for p in passages])
    scores = np.dot(matrix, query_vec)
    top_indices = np.argsort(scores)[-top_k:][::-1]
    return passages[top_indices[0]], scores[top_indices[0]]

# === XML lookup ===
def get_page_by_title(xml_path, search_title):
    context = etree.iterparse(xml_path, events=('end',), tag='{*}page')
    for _, elem in context:
        title = elem.findtext('{*}title')
        if title == search_title:
            text_elem = elem.find('.//{*}revision/{*}text')
            text = text_elem.text if text_elem is not None else ""
            return title, text.strip()
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    return None, None

# === MAIN FUNCTION ===
def answer_query(query: str,
                 jsonl_path: str = PASSAGE_FILE,
                 xml_path: str = WIKI_PATH) -> dict:
    passages = load_passages(jsonl_path)
    qvec = encode_query(query)
    match, score = find_best_passage(qvec, passages)

    page_title = match["title"].replace("_", " ")
    full_title, full_text = get_page_by_title(xml_path, page_title)

    return {
        "query": query,
        "matched_title": full_title,
        "matched_passage": match["passage"],
        "score": float(score),
        "full_page_text": full_text or "(Page not found)"
    }

In [11]:
result = answer_query("Po jaké rostlině je pojmenovaný měsíční časopis určený pro školáky a předškoláky?",
                      xml_path=WIKI_PATH, jsonl_path=PASSAGE_FILE)

print("🔍 Query:", result["query"])
print("✅ Matched Title:", result["matched_title"])
print("⭐ Score:", result["score"])
print("\n📌 Top Passage:\n", result["matched_passage"])
print("\n📖 Full Article Snippet:\n", result["full_page_text"][:1000])


🔍 Query: Po jaké rostlině je pojmenovaný měsíční časopis určený pro školáky a předškoláky?
✅ Matched Title: Boleslav I.
⭐ Score: 23.140743255615234

📌 Top Passage:
 " |&nbsp; | | style="width:10%;" |&nbsp; | style="width:10%;" |&nbsp; |- style="text-align:left;" | style="width:10%;" |&nbsp; | style="width:10%;border-left:1px solid black;border-top:1px solid black;" |&nbsp; | style="width:10%;border-top:1px solid black;" |&nbsp; | style="width:10%;border-left:1px solid black;border-top:1px solid black;" |&nbsp; | style="width:10%;border-top:1px solid black;" |&nbsp; | style="width:10%;border-left:1px solid black;border-top:1px solid black;" |&nbsp; | style="width:10%;border-top:1px solid black;" |&nbsp; | style="width:10%;border-left:1px solid black;" |&nbsp; 

📖 Full Article Snippet:
 {{Infobox - panovník
 | jméno = Boleslav I.
 | titul = Český kníže
 | vláda = zhruba [[935]]–[[967]]/[[972]]
 | datum korunovace = 
 | tituly = 
 | celé jméno = Boleslav I. Ukrutný
 | předchůdce = [[Svatý