Hybrid search with Qdrant

In [1]:
!docker run -d -p 6333:6333 -p 6334:6334 \
   -v "./qdrant_storage:/qdrant/storage:z" \
   qdrant/qdrant

f1f6eaa4905fc5b6b2d84f6df920a9f57e0381e2c8787f54cdf7229d9e3d4fb7


docker: Error response from daemon: failed to set up container networking: driver failed programming external connectivity on endpoint funny_germain (d015b13f5eda531498042a2abfdb02b02230ccc9245fc6c8333791d090173e52): Bind for 0.0.0.0:6333 failed: port is already allocated

Run 'docker run --help' for more information


In [2]:
!python -m pip install -q "qdrant-client[fastembed]>=1.14.2"

In [3]:
from qdrant_client import QdrantClient
from qdrant_client.http import models as models
from fastembed import TextEmbedding, SparseTextEmbedding
from tqdm import tqdm
import os
import json



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
client = QdrantClient("http://localhost:6333")
client.get_collections()

CollectionsResponse(collections=[])

Ingestion

load FIQA Corpus

In [5]:
def load_corpus(path):
    corpus = {}
    ext = os.path.splitext(path.lower())[1]
    with open(path, "r", encoding="utf-8") as f:
        if ext == ".jsonl":
            raw = [json.loads(line) for line in f if line.strip()]
        elif ext == ".json":
            raw = json.load(f)
        else:
            raise ValueError(f"Unsupported file extension: {ext}")

    if isinstance(raw, dict):   # already keyed by doc_id
        for k, v in raw.items():
            corpus[str(k)] = {
                "title": v.get("title", ""),
                "text": v.get("text", ""),
                "metadata": v.get("metadata", {}) or {},
            }
    else:  # list of objects with _id
        for obj in raw:
            did = str(obj["_id"])
            corpus[did] = {
                "title": obj.get("title", ""),
                "text": obj.get("text", ""),
                "metadata": obj.get("metadata", {}) or {},
            }
    return corpus

Load Fiqa Queries

In [6]:
def load_queries(path):
    queries = {}
    ext = os.path.splitext(path.lower())[1]
    with open(path, "r", encoding="utf-8") as f:
        if ext == ".jsonl":
            raw = [json.loads(line) for line in f if line.strip()]
        elif ext == ".json":
            raw = json.load(f)
        else:
            raise ValueError(f"Unsupported file extension: {ext}")

    if isinstance(raw, dict):
        for k, v in raw.items():
            if isinstance(v, str):
                queries[str(k)] = v
            else:
                queries[str(k)] = v.get("text", "")
    else:
        for obj in raw:
            qid = str(obj["_id"])
            queries[qid] = obj.get("text", "")
    return queries

In [7]:
corpus = load_corpus("data/corpus.json")
queries = load_queries("data/queries.json")

Load Qrels

In [8]:
def load_qrels_tsv(path):
    df = pd.read_csv(path, sep="\t", dtype=str, header=0)
    cols = {c.lower().strip(): c for c in df.columns}

    def pick(*cands):
        for c in cands:
            if c in cols:
                return cols[c]
        raise ValueError(f"Missing columns in {path}")

    qcol = pick("query-id", "qid", "query_id")
    dcol = pick("corpus-id", "docid", "doc-id", "doc_id")
    scol = pick("score", "rel", "relevance")

    df[scol] = df[scol].astype(int)

    qrels = {}
    for _, row in df.iterrows():
        qid = str(row[qcol])
        did = str(row[dcol])
        score = int(row[scol])
        qrels.setdefault(qid, {})[did] = score
    return qrels

In [9]:
import pandas as pd

In [10]:
qrels_train = load_qrels_tsv("fiqa/qrels/train.tsv")
qrels_dev = load_qrels_tsv("fiqa/qrels/dev.tsv")
qrels_test = load_qrels_tsv("fiqa/qrels/test.tsv")

Merge all qrels in one File

In [11]:
def merge_qrels(*splits):
    merged = {}
    for qrels in splits:
        for qid, docs in qrels.items():
            dst = merged.setdefault(qid, {})
            for did, sc in docs.items():
                dst[did] = max(sc, dst.get(did, 0))
    return merged

qrels_all = merge_qrels(qrels_train, qrels_dev, qrels_test)


In [12]:
import json
import random

with open("data/corpus.json", "r", encoding="utf-8") as f:
    docs = json.load(f)  

In [13]:
len(docs)

57638

In [14]:
def load_qrels_tsv(path):
    df = pd.read_csv(path, sep="\t", dtype=str, header=0)
    cols = {c.lower().strip(): c for c in df.columns}

    def pick(*cands):
        for c in cands:
            if c in cols:
                return cols[c]
        raise ValueError(f"Missing columns in {path}")

    qcol = pick("query-id", "qid", "query_id")
    dcol = pick("corpus-id", "docid", "doc-id", "doc_id")
    scol = pick("score", "rel", "relevance")

    df[scol] = df[scol].astype(int)

    qrels = {}
    for _, row in df.iterrows():
        qid = str(row[qcol])
        did = str(row[dcol])
        score = int(row[scol])
        qrels.setdefault(qid, {})[did] = score
    return qrels

In [15]:
qrels_df = pd.read_csv("data/qrels_all.csv")

Embed Corpus Once

In [16]:
from fastembed import TextEmbedding
import math
import os

model = os.getenv("EMBED_MODEL", "BAAI/bge-small-en-v1.5")
embedder = TextEmbedding(model_name=model)

def bge_doc(text: str) -> str:
    return f"Represent this sentence for retrieval: {text}"

def l2(v):
    n = math.sqrt(sum(x*x for x in v)) or 1.0
    return [float(x/n) for x in (v.tolist() if hasattr(v, "tolist") else list(v))]

BATCH_EMBED = 4096
dense_vecs = []

for i in range(0, len(docs), BATCH_EMBED):
    batch_docs = docs[i:i+BATCH_EMBED]
    batch_texts = [bge_doc(d.get("text", "")) for d in batch_docs]

    for vec in embedder.embed(batch_texts):
        dense_vecs.append(l2(vec))

DENSE_DIM = len(dense_vecs[0])  


Create Collection

In [17]:
from qdrant_client import models
import uuid


client.delete_collection(collection_name="fiqa-hybrid")


assert len(dense_vecs) > 0
DENSE_DIM = len(dense_vecs[0])  

client.recreate_collection(
    collection_name="fiqa-hybrid",
    vectors_config={
        "dense": models.VectorParams(size=DENSE_DIM, distance=models.Distance.COSINE),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            index=models.SparseIndexParams(on_disk=True),
            modifier=models.Modifier.IDF,
        )
    },
)



  client.recreate_collection(


True

Upsert in Batch to qdrant

In [18]:
from qdrant_client import models
import uuid as _uuid
import math

def l2(v):
    if hasattr(v, "tolist"): v = v.tolist()
    n = math.sqrt(sum(x*x for x in v)) or 1.0
    return [float(x/n) for x in v]

def make_point_id(corpus_id_raw):
    s = str(corpus_id_raw).strip()
    if not s:
        return None
    
    if s.isdigit():
        return int(s)
    
    try:
        return str(_uuid.UUID(s))
    except Exception:

        return str(_uuid.uuid5(_uuid.NAMESPACE_URL, s))

BATCH_UPSERT = 512
points = []

for d, vec in zip(docs, dense_vecs):
    text = (d.get("text") or "").strip()
    if not text:
        continue

    corpus_id_raw = d.get("_id") 
    point_id = make_point_id(corpus_id_raw)
    if point_id is None:
        continue  

    dense_list = l2(vec)

    points.append(
        models.PointStruct(
            id=point_id,                       
            vector={
                "dense": dense_list,
                "bm25": models.Document(       
                    text=text,
                    model="qdrant/bm25",
                ),
            },
            payload={
                "text": text,
                "id": str(corpus_id_raw),      
            },
        )
    )

    if len(points) >= BATCH_UPSERT:
        client.upsert(collection_name="fiqa-hybrid", points=points, wait=True)
        points = []

if points:
    client.upsert(collection_name="fiqa-hybrid", points=points, wait=True)



Sparse Search with BM25

In [19]:
def search_sparse(query: str, k: int = 10) -> list[models.ScoredPoint]:
    search_results = client.query_points(
        collection_name="fiqa-hybrid",
        query=models.Document(
            text=query,
            model="qdrant/bm25",
        ),
        using="bm25",
        limit=k,
        with_payload=True,
    )

    return search_results.points

Example

In [20]:
result = search_sparse("How can I register a UK business without providing a business address?", k=5)
print(result[0].payload["text"])

You don't have to provide your personal home address per se. You can provide a legal address where Companies house can send across paper correspondence to. Companies house legally requires an address because directors are liable to their shareholders(even if you are the only shareholder) and to stop them from disappearing just like that with shareholder's money. Moreover your birth date will also be visible on websites which provide comapnies information. You can ask these websites to stop sharing your personal information. Every company must have a registered office within the UK which is the official legal address of the company. It must be a physical address (i.e. not a PO Box without a physical location) as Companies House will use this address to send correspondence to. To incorporate a private limited company you need at least one director, who has to be over 16 years of age. You may also have a secretary, but this is optional. The information you will need to supply for each off

Dense Search with Bge

In [21]:
from qdrant_client import models
import math

# match indexing: use the DOC prompt for queries
def embed_query(text: str):
    s = f"Represent this sentence for retrieval: {text}"
    v = list(embedder.embed([s]))[0]
    if isinstance(v, dict) and "embedding" in v:
        v = v["embedding"]
    v = list(v)
    n = math.sqrt(sum(x*x for x in v)) or 1.0
    return [x / n for x in v]

def search_dense(text: str, k: int = 10, exact: bool = True):
    qvec = embed_query(text)
    return client.query_points(
        collection_name="fiqa-hybrid",
        query=qvec,
        using="dense",
        with_payload=True,
        limit=k,
        search_params=models.SearchParams(exact=exact) 
    ).points


Example

In [22]:
result = search_dense("what is compound interest",k=5)
print(result[0].payload["text"])

Compound interest means that the interest in each time period is calculated taking into account previously earned interest and not only the initial sum. Thus, if you had $1000 and invested it so that you'd earn 5% each year, than if you would withdraw the earnings each year you in 30 years you would earn 0.05*30*1000 = $1500, so summarily you'd have $2500, or 150% profit. However, if you left all the money to earn interest - including the interest money - then at the end of 30 years you'd have $4321 - or 330% profit.  This is why compound interest is so important - the interest on the earned interest makes money grow significantly faster. On the other hand, the same happens if you owe money - the interest on the money owed is added to the initial sum and so the whole sum owed grows quicker.  Compound interest is also important when calculating interest by time periods. For example, if you are told the loan accumulates 1% interest monthly, you may think it's 12% yearly. However, it is n

Hybrid(Reciprocal Rank Fusion of  both Lists)

In [23]:
def rrf_search(query: str, limit: int = 10, dense_k: int = 200, sparse_k: int = 400):
    qvec = embed_query(query)
    res = client.query_points(
        collection_name="fiqa-hybrid",
        prefetch=[
            models.Prefetch(query=qvec, using="dense", limit=dense_k),
            models.Prefetch(query=models.Document(text=query, model="qdrant/bm25"),
                            using="bm25", limit=sparse_k),
        ],
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        with_payload=True,
        limit=limit,  # final cutoff after fusion
    )
    return res.points

Example

In [24]:
result = rrf_search("what is compound interest",limit=5)
print(result[0].payload["text"])

Compound interest means that the interest in each time period is calculated taking into account previously earned interest and not only the initial sum. Thus, if you had $1000 and invested it so that you'd earn 5% each year, than if you would withdraw the earnings each year you in 30 years you would earn 0.05*30*1000 = $1500, so summarily you'd have $2500, or 150% profit. However, if you left all the money to earn interest - including the interest money - then at the end of 30 years you'd have $4321 - or 330% profit.  This is why compound interest is so important - the interest on the earned interest makes money grow significantly faster. On the other hand, the same happens if you owe money - the interest on the money owed is added to the initial sum and so the whole sum owed grows quicker.  Compound interest is also important when calculating interest by time periods. For example, if you are told the loan accumulates 1% interest monthly, you may think it's 12% yearly. However, it is n

Retrieval Evaluation

In [25]:
q = pd.DataFrame(list(queries.items()), columns=["id", "query"])

In [26]:
queries = q.loc[:,["id","query"]].to_dict(orient="records")

Retrieval Evaluation Functions

In [27]:
def hit_rate(relevance_total):
    if not relevance_total:
        return 0.0
    cnt = 0
    for line in relevance_total:
        # hit if there's at least one relevant item anywhere
        found = False
        for rel in line:
            if rel is True:
                found = True
                break
        if found:
            cnt = cnt + 1
    return cnt / len(relevance_total)


def mrr(relevance_total):
    # Mean Reciprocal Rank using ONLY the first relevant item per query
    if not relevance_total:
        return 0.0
    total_score = 0.0
    for line in relevance_total:
        rr = 0.0
        for rank in range(len(line)):
            if line[rank] is True:
                rr = 1.0 / (rank + 1)
                break  # only the first relevant
        total_score = total_score + rr
    return total_score / len(relevance_total)


def precision_at_k(relevance_total, k=None):
    if not relevance_total:
        return 0.0
    total = 0.0
    for line in relevance_total:
        if not line:
            continue
        kk = len(line) if k is None else min(k, len(line))
        hits = 0
        for i in range(kk):
            if line[i] is True:
                hits = hits + 1
        # precision = hits / retrieved (kk)
        total = total + (hits / kk if kk > 0 else 0.0)
    return total / len(relevance_total)


def recall_at_k(relevance_total, k=None):
    if not relevance_total:
        return 0.0
    total = 0.0
    for line in relevance_total:
        # total relevant in the list
        total_rel = 0
        for r in line:
            if r is True:
                total_rel = total_rel + 1
        if total_rel == 0:
            # by convention, if there are no relevant items, recall contributes 0
            continue
        kk = len(line) if k is None else min(k, len(line))
        hits_at_k = 0
        for i in range(kk):
            if line[i] is True:
                hits_at_k = hits_at_k + 1
        total = total + (hits_at_k / total_rel)
    return total / len(relevance_total)


def average_precision_at_k(relevance_total, k=None):
    # Computes AP per query and returns the mean (MAP@k when aggregated)
    if not relevance_total:
        return 0.0
    total_map = 0.0
    for line in relevance_total:
        # count total relevant (denominator)
        total_rel = 0
        for r in line:
            if r is True:
                total_rel = total_rel + 1
        if total_rel == 0:
            # if no relevant items exist, AP is 0 for that query
            continue
        kk = len(line) if k is None else min(k, len(line))
        ap_sum = 0.0
        hits = 0
        for i in range(kk):
            if line[i] is True:
                hits = hits + 1
                # precision at this rank (i+1)
                prec = hits / (i + 1)
                ap_sum = ap_sum + prec
        total_map = total_map + (ap_sum / total_rel if total_rel > 0 else 0.0)
    return total_map / len(relevance_total)


def map_at_k(relevance_total, k=None):
    # Alias around average_precision_at_k for clarity
    return average_precision_at_k(relevance_total, k=k)


def dcg_at_k(relevance_total, k=None):
    # Average DCG across queries (binary relevance)
    if not relevance_total:
        return 0.0
    import math
    total = 0.0
    for line in relevance_total:
        kk = len(line) if k is None else min(k, len(line))
        dcg = 0.0
        for i in range(kk):
            rel = 1.0 if line[i] is True else 0.0
            denom = math.log2(i + 2)  # log2(rank + 1)
            dcg = dcg + (rel / denom)
        total = total + dcg
    return total / len(relevance_total)


def ndcg_at_k(relevance_total, k=None):
    # Average NDCG across queries (binary relevance)
    if not relevance_total:
        return 0.0
    import math
    total = 0.0
    for line in relevance_total:
        kk = len(line) if k is None else min(k, len(line))
        # DCG
        dcg = 0.0
        for i in range(kk):
            rel = 1.0 if line[i] is True else 0.0
            dcg = dcg + (rel / math.log2(i + 2))
        # IDCG: ideal ordering puts all True first
        # count how many relevant in the top k (at most kk)
        total_rel = 0
        for r in line:
            if r is True:
                total_rel = total_rel + 1
        ideal_hits = min(total_rel, kk)
        idcg = 0.0
        for i in range(ideal_hits):
            idcg = idcg + (1.0 / math.log2(i + 2))
        ndcg = 0.0 if idcg == 0.0 else (dcg / idcg)
        total = total + ndcg
    return total / len(relevance_total)


def f1_at_k(relevance_total, k=None):
    # Harmonic mean of precision@k and recall@k, averaged over queries
    if not relevance_total:
        return 0.0
    total_f1 = 0.0
    for line in relevance_total:
        # precision@k
        kk = len(line) if k is None else min(k, len(line))
        hits_at_k = 0
        for i in range(kk):
            if line[i] is True:
                hits_at_k = hits_at_k + 1
        prec = (hits_at_k / kk) if kk > 0 else 0.0
        # recall@k
        total_rel = 0
        for r in line:
            if r is True:
                total_rel = total_rel + 1
        rec = (hits_at_k / total_rel) if total_rel > 0 else 0.0
        # F1
        if prec + rec == 0.0:
            f1 = 0.0
        else:
            f1 = 2 * prec * rec / (prec + rec)
        total_f1 = total_f1 + f1
    return total_f1 / len(relevance_total)


In [28]:
from tqdm.auto import tqdm

# Normalize a value to a string 
def _to_str(x):
    return None if x is None else str(x)

# Extract a stable document ID from a result item (prefers payload["id"], with safe fallbacks).
def _payload_id(p):
    payload = getattr(p, "payload", None)
    if payload is None and isinstance(p, dict):
        payload = p.get("payload")
    if isinstance(payload, dict) and "id" in payload:
        return _to_str(payload["id"])
    # fallbacks (usually not needed)
    if hasattr(p, "id"): return _to_str(getattr(p, "id"))
    if isinstance(p, dict) and "id" in p: return _to_str(p["id"])
    return None

# Return the set of relevant (positive) doc IDs for a given query ID from qrels.
def _gold_ids_for(qid, qrels_dev):
    rels = qrels_dev.get(str(qid), {})
    return { _to_str(docid) for docid, rel in rels.items() if rel > 0 }

# Evaluate retrieval at cutoff K using all modes (dense/sparse/rrf) and compute standard IR metrics.
def evaluate_fiqa(queries, qrels_dev, K, mode="dense"):
    """
    mode: "dense" | "sparse" | "rrf"
    Uses the same K for retrieval and metrics.
    """
    if mode == "dense":
        search = lambda text: search_dense(text, k=K)
    elif mode == "sparse":
        search = lambda text: search_sparse(text, k=K)
    elif mode == "rrf":
        search = lambda text: rrf_search(text, limit=K)
    else:
        raise ValueError("mode must be 'dense', 'sparse', or 'rrf'")

    relevance_total = []
    total_q = 0
    missing_gold = 0

    for q in tqdm(queries, desc=f"Evaluating ({mode}@{K})"):
        qid = q["id"]
        gold = _gold_ids_for(qid, qrels_dev)
        if not gold:
            missing_gold += 1
        total_q += 1

        results = search(q["query"])
        line = []
        for p in results:
            pid = _payload_id(p)
            line.append(pid in gold)
        relevance_total.append(line)

    return {
        "k": K,
        "mode": mode,
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
        "precision@k": precision_at_k(relevance_total, k=K),
        "recall@k": recall_at_k(relevance_total, k=K),
        "map@k": map_at_k(relevance_total, k=K),
        "ndcg@k": ndcg_at_k(relevance_total, k=K),
        "f1@k": f1_at_k(relevance_total, k=K),
        "queries": total_q,
        "queries_without_gold": missing_gold,
    }


K = 10 



In [29]:
# keep only queries that exist in qrels_dev 
queries_with_gold = [
    q for q in queries
    if str(q["id"]) in qrels_dev and any(v > 0 for v in qrels_dev[str(q["id"])].values())
]

K = 10
dense_scores  = evaluate_fiqa(queries_with_gold, qrels_dev, K, mode="dense")
sparse_scores = evaluate_fiqa(queries_with_gold, qrels_dev, K, mode="sparse")
rrf_scores    = evaluate_fiqa(queries_with_gold, qrels_dev, K, mode="rrf")

print("DENSE :", dense_scores)
print("SPARSE:", sparse_scores)
print("RRF   :", rrf_scores)


Evaluating (dense@10): 100%|██████████| 500/500 [00:34<00:00, 14.70it/s]
Evaluating (sparse@10): 100%|██████████| 500/500 [00:14<00:00, 34.61it/s]
Evaluating (rrf@10): 100%|██████████| 500/500 [00:35<00:00, 14.14it/s]

DENSE : {'k': 10, 'mode': 'dense', 'hit_rate': 0.646, 'mrr': 0.45880634920634905, 'precision@k': 0.1018000000000004, 'recall@k': 0.646, 'map@k': 0.4292107709750567, 'ndcg@k': 0.49136209908104633, 'f1@k': 0.16872104039751132, 'queries': 500, 'queries_without_gold': 0}
SPARSE: {'k': 10, 'mode': 'sparse', 'hit_rate': 0.482, 'mrr': 0.31720873015873025, 'precision@k': 0.06400000000000015, 'recall@k': 0.482, 'map@k': 0.30303934240362806, 'ndcg@k': 0.3502665001756677, 'f1@k': 0.10975428493075545, 'queries': 500, 'queries_without_gold': 0}
RRF   : {'k': 10, 'mode': 'rrf', 'hit_rate': 0.64, 'mrr': 0.4539626984126985, 'precision@k': 0.09680000000000043, 'recall@k': 0.64, 'map@k': 0.42828373015872995, 'ndcg@k': 0.488298725483918, 'f1@k': 0.1620530025530028, 'queries': 500, 'queries_without_gold': 0}





In [30]:
results = pd.DataFrame({"dense_scores": dense_scores,
                        "sparse_scores": sparse_scores,
                        "rrf_scores": rrf_scores})

In [31]:
results

Unnamed: 0,dense_scores,sparse_scores,rrf_scores
k,10,10,10
mode,dense,sparse,rrf
hit_rate,0.646,0.482,0.64
mrr,0.458806,0.317209,0.453963
precision@k,0.1018,0.064,0.0968
recall@k,0.646,0.482,0.64
map@k,0.429211,0.303039,0.428284
ndcg@k,0.491362,0.350267,0.488299
f1@k,0.168721,0.109754,0.162053
queries,500,500,500


RAG FLOW

Model

In [32]:
import os, torch, json, re
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
DTYPE = torch.float32
MAX_NEW_TOKENS = 200
REPETITION_PENALTY = 1.10  # slight nudge to reduce loops

os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
try:
    torch.set_num_threads(min(8, os.cpu_count() or 4))
    torch.set_num_interop_threads(2)
except Exception:
    pass

tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    dtype=DTYPE,
    low_cpu_mem_usage=True,
    attn_implementation="eager",
)
model.to("cpu").eval()
PAD_ID = tok.eos_token_id if tok.pad_token_id is None else tok.pad_token_id

In [33]:
def _encode_chat(messages):
    """
    Use the model's chat template if available; otherwise fall back to a simple prompt.
    """
    try:
        return tok.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
    except Exception:
        sys_txt = "\n\n".join(m["content"] for m in messages if m["role"] == "system")
        usr_txt = "\n\n".join(m["content"] for m in messages if m["role"] == "user")
        prompt = ((sys_txt + "\n\n") if sys_txt else "") + usr_txt
        return tok(prompt, return_tensors="pt").input_ids

Prompts

In [34]:

entry_template = "{n}. [{doc_id}] {title}\n{text}"
prompt_template = (
    "You are a helpful FIQA assistant. Answer clearly, cite facts from the context when useful, "
    "and ignore irrelevant or noisy snippets.\n\n"
    "Context:\n{context}\n\n"
    "Question: {question}\n"
    "Answer:"
)

def search(query):
    points = rrf_search(query, limit=10, dense_k=200, sparse_k=400)
    docs = []
    for p in (points or []):
        payload = getattr(p, "payload", {}) or {}
        # normalize payload keys expected by entry_template
        text = (payload.get("text") or payload.get("content") or "").strip()
        title = (payload.get("title") or "").strip()
        if not title:
            # fall back to first sentence or simple default
            first = re.split(r"(?<=[.!?])\s+", text)[0] if text else ""
            title = first or "Untitled"
        doc_id = str(payload.get("doc_id") or payload.get("id") or getattr(p, "id", None) or "")
        docs.append({"n": None, "title": title, "text": text, "doc_id": doc_id})
    return docs


In [35]:
import re, unicodedata

#  tiny sanitizer to avoid weird Unicode that can trigger loops 
def _clean(t: str) -> str:
    if not t: return ""
    t = unicodedata.normalize("NFKC", str(t))
    t = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F]", " ", t)  
    t = re.sub(r"[ \t]+", " ", t).strip()
    return t

# truncate a string to max_tokens with your tokenizer
def _truncate_to_tokens(text: str, max_tokens: int) -> str:
    if max_tokens <= 0: return ""
    ids = tok.encode(text, add_special_tokens=False)
    if len(ids) <= max_tokens:
        return text
    return tok.decode(ids[:max_tokens], skip_special_tokens=True)

# prompt builder with token budget 
def build_prompt(query, search_results):
    # model context and budget
    ctx_max = int(getattr(model.config, "max_position_embeddings", 2048))
    # leave room for generation + some margin
    margin = 64
    budget = max(256, ctx_max - int(MAX_NEW_TOKENS) - margin)


    chunks = []
    for i, raw in enumerate(search_results or [], 1):
        p = raw if isinstance(raw, dict) else (getattr(raw, "payload", {}) or {})
        title = _clean(p.get("title") or "")
        text  = _clean(p.get("text") or p.get("content") or "")
        if not title:
            first = re.split(r"(?<=[.!?])\s+", text)[0] if text else ""
            title = first or "Untitled"
        doc_id = str(p.get("doc_id") or p.get("id") or getattr(raw, "id", "") or i)

        entry = entry_template.format(n=i, title=title, text=text, doc_id=doc_id)
        trial = ("\n\n---\n\n".join(chunks + [entry])).strip()
        prompt = prompt_template.format(question=_clean(query), context=trial).strip()

        # measure tokenized length 
        messages = [
            {"role": "system", "content": "You are a helpful FIQA assistant. Be concise and factual."},
            {"role": "user", "content": prompt},
        ]
        ids = _encode_chat(messages)
        if ids.shape[-1] <= budget:
            chunks.append(entry)
            continue

        # try truncating this last entry’s text to fit; if still over, stop
        # estimate how many tokens we can spend on this entry by binary truncation
        lo, hi = 0, len(tok.encode(entry, add_special_tokens=False))
        best = ""
        while lo <= hi:
            mid = (lo + hi) // 2
            trial_entry = _truncate_to_tokens(entry, mid)
            trial = ("\n\n---\n\n".join(chunks + [trial_entry])).strip()
            prompt = prompt_template.format(question=_clean(query), context=trial).strip()
            ids = _encode_chat([
                {"role": "system", "content": "You are a helpful FIQA assistant. Be concise and factual."},
                {"role": "user", "content": prompt},
            ])
            if ids.shape[-1] <= budget:
                best = trial_entry
                lo = mid + 1
            else:
                hi = mid - 1
        if best:
            chunks.append(best)
        break  

    context = ("\n\n---\n\n".join(chunks)).strip()
    return prompt_template.format(question=_clean(query), context=context).strip()

# LLM call
def llm(prompt):
    messages = [
        {"role": "system", "content": "You are a helpful FIQA assistant. Be concise and factual."},
        {"role": "user", "content": prompt},
    ]
    input_ids = _encode_chat(messages)
    prompt_len = input_ids.shape[-1]
    with torch.inference_mode():
        out = model.generate(
            input_ids.to("cpu"),
            max_new_tokens=int(MAX_NEW_TOKENS),
            do_sample=False,
            no_repeat_ngram_size=3,
            repetition_penalty=float(REPETITION_PENALTY),
            pad_token_id=PAD_ID,
            eos_token_id=tok.eos_token_id,
            use_cache=True,
        )
    return tok.decode(out[0, prompt_len:], skip_special_tokens=True).strip()

# search adapter
def search(query):
    points = rrf_search(query, limit=10, dense_k=200, sparse_k=400)
    docs = []
    for p in (points or []):
        payload = getattr(p, "payload", {}) or {}
        if "text" not in payload and "content" in payload:
            payload["text"] = payload["content"]
        if not payload.get("title"):
            txt = (payload.get("text") or "").strip()
            first = re.split(r"(?<=[.!?])\s+", txt)[0] if txt else ""
            payload["title"] = first or "Untitled"
        docs.append(payload)
    return docs

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    return llm(prompt)


In [36]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [37]:
question = "Stocks and bonds have yields, but what is a yield?"
answer = rag(question)
print(answer)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In financial terms, a yield is the percentage of interest earned on a fixed-income investment, such as a bond or stock, divided by its face value. When calculating a yield, the interest earned is calculated over a specific period, such a year or a quarter, and the face value is the original investment amount. The higher the yield, or the lower the interest paid, the more attractive the investments are to potential investors. Bonds have a fixed interest rate, which means that the investors receive a predetermined amount of interest each year, regardless of market conditions. On the other hand, stock prices fluctuate based on various factors, including economic growth, corporate earnings, and market sentiment. While stocks may have higher yields than bonds, they are subject to market volatility and risk, and their returns may vary widely depending on the performance of individual companies and industries. Investors should carefully evaluate the risks and


RAG Evaluation

In [None]:
#  Evaluator system prompt 

EVAL_SYSTEM_PROMPT = ( 'You are an expert evaluator. Output ONLY a single JSON object with keys "Relevance" and "Explanation"')


ALLOWED = {"IRRELEVANT", "RELEVANT"}

prompt2_template = (
    "Evaluate the quality of the generated answer to the user question using ONLY the criteria below.\n\n"
    "Binary quality rubric (decide one):\n"
    "• RELEVANT: Directly answers the question, factually correct, specific, and clearly grounded in the provided context. "
    "Borderline/partial answers count as RELEVANT only if they materially address the question AND cite/support from the context.\n"
    "• IRRELEVANT: Off-topic, incorrect, generic, or contradicts/ignores the provided context; lacks clear support from the context; "
    "or is a refusal/unactionable meta-response.\n\n"
    "Question: {question}\n"
    "Generated Answer (verbatim):\n<<<ANSWER>>>\n{answer}\n<<<END>>>\n\n"
    "Return ONLY a parsable JSON object (no code fences, no extra text) with EXACTLY these keys:\n"
    "{{\n"
    '  "Relevance": "IRRELEVANT" | "RELEVANT",\n'
    '  "Explanation": "[Brief reason citing what made it RELEVANT or IRRELEVANT, referencing the context]"\n'
    "}}"
)



In [39]:
prompt = prompt2_template.format(question=question, answer=answer)

In [40]:
print(prompt)

Evaluate the quality of the generated answer to the user question using ONLY the criteria below.

Binary quality rubric (decide one):
• RELEVANT: Directly answers the question, factually correct, specific, and clearly grounded in the provided context. Borderline/partial answers count as RELEVANT only if they materially address the question AND cite/support from the context.
• IRRELEVANT: Off-topic, incorrect, generic, or contradicts/ignores the provided context; lacks clear support from the context; or is a refusal/unactionable meta-response.

Question: Stocks and bonds have yields, but what is a yield?
Generated Answer (verbatim):
<<<ANSWER>>>
In financial terms, a yield is the percentage of interest earned on a fixed-income investment, such as a bond or stock, divided by its face value. When calculating a yield, the interest earned is calculated over a specific period, such a year or a quarter, and the face value is the original investment amount. The higher the yield, or the lower t

In [41]:
qrels_ids = {str(k) for k in qrels_test.keys()}
queries_test = [q for q in queries if str(q.get("id")) in qrels_ids]

In [42]:
len(queries_test)

648

In [53]:
df_test_query = pd.DataFrame(queries_test)
test_queries = df_test_query.to_dict(orient='records')
sample_test_df = df_test_query[:100]
sample_test = sample_test_df.to_dict(orient="records")
sample_test[:100]

[{'id': '4641', 'query': 'Where should I park my rainy-day / emergency fund?'},
 {'id': '5503',
  'query': 'Tax considerations for selling a property below appraised value to family?'},
 {'id': '7803',
  'query': 'Can the Delta be used to calculate the option premium given a certain target?'},
 {'id': '7017', 'query': 'Basic Algorithmic Trading Strategy'},
 {'id': '10152',
  'query': 'What does a high operating margin but a small but positive ROE imply about a company?'},
 {'id': '3451',
  'query': 'Should you keep your stocks if you are too late to sell?'},
 {'id': '4804',
  'query': 'How do financial services aimed at women differ from conventional services?'},
 {'id': '7911',
  'query': "What is the difference between a 'trader' and a 'stockbroker'?"},
 {'id': '10809', 'query': 'Definitions of leverage and of leverage factor'},
 {'id': '6715',
  'query': 'What does it mean if “IPOs - normally are sold with an `underwriting discount` (a built in commission)”'},
 {'id': '2388',
  'que

In [54]:
import json, re

ALLOWED = {"IRRELEVANT", "RELEVANT"}

def extract_evaluation(text: str) -> dict:
    for t in (
        text,
        (re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", text, re.I) or [None, ""])[1],
        text[text.find("{"): text.rfind("}")+1] if "{" in text and "}" in text else "",
    ):
        try:
            obj = json.loads(t)
            break
        except Exception:
            obj = None
    # normalize
    if isinstance(obj, dict):
        k = {str(k).lower().strip(): v for k, v in obj.items()}
        rel = k.get("relevance") or k.get("label") or k.get("result") or ""
        expl = k.get("explanation") or k.get("reason") or k.get("feedback")
    else:
        rel = ""
        expl = None
    # label cleanup → binary
    if isinstance(rel, str):
        rel = rel.upper().replace("-", "_").replace(" ", "_")
        if rel in {"NON_RELEVANT", "NOT_RELEVANT"}: rel = "IRRELEVANT"
        if rel in {"GOOD"}: rel = "RELEVANT"
        if rel.startswith("PART") or rel in {"AVERAGE", "BAD", "NOT_GOOD", "OFF_TOPIC"}: rel = "IRRELEVANT"
    if rel not in ALLOWED:
        # fallback from free text
        m = re.search(r"\b(RELEVANT|IRRELEVANT)\b", text, re.I)
        rel = m.group(1).upper() if m else "IRRELEVANT"
    # explanation fallback
    if not isinstance(expl, str) or not expl.strip():
        m = re.search(r"Explanation\s*:\s*(.+)", text, re.I | re.S)
        expl = (m.group(1) if m else re.sub(r"```[\s\S]*?```", "", text)).strip()[:300] or "No explanation provided."
    return {"Relevance": rel, "Explanation": expl}


In [55]:
evaluations = []
for record in tqdm(sample_test):
    question = record["query"]
    answer = rag(question)

    prompt = prompt2_template.format(question=question, answer=answer)
    evaluation_text = llm(prompt)

    try:
        evaluation = extract_evaluation(evaluation_text)
    except Exception as e:
        evaluation = {
            "Relevance": "IRRELEVANT",
            "Explanation": f"Parsing failed: {e}. Raw: {evaluation_text[:500]}"
        }

    evaluations.append((record, answer, evaluation))

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [7:39:22<00:00, 275.63s/it] 


In [56]:
import pandas as pd

df_eval = pd.DataFrame(evaluations, columns=["record", "answer", "evaluation"])
df_eval["id"] = df_eval["record"].apply(lambda d: d["id"])
df_eval["question"] = df_eval["record"].apply(lambda d: d["query"])
df_eval["relevance"] = df_eval["evaluation"].apply(lambda d: d["Relevance"])
df_eval["explanation"] = df_eval["evaluation"].apply(lambda d: d["Explanation"])
del df_eval['record']
del df_eval['evaluation']


In [57]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT      0.97
IRRELEVANT    0.03
Name: proportion, dtype: float64

In [58]:
df_eval

Unnamed: 0,answer,id,question,relevance,explanation
0,The FIQAs assistant recommends considering kee...,4641,Where should I park my rainy-day / emergency f...,RELEVANT,The explanation
1,"Yes, you can appeal your county' s appraisemen...",5503,Tax considerations for selling a property belo...,RELEVANT,To evaluate the quality score of the binary re...
2,"Yes, the Deltas of options can be used as inpu...",7803,Can the Delta be used to calculate the option ...,RELEVANT,The binary quality rubrics I have listed above...
3,To summarize the key points from the given tex...,7017,Basic Algorithmic Trading Strategy,RELEVANT,The binary quality rubrics used in evaluating ...
4,A high operating profit margin but low return ...,10152,What does a high operating margin but a small ...,RELEVANT,The binary quality rubrics used in this evalua...
...,...,...,...,...,...
95,To summarize the key points from the given tex...,34,401k Transfer After Business Closure,RELEVANT,The binary quality rubrics used by the author ...
96,To deal with someone who compulsionally spends...,4011,How can I deal with a spouse who compulsively ...,RELEVANT,The binary quality rubrics used in evaluating ...
97,"To answer the question, the context suggests t...",4700,Better to get loan from finance company or ban...,RELEVANT,The binary quality rubrics used by the author ...
98,To summarize the key points mentioned in the g...,6395,Option settlement for calendar spreads,RELEVANT,The binary quality rubrics used in this evalua...


In [59]:
df_eval["question"][4]

'What does a high operating margin but a small but positive ROE imply about a company?'

In [60]:
df_eval["answer"][4]

'A high operating profit margin but low return on equity indicates that the business is making a profit but not generating enough revenue to cover its costs. This can lead to a situation where the company is unable to pay dividends or repay debt, potentially leading to financial difficulties. On the other hand, a high return on equipment (ROEQ) but low operating profit (OPM) suggests that the cost of capital is too high for the company to generate enough rewards from its assets. This could indicate that the firm is struggling to maintain its profitability despite investments in new equipment or processes. A high ROEQ but low OPM implies that the asset-based nature of the business means that it is not generating sufficient revenue from its fixed assets to cover the costs associated with owning and managing those assets. Overall, a combination of factors such as high operating costs, low profitability, and high asset values can contribute to a negative ROE and a high OPM'

In [61]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT      0.97
IRRELEVANT    0.03
Name: proportion, dtype: float64

In [63]:
df_eval.to_csv("data/df_eval.csv")