## Why do we need the evaluation? 
to test whether our system ranks the right abstract near the top or not.

### What is the measures we will use ?

#### Retrieval accuracy
Do the queries return the correct document?

#### Ranking quality
Where in the result list does the correct document appear?

#### Semantic search vs lexical search
Does your vector-based search outperform keyword search?

#### Hybrid performance
Does combining lexical + semantic improve ranking?

In [5]:
import pandas as pd
import numpy as np

In [6]:
import os
import sys
from pathlib import Path

import pandas as pd

# --- Project root / import setup ---

def find_project_root(start: Path) -> Path:
    """Walk up from `start` until a folder containing `src` is found."""
    current = start.resolve()
    for p in [current, *current.parents]:
        if (p / "src").exists():
            return p
    return current


PROJECT_ROOT = find_project_root(Path.cwd())
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from global_config import global_config
from src.opensearch.open_search_client import OpenSearchClient
from src.services.article_search_service import ArticleSearchService

# --- Configuration ---
CSV_PATH = PROJECT_ROOT / "src" / "evaluation" / "evaluation_queries.csv"
INDEX_NAME = global_config.index_name
TOP_K = 10

# --- Helper: load evaluation CSV ---

def load_evaluation_queries(csv_path: Path) -> pd.DataFrame:
    """Load evaluation CSV and keep only rows that contain a non-empty query.

    Expected columns: ['chunk_id', 'query', ...].
    """
    df = pd.read_csv(csv_path)

    # Normalise column names just in case
    df.columns = [c.strip() for c in df.columns]

    if "query" not in df.columns:
        raise ValueError("CSV must contain a 'query' column")
    if "chunk_id" not in df.columns:
        raise ValueError("CSV must contain a 'chunk_id' column")

    mask = df["query"].astype(str).str.strip().ne("")
    eval_df = df.loc[mask].copy()

    # Ensure chunk_id is string so it matches OpenSearch _id
    eval_df["chunk_id"] = eval_df["chunk_id"].astype(str)
    return eval_df


eval_set = load_evaluation_queries(CSV_PATH)
len(eval_set)

15

In [7]:
from typing import List, Dict, Tuple

# --- OpenSearch query helper ---


def build_semantic_query(user_query: str, k: int) -> Dict:
    """Build the OpenSearch query body to match your index mapping.

    Uses lexical search over title and abstract fields in both languages.
    """
    return {
        "size": k,
        "query": {
            "multi_match": {
                "query": user_query,
                "fields": [
                    "title.en^3",
                    "title.ar^3",
                    "abstract.en^2",
                    "abstract.ar^2",
                    "author",
                ],
            }
        },
    }


def run_search(
    service: ArticleSearchService,
    query_text: str,
    k: int = TOP_K,
) -> List[str]:
    """Run search and return a list of document IDs (as strings)."""
    body = build_semantic_query(query_text, k)
    resp = service.search_articles(body)
    hits = resp.get("hits", {}).get("hits", [])
    return [str(h.get("_id")) for h in hits]


# Create OpenSearch-backed search service instance
client = OpenSearchClient()
search_service = ArticleSearchService(index=INDEX_NAME, client=client)

# Quick smoke test on the first query row (optional)
first_row = eval_set.iloc[0]
first_query = first_row["query"]
first_expected_id = first_row["chunk_id"]
first_results = run_search(search_service, first_query, k=TOP_K)
first_query, first_expected_id, first_results[:10]

('أثر استخدام استراتيجية التدريس التبادلي في تحصيل الرياضيات والاتجاه نحوها لدى طالبات الصف الخامس في نابلس',
 '0',
 ['8abfdcc5-1931-4a98-a21e-41b60d1ccd3f_1',
  '8abfdcc5-1931-4a98-a21e-41b60d1ccd3f_0',
  'bb3bd957-fb08-44de-9671-670725ad8b60_0',
  'bb3bd957-fb08-44de-9671-670725ad8b60_1',
  'b7489514-a95f-4cb7-ba0b-9e489b813ac5_1',
  'b7489514-a95f-4cb7-ba0b-9e489b813ac5_0',
  'dfba577b-1f52-4aaf-a60d-a0ef61d69de7_1',
  'a5d1c01c-f2e1-4ffc-9fc4-6ca75ba2529f_0',
  '225b2bbf-8ee4-43e6-a9e8-29ec732695bb_0',
  'dfba577b-1f52-4aaf-a60d-a0ef61d69de7_0'])

In [8]:
from math import log2

# --- Helpers to derive expected OpenSearch IDs using abstract text ---

def _normalize_text(value) -> str:
    if value is None:
        return ""
    return str(value).strip()


def resolve_expected_doc_id(
    row,
    service: ArticleSearchService,
    cache: Dict[int, str | None],
) -> str | None:
    """Infer the expected OpenSearch _id for this evaluation row.

    We don't have IDs in the CSV, so we:
    1) Use the full abstract (en/ar) from the CSV as a query
       against the indexed abstract fields.
    2) Take the top hit and extract its bitstream_uuid from _id
       (pattern: '{bitstream_uuid}_{chunk_id}').
    3) Combine that bitstream_uuid with the CSV chunk_id to
       form the expected _id for this row.
    Results are cached per-row index to avoid repeated lookups.
    """
    idx = row.name
    if idx in cache:
        return cache[idx]

    abstract_en = _normalize_text(row.get("abstract_en") if "abstract_en" in row else None)
    abstract_ar = _normalize_text(row.get("abstract_ar") if "abstract_ar" in row else None)
    text = abstract_en or abstract_ar
    if not text:
        cache[idx] = None
        return None

    body = {
        "size": 1,
        "query": {
            "multi_match": {
                "query": text,
                "fields": ["abstract.en^2", "abstract.ar^2"],
            }
        },
    }

    resp = service.search_articles(body)
    hits = resp.get("hits", {}).get("hits", [])
    if not hits:
        cache[idx] = None
        return None

    top = hits[0]
    raw_id = str(top.get("_id", ""))

    bit_uuid = None
    # Prefer explicit field if present
    source = top.get("_source") or {}
    if "bitstream_uuid" in source:
        bit_uuid = str(source["bitstream_uuid"])
    # Fallback: infer from _id pattern '{uuid}_{chunk}'
    if not bit_uuid and "_" in raw_id:
        bit_uuid = raw_id.rsplit("_", 1)[0]

    if not bit_uuid:
        cache[idx] = None
        return None

    chunk_idx = str(row["chunk_id"])
    expected_id = f"{bit_uuid}_{chunk_idx}"
    cache[idx] = expected_id
    return expected_id


# --- Ranking metrics ---


def recall_at_k(relevant: List[str], ranked_list: List[str], k: int) -> float:
    """Compute recall@k for a single query."""
    if not relevant:
        return 0.0
    retrieved_at_k = ranked_list[:k]
    hits = sum(1 for r in relevant if r in retrieved_at_k)
    return hits / len(relevant)


def reciprocal_rank(relevant: List[str], ranked_list: List[str]) -> float:
    """Compute reciprocal rank for a single query."""
    for i, doc_id in enumerate(ranked_list, start=1):
        if doc_id in relevant:
            return 1.0 / i
    return 0.0


def ndcg_at_k(relevant: List[str], ranked_list: List[str], k: int) -> float:
    """Compute nDCG@k for a single query.

    Here each relevant document has gain 1, non-relevant 0.
    """
    rel_set = set(relevant)
    dcg = 0.0
    for i, doc_id in enumerate(ranked_list[:k], start=1):
        if doc_id in rel_set:
            dcg += 1.0 / log2(i + 1)

    ideal_gains = [1.0] * min(len(rel_set), k)
    if not ideal_gains:
        return 0.0
    idcg = sum(g / log2(i + 1) for i, g in enumerate(ideal_gains, start=1))
    return dcg / idcg if idcg > 0 else 0.0


def evaluate_search(
    df: pd.DataFrame,
    service: ArticleSearchService,
    k: int = TOP_K,
) -> Tuple[pd.DataFrame, Dict[str, float]]:
    """Run evaluation over all queries in df.

    df is expected to contain columns:
      - 'chunk_id'
      - 'query'
      - 'abstract_ar'
      - 'abstract_en'
    Returns a per-query metrics DataFrame and an aggregate metrics dict.
    """
    per_query_rows = []
    expected_cache: Dict[int, str | None] = {}

    for idx, row in df.iterrows():
        qtext = row["query"]
        expected_id = resolve_expected_doc_id(row, service, expected_cache)
        relevant_ids = [expected_id] if expected_id else []

        ranked_ids = run_search(service, qtext, k=k)

        r_at_k = recall_at_k(relevant_ids, ranked_ids, k)
        rr = reciprocal_rank(relevant_ids, ranked_ids)
        ndcg = ndcg_at_k(relevant_ids, ranked_ids, k)

        hit_rank = None
        if expected_id:
            hit_rank = next(
                (i + 1 for i, did in enumerate(ranked_ids) if did == expected_id),
                None,
            )

        per_query_rows.append(
            {
                "chunk_id": row["chunk_id"],
                "query": qtext,
                "expected_id": expected_id,
                "hit_rank": hit_rank,
                "recall@%d" % k: r_at_k,
                "MRR": rr,
                "nDCG@%d" % k: ndcg,
            }
        )

    per_query_df = pd.DataFrame(per_query_rows)

    # Drop rows where we couldn't resolve an expected_id or where hit_rank is NaN
    per_query_df = per_query_df.dropna(subset=["expected_id", "hit_rank"])

    metrics = {
        "num_queries": len(per_query_df),
        "recall@%d" % k: per_query_df["recall@%d" % k].mean() if not per_query_df.empty else 0.0,
        "MRR": per_query_df["MRR"].mean() if not per_query_df.empty else 0.0,
        "nDCG@%d" % k: per_query_df["nDCG@%d" % k].mean() if not per_query_df.empty else 0.0,
    }

    return per_query_df, metrics


def print_summary(per_query_df: pd.DataFrame, metrics: Dict[str, float], k: int = TOP_K) -> None:
    """Pretty-print per-query and aggregate evaluation results."""
    display_cols = [
        "chunk_id",
        "expected_id",
        "hit_rank",
        f"recall@{k}",
        "MRR",
        f"nDCG@{k}",
    ]

    print("Per-query results:")
    display(per_query_df[display_cols])

    print("\nAggregate metrics:")
    for name, value in metrics.items():
        print(f"{name}: {value:.4f}" if isinstance(value, (int, float)) else f"{name}: {value}")


per_query_results, agg_metrics = evaluate_search(eval_set, search_service, k=TOP_K)
print_summary(per_query_results, agg_metrics, k=TOP_K)

Per-query results:


Unnamed: 0,chunk_id,expected_id,hit_rank,recall@10,MRR,nDCG@10
2,0,3abe1e26-2454-4ef3-87c5-1eeda7724d5b_0,3.0,1.0,0.333333,0.5
6,0,3abe1e26-2454-4ef3-87c5-1eeda7724d5b_0,1.0,1.0,1.0,1.0
7,0,62d2a2b1-533e-404b-900e-755542538751_0,3.0,1.0,0.333333,0.5
8,0,222b26aa-4b0f-49f4-ac76-3dd027db7b28_0,4.0,1.0,0.25,0.430677
9,0,3abe1e26-2454-4ef3-87c5-1eeda7724d5b_0,8.0,1.0,0.125,0.315465
11,0,6d2b92d1-dfa7-4d4a-9b4a-8b4e5eb00bd6_0,1.0,1.0,1.0,1.0
13,0,3abe1e26-2454-4ef3-87c5-1eeda7724d5b_0,10.0,1.0,0.1,0.289065



Aggregate metrics:
num_queries: 7.0000
recall@10: 1.0000
MRR: 0.4488
nDCG@10: 0.5765
