In [1]:
from evaluation.eval_utils import *
from evaluation.eval_utils import *
import json
from retrieval import *


Loading passages from D:/NLP_Project/NLP_project/data/chunked_text_all_together_cleaned.json


In [5]:
import os
import json
import numpy as np
import faiss
import matplotlib.pyplot as plt

# For text-based retrieval:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi

# Import your evaluation framework.
from evaluation.eval_utils import MetricCollection, RecallAtK, PrecisionAtK, MRR, nDCG

#########################################
# Retrieval Method Functions
#########################################

def tfidf_retrieval(query, config):
    """
    TF-IDF retrieval.
    config must include:
      - "vectorizer": a fitted TfidfVectorizer,
      - "doc_matrix": document-term matrix,
      - "passages": list of passages,
      - "chunk_ids": list of passage identifiers,
      - "top_k": number of top results.
    """
    vectorizer = config["vectorizer"]
    doc_matrix = config["doc_matrix"]
    passages = config["passages"]
    chunk_ids = config["chunk_ids"]
    top_k = config.get("top_k", 5)
    
    query_vec = vectorizer.transform([query])
    cosine_similarities = (doc_matrix @ query_vec.T).toarray().flatten()
    sorted_indices = np.argsort(cosine_similarities)[::-1][:top_k]
    
    results = [{"chunk_id": chunk_ids[i], "score": float(cosine_similarities[i])} 
               for i in sorted_indices]
    return results

def bm25_retrieval(query, config):
    """
    BM25 retrieval.
    config must include:
      - "bm25": a BM25Okapi object,
      - "passages": list of passages,
      - "chunk_ids": list of passage identifiers,
      - "top_k": number of top results.
    """
    bm25 = config["bm25"]
    passages = config["passages"]
    chunk_ids = config["chunk_ids"]
    top_k = config.get("top_k", 5)
    
    tokenized_query = word_tokenize(query.lower())
    scores = bm25.get_scores(tokenized_query)
    sorted_indices = np.argsort(scores)[::-1][:top_k]
    
    results = [{"chunk_id": chunk_ids[i], "score": float(scores[i])} 
               for i in sorted_indices]
    return results



def dense_retrieval_subqueries(queries, config):

    if isinstance(queries, str):
        queries = [queries]

    results = []
    top_k = config.get("top_k", 5)
    for query in queries:
        query_emb = query_embed_search(query, config["all_subqueries"], config["subquery_index"])
        query_emb = query_emb.reshape(1, -1)  # Reshape to (1, d)
        distances, indices = config["faiss_index"].search(query_emb, top_k)
        results.extend([
            {
                "sub_query": query,
                "chunk_id": config["chunk_ids"][i],
                "passage": config["passages"][i],
                "score": float(distances[0][j])
            } for j, i in enumerate(indices[0])
        ])
    return results

def query_embed_search(query, all_queries_list, index):
    """
    Given a query, finds its embedding from the precomputed subqueries index.
    """
    try:
        position = all_queries_list.index(query)
        return index.reconstruct(position)
    except ValueError:
        raise ValueError(f"Query '{query}' not found in the list of all queries.")
    
def hybrid(query, config):
    """
    Hybrid retrieval: first uses a sparse retrieval method to get an intermediate set,
    then re-ranks candidates using a dense retrieval function and returns the final top-k results.
    
    Parameters:
      - query (str): The input query.
      - config (dict): A configuration dictionary containing:
            * "sparse_func": Function for sparse retrieval (e.g., tfidf_retrieval or bm25_retrieval).
            * "sparse_config": Dictionary for the sparse retrieval configuration.
            * "dense_func": Function for dense retrieval (e.g., dense_retrieval_subqueries).
            * "dense_config": Dictionary for the dense retrieval configuration.
            * "intermediate_k": Number of candidates to retrieve using the sparse method.
            * "final_k": Number of final results to return after dense re-ranking.
            * "mode": "intersection" (default) or "union".
                - "intersection": Use only candidates present in the sparse results.
                - "union": Combine candidates from both sparse and dense retrieval.
                
    Returns:
      A list of dictionaries corresponding to the final top_k results.
      Each dictionary includes at least "chunk_id" and "score" (dense score).
    """
    if isinstance(query, list) and len(query) >= 2:
        query_sparse = query[0]
        query_dense = query[1]
    else:
        print("Query should be a list of two queries: [sparse_query, dense_query].")
    intermediate_k = config.get("intermediate_k")
    final_k = config.get("final_k")
    mode = config.get("mode", "intersection")
    
    # Retrieve intermediate candidate set using the sparse retrieval function.
    sparse_config = config["sparse_config"].copy()
    sparse_config["top_k"] = intermediate_k
    sparse_results = config["sparse_func"](query_sparse, sparse_config)
    
    
    # Retrieve dense results using the dense retrieval function.
    dense_results = config["dense_func"](query_dense, config["dense_config"])
    
    if mode == "intersection":
        # Use only candidates that appear in the sparse results.
        candidate_ids = set(r["chunk_id"] for r in sparse_results)
        filtered_dense = [r for r in dense_results if r["chunk_id"] in candidate_ids]
        if not filtered_dense:
            # If no overlap, fall back to all dense results.
            filtered_dense = dense_results
    elif mode == "union":
        # Use the union of candidate IDs from sparse and dense retrieval.
        candidate_ids = set(r["chunk_id"] for r in sparse_results).union(
                        set(r["chunk_id"] for r in dense_results))
        # Build a dictionary for dense results (using dense score), with default 0 if not present.
        dense_dict = {r["chunk_id"]: r for r in dense_results}
        filtered_dense = []
        for cid in candidate_ids:
            if cid in dense_dict:
                filtered_dense.append(dense_dict[cid])
            else:
                # If candidate from sparse isn't in dense, create a dummy entry with 0 score.
                filtered_dense.append({"chunk_id": cid, "score": 0.0})
    else:
        raise ValueError("Invalid mode. Choose 'intersection' or 'union'.")
    
    # Sort the filtered dense results by their dense score (highest first).
    sorted_dense = sorted(filtered_dense, key=lambda x: x["score"], reverse=True)
    final_results = sorted_dense[:final_k]
    return final_results
def hybrid(query, config):
    """
    Hybrid retrieval: first retrieve candidates with one method, then re-rank with the other.
    
    The order is determined by config["order"]:
      - "sparse_dense" (default): Use query[0] (or query if not a list) for sparse retrieval,
                                   then query[1] (or query) for dense retrieval.
      - "dense_sparse": Use query[0] for dense retrieval,
                        then query[1] for sparse retrieval.
    
    The candidate sets are combined (using the "mode": "intersection" or "union")
    and then sorted by the score of the re-ranking model.
    
    Parameters:
      - query (str or list): If a list of at least two elements, the two parts will be used
                             in the order specified by config["order"]. Otherwise, the same query is used.
      - config (dict): Must contain:
          * "order": either "sparse_dense" (default) or "dense_sparse"
          * "sparse_func": sparse retrieval function (e.g., tfidf_retrieval or bm25_retrieval)
          * "sparse_config": configuration for the sparse method.
          * "dense_func": dense retrieval function (e.g., dense_retrieval_subqueries)
          * "dense_config": configuration for the dense method.
          * "intermediate_k": candidate set size (default: 30)
          * "final_k": final result count (default: 5)
          * "mode": "intersection" (default) or "union"
    
    Returns:
      A list of final results (dictionaries) from the re-ranking step.
    """
    order = config.get("order", "sparse_dense")
    intermediate_k = config.get("intermediate_k", 30)
    final_k = config.get("final_k", 5)
    mode = config.get("mode", "intersection")
    
    # Decide which query part to use for each retrieval step.
    if isinstance(query, list) and len(query) >= 2:
        query_sparse = query[0]
        query_dense = query[1]
    else:
        print("Query should be a list of two queries: [sparse_query, dense_query].")


    if order == "sparse_dense":
        # Sparse retrieval first.
        sparse_config = config["sparse_config"].copy()
        sparse_config["top_k"] = intermediate_k
        sparse_results = config["sparse_func"](query_sparse, sparse_config)
        
        # Dense retrieval.
        dense_results = config["dense_func"](query_dense, config["dense_config"])
        
        # Combine: here we choose to filter dense results by the candidate set from sparse.
        candidate_ids = set(r["chunk_id"] for r in sparse_results)
        filtered_dense = [r for r in dense_results if r["chunk_id"] in candidate_ids]
        if not filtered_dense:
            filtered_dense = dense_results
        # Sort by dense score.
        sorted_results = sorted(filtered_dense, key=lambda x: x["score"], reverse=True)
    
    elif order == "dense_sparse":
        # Dense retrieval first.
        dense_results = config["dense_func"](query_dense, config["dense_config"])
        # Sparse retrieval.
        sparse_config = config["sparse_config"].copy()
        sparse_config["top_k"] = intermediate_k
        sparse_results = config["sparse_func"](query_sparse, sparse_config)
        # Combine: filter sparse results by dense candidate set.
        candidate_ids = set(r["chunk_id"] for r in dense_results)
        filtered_sparse = [r for r in sparse_results if r["chunk_id"] in candidate_ids]
        if not filtered_sparse:
            filtered_sparse = sparse_results
        # Sort by sparse score.
        sorted_results = sorted(filtered_sparse, key=lambda x: x["score"], reverse=True)
    else:
        raise ValueError("Invalid order specified.")
    
    return sorted_results[:final_k]

#########################################
# Dataset Configuration
#########################################

# Global corpus (for all retrieval methods that use it).
def load_corpus(corpus_file):
    with open(corpus_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    passages = [entry["passage"] for entry in data if "passage" in entry]
    chunk_ids = [entry["chunk_id"] for entry in data if "chunk_id" in entry]
    print(f"Loaded {len(passages)} corpus passages")
    return passages, chunk_ids

# Paths for the corpus and QA sets.
CORPUS_FILE = "./data/chunked_text_all_together_cleaned.json"
QA_PATH = "./data/QA_set"
QA_EMBEDDED_PATH = "./data/QA_set_embedded"

# Load the corpus and build common indexes.
passages, chunk_ids = load_corpus(CORPUS_FILE)

# Build TF-IDF index for the corpus.
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
doc_matrix = tfidf_vectorizer.fit_transform(passages)

# Build BM25 index for the corpus.
tokenized_passages = [word_tokenize(p.lower()) for p in passages]
bm25 = BM25Okapi(tokenized_passages)

# Load global dense corpus index (FAISS).
corpus_dense_index = faiss.read_index("./hp_all_bge.index")


Loaded 9251 corpus passages


In [7]:

# Define dataset configurations.
# For each dataset, specify:
#  - "name": a short name,
#  - "gt_path": path to the ground truth QA file,
#  - For dense retrieval: "subqueries" (list) and "subquery_index" (FAISS index file path).
datasets = [
    {
        "name": "easy_single",
        "gt_path": os.path.join(QA_PATH, "easy_single_labeled.json"),
        "dense": {
            "subqueries": retrieve_all_subqueries(os.path.join(QA_PATH, "easy_single_labeled.json")),
            "subquery_index": faiss.read_index(os.path.join(QA_EMBEDDED_PATH, "bge_easy_single_labeled.index"))
        }
    },
    {
        "name": "medium_single",
        "gt_path": os.path.join(QA_PATH, "medium_single_labeled.json"),
        "dense": {
            "subqueries": retrieve_all_subqueries(os.path.join(QA_PATH, "medium_single_labeled.json")),
            "subquery_index": faiss.read_index(os.path.join(QA_EMBEDDED_PATH, "bge_medium_single_labeled.index"))
        }
    },
    {
        "name": "medium_multi",
        "gt_path": os.path.join(QA_PATH, "medium_multi_labeled.json"),
        "dense": {
            "subqueries": retrieve_all_subqueries(os.path.join(QA_PATH, "medium_multi_labeled.json")),
            "subquery_index": faiss.read_index(os.path.join(QA_EMBEDDED_PATH, "bge_medium_multi_labeled.index"))
        }
    },
    {
        "name": "hard_single",
        "gt_path": os.path.join(QA_PATH, "hard_single_labeled.json"),
        "dense": {
            "subqueries": retrieve_all_subqueries(os.path.join(QA_PATH, "hard_single_labeled.json")),
            "subquery_index": faiss.read_index(os.path.join(QA_EMBEDDED_PATH, "bge_hard_single_labeled.index"))
        }
    },
    {
        "name": "hard_multi",
        "gt_path": os.path.join(QA_PATH, "hard_multi_labeled.json"),
        "dense": {
            "subqueries": retrieve_all_subqueries(os.path.join(QA_PATH, "hard_multi_labeled.json")),
            "subquery_index": faiss.read_index(os.path.join(QA_EMBEDDED_PATH, "bge_hard_multi_labeled.index"))
        }
    }
]
# A helper function to load subqueries from a ground truth file.
def retrieve_all_subqueries(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        qa_data = json.load(f)
    subqueries = []
    for item in qa_data:
        subqueries.extend(item["sub_questions"])
    return subqueries

#########################################
# Retrieval Methods Configuration
#########################################

# Create a dictionary mapping retrieval method names to their functions.
retrieval_methods = {
    "tfidf": tfidf_retrieval,
    "bm25": bm25_retrieval,
    "dense": dense_retrieval_subqueries,
    "tfidf_dense": hybrid,
    "bm25_dense": hybrid,
    "dense_tfidf": hybrid,
    "dense_bm25": hybrid,
}

# For each dataset, define a configuration for each retrieval method.
# Here we create a dictionary keyed by retrieval method for each dataset.
for ds in datasets:
    ds.setdefault("retrieval_config", {})
    #TF-IDF configuration.
    ds["retrieval_config"]["tfidf"] = {
        "passages": passages,
        "chunk_ids": chunk_ids,
        "vectorizer": tfidf_vectorizer,
        "doc_matrix": doc_matrix,
        "top_k": 20,
        "query_type": "question"  # or "sub_questions" based on your data structure
    }
    # BM25 configuration.
    ds["retrieval_config"]["bm25"] = {
        "passages": passages,
        "chunk_ids": chunk_ids,
        "bm25": bm25,
        "top_k": 20,
        "query_type": "question"  # or "sub_questions" based on your data structure
    }
    #Dense retrieval configuration.
    ds["retrieval_config"]["dense"] = {
        "passages": passages,
        "chunk_ids": chunk_ids,
        "all_subqueries": ds["dense"]["subqueries"],
        "subquery_index": ds["dense"]["subquery_index"],
        "faiss_index": corpus_dense_index,
        "top_k": 20,
        "query_type": "sub_questions"  # or "question" based on your data structure
    }
        # --- Add Hybrid Configurations ---
    # Hybrid TF-IDF + Dense configuration.
    ds["retrieval_config"]["tfidf_dense"] = {
        "sparse_func": tfidf_retrieval,
        "sparse_config": ds["retrieval_config"]["tfidf"],
        "dense_func": dense_retrieval_subqueries,
        "dense_config": ds["retrieval_config"]["dense"],
        "intermediate_k": 1000,   # You can choose this value independently.
        "final_k": 20,           # And choose the final number of results.
        "mode": "intersection" , # or "union"
        "query_type": "combine",  # or "sub_questions" based on your data structure
        "order": "sparse_dense"  # or "dense_sparse"

    }
    # Hybrid BM25 + Dense configuration.
    ds["retrieval_config"]["bm25_dense"] = {
        "sparse_func": bm25_retrieval,
        "sparse_config": ds["retrieval_config"]["bm25"],
        "dense_func": dense_retrieval_subqueries,
        "dense_config": ds["retrieval_config"]["dense"],
        "intermediate_k": 1000,
        "final_k": 20,
        "mode": "intersection" , # or "union"
        "query_type": "combine",  # or "sub_questions" based on your data structure
        "order": "sparse_dense"  # or "dense_sparse"
    }
    ds["retrieval_config"]["dense_tfidf"] = {
        "sparse_func": tfidf_retrieval,
        "sparse_config": ds["retrieval_config"]["tfidf"],
        "dense_func": dense_retrieval_subqueries,
        "dense_config": ds["retrieval_config"]["dense"],
        "intermediate_k": 1000,   # You can choose this value independently.
        "final_k": 20,           # And choose the final number of results.
        "mode": "intersection" , # or "union"
        "query_type": "combine",  # or "sub_questions" based on your data structure
        "order": "dense_sparse"  # or "dense_sparse"

    }
    # Hybrid BM25 + Dense configuration.
    ds["retrieval_config"]["dense_bm25"] = {
        "sparse_func": bm25_retrieval,
        "sparse_config": ds["retrieval_config"]["bm25"],
        "dense_func": dense_retrieval_subqueries,
        "dense_config": ds["retrieval_config"]["dense"],
        "intermediate_k": 1000,
        "final_k": 20,
        "mode": "intersection" , # or "union"
        "query_type": "combine",  # or "sub_questions" based on your data structure
        "order": "dense_sparse"  # or "dense_sparse"
    }



#########################################
# Test Process: Evaluate Each Retrieval Method on Each Dataset
#########################################
# load the results
save_path = os.path.join(QA_PATH, "retrieval_results.json")
with open(save_path, "r") as f:
    results = json.load(f)


for ds in datasets:
    # Load ground truth data.
    with open(ds["gt_path"], "r", encoding="utf-8") as f:
        ground_truth_data = json.load(f)
    print(f"Dataset '{ds['name']}': loaded {len(ground_truth_data)} ground truth examples.")
    
    # For each retrieval method defined in this dataset's config:
    for method_name, method_func in retrieval_methods.items():
        # Check if a configuration exists for this method.
        if method_name in ds["retrieval_config"]:
            config = ds["retrieval_config"][method_name]
            predictions = []
            references = []
            for example in ground_truth_data:
                query_type = config.get("query_type")
                query = [example["question"], example["sub_questions"]]
                if query_type == "sub_questions":
                    question = query[1]
                elif query_type == "question":
                    question = query[0]
                else:
                    question = query 
                ret_results = method_func(question, config)
                # Collect predicted chunk IDs (convert to string).
                pred_ids = [str(r["chunk_id"]) for r in ret_results]
                predictions.append(pred_ids)
                # Ground truth: assume each reference in "list of reference" has a "ref_id".
                gt_ids = [str(ref["ref_id"]) for ref in example["list of reference"]]
                references.append(gt_ids)
            
            # Initialize evaluation metrics.
            eval_collection = MetricCollection({
                "recall@5": RecallAtK(k=5),
                "recall@10": RecallAtK(k=10),
                "precision@5": PrecisionAtK(k=5),
                "precision@10": PrecisionAtK(k=10),
                "mrr": MRR(),
                "ndcg@5": nDCG(k=5),
                "ndcg@10": nDCG(k=10)
            })
            eval_collection.update(predictions, references, metric_type="retrieval")
            overall_metrics = eval_collection.compute(metric_type="retrieval")
            
            results.append({
                "data_set": ds["name"],
                "retrieval": method_name,
                "performance": overall_metrics
            })
            print(f"Evaluated {method_name} on {ds['name']}: {overall_metrics}")

#########################################
# Now 'results' contains evaluation metrics for each dataset and each retrieval method.
#########################################

# Optionally, you can now plot the results. For example, a grouped bar chart comparing methods on each dataset.
# (You can adapt your previous plotting code using the results list.)
print("Final Results:")
print(results)


Dataset 'easy_single': loaded 253 ground truth examples.
Evaluated tfidf on easy_single: {'recall@5': 0.08389261744966443, 'recall@10': 0.12416107382550336, 'precision@5': 0.004940711462450593, 'precision@10': 0.007312252964426878, 'mrr': 0.0647274655644802, 'ndcg@5': 0.05913122106512458, 'ndcg@10': 0.07361285003547607}
Evaluated bm25 on easy_single: {'recall@5': 0.08389261744966443, 'recall@10': 0.15436241610738255, 'precision@5': 0.004940711462450593, 'precision@10': 0.00909090909090909, 'mrr': 0.0636334769291915, 'ndcg@5': 0.05909196015880151, 'ndcg@10': 0.08244786017469655}
Evaluated dense on easy_single: {'recall@5': 0.15100671140939598, 'recall@10': 0.21140939597315436, 'precision@5': 0.008893280632411068, 'precision@10': 0.012450592885375493, 'mrr': 0.11776788145074586, 'ndcg@5': 0.11317977067850814, 'ndcg@10': 0.13352461988268502}
Evaluated tfidf_dense on easy_single: {'recall@5': 0.020134228187919462, 'recall@10': 0.1040268456375839, 'precision@5': 0.0019348597226701064, 'prec

In [8]:
# save the results to a JSON file.
save_path = os.path.join(QA_PATH, "retrieval_results.json")
# Ensure the directory exists.
os.makedirs(os.path.dirname(save_path), exist_ok=True)
# Save the results to the specified JSON file.
with open(save_path, "w") as f:
    json.dump(results, f, indent=2)
# Print a message indicating where the results were saved.
print(f"Results saved to {save_path}")

Results saved to ./data/QA_set\retrieval_results.json
