In [49]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

if project_root not in sys.path:
    sys.path.insert(0, project_root)

from evaluation.eval_utils import *
from evaluation.eval_utils import *
import json
#from retrieval import *
import numpy as np
import faiss
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi

In [27]:
# import the generation model here
from generation.cohere_generation import *

In [4]:
#########################################
# Retrieval Method Functions
#########################################

def tfidf_retrieval(query, config):
    """
    TF-IDF retrieval.
    config must include:
      - "vectorizer": a fitted TfidfVectorizer,
      - "doc_matrix": document-term matrix,
      - "passages": list of passages,
      - "chunk_ids": list of passage identifiers,
      - "top_k": number of top results.
    """
    vectorizer = config["vectorizer"]
    doc_matrix = config["doc_matrix"]
    passages = config["passages"]
    chunk_ids = config["chunk_ids"]
    top_k = config.get("top_k", 5)
    
    query_vec = vectorizer.transform([query])
    cosine_similarities = (doc_matrix @ query_vec.T).toarray().flatten()
    sorted_indices = np.argsort(cosine_similarities)[::-1][:top_k]
    
    results = [{"chunk_id": chunk_ids[i], "score": float(cosine_similarities[i])} 
               for i in sorted_indices]
    return results

def bm25_retrieval(query, config):
    """
    BM25 retrieval.
    config must include:
      - "bm25": a BM25Okapi object,
      - "passages": list of passages,
      - "chunk_ids": list of passage identifiers,
      - "top_k": number of top results.
    """
    bm25 = config["bm25"]
    passages = config["passages"]
    chunk_ids = config["chunk_ids"]
    top_k = config.get("top_k", 5)
    
    tokenized_query = word_tokenize(query.lower())
    scores = bm25.get_scores(tokenized_query)
    sorted_indices = np.argsort(scores)[::-1][:top_k]
    
    results = [{"chunk_id": chunk_ids[i], "score": float(scores[i])} 
               for i in sorted_indices]
    return results



def dense_retrieval_subqueries(queries, config):

    if isinstance(queries, str):
        queries = [queries]

    results = []
    top_k = config.get("top_k", 5)
    for query in queries:
        query_emb = query_embed_search(query, config["all_subqueries"], config["subquery_index"])
        query_emb = query_emb.reshape(1, -1)  # Reshape to (1, d)
        distances, indices = config["faiss_index"].search(query_emb, top_k)
        results.extend([
            {
                "sub_query": query,
                "chunk_id": config["chunk_ids"][i],
                "passage": config["passages"][i],
                "score": float(distances[0][j])
            } for j, i in enumerate(indices[0])
        ])
    return results

def query_embed_search(query, all_queries_list, index):
    """
    Given a query, finds its embedding from the precomputed subqueries index.
    """
    try:
        position = all_queries_list.index(query)
        return index.reconstruct(position)
    except ValueError:
        raise ValueError(f"Query '{query}' not found in the list of all queries.")
    

def hybrid(query, config):
    """
    Hybrid retrieval: first retrieve candidates with one method, then re-rank with the other.
    
    The order is determined by config["order"]:
      - "sparse_dense" (default): Use query[0] (or query if not a list) for sparse retrieval,
                                   then query[1] (or query) for dense retrieval.
      - "dense_sparse": Use query[0] for dense retrieval,
                        then query[1] for sparse retrieval.
    
    The candidate sets are combined (using the "mode": "intersection" or "union")
    and then sorted by the score of the re-ranking model.
    
    Parameters:
      - query (str or list): If a list of at least two elements, the two parts will be used
                             in the order specified by config["order"]. Otherwise, the same query is used.
      - config (dict): Must contain:
          * "order": either "sparse_dense" (default) or "dense_sparse"
          * "sparse_func": sparse retrieval function (e.g., tfidf_retrieval or bm25_retrieval)
          * "sparse_config": configuration for the sparse method.
          * "dense_func": dense retrieval function (e.g., dense_retrieval_subqueries)
          * "dense_config": configuration for the dense method.
          * "intermediate_k": candidate set size (default: 30)
          * "final_k": final result count (default: 5)
          * "mode": "intersection" (default) or "union"
    
    Returns:
      A list of final results (dictionaries) from the re-ranking step.
    """
    order = config.get("order", "sparse_dense")
    intermediate_k = config.get("intermediate_k", 30)
    final_k = config.get("final_k", 5)
    mode = config.get("mode", "intersection")
    
    # Decide which query part to use for each retrieval step.
    if isinstance(query, list) and len(query) >= 2:
        query_sparse = query[0]
        query_dense = query[1]
    else:
        print("Query should be a list of two queries: [sparse_query, dense_query].")


    if order == "sparse_dense":
        # Sparse retrieval first.
        sparse_config = config["sparse_config"].copy()
        sparse_config["top_k"] = intermediate_k
        sparse_results = config["sparse_func"](query_sparse, sparse_config)
        
        # Dense retrieval.
        dense_results = config["dense_func"](query_dense, config["dense_config"])
        
        # Combine: here we choose to filter dense results by the candidate set from sparse.
        candidate_ids = set(r["chunk_id"] for r in sparse_results)
        filtered_dense = [r for r in dense_results if r["chunk_id"] in candidate_ids]
        if not filtered_dense:
            filtered_dense = dense_results
        # Sort by dense score.
        sorted_results = sorted(filtered_dense, key=lambda x: x["score"], reverse=True)
    
    elif order == "dense_sparse":
        # Dense retrieval first.
        dense_results = config["dense_func"](query_dense, config["dense_config"])
        # Sparse retrieval.
        sparse_config = config["sparse_config"].copy()
        sparse_config["top_k"] = intermediate_k
        sparse_results = config["sparse_func"](query_sparse, sparse_config)
        # Combine: filter sparse results by dense candidate set.
        candidate_ids = set(r["chunk_id"] for r in dense_results)
        filtered_sparse = [r for r in sparse_results if r["chunk_id"] in candidate_ids]
        if not filtered_sparse:
            filtered_sparse = sparse_results
        # Sort by sparse score.
        sorted_results = sorted(filtered_sparse, key=lambda x: x["score"], reverse=True)
    else:
        raise ValueError("Invalid order specified.")
    
    return sorted_results[:final_k]

#########################################
# Dataset Configuration
#########################################

# Global corpus (for all retrieval methods that use it).
def load_corpus(corpus_file):
    with open(corpus_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    passages = [entry["passage"] for entry in data if "passage" in entry]
    chunk_ids = [entry["chunk_id"] for entry in data if "chunk_id" in entry]
    print(f"Loaded {len(passages)} corpus passages")
    return passages, chunk_ids

# A helper function to load subqueries from a ground truth file.
def retrieve_all_subqueries(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        qa_data = json.load(f)
    subqueries = []
    for item in qa_data:
        subqueries.extend(item["sub_questions"])
    return subqueries

# Paths for the corpus and QA sets.
CORPUS_FILE = "../data/chunked_text_all_together_cleaned.json"
QA_PATH = "../data/QA_set"
QA_EMBEDDED_PATH = "../data/QA_set_embedded"
PM_PATH = "../performance"

# CORPUS_FILE = os.path.join(project_root, "data", "chunked_text_all_together_cleaned.json")
# QA_PATH = os.path.join(project_root, "data", "QA_set")
# QA_EMBEDDED_PATH = os.path.join(project_root, "data", "QA_set_embedded")
# Load the corpus and build common indexes.

passages, chunk_ids = load_corpus(CORPUS_FILE)

# Build TF-IDF index for the corpus.
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
doc_matrix = tfidf_vectorizer.fit_transform(passages)

# Build BM25 index for the corpus.
tokenized_passages = [word_tokenize(p.lower()) for p in passages]
bm25 = BM25Okapi(tokenized_passages)

# Load global dense corpus index (FAISS).
corpus_dense_index = faiss.read_index("../hp_all_bge.index")

Loaded 9251 corpus passages


In [82]:
datasets = [
    {
        "name": "easy_single",
        "gt_path": os.path.join(QA_PATH, "easy_single_labeled.json"),
        "dense": {
            "subqueries": retrieve_all_subqueries(os.path.join(QA_PATH, "easy_single_labeled.json")),
            "subquery_index": faiss.read_index(os.path.join(QA_EMBEDDED_PATH, "bge_easy_single_labeled.index"))
        }
    },
    {
        "name": "medium_single",
        "gt_path": os.path.join(QA_PATH, "medium_single_labeled.json"),
        "dense": {
            "subqueries": retrieve_all_subqueries(os.path.join(QA_PATH, "medium_single_labeled.json")),
            "subquery_index": faiss.read_index(os.path.join(QA_EMBEDDED_PATH, "bge_medium_single_labeled.index"))
        }
    },
    {
        "name": "medium_multi",
        "gt_path": os.path.join(QA_PATH, "medium_multi_labeled.json"),
        "dense": {
            "subqueries": retrieve_all_subqueries(os.path.join(QA_PATH, "medium_multi_labeled.json")),
            "subquery_index": faiss.read_index(os.path.join(QA_EMBEDDED_PATH, "bge_medium_multi_labeled.index"))
        }
    },
    {
        "name": "hard_single",
        "gt_path": os.path.join(QA_PATH, "hard_single_labeled.json"),
        "dense": {
            "subqueries": retrieve_all_subqueries(os.path.join(QA_PATH, "hard_single_labeled.json")),
            "subquery_index": faiss.read_index(os.path.join(QA_EMBEDDED_PATH, "bge_hard_single_labeled.index"))
        }
    },
    {
        "name": "hard_multi",
        "gt_path": os.path.join(QA_PATH, "hard_multi_labeled.json"),
        "dense": {
            "subqueries": retrieve_all_subqueries(os.path.join(QA_PATH, "hard_multi_labeled.json")),
            "subquery_index": faiss.read_index(os.path.join(QA_EMBEDDED_PATH, "bge_hard_multi_labeled.index"))
        }
    }
]


# Create a dictionary mapping retrieval method names to their functions.
retrieval_methods = {
    "tfidf": tfidf_retrieval,
    "bm25": bm25_retrieval,
    "dense": dense_retrieval_subqueries,
    "tfidf_dense": hybrid,
    "bm25_dense": hybrid,
    "dense_tfidf": hybrid,
    "dense_bm25": hybrid,
    
}

# For each dataset, define a configuration for each retrieval method.
# Here we create a dictionary keyed by retrieval method for each dataset.
for ds in datasets:
    ds.setdefault("retrieval_config", {})
    #TF-IDF configuration.
    ds["retrieval_config"]["tfidf"] = {
        "passages": passages,
        "chunk_ids": chunk_ids,
        "vectorizer": tfidf_vectorizer,
        "doc_matrix": doc_matrix,
        "top_k": 20,
        "query_type": "question"  # or "sub_questions" based on your data structure
    }
    # BM25 configuration.
    ds["retrieval_config"]["bm25"] = {
        "passages": passages,
        "chunk_ids": chunk_ids,
        "bm25": bm25,
        "top_k": 20,
        "query_type": "question"  # or "sub_questions" based on your data structure
    }
    #Dense retrieval configuration.
    ds["retrieval_config"]["dense"] = {
        "passages": passages,
        "chunk_ids": chunk_ids,
        "all_subqueries": ds["dense"]["subqueries"],
        "subquery_index": ds["dense"]["subquery_index"],
        "faiss_index": corpus_dense_index,
        "top_k": 20,
        "query_type": "sub_questions"  # or "question" based on your data structure
    }
        # --- Add Hybrid Configurations ---
    # Hybrid TF-IDF + Dense configuration.
    ds["retrieval_config"]["tfidf_dense"] = {
        "sparse_func": tfidf_retrieval,
        "sparse_config": ds["retrieval_config"]["tfidf"],
        "dense_func": dense_retrieval_subqueries,
        "dense_config": ds["retrieval_config"]["dense"],
        "intermediate_k": 1000,   # You can choose this value independently.
        "final_k": 20,           # And choose the final number of results.
        "mode": "intersection" , # or "union"
        "query_type": "combine",  # or "sub_questions" based on your data structure
        "order": "sparse_dense"  # or "dense_sparse"

    }
    # Hybrid BM25 + Dense configuration.
    ds["retrieval_config"]["bm25_dense"] = {
        "sparse_func": bm25_retrieval,
        "sparse_config": ds["retrieval_config"]["bm25"],
        "dense_func": dense_retrieval_subqueries,
        "dense_config": ds["retrieval_config"]["dense"],
        "intermediate_k": 1000,
        "final_k": 20,
        "mode": "intersection" , # or "union"
        "query_type": "combine",  # or "sub_questions" based on your data structure
        "order": "sparse_dense"  # or "dense_sparse"
    }
    ds["retrieval_config"]["dense_tfidf"] = {
        "sparse_func": tfidf_retrieval,
        "sparse_config": ds["retrieval_config"]["tfidf"],
        "dense_func": dense_retrieval_subqueries,
        "dense_config": ds["retrieval_config"]["dense"],
        "intermediate_k": 1000,   # You can choose this value independently.
        "final_k": 20,           # And choose the final number of results.
        "mode": "intersection" , # or "union"
        "query_type": "combine",  # or "sub_questions" based on your data structure
        "order": "dense_sparse"  # or "dense_sparse"

    }
    # Hybrid BM25 + Dense configuration.
    ds["retrieval_config"]["dense_bm25"] = {
        "sparse_func": bm25_retrieval,
        "sparse_config": ds["retrieval_config"]["bm25"],
        "dense_func": dense_retrieval_subqueries,
        "dense_config": ds["retrieval_config"]["dense"],
        "intermediate_k": 1000,
        "final_k": 20,
        "mode": "intersection" , # or "union"
        "query_type": "combine",  # or "sub_questions" based on your data structure
        "order": "dense_sparse"  # or "dense_sparse"
    }


In [87]:


# Assume the following retrieval and generation functions and classes are defined and imported:
# - tfidf_retrieval, bm25_retrieval, dense_retrieval_subqueries, hybrid, query_embed_search.
# - retrieval_methods dictionary mapping method names to functions.
# - CohereGenerator class.
# - retrieve_all_subqueries, load_corpus, etc.

# For generation, we instantiate our generator.
generator = CohereGenerator(model="command-r")

# # Example: use the "tfidf_dense" hybrid method for generation.
# selected_method = "tfidf_dense"
# # retrieval_methods is assumed to be defined, for example:
# # retrieval_methods = {
# #    "tfidf": tfidf_retrieval,
# #    "bm25": bm25_retrieval,
# #    "dense": dense_retrieval_subqueries,
# #    "tfidf_dense": hybrid,
# #    "bm25_dense": hybrid,
# #    "dense_tfidf": hybrid,
# #    "dense_bm25": hybrid
# # }
# selected_dataset = "easy_single"  # Example dataset name.

# # Load the dataset configuration based on the selected method.

# test_results = []  # Store results for each dataset.
# # For each dataset (datasets list is defined as in your configuration):
# ds = next((d for d in datasets if d["name"] == selected_dataset), None)
# # Load ground truth generation data.
# with open(ds["gt_path"], "r", encoding="utf-8") as f:
#     ground_truth_data = json.load(f)
# print(f"Dataset '{ds['name']}': loaded {len(ground_truth_data)} ground truth examples.")

# # Choose the retrieval configuration for generation.
# # Here we use the hybrid method "tfidf_dense" (which uses combined query).
# retrieval_config = ds["retrieval_config"][selected_method]
# retrieval_func = retrieval_methods[selected_method]

# predictions = []  # Store generated answers.
# references = []   # Store list of reference answers.

select_datasets = ["easy_single", "medium_single"]
select_retrieval_methods = ["tfidf_dense"]
# Filter datasets based on the selected names.
datasets = [ds for ds in datasets if ds["name"] in select_datasets]
# Filter retrieval methods based on the selected names.
retrieval_methods = {k: v for k, v in retrieval_methods.items() if k in select_retrieval_methods}

results = []  # Store results for each dataset.
for ds in datasets:
    # Load ground truth data.
    with open(ds["gt_path"], "r", encoding="utf-8") as f:
        ground_truth_data = json.load(f)
    print(f"Dataset '{ds['name']}': loaded {len(ground_truth_data)} ground truth examples.")
    
    # For each retrieval method defined in this dataset's config:
    for method_name, method_func in retrieval_methods.items():
        # Check if a configuration exists for this method.
        if method_name in ds["retrieval_config"]:
            config = ds["retrieval_config"][method_name]
            predictions = []
            references = []
            for example in ground_truth_data:
                query_type = config.get("query_type")
                origin_question = example["question"]
                subquestions = example["sub_questions"]
                query = [origin_question, subquestions]
                # if query_type == "sub_questions":
                #     question = query[1]
                # elif query_type == "question":
                #     question = query[0]
                # else:
                #     question = query 
                retrieval_results = method_func(query, config)
                # Call the generation function.
                # generation_answer returns previous Q&A, and a final answer.
                previous_qa, final_answer = generator.generation_answer(
                    origin_question, subquestions, retrieval_results, top_k=5, max_tokens=60
                )
                
                predictions.append(final_answer)
                # Assume the ground truth contains a list of reference answers under "list of reference".
                references.append(example["list of reference"])
            
            passages = [[r["passage"] for r in ref] for ref in references]

            gen_metrics = MetricCollection({
        "bleu": BLEU(),
        "rouge": ROUGE(),
        "bertscore": BERTScore(model_name_or_path="bert-base-uncased", batch_size=16, device="cuda")
    })

            gen_metrics.update(predictions, passages, metric_type="generation")
            gen_results = gen_metrics.compute(metric_type="generation")

            results.append({
                "dataset": ds["name"],
                "retrieval_method": method_name,
                "generation_model": "cohere",
                "metrics": gen_results
            })
            print(f"\nGeneration evaluation metrics for dataset {ds['name']} using {method_name}:")
            print(json.dumps(gen_results, indent=2))

            # 
            


# for example in ground_truth_data:
#     origin_question = example["question"]
#     subquestions = example["sub_questions"]
#     # For hybrid retrieval, we combine the main question and the sub_questions.
#     # Our hybrid function expects a list: [query_for_sparse, query_for_dense].
#     query = [origin_question, subquestions]
#     # Get retrieval results using the chosen retrieval method.
#     retrieval_results = retrieval_func(query, retrieval_config)
    
#     # Call the generation function.
#     # generation_answer returns previous Q&A, and a final answer.
#     previous_qa, final_answer = generator.generation_answer(
#         origin_question, subquestions, retrieval_results, top_k=5, max_tokens=60
#     )
    
#     predictions.append(final_answer)
#     # Assume the ground truth contains a list of reference answers under "list of reference".
#     references.append(example["list of reference"])
#     passages = [[r["passage"] for r in ref] for ref in references]
#     # save the references for later evaluation.
#     # test_results.append({
#     #     "question": origin_question,
#     #     "sub_questions": subquestions,
#     #     "retrieval_results": retrieval_results,
#     #     "previous_qa": previous_qa,
#     #     "final_answer": final_answer,
#     #     "references": example["list of reference"],
#     #     "datasets": ds["name"]
#     # })
#     # Save the predictions to a file.

#     # Evaluate generation performance using generation metrics.
#     gen_metrics = MetricCollection({
#         "bleu": BLEU(),
#         "rouge": ROUGE(),
#         "bertscore": BERTScore(model_name_or_path="bert-base-uncased", batch_size=16, device="cuda")
#     })

#     gen_metrics.update(predictions, passages, metric_type="generation")
#     gen_results = gen_metrics.compute(metric_type="generation")

    
#     # print(f"\nGeneration evaluation metrics for dataset {ds['name']}:")
#     # print(json.dumps(gen_results, indent=2))

#     # Save the test results to a file.


Dataset 'easy_single': loaded 253 ground truth examples.


KeyboardInterrupt: 

In [None]:
# LOAD THE TEST RESULTS FOR EVALUATION
test_results_path = os.path.join(PM_PATH, f"test_results_{selected_method}.json")
rouge = ROUGE()

# Load the test results for evaluation.
with open(test_results_path, "r", encoding="utf-8") as f:
    test_results = json.load(f)

predictions = [result["final_answer"] for result in test_results]
# Process references to ensure they are lists of strings
references = [result["references"] for result in test_results]
passages = [[r["passage"] for r in ref] for ref in references]

passages = []
passage = []
for ref in references:
    for r in ref:
        passage.append(r["passage"])
    passages.append(passage)        


print(len(passages))
print(len(predictions))

# # Evaluate the test results using the evaluation functions.
# #for prediction, reference in zip(predictions, references):
# print(predictions)
# # print type of predictions and references
# print(type(predictions))
# print(type(references))
# for prediction, reference in zip(predictions, references):
#     # Ensure that `reference` is a list of strings as required by ROUGE
#     processed_reference = [ref["passage"] if isinstance(ref, dict) else ref for ref in reference]
#     rouge.update(prediction, processed_reference)
#     total_score = rouge.compute()
#     print(f"ROUGE Score: {total_score}")
# #Update ROUGE with processed references

rouge.update(predictions, passages)
total_score = rouge.compute()
print(f"ROUGE Score: {total_score}")


253
253
ROUGE Score: 0.03572619228406683


In [48]:
# save the test results to a file.
test_results_path = os.path.join(PM_PATH, f"test_results_{selected_method}.json")
with open(test_results_path, "w", encoding="utf-8") as f:
    json.dump(test_results, f, indent=2)
print(f"Test results saved to {test_results_path}")


Test results saved to ../performance\test_results_tfidf_dense.json


In [37]:
from pathlib import Path
project_root = Path.cwd().parent  # 当前 notebook 假设在 performance/ 下
dotenv_path = project_root / "key.env"
print(f"dotenv_path: {dotenv_path}")
load_dotenv(dotenv_path)

# 使用环境变量
api_key = os.getenv("COHERE_API_KEY")
print(api_key)

dotenv_path: d:\NLP_Project\NLP_project\key.env
IX1TJYpIXliOPw5R8q8YaUvKcGvgTXZZRd32ThsV


In [70]:
def process_references(refs):
    """
    Process a list of reference lists so that each reference is a string.
    If a reference is a dictionary and contains an "answer" key, extract it.
    Otherwise, assume it is already a string.
    """
    processed = []
    for ref_list in refs:
        processed_ref_list = []
        for ref in ref_list:
            if isinstance(ref, dict):
                # Adjust the key as needed (e.g., "answer", "text", etc.)
                processed_ref_list.append(ref.get("answer", ""))
            else:
                processed_ref_list.append(ref)
        processed.append(processed_ref_list)
    return processed