# Retrieval System EvaluationImplemented based on src/evaluation.py

In [None]:
# 1. Environment Setup
!pip install -q scikit-learn matplotlib
from google.colab import drive
drive.mount('/content/drive')

import os
PROJECT_PATH = "/content/drive/MyDrive/CS6120_project"
os.chdir(PROJECT_PATH)

# Timing utility
from timeit import default_timer as timer

In [None]:
# 2. Load test data
import json
from src.evaluation import Evaluator

# Load test set
with open("data/processed/combined.json") as f:
    test_data = json.load(f)["test"]
    queries = [item["query"] for item in test_data]
    ground_truth = [item["relevant_docs"] for item in test_data]

# Initialize evaluator
evaluator = Evaluator(queries, ground_truth)

In [None]:
# 3. Evaluate retrieval methods
methods = ["BM25", "Vector", "Hybrid"]
results = {}

for method in methods:
    start = timer()
    
    # Simulate retrieval results (should call actual retrieval modules)
    if method == "BM25":
        scores = evaluator.simulate_bm25()
    elif method == "Vector":
        scores = evaluator.simulate_vector()
    else:
        scores = evaluator.simulate_hybrid()
    
    # Calculate metrics
    metrics = evaluator.evaluate(scores)
    metrics["time"] = timer() - start
    results[method] = metrics
    
    print(f"{method} Evaluation Results:")
    print(f"- MRR: {metrics['mrr']:.4f}")
    print(f"- Recall@10: {metrics['recall@10']:.4f}")
    print(f"- Precision@5: {metrics['precision@5']:.4f}")
    print(f"- Average query time: {metrics['time']:.4f}s")
    print("="*50)

In [None]:
# 4. Results visualization
import matplotlib.pyplot as plt
import numpy as np

# Prepare data
x = np.arange(len(methods))
width = 0.25

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Metrics comparison
ax1.bar(x - width, [results[m]["mrr"] for m in methods], width, label='MRR')
ax1.bar(x, [results[m]["recall@10"] for m in methods], width, label='Recall@10')
ax1.bar(x + width, [results[m]["precision@5"] for m in methods], width, label='Precision@5')
ax1.set_xticks(x)
ax1.set_xticklabels(methods)
ax1.set_ylabel('Score')
ax1.set_title('Retrieval Performance Comparison')
ax1.legend()

# Time comparison
ax2.bar(methods, [results[m]["time"] for m in methods], color='orange')
ax2.set_ylabel('Time (seconds)')
ax2.set_title('Query Latency Comparison')

plt.tight_layout()
plt.show()

In [None]:
# 5. Ablation study (weight parameter analysis)
weight_range = np.linspace(0, 1, 11)
mrr_scores = []

for w in weight_range:
    # Simulate fixed-weight hybrid retrieval
    scores = evaluator.simulate_hybrid(fixed_weight=w)
    metrics = evaluator.evaluate(scores)
    mrr_scores.append(metrics["mrr"])

# Plot weight-MRR curve
plt.figure(figsize=(10,6))
plt.plot(weight_range, mrr_scores, marker='o')
plt.axvline(x=0.5, color='r', linestyle='--', label='Equal Weight')
plt.xlabel('BM25 Weight (1-Vector Weight)')
plt.ylabel('MRR')
plt.title('Ablation Study: Weight Parameter Analysis')
plt.legend()
plt.grid()
plt.show()