# Retrieval Evaluation for Semantic Search

In this notebook, we evaluate the effectiveness of our semantic search engine using a small, manually curated test set.

### Goals:
1. Define (query, expected clause type) test cases.
2. Use our semantic search engine to retrieve the top-K most relevant clauses.
3. Evaluate the search quality using:
   - **Precision@5**: Proportion of top-5 results that match the expected clause type.
   - **Mean Reciprocal Rank (MRR)**: Measures the rank of the first correct result.
4. Optionally, display returned clause types to debug relevance.

This evaluation simulates how well the system performs in real-world legal retrieval scenarios.


In [None]:
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
import torch
from sentence_transformers import SentenceTransformer

# -----------------------------
# Load Models & Metadata
# -----------------------------

# Define base paths
BASE_DIR = Path.cwd().parent
RESULTS_DIR = BASE_DIR / "results"
NN_MODEL_PATH = RESULTS_DIR / "clause_nn_model.pkl"
META_PATH = RESULTS_DIR / "clause_metadata.pkl"

# Load nearest neighbors model
with open(NN_MODEL_PATH, "rb") as f:
    nn = pickle.load(f)

# Load clause metadata
with open(META_PATH, "rb") as f:
    metadata = pickle.load(f)

# Clean up metadata
clause_texts = metadata["clause_text"]
labels = [lbl.strip().replace('"', '') for lbl in metadata["cleaned_type"]] 

# Load embedding model
device = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "sentence-transformers/msmarco-MiniLM-L6-cos-v5"
model = SentenceTransformer(MODEL_NAME, device=device)

# -----------------------------
# Semantic Search Function
# -----------------------------

def semantic_search_df(query: str, top_k: int = 5, verbose: bool = False) -> pd.DataFrame:
    """
    Perform semantic search using the precomputed clause embeddings.

    Args:
        query (str): Natural language legal query.
        top_k (int): Number of top results to return.
        verbose (bool): If True, prints the types of top-k returned clauses.

    Returns:
        pd.DataFrame: Top-k search results with score, type, clause, and index.
    """
    # Encode the query using the same embedding model
    q_emb = model.encode([query], convert_to_numpy=True)

    # Retrieve top_k nearest neighbors from embedding space
    distances, indices = nn.kneighbors(q_emb, n_neighbors=top_k)
    sims = 1 - distances[0]  # Convert distance to similarity

    # Build result rows
    rows = []
    for score, idx in zip(sims, indices[0]):
        rows.append({
            "score": float(score),
            "type": labels[idx],
            "clause": clause_texts[idx],
            "index": idx
        })

    # Create DataFrame
    df = pd.DataFrame(rows).set_index("index")

    # Optionally show what types were returned
    if verbose:
        print(f"\n Query: {query}")
        print("Top types returned:", df["type"].tolist())

    return df


## Evaluation Metrics

We define two standard information retrieval metrics to assess performance:

- **Precision@K**: What fraction of the top-K retrieved clauses match the expected clause type?
- **Mean Reciprocal Rank (MRR)**: At what rank does the first correct result appear? Higher is better.

These are computed for each query in the test set, and then averaged.


In [None]:
# Expanded + rephrased test set
test_set = [
    ("Dispute resolution clause", "Dispute Resolution"),
    ("Confidentiality obligations", "Confidentiality"),
    ("Indemnification clause", "Indemnification"),
    ("Termination of agreement", "Termination"),
    ("Governing law clause", "Governing Law"),
    ("Limitation of liability", "Limitation of Liability"),
    ("Assignment clause", "Assignment"),
    ("Force majeure clause", "Force Majeure"),
    ("Audit rights clause", "Audit Rights"),
    ("Non-compete agreement clause", "Non-Compete"),
    ("IP ownership clause", "IP Ownership"),
    ("Change of control provision", "Change of Control"),
    ("Exclusivity agreement", "Exclusivity"),
    ("Payment terms clause", "Payment Terms"),
    ("Publicity restriction", "Publicity"),
    ("Warranties clause", "Warranties"),
    ("Third party beneficiary clause", "Third Party Beneficiary"),
    ("Post-termination obligations", "Post-Termination Services")
]


# Normalize function
from difflib import SequenceMatcher

# Fuzzy similarity score between two strings
def is_fuzzy_match(a: str, b: str, threshold: float = 0.7) -> bool:
    a_clean = a.strip().lower().replace('"', '')
    b_clean = b.strip().lower().replace('"', '')
    return SequenceMatcher(None, a_clean, b_clean).ratio() >= threshold

# Precision@K using fuzzy type matching
def precision_type_at_k(df, target_type, k=5):
    return sum(is_fuzzy_match(t, target_type) for t in df['type'].iloc[:k]) / k

# MRR using fuzzy type matching
def mrr_type(df, target_type):
    for i, t in enumerate(df['type'], start=1):
        if is_fuzzy_match(t, target_type):
            return 1.0 / i
    return 0.0


# Run evaluation
type_eval_results = []
for query, expected_type in test_set:
    df_res = semantic_search_df(query, top_k=5)

    # Debug output
    print(f"\nQuery: {query}")
    print(f"Expected Type: {expected_type}")
    print("Returned Types:", df_res['type'].tolist())

    p5 = precision_type_at_k(df_res, expected_type)
    mrr = mrr_type(df_res, expected_type)

    # Optional warning for 0 precision
    if p5 == 0.0:
        print("No correct type found in top 5.")

    # Save returned types and clauses in results
    type_eval_results.append({
        'query': query,
        'precision@5': round(p5, 3),
        'MRR': round(mrr, 3),
        'returned_types': df_res['type'].tolist(),     
        'returned_clauses': df_res['clause'].tolist()    
    })


# Results table sorted by precision
df_type_eval = pd.DataFrame(type_eval_results)
df_type_eval_sorted = df_type_eval.sort_values(by="precision@5") 
display(df_type_eval_sorted)

# Summary
print(f"\nAverage Precision@5: {df_type_eval['precision@5'].mean():.3f}")
print(f"Average MRR: {df_type_eval['MRR'].mean():.3f}")



Query: Dispute resolution clause
Expected Type: Dispute Resolution
Returned Types: ['covenant not to sue', 'governing law', 'governing law', 'governing law', 'governing law']
No correct type found in top 5.

Query: Confidentiality obligations
Expected Type: Confidentiality
Returned Types: ['audit rights', 'document name', 'cap on liability', 'uncapped liability', 'cap on liability']
No correct type found in top 5.

Query: Indemnification clause
Expected Type: Indemnification
Returned Types: ['third party beneficiary', 'cap on liability', 'cap on liability', 'uncapped liability', 'cap on liability']
No correct type found in top 5.

Query: Termination of agreement
Expected Type: Termination
Returned Types: ['post-termination services', 'post-termination services', 'change of control', 'change of control', 'document name']
No correct type found in top 5.

Query: Governing law clause
Expected Type: Governing Law
Returned Types: ['post-termination services', 'governing law', 'governing law

Unnamed: 0,query,precision@5,MRR,returned_types,returned_clauses
0,Dispute resolution clause,0.0,0.0,"[covenant not to sue, governing law, governing...",[the parties desire to resolve disputes arisin...
1,Confidentiality obligations,0.0,0.0,"[audit rights, document name, cap on liability...",[have shall impose confidentiality obligations...
2,Indemnification clause,0.0,0.0,"[third party beneficiary, cap on liability, ca...",[there are no third party beneficiaries under ...
3,Termination of agreement,0.0,0.0,"[post-termination services, post-termination s...","[upon termination of this agreement,, upon the..."
5,Limitation of liability,0.0,0.0,"[cap on liability, uncapped liability, cap on ...","[limitation of liability of the sponsor., prov..."
7,Force majeure clause,0.0,0.0,"[expiration date, cap on liability, cap on lia...",[this agreement shall come into force on the e...
10,IP ownership clause,0.0,0.0,"[ip ownership assignment, parties, parties, pa...",[sole ownership provided for in the second sen...
9,Non-compete agreement clause,0.0,0.0,"[document name, document name, document name, ...","[non-competition agreement amendment no. 1, sa..."
15,Warranties clause,0.0,0.0,"[warranty duration, covenant not to sue, warra...",[without prejudice to any other rights accruin...
13,Payment terms clause,0.0,0.0,"[effective date, parties, audit rights, minimu...","[this clause 6 and clauses 1, 5, 7 through 9 (..."



Average Precision@5: 0.322
Average MRR: 0.361


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarities between query and each returned clause
def max_cosine_similarity(query: str, df_results: pd.DataFrame) -> float:
    q_emb = model.encode([query], convert_to_numpy=True)
    clause_texts = df_results["clause"].tolist()
    clause_embs = model.encode(clause_texts, convert_to_numpy=True)
    sims = cosine_similarity([q_emb[0]], clause_embs)[0]
    return float(np.max(sims))

# Run similarity eval
df_type_eval["max_cosine_sim"] = [
    max_cosine_similarity(row["query"], semantic_search_df(row["query"], top_k=5))
    for _, row in df_type_eval.iterrows()
]

# Show updated table
display(df_type_eval.sort_values(by="max_cosine_sim", ascending=False))

# Summary stats
print(f"\nAverage Max Cosine Similarity: {df_type_eval['max_cosine_sim'].mean():.3f}")


Unnamed: 0,query,precision@5,MRR,returned_types,returned_clauses,max_cosine_sim
3,Termination of agreement,0.0,0.0,"[post-termination services, post-termination s...","[upon termination of this agreement,, upon the...",0.896646
5,Limitation of liability,0.0,0.0,"[cap on liability, uncapped liability, cap on ...","[limitation of liability of the sponsor., prov...",0.789427
9,Non-compete agreement clause,0.0,0.0,"[document name, document name, document name, ...","[non-competition agreement amendment no. 1, sa...",0.788203
1,Confidentiality obligations,0.0,0.0,"[audit rights, document name, cap on liability...",[have shall impose confidentiality obligations...,0.699714
12,Exclusivity agreement,0.4,1.0,"[exclusivity, expiration date, exclusivity, co...","[exclusivity date means october 1, 2008, the d...",0.687088
15,Warranties clause,0.0,0.0,"[warranty duration, covenant not to sue, warra...",[without prejudice to any other rights accruin...,0.640307
11,Change of control provision,1.0,1.0,"[change of control, change of control, change ...","[(b) an operator change of control;, for purpo...",0.637836
7,Force majeure clause,0.0,0.0,"[expiration date, cap on liability, cap on lia...",[this agreement shall come into force on the e...,0.635691
16,Third party beneficiary clause,1.0,1.0,"[third party beneficiary, third party benefici...",[other system franchisees shall be deemed thir...,0.630416
17,Post-termination obligations,0.8,1.0,"[post-termination services, post-termination s...","[upon termination of this agreement,, terminat...",0.62308



Average Max Cosine Similarity: 0.633


## Cluster-Based Evaluation (Optional)

We evaluate whether the top-5 retrieved clauses for a query belong to the **same semantic cluster**, based on prior unsupervised clustering (K-Means).

This provides an **unsupervised signal** of retrieval consistency — even if clause types don't match exactly, if the results fall in the same cluster, the query is likely well-answered.

### Cluster Evaluation Steps:
1. Load saved `clause_clusters.pkl` (output of 03_clause_clustering).
2. For each query’s top-5 retrieved clauses:
   - Retrieve their **K-Means cluster assignments**
   - Check if they mostly fall in the same cluster
3. Compute a **dominant cluster ratio** (e.g., if 4/5 belong to cluster 7 → score = 0.8)

> This metric rewards **semantic consistency** and can flag ambiguous or scattered retrieval results.


In [None]:
import pickle
from collections import Counter

# Load cluster labels: {index: cluster_id}
cluster_path = RESULTS_DIR / "clause_clusters.pkl"
with open(cluster_path, "rb") as f:
    cluster_map = pickle.load(f)

# Compute dominant cluster ratio for top-5
def dominant_cluster_ratio(df_results: pd.DataFrame) -> float:
    cluster_ids = [cluster_map.get(idx, -1) for idx in df_results.index]
    if not cluster_ids:
        return 0.0
    cluster_counts = Counter(cluster_ids)
    dominant_cluster_size = cluster_counts.most_common(1)[0][1]
    return dominant_cluster_size / len(cluster_ids)

# Evaluate for all queries
df_type_eval["cluster_ratio@5"] = [
    dominant_cluster_ratio(semantic_search_df(row["query"], top_k=5))
    for _, row in df_type_eval.iterrows()
]

# Show table sorted by cluster cohesion
display(df_type_eval.sort_values(by="cluster_ratio@5", ascending=False))

# Summary
print(f"\nAverage Cluster Cohesion (Top-5): {df_type_eval['cluster_ratio@5'].mean():.3f}")


Unnamed: 0,query,precision@5,MRR,returned_types,returned_clauses,max_cosine_sim,cluster_ratio@5
3,Termination of agreement,0.0,0.0,"[post-termination services, post-termination s...","[upon termination of this agreement,, upon the...",0.896646,1.0
2,Indemnification clause,0.0,0.0,"[third party beneficiary, cap on liability, ca...",[there are no third party beneficiaries under ...,0.592794,1.0
8,Audit rights clause,1.0,1.0,"[audit rights, audit rights, audit rights, aud...",[it is understood that the foregoing audit rig...,0.613163,1.0
11,Change of control provision,1.0,1.0,"[change of control, change of control, change ...","[(b) an operator change of control;, for purpo...",0.637836,1.0
17,Post-termination obligations,0.8,1.0,"[post-termination services, post-termination s...","[upon termination of this agreement,, terminat...",0.62308,0.8
0,Dispute resolution clause,0.0,0.0,"[covenant not to sue, governing law, governing...",[the parties desire to resolve disputes arisin...,0.55119,0.8
5,Limitation of liability,0.0,0.0,"[cap on liability, uncapped liability, cap on ...","[limitation of liability of the sponsor., prov...",0.789427,0.8
6,Assignment clause,0.8,1.0,"[anti-assignment, anti-assignment, anti-assign...","[assignment of member status, under this claus...",0.553461,0.8
9,Non-compete agreement clause,0.0,0.0,"[document name, document name, document name, ...","[non-competition agreement amendment no. 1, sa...",0.788203,0.8
4,Governing law clause,0.8,0.5,"[post-termination services, governing law, gov...","[by applicable laws and regulations., the laws...",0.536156,0.8



Average Cluster Cohesion (Top-5): 0.733


## Ablation Study: Effect of Dimensionality Reduction

To understand the effect of embedding dimensionality on retrieval performance, we compare:

- **Original embeddings** (384D from `msmarco-MiniLM-L6-cos-v5`)
- **PCA-reduced embeddings** (50D using `pca_model.pkl`)

We'll evaluate both with Precision@5 and MRR using the same test queries.

### Purpose:
- Does PCA reduce retrieval quality significantly?
- Are compressed embeddings sufficient for search applications?


In [9]:
# Load reduced PCA embeddings
PCA_EMB_PATH = RESULTS_DIR / "clause_embeddings_pca50.npy"
PCA_MODEL_PATH = RESULTS_DIR / "pca_model.pkl"
with open(PCA_MODEL_PATH, "rb") as f:
    pca_model = pickle.load(f)
embeddings_pca = np.load(PCA_EMB_PATH)

# Refit kNN on PCA embeddings
from sklearn.neighbors import NearestNeighbors
nn_pca = NearestNeighbors(n_neighbors=5, metric="cosine")
nn_pca.fit(embeddings_pca)

# Reload original embeddings for fair comparison
embeddings_orig = np.load(RESULTS_DIR / "clause_embeddings.npy")
nn_orig = NearestNeighbors(n_neighbors=5, metric="cosine")
nn_orig.fit(embeddings_orig)

# Evaluation function with optional PCA transformation
def evaluate_search(nn_model, embeddings_matrix, model, test_set, apply_pca=False, pca_model=None):
    results = []
    for query, expected_type in test_set:
        q_emb = model.encode([query], convert_to_numpy=True)
        if apply_pca and pca_model:
            q_emb = pca_model.transform(q_emb)
        distances, indices = nn_model.kneighbors(q_emb, n_neighbors=5)
        sims = 1 - distances[0]
        retrieved = [labels[i] for i in indices[0]]
        p5 = precision_type_at_k(pd.DataFrame({"type": retrieved}), expected_type)
        mrr = mrr_type(pd.DataFrame({"type": retrieved}), expected_type)
        results.append({
            "query": query,
            "precision@5": p5,
            "MRR": mrr
        })
    return pd.DataFrame(results)

# Run both evaluations
print("Evaluating with ORIGINAL embeddings...\n")
df_orig = evaluate_search(nn_orig, embeddings_orig, model, test_set)

print("\nEvaluating with PCA-reduced embeddings...\n")
df_pca = evaluate_search(nn_pca, embeddings_pca, model, test_set, apply_pca=True, pca_model=pca_model)

# Merge for side-by-side comparison
df_ablation = df_orig.copy()
df_ablation.columns = ["query", "prec@5_orig", "MRR_orig"]
df_ablation["prec@5_pca"] = df_pca["precision@5"]
df_ablation["MRR_pca"] = df_pca["MRR"]
display(df_ablation)

# Summary
print("\n--- Summary ---")
print("Original Embeddings:")
print(f"Avg Precision@5: {df_ablation['prec@5_orig'].mean():.3f}")
print(f"Avg MRR: {df_ablation['MRR_orig'].mean():.3f}")

print("\nPCA-Reduced Embeddings:")
print(f"Avg Precision@5: {df_ablation['prec@5_pca'].mean():.3f}")
print(f"Avg MRR: {df_ablation['MRR_pca'].mean():.3f}")


Evaluating with ORIGINAL embeddings...


Evaluating with PCA-reduced embeddings...



Unnamed: 0,query,prec@5_orig,MRR_orig,prec@5_pca,MRR_pca
0,Dispute resolution clause,0.0,0.0,0.0,0.0
1,Confidentiality obligations,0.0,0.0,0.0,0.0
2,Indemnification clause,0.0,0.0,0.0,0.0
3,Termination of agreement,0.0,0.0,0.0,0.0
4,Governing law clause,0.8,0.5,1.0,1.0
5,Limitation of liability,0.0,0.0,0.0,0.0
6,Assignment clause,0.8,1.0,0.6,1.0
7,Force majeure clause,0.0,0.0,0.0,0.0
8,Audit rights clause,1.0,1.0,1.0,1.0
9,Non-compete agreement clause,0.0,0.0,0.0,0.0



--- Summary ---
Original Embeddings:
Avg Precision@5: 0.322
Avg MRR: 0.361

PCA-Reduced Embeddings:
Avg Precision@5: 0.278
Avg MRR: 0.389
