In [1]:
# Bootstrap to import saved embeddings from notebook #2 and redefine helper function to run notebook #3 from scratch

from pathlib import Path
import numpy as np
import json
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity

# paths
cwd = Path.cwd()
PROJECT_ROOT = cwd.parent if cwd.name == "Notebook" else cwd
INDEX_DIR = PROJECT_ROOT / "index"

# load embeddings + metadata
embeddings = np.load(INDEX_DIR / "embeddings.npy")
with open(INDEX_DIR / "meta.json", "r", encoding="utf-8") as f:
    chunk_records = json.load(f)

# load embedding model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# re-use semantic_search helper function from Notebook #2
def semantic_search(query, k=5, pool_size=50, use_rerank=True):
    q_vec = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    sims = cosine_similarity(q_vec, embeddings)[0]
    topN = np.argsort(-sims)[:max(k, pool_size)]

    if not use_rerank:
        return [
            {
                "rank": rank,
                "doc_id": chunk_records[i]["doc_id"],
                "chunk_id": chunk_records[i]["chunk_id"],
                "title": chunk_records[i]["title"],
                "score": float(sims[i]),
                "score_type": "cosine",
                "text": chunk_records[i]["text"],
                "global_idx": int(i),
            }
            for rank, i in enumerate(topN[:k], start=1)
        ]

    # rerank with cross-encoder
    ce = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
    pairs = [[query, chunk_records[i]["text"]] for i in topN]
    ce_scores = ce.predict(pairs)
    ranked = sorted(zip(topN, ce_scores), key=lambda x: -x[1])[:k]

    return [
        {
            "rank": rank,
            "doc_id": chunk_records[i]["doc_id"],
            "chunk_id": chunk_records[i]["chunk_id"],
            "title": chunk_records[i]["title"],
            "score": float(ce_score),
            "score_type": "ce",
            "text": chunk_records[i]["text"],
            "global_idx": int(i),
        }
        for rank, (i, ce_score) in enumerate(ranked, start=1)
    ]

In [2]:
# Hardcode query 

query = "semiconductor patent claims"
print("🔎 Query:", query)

🔎 Query: semiconductor patent claims


In [4]:
# Run semantic search and display pretty print results
hits = semantic_search(query, k=9, pool_size=50, use_rerank=True)

for h in hits:
    print(f"{h['rank']}. [{h['score_type']}={h['score']:.3f}] doc={h['doc_id']} chunk={h['chunk_id']}")
    print("   Title :", h['title'][:120])
    print("   Snip  :", h['text'][:250].replace('\n',' '), "...\n")

1. [ce=3.142] doc=2024004106_2025-08-27_32f45f1a-66d2-4e87-82dc-67fa1196f3a5 chunk=0
   Title : 2024004106 2025 08 27 32f45f1a 66d2 4e87 82dc 67fa1196f3a5
   Snip  : U NITED STATES PATENT AND TRADEMARK O FFICE UNITED STATES DEPARTMENT OF COMMERCE United States Patent and Trademark Office Address: COMMISSIONER FOR PATENTS P.O. Box 1450 Alexandria, Virginia 22313-1450 www.uspto.gov APPLICATION NO. FILING DATE FIRST ...

2. [ce=1.771] doc=2024004096_2025-08-27_ca8b7daf-07f0-4741-98b4-f504184e3399 chunk=0
   Title : 2024004096 2025 08 27 ca8b7daf 07f0 4741 98b4 f504184e3399
   Snip  : U NITED STATES PATENT AND TRADEMARK O FFICE UNITED STATES DEPARTMENT OF COMMERCE United States Patent and Trademark Office Address: COMMISSIONER FOR PATENTS P.O. Box 1450 Alexandria, Virginia 22313-1450 www.uspto.gov APPLICATION NO. FILING DATE FIRST ...

3. [ce=-0.665] doc=2024002880_2025-08-26_6583dc03-0b34-46bd-a958-93345eb5e15c chunk=0
   Title : 2024002880 2025 08 26 6583dc03 0b34 46bd a958 93345eb5e15c


In [7]:
# Helper function to show a hit with surrounding context

def debug_show_reranked_hit(reranked_list, which: int = 0, window: int = 1):
    """
    Inspect a reranked result at position `which` and print its context.
    reranked_list should be a list of (global_idx, score) pairs.
    """
    if not reranked_list:
        print("⚠️ Empty results list.")
        return
    if which < 0 or which >= len(reranked_list):
        print(f"⚠️ which={which} out of range 0..{len(reranked_list)-1}")
        return

    global_idx = int(reranked_list[which][0])
    score = float(reranked_list[which][1])

    if global_idx < 0 or global_idx >= len(chunk_records):
        print(f"⚠️ global_idx {global_idx} out of range (0..{len(chunk_records)-1})")
        return

    rec = chunk_records[global_idx]
    doc_id = rec.get("doc_id", "<?>")
    chunk_id = rec.get("chunk_id", -1)
    title = rec.get("title", "")
    text = rec.get("text", "")

    print(f"\n📄 Document: {doc_id}")
    print(f"🧾 Title   : {title}")
    print(f"🎯 Chunk ID: {chunk_id}")
    print(f"⭐ Score    : {score:.3f}")
    print("-" * 80)

    # find all chunks for this doc
    doc_chunks = [(i, r) for i, r in enumerate(chunk_records) if r["doc_id"] == doc_id]
    doc_chunks.sort(key=lambda x: x[1]["chunk_id"])

    # locate this chunk’s position
    pos = next((p for p, (gi, _) in enumerate(doc_chunks) if gi == global_idx), None)
    if pos is None:
        print("⚠️ Could not locate this chunk within the doc.")
        return

    start = max(0, pos - window)
    end = min(len(doc_chunks), pos + window + 1)

    for j in range(start, end):
        gi, r = doc_chunks[j]
        marker = "👉" if gi == global_idx else "  "
        print(f"{marker} [chunk {r['chunk_id']}]")
        print(r['text'][:1000])  # print first 1000 chars
        print("-" * 80)

In [8]:
# Show full context for the top hit

debug_show_reranked_hit([(h['global_idx'], h['score']) for h in hits], which=0, window=1)


📄 Document: 2024004106_2025-08-27_32f45f1a-66d2-4e87-82dc-67fa1196f3a5
🧾 Title   : 2024004106 2025 08 27 32f45f1a 66d2 4e87 82dc 67fa1196f3a5
🎯 Chunk ID: 0
⭐ Score    : 3.142
--------------------------------------------------------------------------------
👉 [chunk 0]
U NITED STATES PATENT AND TRADEMARK O FFICE UNITED STATES DEPARTMENT OF COMMERCE United States Patent and Trademark Office Address: COMMISSIONER FOR PATENTS P.O. Box 1450 Alexandria, Virginia 22313-1450 www.uspto.gov APPLICATION NO. FILING DATE FIRST NAMED INVENTOR ATTORNEY DOCKET NO. CONFIRMATION NO. 16/743,462 01/15/2020 Masashi SAKAI 005700-ME0612 1074 78198 7590 08/28/2025 Studebaker Brackett PLLC 8255 Greensboro Drive Suite 300 Tysons, VA 22102 EXAMINER KIM, JAY C ART UNIT PAPER NUMBER 2815 NOTIFICATION DATE DELIVERY MODE 08/28/2025 ELECTRONIC Please find below and/or attached an Office communication concerning this application or proceeding. The time period for reply, if any, is set in the attached communication. No

In [9]:
# Fun peek at the first search result dictionary

if hits:  # make sure the list is not empty
    h = hits[0]   # first result
    print("🔎 Here's what one search result (a dictionary) looks like:\n")
    for key, value in h.items():
        # shorten long text fields for readability
        if isinstance(value, str) and len(value) > 200:
            display_val = value[:200] + "..."
        else:
            display_val = value
        print(f"{key:>10} : {display_val}")
else:
    print("⚠️ No results found in 'hits'.")

🔎 Here's what one search result (a dictionary) looks like:

      rank : 1
    doc_id : 2024004106_2025-08-27_32f45f1a-66d2-4e87-82dc-67fa1196f3a5
  chunk_id : 0
     title : 2024004106 2025 08 27 32f45f1a 66d2 4e87 82dc 67fa1196f3a5
     score : 3.142171859741211
score_type : ce
      text : U NITED STATES PATENT AND TRADEMARK O FFICE UNITED STATES DEPARTMENT OF COMMERCE United States Patent and Trademark Office Address: COMMISSIONER FOR PATENTS P.O. Box 1450 Alexandria, Virginia 22313-14...
global_idx : 123
