In [1]:
# ======= CONFIG =======
# Just change this to switch franchises
DOMAIN = "money-heist"   # e.g. "money-heist", "naruto", "dragonball"

# Name of FAISS index file.
# Option 1: domain-specific index (recommended)
INDEX_FILENAME = f"faiss_flat_{DOMAIN}.index"
# If you instead use a global index like "faiss_flat.index", set:
# INDEX_FILENAME = "faiss_flat.index"

# ======================

from pathlib import Path
import json
import textwrap
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option("display.max_colwidth", 200)

import faiss  # make sure faiss is installed: pip install faiss-gpu or faiss-cpu

try:
    from sentence_transformers import SentenceTransformer
except ImportError:
    SentenceTransformer = None
    print("⚠️ sentence-transformers not installed. Install it if you want to encode free-form queries.")

In [2]:
REPO_ROOT = Path("..").resolve()

EMB_DIR = REPO_ROOT / "data" / "embeddings"
PROCESSED_DIR = REPO_ROOT / "data" / "processed"
INDEX_DIR = REPO_ROOT / "data" / "indexes"

EMB_PATH = EMB_DIR / f"spans_{DOMAIN}.npy"
ID_PATH = EMB_DIR / f"spans_{DOMAIN}.index_ids.npy"
MODEL_INFO_PATH = EMB_DIR / f"model_info_{DOMAIN}.json"
SPANS_PATH = PROCESSED_DIR / f"spans_{DOMAIN}.csv"
INDEX_PATH = INDEX_DIR / INDEX_FILENAME

print("Using DOMAIN:", DOMAIN)
print("Embeddings:", EMB_PATH, "→", EMB_PATH.exists())
print("Index IDs:", ID_PATH, "→", ID_PATH.exists())
print("Spans CSV:", SPANS_PATH, "→", SPANS_PATH.exists())
print("Model info:", MODEL_INFO_PATH, "→", MODEL_INFO_PATH.exists())
print("FAISS index:", INDEX_PATH, "→", INDEX_PATH.exists())

Using DOMAIN: money-heist
Embeddings: /data/sundeep/Fandom_SI/data/embeddings/spans_money-heist.npy → True
Index IDs: /data/sundeep/Fandom_SI/data/embeddings/spans_money-heist.index_ids.npy → True
Spans CSV: /data/sundeep/Fandom_SI/data/processed/spans_money-heist.csv → True
Model info: /data/sundeep/Fandom_SI/data/embeddings/model_info_money-heist.json → True
FAISS index: /data/sundeep/Fandom_SI/data/indexes/faiss_flat_money-heist.index → True


In [3]:
# Load embeddings + ids
embeddings = np.load(EMB_PATH)
index_ids = np.load(ID_PATH, allow_pickle=True)

print("Embeddings shape:", embeddings.shape)
print("Index IDs shape:", index_ids.shape)

# Normalize types: treat span_ids as strings everywhere
index_ids = np.array(index_ids, dtype=str)

# Load spans CSV
df_spans = pd.read_csv(SPANS_PATH)
print("Spans columns:", df_spans.columns.tolist())
print("Num spans:", len(df_spans))

# Ensure span_id + text columns
assert "span_id" in df_spans.columns, "Expected 'span_id' column in spans CSV."
assert "text" in df_spans.columns, "Expected 'text' column in spans CSV."

df_spans["span_id"] = df_spans["span_id"].astype(str)

# Map span_id -> row index
spanid_to_row = {sid: i for i, sid in enumerate(df_spans["span_id"].tolist())}

# Check coverage
missing = [sid for sid in index_ids if sid not in spanid_to_row]
print("IDs in index_ids but not in spans.csv:", len(missing))
print("Example missing:", missing[:10])

Embeddings shape: (3189, 384)
Index IDs shape: (3189,)
Spans columns: ['span_id', 'article_id', 'page_name', 'title', 'section', 'span_index', 'start_char', 'end_char', 'len_chars', 'num_sents', 'text', 'url', 'source_path']
Num spans: 3189
IDs in index_ids but not in spans.csv: 0
Example missing: []


In [4]:
model_name = None

if MODEL_INFO_PATH.exists():
    with MODEL_INFO_PATH.open("r", encoding="utf-8") as f:
        model_info = json.load(f)
    print("model_info:", model_info)
    model_name = model_info.get("model_name", None)

if model_name is None:
    # Fallback: set manually if not in model_info file
    model_name = "sentence-transformers/all-MiniLM-L6-v2"

print("Encoder model name:", model_name)

if SentenceTransformer is None:
    print("⚠️ sentence-transformers not available, can't encode new queries.")
    encoder = None
else:
    print("Loading SentenceTransformer…")
    encoder = SentenceTransformer(model_name)
    print("Encoder loaded.")

model_info: {'domain': 'money-heist', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2', 'embedding_dim': 384, 'num_spans': 3189, 'normalize_embeddings': True, 'created_at': '2025-11-16T10:40:43.860142Z', 'spans_file': 'data/processed/spans_money-heist.csv', 'embeddings_file': 'data/embeddings/spans_money-heist.npy', 'index_ids_file': 'data/embeddings/spans_money-heist.index_ids.npy'}
Encoder model name: sentence-transformers/all-MiniLM-L6-v2
Loading SentenceTransformer…
Encoder loaded.


In [5]:
index = faiss.read_index(str(INDEX_PATH))
print("FAISS index type:", type(index))
print("Index dimension (d):", index.d)

# Sanity: ensure FAISS and embeddings agree on dimension
d_emb = embeddings.shape[1]
assert index.d == d_emb, f"Dim mismatch: FAISS d={index.d}, embeddings dim={d_emb}"

# Ensure float32 for FAISS
if embeddings.dtype != np.float32:
    print("Casting embeddings to float32 for FAISS compatibility.")
    embeddings = embeddings.astype("float32")

FAISS index type: <class 'faiss.swigfaiss.IndexFlatIP'>
Index dimension (d): 384


In [6]:
index = faiss.read_index(str(INDEX_PATH))
print("FAISS index type:", type(index))
print("Index dimension (d):", index.d)

# Sanity: ensure FAISS and embeddings agree on dimension
d_emb = embeddings.shape[1]
assert index.d == d_emb, f"Dim mismatch: FAISS d={index.d}, embeddings dim={d_emb}"

# Ensure float32 for FAISS
if embeddings.dtype != np.float32:
    print("Casting embeddings to float32 for FAISS compatibility.")
    embeddings = embeddings.astype("float32")

FAISS index type: <class 'faiss.swigfaiss.IndexFlatIP'>
Index dimension (d): 384


In [7]:
def get_span_text(span_id: str):
    idx = spanid_to_row.get(span_id, None)
    if idx is None:
        return None
    return str(df_spans.iloc[idx]["text"])


def encode_query_text(text: str) -> np.ndarray:
    """
    Encode a free-form query string using the same encoder as spans.
    Returns shape (1, D) float32.
    """
    assert encoder is not None, "Encoder not loaded (sentence-transformers missing)."
    vec = encoder.encode([text], convert_to_numpy=True)
    if vec.dtype != np.float32:
        vec = vec.astype("float32")
    return vec


def search_with_vector(query_vec: np.ndarray, top_k: int = 10):
    """
    Search FAISS index with an already-computed query embedding.
    query_vec: (1, D) float32
    Returns: list of dicts with rank, score, span_id, text.
    """
    # Ensure correct shape & dtype
    if query_vec.ndim == 1:
        query_vec = query_vec[None, :]
    if query_vec.dtype != np.float32:
        query_vec = query_vec.astype("float32")

    D, I = index.search(query_vec, top_k)  # D: distances, I: indices
    D = D[0]
    I = I[0]

    results = []
    for rank, (dist, idx) in enumerate(zip(D, I)):
        if idx < 0:
            continue
        span_id = index_ids[idx]
        text = get_span_text(span_id)
        results.append(
            {
                "rank": rank,
                "array_idx": int(idx),
                "span_id": span_id,
                "score": float(dist),
                "text": text,
            }
        )
    return results

In [8]:
def pretty_print_results(results, max_chars=260):
    for r in results:
        print("\n" + "-"*80)
        print(f"Rank {r['rank']} | span_id={r['span_id']} | score={r['score']:.4f}")
        print(textwrap.fill((r["text"] or "").replace("\n", " "), width=100))
    print("\n" + "="*80 + "\n")


# Example interactive query cell
if encoder is None:
    print("⚠️ Cannot run free-form query: encoder not available.")
else:
    query = "brief backstory of Tokyo and the Professor"  # change this or use input()
    print("QUERY:", query)
    q_vec = encode_query_text(query)
    results = search_with_vector(q_vec, top_k=10)
    pretty_print_results(results)

QUERY: brief backstory of Tokyo and the Professor

--------------------------------------------------------------------------------
Rank 0 | span_id=money-heist_span_0002937 | score=0.6840
Comparing to the Spanish Tokyo . They have similarities: Both trying to escape a crime they commit.
The Professor shows up to save her. Both became most wanted in the area.

--------------------------------------------------------------------------------
Rank 1 | span_id=money-heist_span_0002924 | score=0.6726
The Professor was the first person that Tokyo meets in the crew, and was the one who saved her from
spending her life in jail. Throughout the series, Tokyo has shown to greatly admire and respect The
Professor, even going so far as to call him her 'guardian angel'.

--------------------------------------------------------------------------------
Rank 2 | span_id=money-heist_span_0002863 | score=0.6726
The Professor was the first person that Tokyo meets in the crew, and was the one who saved her

In [9]:
if encoder is not None:
    query = input("Enter a query: ")
    q_vec = encode_query_text(query)
    results = search_with_vector(q_vec, top_k=10)
    pretty_print_results(results)


--------------------------------------------------------------------------------
Rank 0 | span_id=money-heist_span_0000519 | score=0.2914
↑ Episode 11 (Part 1)  ↑ Episode 13 (Part 1)

--------------------------------------------------------------------------------
Rank 1 | span_id=money-heist_span_0000435 | score=0.2909
↑ Episode 7 (Part 2)  ↑ Episode 3 (Part 1)

--------------------------------------------------------------------------------
Rank 2 | span_id=money-heist_span_0000404 | score=0.2909
↑ Episode 7 (Part 2)  ↑ Episode 3 (Part 1)

--------------------------------------------------------------------------------
Rank 3 | span_id=money-heist_span_0000199 | score=0.2814
↑ Episode 4 (Part 1)  ↑ Episode 4 (Part 1)

--------------------------------------------------------------------------------
Rank 4 | span_id=money-heist_span_0000760 | score=0.2800
↑ Episode 2 (Part 1)  ↑ Episode 3 (Part 1)

--------------------------------------------------------------------------------
Rank 5

In [10]:
# Pick a random span_id from this DOMAIN
random_span_id = random.choice(index_ids.tolist())
print("Random span_id:", random_span_id)

# Get its embedding from the embeddings matrix
# index_ids[idx] == span_id, so we find that idx
array_idx = np.where(index_ids == random_span_id)[0][0]
query_vec = embeddings[array_idx : array_idx + 1]  # (1, D)

print("\n[QUERY SPAN TEXT]")
print(textwrap.fill(get_span_text(random_span_id) or "(no text)", width=100))

results = search_with_vector(query_vec, top_k=10)
pretty_print_results(results)

Random span_id: money-heist_span_0002430

[QUERY SPAN TEXT]
Mónica Gaztambide , accompanied by 6 other people, all wearing Dali Masks and red jumpsuits come out
the entrance of the mint. Mónica takes off her mask and speaks into a megaphone, making a statement
on behalf of the robbers. She asks the police not to shoot and reveals that there are 67 hostages in
perfect health and well cared for. She also explains that the hostages are dressed exactly the same
of the robbers, so any attempts of assault could cost lives.

--------------------------------------------------------------------------------
Rank 0 | span_id=money-heist_span_0002430 | score=1.0000
Mónica Gaztambide , accompanied by 6 other people, all wearing Dali Masks and red jumpsuits come out
the entrance of the mint. Mónica takes off her mask and speaks into a megaphone, making a statement
on behalf of the robbers. She asks the police not to shoot and reveals that there are 67 hostages in
perfect health and well cared for. S

In [11]:
# Pick a random span_id from this DOMAIN
random_span_id = random.choice(index_ids.tolist())
print("Random span_id:", random_span_id)

# Get its embedding from the embeddings matrix
# index_ids[idx] == span_id, so we find that idx
array_idx = np.where(index_ids == random_span_id)[0][0]
query_vec = embeddings[array_idx : array_idx + 1]  # (1, D)

print("\n[QUERY SPAN TEXT]")
print(textwrap.fill(get_span_text(random_span_id) or "(no text)", width=100))

results = search_with_vector(query_vec, top_k=10)
pretty_print_results(results)

Random span_id: money-heist_span_0001111

[QUERY SPAN TEXT]
Juan Fernández as Alfonso Prieto  Anna Gras as Mercedes Colmenar  Fernando Soto as Ángel Rubio
Mario de la Rosa as Suárez

--------------------------------------------------------------------------------
Rank 0 | span_id=money-heist_span_0001237 | score=1.0000
Juan Fernández as Alfonso Prieto  Anna Gras as Mercedes Colmenar  Fernando Soto as Ángel Rubio
Mario de la Rosa as Suárez

--------------------------------------------------------------------------------
Rank 1 | span_id=money-heist_span_0001192 | score=1.0000
Juan Fernández as Alfonso Prieto  Anna Gras as Mercedes Colmenar  Fernando Soto as Ángel Rubio
Mario de la Rosa as Suárez

--------------------------------------------------------------------------------
Rank 2 | span_id=money-heist_span_0001149 | score=1.0000
Juan Fernández as Alfonso Prieto  Anna Gras as Mercedes Colmenar  Fernando Soto as Ángel Rubio
Mario de la Rosa as Suárez

----------------------------------