In [2]:
# in this code we will try to load the document into a seperate vector database
# then we will apply ranking over the retrieved documents from all the vector databases
# the retriever will be a multi vector database retriever with ranking capabilities


In [8]:
# read the documents from the data/data_information folder
# create a vector database for each document
# then create a multi vector database retriever with ranking capabilities
# then create a langchain agent with the retriever
import os
from pathlib import Path

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import FAISS

from langchain_core.documents import Document

# For ranking
from typing import List, Tuple

In [9]:
# In[2]: Load documents from folder

DATA_DIR = Path("../data/data_information")
assert DATA_DIR.exists(), f"{DATA_DIR} does not exist"

def load_single_file(path: Path) -> List[Document]:
    """
    Load a single text-like file into LangChain Documents.
    You can extend for PDFs, etc.
    """
    loader = TextLoader(str(path), encoding="utf-8")
    docs = loader.load()
    # Attach metadata to remember which file they came from
    for d in docs:
        d.metadata["source_file"] = path.name
    return docs

all_docs_by_file = {}  # filename -> [Document, ...]
for file_path in DATA_DIR.iterdir():
    if file_path.is_file():
        docs = load_single_file(file_path)
        all_docs_by_file[file_path.name] = docs

all_docs_by_file.keys()


dict_keys(['order_items.txt', 'support_tickets.txt', 'states.txt', 'products.txt', 'addresses.txt', 'orders.txt', 'countries.txt', 'users.txt', 'payments.txt', 'user_profiles.txt'])

In [10]:
# In[3]: Split documents into chunks per file

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150,
    length_function=len,
)

chunked_docs_by_file = {}  # filename -> [Document chunks]

for filename, docs in all_docs_by_file.items():
    chunks = text_splitter.split_documents(docs)
    # Ensure metadata keeps track of original file
    for c in chunks:
        c.metadata["source_file"] = filename
    chunked_docs_by_file[filename] = chunks

{fn: len(chs) for fn, chs in chunked_docs_by_file.items()}


{'order_items.txt': 1,
 'support_tickets.txt': 1,
 'states.txt': 1,
 'products.txt': 1,
 'addresses.txt': 1,
 'orders.txt': 1,
 'countries.txt': 1,
 'users.txt': 1,
 'payments.txt': 1,
 'user_profiles.txt': 1}

In [12]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstores_by_file = {}  # filename -> FAISS vectorstore

for filename, chunks in chunked_docs_by_file.items():
    if not chunks:
        continue
    vs = FAISS.from_documents(chunks, embeddings)
    vectorstores_by_file[filename] = vs

vectorstores_by_file


{'order_items.txt': <langchain_community.vectorstores.faiss.FAISS at 0x12ccc2ba0>,
 'support_tickets.txt': <langchain_community.vectorstores.faiss.FAISS at 0x12cd38e10>,
 'states.txt': <langchain_community.vectorstores.faiss.FAISS at 0x12cd38f50>,
 'products.txt': <langchain_community.vectorstores.faiss.FAISS at 0x12cd43820>,
 'addresses.txt': <langchain_community.vectorstores.faiss.FAISS at 0x12cd43e10>,
 'orders.txt': <langchain_community.vectorstores.faiss.FAISS at 0x12fba7ad0>,
 'countries.txt': <langchain_community.vectorstores.faiss.FAISS at 0x12046b9b0>,
 'users.txt': <langchain_community.vectorstores.faiss.FAISS at 0x12046bdf0>,
 'payments.txt': <langchain_community.vectorstores.faiss.FAISS at 0x12fbc1450>,
 'user_profiles.txt': <langchain_community.vectorstores.faiss.FAISS at 0x12fc12750>}

In [14]:
# In[5]: Multi-vector retriever with pure embedding ranking

def retrieve_from_all(
    query: str,
    k_per_store: int = 4,
) -> List[Tuple[Document, float, str]]:
    """
    Query each vectorstore and return (doc, distance, source_file).
    distance: smaller = better match.
    """
    results = []
    for filename, vs in vectorstores_by_file.items():
        # For FAISS: similarity_search_with_score returns (doc, distance)
        docs_and_scores = vs.similarity_search_with_score(query, k=k_per_store)
        for doc, distance in docs_and_scores:
            results.append((doc, distance, filename))
    return results


def rank_results_by_distance(
    results: List[Tuple[Document, float, str]],
    top_n: int = 5,
) -> List[Tuple[Document, float, str]]:
    """
    Convert distances to relevance scores and sort.
    Relevance = 1 / (1 + distance). Larger = more relevant.
    """
    scored = []
    for doc, distance, filename in results:
        relevance = 1.0 / (1.0 + distance)
        scored.append((doc, relevance, filename))

    scored.sort(key=lambda x: x[1], reverse=True)
    return scored[:top_n]


def multi_vector_search(
    query: str,
    k_per_store: int = 4,
    top_n: int = 5,
) -> List[Tuple[Document, float, str]]:
    """
    Full pipeline:
    - retrieve from every vector DB
    - rank globally by embedding relevance
    """
    raw_results = retrieve_from_all(query, k_per_store=k_per_store)
    ranked = rank_results_by_distance(raw_results, top_n=top_n)
    return ranked


In [15]:
# In[6]: Pretty-print ranked results

def print_ranked_results(query: str, top_n: int = 5):
    print("=" * 80)
    print(f"QUERY: {query}")
    print("=" * 80)

    results = multi_vector_search(query, top_n=top_n)

    for idx, (doc, relevance, filename) in enumerate(results, start=1):
        snippet = doc.page_content.replace("\n", " ")
        if len(snippet) > 300:
            snippet = snippet[:300] + "..."
        print(f"\nRank #{idx}")
        print(f"  Source file: {filename}")
        print(f"  Relevance score: {relevance:.4f}")
        print(f"  Metadata: {doc.metadata}")
        print(f"  Snippet: {snippet}")
    print("\n")


In [None]:
# In[10]: Define test cases (gold labels)

from typing import Set, Dict, Any, List

"""
Each test case:
- query: user query string
- relevant_sources: set of source_file names considered relevant
  (you decide this manually based on your knowledge of the files)
"""

test_cases: List[Dict[str, Any]] = [
    {
        "name": "User demographics",
        "query": "What data describes the demographics of a user?",
        "relevant_sources": {"users.txt", "user_profiles.txt", "countries.txt", "states.txt"},
    },
    {
        "name": "Order lifecycle",
        "query": "Explain the lifecycle of an order and its statuses.",
        "relevant_sources": {"orders.txt"},
    },
    {
        "name": "Orders and payments",
        "query": "How are orders linked to payments and payment status?",
        "relevant_sources": {"orders.txt", "payments.txt"},
    },
    {
        "name": "Products and pricing",
        "query": "What fields describe a product, its price and availability?",
        "relevant_sources": {"products.txt"},
    },
    {
        "name": "Order line items",
        "query": "Where can I find details for items inside an order?",
        "relevant_sources": {"order_items.txt", "orders.txt", "products.txt"},
    },
    {
        "name": "Addresses and geography",
        "query": "How is a user's address and location represented in the data model?",
        "relevant_sources": {"addresses.txt", "countries.txt", "states.txt", "users.txt"},
    },
    {
        "name": "Customer support",
        "query": "Which tables track customer support issues and their status?",
        "relevant_sources": {"support_tickets.txt", "users.txt", "orders.txt"},
    },
    {
        "name": "Revenue and monetary values",
        "query": "Which tables contain monetary amounts used for revenue analysis?",
        "relevant_sources": {"orders.txt", "payments.txt", "order_items.txt", "products.txt"},
    },
    {
        "name": "Customer lifetime value",
        "query": "What tables are needed to compute customer lifetime value?",
        "relevant_sources": {"orders.txt", "payments.txt", "users.txt", "user_profiles.txt"},
    },
    {
        "name": "Shopping workflow",
        "query": "Describe the workflow from browsing products to paying for an order.",
        "relevant_sources": {"products.txt", "orders.txt", "order_items.txt", "payments.txt"},
    },
]


# In[11]: Helper to get ranked sources from your retriever

def get_ranked_sources_for_query(query: str, top_n: int = 10) -> List[str]:
    """
    Uses your multi_vector_search() which returns:
    List[(Document, score, filename)]
    We map to a ranked list of unique source filenames.
    """
    results = multi_vector_search(query, top_n=top_n)
    
    ranked_sources: List[str] = []
    seen: Set[str] = set()
    for doc, score, filename in results:
        sf = doc.metadata.get("source_file", filename)
        if sf not in seen:
            ranked_sources.append(sf)
            seen.add(sf)
    return ranked_sources


# For BaseRetriever:
#docs = mv_retriever.get_relevant_documents(query)
#ranked_sources = [d.metadata["source_file"] for d in docs]


In [23]:
# In[12]: Metric functions

import math

def precision_at_k(pred: List[str], rel: Set[str], k: int) -> float:
    if k == 0:
        return 0.0
    pred_k = pred[:k]
    if len(pred_k) == 0:
        return 0.0
    num_rel = sum(1 for p in pred_k if p in rel)
    return num_rel / len(pred_k)


def recall_at_k(pred: List[str], rel: Set[str], k: int) -> float:
    if not rel:
        return 0.0
    pred_k = pred[:k]
    num_rel = sum(1 for p in pred_k if p in rel)
    return num_rel / len(rel)


def average_precision(pred: List[str], rel: Set[str]) -> float:
    """
    AP = average of precision@k at all ranks k where the item is relevant.
    """
    if not rel:
        return 0.0
    ap_sum = 0.0
    num_hits = 0
    for i, p in enumerate(pred, start=1):
        if p in rel:
            num_hits += 1
            ap_sum += num_hits / i
    if num_hits == 0:
        return 0.0
    return ap_sum / len(rel)


def reciprocal_rank(pred: List[str], rel: Set[str]) -> float:
    """
    RR = 1 / rank of first relevant document, 0 if none.
    """
    for i, p in enumerate(pred, start=1):
        if p in rel:
            return 1.0 / i
    return 0.0


def dcg_at_k(pred: List[str], rel: Set[str], k: int) -> float:
    """
    DCG with binary relevance: rel_i âˆˆ {0,1}
    """
    dcg = 0.0
    for i, p in enumerate(pred[:k], start=1):
        rel_i = 1.0 if p in rel else 0.0
        if rel_i > 0:
            dcg += rel_i / math.log2(i + 1)
    return dcg


def ndcg_at_k(pred: List[str], rel: Set[str], k: int) -> float:
    """
    nDCG@K = DCG@K / IDCG@K
    With binary relevance, ideal ranking is all relevant docs at the top.
    """
    if not rel:
        return 0.0
    dcg = dcg_at_k(pred, rel, k)
    
    # Ideal DCG: all relevant docs sorted first (but capped at k)
    ideal_list = [1.0] * min(len(rel), k)
    idcg = 0.0
    for i, rel_i in enumerate(ideal_list, start=1):
        idcg += rel_i / math.log2(i + 1)
    if idcg == 0.0:
        return 0.0
    return dcg / idcg


In [24]:
# In[13]: Evaluate retriever on test cases

def evaluate_retriever(
    test_cases: List[Dict[str, Any]],
    top_n: int = 10,
    k_metrics: int = 5,  # for P@k, R@k, nDCG@k
):
    results = []
    
    for tc in test_cases:
        name = tc["name"]
        query = tc["query"]
        relevant_sources: Set[str] = set(tc["relevant_sources"])
        
        pred_sources = get_ranked_sources_for_query(query, top_n=top_n)
        
        p_at_k = precision_at_k(pred_sources, relevant_sources, k_metrics)
        r_at_k = recall_at_k(pred_sources, relevant_sources, k_metrics)
        ap = average_precision(pred_sources, relevant_sources)
        rr = reciprocal_rank(pred_sources, relevant_sources)
        ndcg = ndcg_at_k(pred_sources, relevant_sources, k_metrics)
        
        results.append({
            "name": name,
            "query": query,
            "P@{}".format(k_metrics): p_at_k,
            "R@{}".format(k_metrics): r_at_k,
            "AP": ap,
            "RR": rr,
            "nDCG@{}".format(k_metrics): ndcg,
            "pred_sources": pred_sources,
            "relevant_sources": relevant_sources,
        })
    
    # Macro averages
    avg_p = sum(r["P@{}".format(k_metrics)] for r in results) / len(results)
    avg_r = sum(r["R@{}".format(k_metrics)] for r in results) / len(results)
    avg_ap = sum(r["AP"] for r in results) / len(results)
    avg_rr = sum(r["RR"] for r in results) / len(results)
    avg_ndcg = sum(r["nDCG@{}".format(k_metrics)] for r in results) / len(results)
    
    summary = {
        "P@{}".format(k_metrics): avg_p,
        "R@{}".format(k_metrics): avg_r,
        "MAP": avg_ap,
        "MRR": avg_rr,
        "nDCG@{}".format(k_metrics): avg_ndcg,
    }
    
    return results, summary


In [25]:
# In[14]: Run evaluation

results, summary = evaluate_retriever(test_cases, top_n=10, k_metrics=5)

print("=== Summary Metrics ===")
for k, v in summary.items():
    print(f"{k}: {v:.3f}")

print("\n=== Per-Query Results ===")
for r in results:
    print(f"\nTest case: {r['name']}")
    print(f"Query: {r['query']}")
    print(f"P@5: {r['P@5']:.3f}, R@5: {r['R@5']:.3f}, AP: {r['AP']:.3f}, RR: {r['RR']:.3f}, nDCG@5: {r['nDCG@5']:.3f}")
    print(f"Relevant: {sorted(r['relevant_sources'])}")
    print(f"Predicted: {r['pred_sources']}")


=== Summary Metrics ===
P@5: 0.560
R@5: 0.942
MAP: 0.934
MRR: 1.000
nDCG@5: 0.944

=== Per-Query Results ===

Test case: User demographics
Query: What data describes the demographics of a user?
P@5: 0.800, R@5: 1.000, AP: 0.950, RR: 1.000, nDCG@5: 0.983
Relevant: ['countries.txt', 'states.txt', 'user_profiles.txt', 'users.txt']
Predicted: ['user_profiles.txt', 'users.txt', 'countries.txt', 'addresses.txt', 'states.txt', 'support_tickets.txt', 'products.txt', 'orders.txt', 'order_items.txt', 'payments.txt']

Test case: Order lifecycle
Query: Explain the lifecycle of an order and its statuses.
P@5: 0.200, R@5: 1.000, AP: 1.000, RR: 1.000, nDCG@5: 1.000
Relevant: ['orders.txt']
Predicted: ['orders.txt', 'products.txt', 'order_items.txt', 'payments.txt', 'support_tickets.txt', 'addresses.txt', 'states.txt', 'user_profiles.txt', 'countries.txt', 'users.txt']

Test case: Orders and payments
Query: How are orders linked to payments and payment status?
P@5: 0.400, R@5: 1.000, AP: 1.000, RR: 1.