# RAG Pipeline - Analysis & Testing

## loading the documents

In [None]:
import os
import sys
import json

sys.path.insert(0, os.path.abspath('..'))

from langchain_community.document_loaders import PyPDFLoader

DATA_DIR = os.path.join('..', 'data')
pdf_files = [f for f in os.listdir(DATA_DIR) if f.endswith('.pdf')]

print(f"found {len(pdf_files)} PDFs: {pdf_files}")

all_docs = []
for pdf in pdf_files:
    loader = PyPDFLoader(os.path.join(DATA_DIR, pdf))
    docs = loader.load()
    all_docs.extend(docs)
    total_words = sum(len(d.page_content.split()) for d in docs)
    print(f"{pdf}: {len(docs)} pages, {total_words} words")

## chunk analysis

In [None]:
CHUNKS_PATH = os.path.join('..', 'chunks', 'chunks.json')

with open(CHUNKS_PATH, 'r', encoding='utf-8') as f:
    chunks = json.load(f)

print(f"total chunks: {len(chunks)}")

word_counts = [c['word_count'] for c in chunks]

print(f"min words: {min(word_counts)}")
print(f"max words: {max(word_counts)}")
print(f"avg words: {sum(word_counts) / len(word_counts):.1f}")
print(f"median: {sorted(word_counts)[len(word_counts)//2]}")

# distribution
buckets = {'<50': 0, '50-100': 0, '100-150': 0, '150-200': 0, '200-300': 0, '>300': 0}
for wc in word_counts:
    if wc < 50: buckets['<50'] += 1
    elif wc < 100: buckets['50-100'] += 1
    elif wc < 150: buckets['100-150'] += 1
    elif wc < 200: buckets['150-200'] += 1
    elif wc < 300: buckets['200-300'] += 1
    else: buckets['>300'] += 1

print("\nword count distribution:")
for bucket, count in buckets.items():
    print(f"  {bucket}: {count}")

In [None]:
# look at a few example chunks
for i in [0, len(chunks)//2, len(chunks)-1]:
    c = chunks[i]
    print(f"\nchunk {c['chunk_id']} (page {c['metadata'].get('page', '?')}, {c['word_count']} words):")
    print(c['content'][:250] + '...')
    print()

## embeddings and vector store

In [None]:
from src.embedder import get_embedding_model
from src.vector_store import get_vector_store, get_chunk_count
from src.config import EMBEDDING_MODEL, CHROMA_COLLECTION, VECTORDB_DIR

print(f"model: {EMBEDDING_MODEL}")

embed_model = get_embedding_model()
sample = embed_model.embed_query("test query")
print(f"embedding dim: {len(sample)}")
print(f"first 5 values: {sample[:5]}")

print(f"\nchromadb collection: {CHROMA_COLLECTION}")
print(f"indexed chunks: {get_chunk_count()}")

## testing retrieval

In [None]:
from src.retriever import retrieve_documents

queries = [
    "What are eBay's seller fees?",
    "What happens if my account is suspended?",
    "How does eBay handle intellectual property disputes?",
    "What is the arbitration agreement?",
    "Can I sell vehicles on eBay?",
]

for q in queries:
    print(f"\nquery: {q}")
    results = retrieve_documents(q, top_k=3)
    for i, doc in enumerate(results, 1):
        page = doc['metadata'].get('page', '?')
        print(f"  [{i}] score={doc['score']:.4f}, page {page}: {doc['content'][:100]}...")

## full pipeline test (non-streaming)

In [None]:
from src.rag_pipeline import format_context
from src.generator import generate_response

test_q = "What are the fees charged to sellers on eBay?"

docs = retrieve_documents(test_q, top_k=5)
print(f"got {len(docs)} chunks, scores: {docs[0]['score']:.4f} to {docs[-1]['score']:.4f}")

context = format_context(docs)
print(f"context: {len(context)} chars\n")

response = generate_response(test_q, context)
print("response:")
print(response)

In [None]:
# this should fail gracefully - question is not about the document
bad_q = "What is the weather in New York today?"
docs = retrieve_documents(bad_q, top_k=3)
print(f"query: {bad_q}")
print(f"scores: {[d['score'] for d in docs]}")

context = format_context(docs)
response = generate_response(bad_q, context)
print(f"\nresponse: {response}")