# Knowledge Base Ingestion

This notebook ingests research PDFs from the `research_papers/` folder, extracts text, chunks it, creates embeddings, and builds a FAISS index for retrieval.


In [None]:
# Install required packages (run once)
!pip install -q pdfplumber sentence-transformers faiss-cpu groq streamlit


In [None]:
# Import modules
import sys
from pathlib import Path

# Add src to path
sys.path.append(str(Path.cwd()))

from src.kb.ingest import ingest_knowledge_base
import os

# Create necessary directories
os.makedirs("research_papers", exist_ok=True)
os.makedirs("data", exist_ok=True)

print("Setup complete!")


## Step 1: Upload PDFs to research_papers folder

**Important:** Upload your research PDFs to the `research_papers/` folder. The ingestion will process all PDF files in this folder.

You can upload files using:
- Colab: Files → Upload to session storage → Select `research_papers/` folder
- Or use the file browser on the left sidebar


In [None]:
# List PDFs in research_papers folder
from pathlib import Path

pdf_folder = Path("research_papers")
pdf_files = list(pdf_folder.glob("*.pdf"))

print(f"Found {len(pdf_files)} PDF files:")
for pdf in pdf_files:
    print(f"  - {pdf.name}")
    
if len(pdf_files) == 0:
    print("\n⚠️  No PDFs found! Please upload PDFs to the research_papers/ folder.")


## Step 2: Run Ingestion

This will:
1. Extract text from all PDFs
2. Chunk text into ~400-word chunks with 80-word overlap
3. Generate embeddings using sentence-transformers (all-MiniLM-L6-v2)
4. Build FAISS index
5. Save index and metadata to `data/` folder


In [None]:
# Run ingestion
ingest_knowledge_base(
    pdf_folder="./research_papers",
    index_path="./data/faiss_index.bin",
    metadata_path="./data/metadata.jsonl",
    model_name="all-MiniLM-L6-v2",
    index_type="flat"  # Use "hnsw" for larger datasets (>10k chunks)
)


## Step 3: Verify Ingestion

Check that the index and metadata files were created successfully.


In [None]:
# Verify files exist
import os

index_path = "./data/faiss_index.bin"
metadata_path = "./data/metadata.jsonl"

print(f"FAISS index exists: {os.path.exists(index_path)}")
print(f"Metadata file exists: {os.path.exists(metadata_path)}")

if os.path.exists(metadata_path):
    # Count chunks
    chunk_count = sum(1 for line in open(metadata_path) if line.strip())
    print(f"\nTotal chunks created: {chunk_count}")
    
    # Show sample chunk
    import json
    with open(metadata_path, 'r') as f:
        first_line = f.readline()
        if first_line:
            sample_chunk = json.loads(first_line)
            print(f"\nSample chunk:")
            print(f"  Paper: {sample_chunk.get('paper_title', 'N/A')}")
            print(f"  Chunk index: {sample_chunk.get('chunk_index', 'N/A')}")
            print(f"  Text preview: {sample_chunk.get('text', '')[:200]}...")


## Step 4: Test Retrieval

Test that retrieval works correctly.


In [None]:
# Test retrieval
from src.kb.retriever import retrieve_chunks

query = "psychometric predictors of loan repayment"
results = retrieve_chunks(query, k=5)

print(f"Retrieved {len(results)} chunks for query: '{query}'\n")
for i, chunk in enumerate(results, 1):
    print(f"Chunk {i} (score: {chunk['score']:.4f}):")
    print(f"  Paper: {chunk['paper_title']}")
    print(f"  Text: {chunk['text'][:200]}...")
    print()
