# Law PDF Ingestion Pipeline

This notebook handles the ingestion of Sri Lankan Law PDFs into the Pinecone Vector Database.

**Steps:**
1.  Setup Environment
2.  Load PDFs from `../data/law_pdfs`
3.  Split Text (Semantic/Recursive)
4.  Embed & Upsert to Pinecone

In [None]:
from dotenv import load_dotenv
import os

load_dotenv("../.env")

# Ensure keys are present
assert os.environ.get("GOOGLE_API_KEY"), "GOOGLE_API_KEY not found"
assert os.environ.get("PINECONE_API_KEY"), "PINECONE_API_KEY not found"
print("Environment loaded.")

In [None]:
from langchain_community.document_loaders import PyPDFDirectoryLoader

DATA_PATH = "../data/law_pdfs"
# Create directory if it doesn't exist
os.makedirs(DATA_PATH, exist_ok=True)

print(f"Loading PDFs from {DATA_PATH}...")
loader = PyPDFDirectoryLoader(DATA_PATH)
docs = loader.load()

print(f"Loaded {len(docs)} pages.")

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Tuning for legal text - larger chunks might keep context of sections better
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

splits = text_splitter.split_documents(docs)
print(f"Created {len(splits)} chunks.")

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
import time

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
index_name = os.environ.get("PINECONE_INDEX_NAME", "law-index")

pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

# Check if index exists, create if not
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    print(f"Creating index: {index_name}...")
    pc.create_index(
        name=index_name,
        dimension=768, # Verify dimension for models/embedding-001 (usually 768)
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)
    print("Index created.")
else:
    print(f"Index {index_name} already exists.")

In [None]:
print(f"Upserting {len(splits)} chunks to Pinecone...")
vector_store = PineconeVectorStore.from_documents(
    documents=splits,
    embedding=embeddings,
    index_name=index_name
)
print("Ingestion Complete!")