In [6]:
!pip install --upgrade faiss-cpu langchain-community sentence-transformers
!pip install --upgrade langchain
!pip install --upgrade -qU langchain-text-splitters
!pip install pdfplumber 
!pip install spacy
!pip install lxml
!pip install nltk
!pip install dotenv
!pip install --upgrade langchain-huggingface



In [8]:
import zipfile
import os
from langchain_community.document_loaders import UnstructuredXMLLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
import os
from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

In [9]:
dita_docs = []
for zip_file in ["./lib/FARDITA.zip", "./lib/DFARSDITA.zip"]: # Replace with your filenames
    with zipfile.ZipFile(zip_file, 'r') as zf:
        zf.extractall("dita_content")
    
    for root, dirs, files in os.walk("dita_content"):
        for file in files:
            if file.endswith(".dita") or file.endswith(".xml"):
                file_path = os.path.join(root, file)
                # Use UnstructuredXMLLoader for parsing DITA XML
                loader = UnstructuredXMLLoader(file_path)
                dita_docs.extend(loader.load())

# --- 2. Process PDF file ---

pdf_file = "./lib/USCODE-2024-title10.4022.pdf" # Replace with your filename
loader = PyPDFLoader(pdf_file)
pdf_docs = loader.load()

# Combine all documents
all_docs = dita_docs + pdf_docs
print(all_docs[:100])

[Document(metadata={'source': 'dita_content\\dita\\1.000.dita'}, page_content='1.000\n\nsubparts\xa0 1.2\n\n1.3\n\n1.4\n\nThis part sets forth basic policies and general information about the Federal Acquisition Regulations System including purpose, authority, applicability, issuance, arrangement, numbering, dissemination, implementation, supplementation, maintenance, administration, and deviation. '), Document(metadata={'source': 'dita_content\\dita\\1.101.dita'}, page_content='1.101\n\n1.301\n\nThe Federal Acquisition Regulations System is established for the codification and publication of uniform policies and procedures for acquisition by all executive agencies. The Federal Acquisition Regulations System consists of the Federal Acquisition Regulation (FAR), which is the primary document, and agency acquisition regulations that implement or supplement the FAR. The FAR System does not include internal agency guidance of the type described in '), Document(metadata={'source': 'dita_con

In [None]:
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) # Adjust chunk_size/overlap as needed
chunked_docs = text_splitter.split_documents(all_docs)

# Define a local, efficient embedding function
# "all-MiniLM-L6-v2" is lightweight and effective (384 dimensions)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create a collection using the compatible embedding function
db = FAISS.from_documents(chunked_docs, embeddings)

In [None]:
# Add chunks to the collection
# (Rest of your code remains the same)
import uuid
doc_ids = [str(uuid.uuid4()) for _ in chunked_docs]
db.add(
    documents=[doc.page_content for doc in chunked_docs],
    metadatas=[doc.metadata for doc in chunked_docs],
    ids=doc_ids
)
db.save_local("faiss_index")
print("FAISS index saved successfully.")
print(f"Successfully processed {len(chunked_docs)} chunks and stored in FAISS at faiss_index")


db.save_local("faiss_index")
print("FAISS index saved successfully.")

In [None]:
import shutil

# Initialize persistence
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_or_create_collection(name="rag_docs")

# ... (Run your extraction and addition code from previous steps)

# 2026 Efficiency Tip: Zip the database folder for GitHub
# GitHub has a 100MB file limit; zipping minimizes the footprint of the SQLite/HNSW files.
shutil.make_archive("chroma_db_export", 'zip', "./chroma_db")
print("Database saved and compressed to chroma_db_export.zip")
