In [None]:
# !pip install sentence-transformers langchain pypdf faiss-cpu

In [None]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS


In [None]:
pdf_paths = []
for folder in ['data/law', 'data/security']:
    for fname in os.listdir(folder):
        if fname.endswith('.pdf'):
            pdf_paths.append(os.path.join(folder, fname))
len(pdf_paths)

In [None]:
# Load documents
docs = []
for path in pdf_paths:
    loader = PyPDFLoader(path)
    docs.extend(loader.load())
len(docs)

In [None]:
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs_split = text_splitter.split_documents(docs)
len(docs_split)

In [None]:
# Create embeddings and vector store
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
vector_store = FAISS.from_documents(docs_split, embedding_model)

os.makedirs('../data/vector_store', exist_ok=True)
vector_store.save_local('../data/vector_store')