In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import os

In [2]:
pdf_path = "data/example.pdf"

loader = PyPDFLoader(pdf_path)
documents = loader.load()

print("Total pages loaded:", len(documents))
documents[0]

Total pages loaded: 26


Document(metadata={'producer': 'PDPreStamp v3.3', 'creator': 'Adobe InDesign 16.4 (Windows)', 'creationdate': '2022-09-29T17:54:00+02:00', 'author': 'ISO', 'license': 'Information Handling Services, 2022', 'moddate': '2022-10-26T19:22:40+08:00', 'title': 'ISO/IEC 27001:2022', 'trapped': '/False', 'source': 'data/example.pdf', 'total_pages': 26, 'page': 0, 'page_label': '1'}, page_content="Information security, cybersecurity \nand privacy protection — Information \nsecurity management systems — \nRequirements\nSécurité de l'information, cybersécurité et protection de la vie \nprivée — Systèmes de management de la sécurité de l'information — \nExigences\nINTERNATIONAL \nSTANDARD\nISO/IEC \n27001\nThird edition  \n2022-10\nReference number \nISO/IEC 27001:2022(E)\n© ISO/IEC 2022\n--``,,,,,``````,,,,,`,`,`,`,,`,-`-`,,`,,`,`,,`---")

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

chunks = text_splitter.split_documents(documents)

print("Total chunks created:", len(chunks))
chunks[0]

Total chunks created: 152


Document(metadata={'producer': 'PDPreStamp v3.3', 'creator': 'Adobe InDesign 16.4 (Windows)', 'creationdate': '2022-09-29T17:54:00+02:00', 'author': 'ISO', 'license': 'Information Handling Services, 2022', 'moddate': '2022-10-26T19:22:40+08:00', 'title': 'ISO/IEC 27001:2022', 'trapped': '/False', 'source': 'data/example.pdf', 'total_pages': 26, 'page': 0, 'page_label': '1'}, page_content="Information security, cybersecurity \nand privacy protection — Information \nsecurity management systems — \nRequirements\nSécurité de l'information, cybersécurité et protection de la vie \nprivée — Systèmes de management de la sécurité de l'information — \nExigences\nINTERNATIONAL \nSTANDARD\nISO/IEC \n27001\nThird edition  \n2022-10\nReference number \nISO/IEC 27001:2022(E)\n© ISO/IEC 2022\n--``,,,,,``````,,,,,`,`,`,`,,`,-`-`,,`,,`,`,,`---")

In [4]:
for idx, chunk in enumerate(chunks):
    chunk.metadata["chunk_id"] = idx

chunks[10].metadata

{'producer': 'PDPreStamp v3.3',
 'creator': 'Adobe InDesign 16.4 (Windows)',
 'creationdate': '2022-09-29T17:54:00+02:00',
 'author': 'ISO',
 'license': 'Information Handling Services, 2022',
 'moddate': '2022-10-26T19:22:40+08:00',
 'title': 'ISO/IEC 27001:2022',
 'trapped': '/False',
 'source': 'data/example.pdf',
 'total_pages': 26,
 'page': 2,
 'page_label': '3',
 'chunk_id': 10}

In [5]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")



Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [7]:
persist_directory = "chroma_db"

vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory=persist_directory
)

print("Vector database created and persisted.")

Vector database created and persisted.


In [8]:
query = "What are the responsibilities of top management?"
k = 3

results = vector_db.similarity_search_with_score(query, k=k)

In [9]:
for doc, score in results:
    print("Chunk ID:", doc.metadata["chunk_id"])
    print("Source:", doc.metadata["source"])
    print("Page:", doc.metadata["page"])
    print("Similarity Score:", round(score, 3))
    print("Chunk Text:\n", doc.page_content)
    print("-" * 80)


Chunk ID: 57
Source: data/example.pdf
Page: 8
Similarity Score: 0.603
Chunk Text:
 security are assigned and communicated within the organization.
Top management shall assign the responsibility and authority for:
a) ensuring that the information security management system conforms to the requirements of this 
document; 
b) reporting on the performance of the information security management system to top management.
NOTE Top management can also assign responsibilities and authorities for reporting performance of the
--------------------------------------------------------------------------------
Chunk ID: 57
Source: data/example.pdf
Page: 8
Similarity Score: 0.603
Chunk Text:
 security are assigned and communicated within the organization.
Top management shall assign the responsibility and authority for:
a) ensuring that the information security management system conforms to the requirements of this 
document; 
b) reporting on the performance of the information security management syste

In [10]:
queries = [
    "What are the responsibilities of top management?",
    "What is the purpose of an information security management system?",
    "What are information security risk assessment requirements?"
]

for q in queries:
    print("\nQUERY:", q)
    results = vector_db.similarity_search_with_score(q, k=3)

    for doc, score in results:
        print({
            "chunk_text": doc.page_content[:150] + "...",
            "score": round(score, 3),
            "source": doc.metadata["source"],
            "page": doc.metadata["page"],
            "chunk_id": doc.metadata["chunk_id"]
        })


QUERY: What are the responsibilities of top management?
{'chunk_text': 'security are assigned and communicated within the organization.\nTop management shall assign the responsibility and authority for:\na) ensuring that the...', 'score': 0.603, 'source': 'data/example.pdf', 'page': 8, 'chunk_id': 57}
{'chunk_text': 'security are assigned and communicated within the organization.\nTop management shall assign the responsibility and authority for:\na) ensuring that the...', 'score': 0.603, 'source': 'data/example.pdf', 'page': 8, 'chunk_id': 57}
{'chunk_text': 'allocated according to the organization needs.\n5.3 Segregation of duties Control\nConflicting duties and conflicting areas of responsibility shall be s...', 'score': 1.033, 'source': 'data/example.pdf', 'page': 16, 'chunk_id': 101}

QUERY: What is the purpose of an information security management system?
{'chunk_text': 'risks are adequately managed.\nIt is important that the information security management system is part of and 