In [1]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [2]:
header_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=[("#", "Header1"),
                         ("##", "Header2")],
    strip_headers=False
)

In [3]:
with open("data/extracted/IAC/cleaned_markdown/IAC Manual v9.md", "r", encoding="utf-8") as f:
    content = f.read()

In [4]:
splitted_markdown = header_splitter.split_text(content)

In [5]:
for doc in splitted_markdown:
    doc.metadata["Title"] = "IAC Manual v9"
    doc.metadata["added_date"] = "2024/04/30"

In [18]:
splitted_markdown[0].metadata["Header2"]

'1.1 About this Manual'

In [7]:
from langchain_ollama import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="bge-m3")

In [8]:
from langchain_chroma import Chroma
vector_store = Chroma(
    collection_name="manuals_collection",
    embedding_function=embeddings,
    persist_directory="data/chroma_langchain_db",)

In [9]:
from uuid import uuid4
from langchain_core.documents import Document

uuids = [str(uuid4()) for _ in range(len(splitted_markdown))]

vector_store.add_documents(documents=splitted_markdown, ids=uuids)

['3fe1d137-0b67-463d-ab29-b7982f06b973',
 'ba46bd37-1085-4096-a92c-06257c5d040d',
 'f33890a0-1bfd-416f-a567-264eb18d256b',
 'c11f4438-aea7-4802-bbde-3a3ac1b08b60',
 '88dcd921-a008-4ffa-94e9-b01b47d1999c',
 '1b30e344-8aac-42ba-8359-5ee65eb1020d',
 'c5344f9c-5a1d-42e1-a205-d60e7b9f462a',
 '7db27460-a9f5-430a-b3a7-c98aae7b6bb2',
 '6521433a-e500-485c-b710-9d403ce7bffd',
 '6299228f-0a8f-4fc0-a498-cb253717c35a',
 '7cbc7023-cd87-41f5-a3cb-d57437dbba16',
 '348d0459-9ac6-44fe-a7c8-a2908d0bf791',
 '1022c432-91bb-4529-ad3f-53c5eb25b807',
 '6d1b9872-da57-4c0a-bb52-35892d060c21',
 'e178ba0c-acdc-43f2-8673-c2637c75d457',
 '05b99a94-d976-44de-ae41-2673d14bcb6d',
 'fb2552f9-9a7e-4230-a6dd-d5660af98eb3',
 'fafd40bc-58a3-4207-8672-b5d5c25c40f8',
 '2d25190f-6c4f-4e0b-b2ae-3e9940d19fc4',
 'ab4c2df0-f866-4f3f-9439-f2cd6fb81f3b',
 '62ac3d96-3882-40d4-968c-86c7b737dfd1',
 '7f471d89-733c-4295-bc9e-870c5618db8a',
 'c0c1926d-ead8-483c-a3ce-9e3fab4ce006',
 '350d5e6b-cd01-41ee-a6b4-6646ca3760d8',
 'd84e01ad-15b5-

In [10]:
results = vector_store.similarity_search(
    "What are the rules for using telephones?",
    k=1,
)
for res in results:
    print(f"id: {res.id}")

id: ca265a15-da84-476e-bee8-c1b3b68de66d


In [39]:
import re

def extract_sections_with_multiple_images(md_text):
    # 提取标题（如 1.2.3）
    section_pattern = re.compile(r'^(\d+(?:\.\d+){2,})\s+.*$', re.MULTILINE)
    # 提取图片路径
    image_pattern = re.compile(r'!\[\]\((.*?)\)')

    # 提取所有 section 标题及位置
    sections = []
    for match in section_pattern.finditer(md_text):
        sections.append({
            "number": match.group(1),
            "start": match.start()
        })

    # 添加一个虚拟结束位置（文档末尾），方便计算区段范围
    sections.append({
        "number": "END",
        "start": len(md_text)
    })

    results = []

    for i in range(len(sections) - 1):
        section_start = sections[i]["start"]
        section_end = sections[i + 1]["start"]
        section_number = sections[i]["number"]

        section_text = md_text[section_start:section_end]
        # 找出该段中所有图片
        for match in image_pattern.finditer(section_text):
            img_pos = match.start()
            img_path = match.group(1)
            # 截取该图片之前的内容（相对当前段）
            content_up_to_image = section_text[:img_pos].strip()
            results.append({
                "section_number": section_number,
                "image_path": img_path,
                "content": content_up_to_image
            })

    return results

In [40]:
result = [doc for doc in splitted_markdown if doc.metadata.get("Header2") == "10.2 Fallback procedure for ACC"]
print(result)

[Document(metadata={'Header1': 'Part 10 – Decentralization Plan', 'Header2': '10.2 Fallback procedure for ACC', 'Title': 'IAC Manual v9', 'added_date': '2024/04/30'}, page_content='## 10.2 Fallback procedure for ACC  \n10.2.1 When the fallback procedure is activated, the ACC functions of stand allocation, apron control would move to the fallback centre of Backup Apron Control Centre (Backup ACC) at AOC, 2/F of Midfield Operation and Maintenance Building II (MOMB II). Bus Control would move to Bus Control Office. FIDS would move to location that is subject to MMI’s availability. For the function of services and standard monitoring, it will be performed at AOC.  \n10.2.2 Airfield Department is responsible for the management, system provision and operation procedure of the fallback centre. Latest update of the equipment list and redundancy level shall be referred to Airfield Department. The equipment list at the fallback centre are as follow (the locations of systems at the fallback centr

In [None]:
md_content = result[0].page_content

results = extract_sections_with_multiple_images(md_content)

for item in results:
    print(f"Section {item['section_number']}:")
    print(item['content'])
    print(f"→ Image: {item['image_path']}")
    print('-' * 60)

Section 10.2.2:
10.2.2 Airfield Department is responsible for the management, system provision and operation procedure of the fallback centre. Latest update of the equipment list and redundancy level shall be referred to Airfield Department. The equipment list at the fallback centre are as follow (the locations of systems at the fallback centre is shown in Illustration 1-2):
→ Image: ../images/5b44925af0e0d7ac75f2ef7343d20b52e4e38a639a33653649891f208b67c9de.jpg
------------------------------------------------------------
Section 10.2.2:
10.2.2 Airfield Department is responsible for the management, system provision and operation procedure of the fallback centre. Latest update of the equipment list and redundancy level shall be referred to Airfield Department. The equipment list at the fallback centre are as follow (the locations of systems at the fallback centre is shown in Illustration 1-2):  
![](../images/5b44925af0e0d7ac75f2ef7343d20b52e4e38a639a33653649891f208b67c9de.jpg)
→ Image: 

In [44]:
from langchain_ollama import OllamaLLM
import base64
import os


# 加载图片并转为 base64，保持原始格式
def encode_image_base64(image_path: str) -> str:
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"图片不存在: {image_path}")
    
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


llm = OllamaLLM(model="llama3.2-vision:11b")


# llm_with_image_context = llm.bind(images=[image_base64])
# response = llm_with_image_context.invoke(f"summarize the image detailly. Images:{image_base64}")
# print(response)


In [51]:
md_content = result[0].page_content

results = extract_sections_with_multiple_images(md_content)

for item in results:
    image_path = os.path.join("data/extracted/IAC/images", os.path.basename(item['image_path']))
    image_base64 = encode_image_base64(image_path)
    context = item['content']

    messages=[
        {
            'role': 'user',
            'content': f'Describe the image in detail. Here are some related infos: {context}',
            'images': [image_base64]
        }
    ]

    print(messages)

    response = llm.invoke(messages)
    print("="*60)
    print(context)
    print(response)

[{'role': 'user', 'content': 'Describe the image in detail. Here are some related infos: 10.2.2 Airfield Department is responsible for the management, system provision and operation procedure of the fallback centre. Latest update of the equipment list and redundancy level shall be referred to Airfield Department. The equipment list at the fallback centre are as follow (the locations of systems at the fallback centre is shown in Illustration 1-2):', 'images': ['/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAIBAQEBAQIBAQECAgICAgQDAgICAgUEBAMEBgUGBgYFBgYGBwkIBgcJBwYGCAsICQoKCgoKBggLDAsKDAkKCgr/2wBDAQICAgICAgUDAwUKBwYHCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgr/wgARCARBBGsDAREAAhEBAxEB/8QAHQABAQADAQEBAQEAAAAAAAAAAAcGCAUEAwECCf/EABQBAQAAAAAAAAAAAAAAAAAAAAD/2gAMAwEAAhADEAAAAd/gAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAARM9wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMFM6AAAAAAAAAAAAAAAAAAAAAAABhpsgR4sIAAAAAAAAAAAAB8z/L0q5ugcE6p1jwGv5aDrHzPEfEzEwwycxgyE1e

In [47]:
import os
path = "../images/5b44925af0e0d7ac75f2ef7343d20b52e4e38a639a33653649891f208b67c9de.jpg"
path2 = os.path.basename(path)
print(path2)

5b44925af0e0d7ac75f2ef7343d20b52e4e38a639a33653649891f208b67c9de.jpg
