In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [None]:
header_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=[("#", "Header1"),
                         ("##", "Header2")],
    strip_headers=False
)

In [None]:
with open("data/extracted/IAC/cleaned_markdown/IAC Manual v9.md", "r", encoding="utf-8") as f:
    content = f.read()

In [None]:
splitted_markdown = header_splitter.split_text(content)

In [None]:
for doc in splitted_markdown:
    doc.metadata["Title"] = "IAC Manual v9"
    doc.metadata["added_date"] = "2024/04/30"

In [None]:
splitted_markdown[0].metadata["Header2"]

In [None]:
from langchain_ollama import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="bge-m3")

In [None]:
from langchain_chroma import Chroma
vector_store = Chroma(
    collection_name="manuals_collection",
    embedding_function=embeddings,
    persist_directory="data/chroma_langchain_db",)

In [None]:
from uuid import uuid4
from langchain_core.documents import Document

uuids = [str(uuid4()) for _ in range(len(splitted_markdown))]

vector_store.add_documents(documents=splitted_markdown, ids=uuids)

In [None]:
results = vector_store.similarity_search(
    "What are the rules for using telephones?",
    k=1,
)
for res in results:
    print(f"id: {res.id}")

In [None]:
import re

def extract_sections_with_multiple_images(md_text):
    # 提取标题（如 1.2.3）
    section_pattern = re.compile(r'^(\d+(?:\.\d+){2,})\s+.*$', re.MULTILINE)
    # 提取图片路径
    image_pattern = re.compile(r'!\[\]\((.*?)\)')

    # 提取所有 section 标题及位置
    sections = []
    for match in section_pattern.finditer(md_text):
        sections.append({
            "number": match.group(1),
            "start": match.start()
        })

    # 添加一个虚拟结束位置（文档末尾），方便计算区段范围
    sections.append({
        "number": "END",
        "start": len(md_text)
    })

    results = []

    for i in range(len(sections) - 1):
        section_start = sections[i]["start"]
        section_end = sections[i + 1]["start"]
        section_number = sections[i]["number"]

        section_text = md_text[section_start:section_end]
        # 找出该段中所有图片
        for match in image_pattern.finditer(section_text):
            img_pos = match.start()
            img_path = match.group(1)
            # 截取该图片之前的内容（相对当前段）
            content_up_to_image = section_text[:img_pos].strip()
            results.append({
                "section_number": section_number,
                "image_path": img_path,
                "content": content_up_to_image
            })

    return results

In [None]:
result = [doc for doc in splitted_markdown if doc.metadata.get("Header2") == "10.2 Fallback procedure for ACC"]
print(result)

In [None]:
md_content = result[0].page_content

results = extract_sections_with_multiple_images(md_content)

for item in results:
    print(f"Section {item['section_number']}:")
    print(item['content'])
    print(f"→ Image: {item['image_path']}")
    print('-' * 60)

In [None]:
from langchain_ollama import OllamaLLM
import base64
import os


# 加载图片并转为 base64，保持原始格式
def encode_image_base64(image_path: str) -> str:
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"图片不存在: {image_path}")
    
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


llm = OllamaLLM(model="llama3.2-vision:11b")


# llm_with_image_context = llm.bind(images=[image_base64])
# response = llm_with_image_context.invoke(f"summarize the image detailly. Images:{image_base64}")
# print(response)


In [None]:
md_content = result[0].page_content

results = extract_sections_with_multiple_images(md_content)

for item in results:
    image_path = os.path.join("data/extracted/IAC/images", os.path.basename(item['image_path']))
    image_base64 = encode_image_base64(image_path)
    context = item['content']

    messages=[
        {
            'role': 'user',
            'content': f'Describe the image in detail. Here are some related infos: {context}',
            'images': [image_base64]
        }
    ]

    print(messages)

    response = llm.invoke(messages)
    print("="*60)
    print(context)
    print(response)

In [None]:
import os
path = "../images/5b44925af0e0d7ac75f2ef7343d20b52e4e38a639a33653649891f208b67c9de.jpg"
path2 = os.path.basename(path)
print(path2)