# Some functional components

## Turn a markdown file(including text and image) into vector, use base64 to encode image

In [1]:
md_path = "/home/Preda/user/Sony_InternProj/content/terminology/confusion_matrix/_index.jp.md"

In [None]:
import re
import base64
import os

def encode_image(image_path):
    """将图像文件路径转换为Base64编码"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def process_markdown(markdown_text):
    # 处理图片路径，转换成Base64
    def replace_image_path(match):
        image_path = match.group(1)
        base64_image = encode_image(image_path)
        return f"![Image](data:image/png;base64,{base64_image})"

    # 匹配 ![](../img/t_slide29.png) 形式的图像路径并替换
    markdown_text = re.sub(r'!\[\]\((\.\./img/[^)]+)\)', replace_image_path, markdown_text)

    # 处理 {{% a_in "../../tutorial/crm_predict_unsubscribe/" "「退会予測による退会の削減」" %}}
    # 只保留引号内的内容
    markdown_text = re.sub(r'\{\{%\s*a_in\s+"[^"]+"\s*"([^"]+)"\s*%\}\}', r'\1', markdown_text)

    # 删除以 {{% div_relitem contents-bottom %}} 开始，以 {{% /div_relitem %}} 结束的内容
    markdown_text = re.sub(r'\{\{%\s*div_relitem\s*contents-bottom\s*%\}\}.*?\{\{%\s*/div_relitem\s*%\}\}', '', markdown_text, flags=re.DOTALL)

    return markdown_text

if __name__ == "__main__":
    # 读取Markdown文件
    with open("input.md", "r", encoding="utf-8") as f:
        markdown_content = f.read()

    # 处理Markdown内容
    processed_content = process_markdown(markdown_content)

    # 保存处理后的Markdown内容
    with open("output.md", "w", encoding="utf-8") as f:
        f.write(processed_content)

    print("Markdown文件处理完毕，结果已保存到output.md")


In [None]:
import base64
import os
from langchain.schema import SystemMessage, HumanMessage, AIMessage
from langchain.prompts.chat import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.callbacks import get_openai_callback
from langfuse.callback import CallbackHandler
import markdown
import re

# 读取Markdown文件并提取图像路径和文本内容
def parse_markdown_with_images(markdown_path):
    with open(markdown_path, "r", encoding="utf-8") as f:
        markdown_content = f.read()
    
    # 使用正则表达式匹配图像路径
    image_paths = re.findall(r'!\[.*?\]\((.*?)\)', markdown_content)
    text_content = re.sub(r'!\[.*?\]\((.*?)\)', '', markdown_content)  # 去除图像标签的文本
    
    return text_content.strip(), image_paths

# 图像文件路径转Base64编码
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

# 创建带有图像和文本的GPT输入
def create_prompt_with_image_and_text(text_content, image_paths):
    base64_images = [encode_image(image) for image in image_paths]
    human_prompt_content = {
        "type": "text",
        "text": text_content,
    }
    image_prompts = [
        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image}"}} 
        for image in base64_images
    ]
    human_prompt = HumanMessage(content=[human_prompt_content] + image_prompts)
    system_prompt = SystemMessage(content="Analyze the provided content and images.")
    prompt = ChatPromptTemplate.from_messages([system_prompt, human_prompt])
    
    return prompt

# 生成嵌入
def generate_embedding_from_markdown(markdown_path):
    text_content, image_paths = parse_markdown_with_images(markdown_path)
    prompt = create_prompt_with_image_and_text(text_content, image_paths)

    llm = get_gpt4o_chat_llm()  # 你已有的函数，用于获取GPT模型
    chat = prompt | llm | StrOutputParser()

    with get_openai_callback() as cb:
        result = chat.invoke(
            {},
            {
                "callbacks": [
                    CallbackHandler(user_id=get_user_id()),
                ]
            },
        )
        print(result)  # 输出GPT模型的结果
        return result

if __name__ == "__main__":
    markdown_file_path = "./your_markdown_file.md"
    embedding = generate_embedding_from_markdown(markdown_file_path)
    print("Generated embedding:", embedding)
