In [1]:
import os
import pypdf
import chromadb
from chromadb.config import Settings
from ckip_transformers.nlp import CkipWordSegmenter
from sentence_transformers import SentenceTransformer

# 初始化 ChromaDB
client = chromadb.PersistentClient(path="./chroma_db", settings=Settings(allow_reset=True))
collection = client.get_or_create_collection(name="pdf_documents")

# 初始化 CKIP 分詞器與向量模型
ws_driver = CkipWordSegmenter(device=0)
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# 📄 讀取 PDF 內容
def extract_text(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = pypdf.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

# ✂️ 使用 CKIP 分詞
def ckip_tokenize(text):
    ws = ws_driver([text])[0]
    return " ".join(ws)

# 🔖 將文本分成 Chunk
def split_into_chunks(text, chunk_size=300):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# ➕ 新增 PDF 到 ChromaDB
def add_pdf_to_chroma(file_path):
    file_name = os.path.basename(file_path)
    text = extract_text(file_path)
    
    # 1️⃣ CKIP 分詞
    tokenized_text = ckip_tokenize(text)
    
    # 2️⃣ Chunk 分段
    chunks = split_into_chunks(tokenized_text, chunk_size=500)
    
    for idx, chunk in enumerate(chunks):
        embedding = embedding_model.encode(chunk)
        collection.add(
            documents=[chunk],
            metadatas=[{"file_name": file_name, "chunk_id": idx}],
            ids=[f"{file_name}_{idx}"]
        )
    print(f"✅ 已新增檔案: {file_name}")

# 🔍 進行檢索
def search_pdf(query, top_n=3):
    query_tokens = ckip_tokenize(query)
    query_embedding = embedding_model.encode(query_tokens)
    
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_n
    )
    
    # for doc, score, meta in zip(results['documents'][0], results['distances'][0], results['metadatas'][0]):
    #     print(f"檔案名稱: {meta['file_name']} | Chunk ID: {meta['chunk_id']} | 相似度: {1 - score:.4f}")
    #     print(f"內容片段: {doc}\n{'-'*50}")
    with open('a.txt', 'w', encoding='utf-8') as fp:
        for doc, score, meta in zip(results['documents'][0], results['distances'][0], results['metadatas'][0]):
            fp.write(f"檔案名稱: {meta['file_name']} | Chunk ID: {meta['chunk_id']} | 相似度: {1 - score:.4f}\n")
            fp.write(f"內容片段: {doc}\n{'-'*50}\n")

# 🗑️ 刪除 PDF 檔案
def remove_pdf_from_chroma(file_name):
    # 刪除與檔案名稱相關的所有 Chunk
    ids_to_delete = [doc['id'] for doc in collection.get(include=["metadatas"])['metadatas'] if doc["file_name"] == file_name]
    collection.delete(ids=ids_to_delete)
    client.persist()
    print(f"🗑️ 已刪除檔案: {file_name}")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pdf_directory = "..\\product infomation"
pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]

for pdf_file in pdf_files:
    add_pdf_to_chroma(os.path.join(pdf_directory, pdf_file))

Tokenization: 100%|██████████| 1/1 [00:00<00:00, 919.40it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00,  5.09it/s]
Insert of existing embedding ID: DE-291-1 DE-293 工作桌.pdf_0
Add of existing embedding ID: DE-291-1 DE-293 工作桌.pdf_0
Insert of existing embedding ID: DE-291-1 DE-293 工作桌.pdf_1
Add of existing embedding ID: DE-291-1 DE-293 工作桌.pdf_1


✅ 已新增檔案: DE-291-1 DE-293 工作桌.pdf


Tokenization: 100%|██████████| 1/1 [00:00<00:00, 664.50it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00,  7.41it/s]
Insert of existing embedding ID: SADES DIABLO 暗黑鬥狼RGB REALTEK 電競耳麥 7.1 (USB) SA-916.pdf_0
Add of existing embedding ID: SADES DIABLO 暗黑鬥狼RGB REALTEK 電競耳麥 7.1 (USB) SA-916.pdf_0
Insert of existing embedding ID: SADES DIABLO 暗黑鬥狼RGB REALTEK 電競耳麥 7.1 (USB) SA-916.pdf_1
Add of existing embedding ID: SADES DIABLO 暗黑鬥狼RGB REALTEK 電競耳麥 7.1 (USB) SA-916.pdf_1


✅ 已新增檔案: SADES DIABLO 暗黑鬥狼RGB REALTEK 電競耳麥 7.1 (USB) SA-916.pdf


Tokenization: 100%|██████████| 1/1 [00:00<00:00, 397.34it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 19.56it/s]
Insert of existing embedding ID: W202 人體工學椅.pdf_0
Add of existing embedding ID: W202 人體工學椅.pdf_0
Insert of existing embedding ID: W202 人體工學椅.pdf_1
Add of existing embedding ID: W202 人體工學椅.pdf_1


✅ 已新增檔案: W202 人體工學椅.pdf


Tokenization: 100%|██████████| 1/1 [00:00<00:00, 998.88it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 29.64it/s]
Insert of existing embedding ID: [折疊收納]懶人折疊桌.pdf_0
Add of existing embedding ID: [折疊收納]懶人折疊桌.pdf_0


✅ 已新增檔案: [折疊收納]懶人折疊桌.pdf


Tokenization: 100%|██████████| 1/1 [00:00<00:00, 95.09it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00,  1.29it/s]
Insert of existing embedding ID: 世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf_0
Add of existing embedding ID: 世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf_0
Insert of existing embedding ID: 世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf_1
Add of existing embedding ID: 世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf_1
Insert of existing embedding ID: 世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf_2
Add of existing embedding ID: 世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf_2
Insert of existing embedding ID: 世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf_3
Add of existing embedding ID: 世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf_3
Insert of existing embedding ID: 世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf_4
Add of existing embedding ID: 世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf_4
Insert of existing embedding ID: 世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf_5
Add of existing embedding ID: 世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf_5
Insert of existing embedding ID: 世界上最

✅ 已新增檔案: 世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf


Tokenization: 100%|██████████| 1/1 [00:00<00:00, 333.20it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00,  8.09it/s]
Insert of existing embedding ID: 羅技 Logitech H340 USB耳機麥克風.pdf_0
Add of existing embedding ID: 羅技 Logitech H340 USB耳機麥克風.pdf_0
Insert of existing embedding ID: 羅技 Logitech H340 USB耳機麥克風.pdf_1
Add of existing embedding ID: 羅技 Logitech H340 USB耳機麥克風.pdf_1


✅ 已新增檔案: 羅技 Logitech H340 USB耳機麥克風.pdf


In [3]:
# search_pdf("推理小說")

In [6]:
search_pdf("耳機麥克風", top_n=5)

Tokenization: 100%|██████████| 1/1 [00:00<00:00, 989.22it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 21.26it/s]
