In [1]:
import os
import pypdf
from ckip_transformers.nlp import CkipWordSegmenter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ws_driver = CkipWordSegmenter(device=0)

In [3]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = pypdf.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

In [4]:
# 進行 CKIP 斷詞
def ckip_tokenize(text):
    ws = ws_driver([text])[0]
    return " ".join(ws)

In [5]:
pdf_directory = "..\\product infomation"
pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]

documents = []
file_names = []

for pdf_file in pdf_files:
    path = os.path.join(pdf_directory, pdf_file)
    text = extract_text_from_pdf(path)
    tokenized_text = ckip_tokenize(text)
    documents.append(tokenized_text)
    file_names.append(pdf_file)

Tokenization: 100%|██████████| 1/1 [00:00<00:00, 999.36it/s]
Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Inference: 100%|██████████| 1/1 [00:00<00:00,  4.47it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 994.85it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00,  6.98it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 965.76it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 20.54it/s]
Tokenization: 100%|██████████| 1/1 [00:00<?, ?it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 31.04it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 216.45it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00,  1.29it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 893.17it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00,  8.21it/s]


In [6]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

In [7]:
# 搜尋並檢索
def search_documents(query, top_n=5):
    # 查詢語句也使用 CKIP 斷詞
    query_tokens = ckip_tokenize(query)
    query_vec = vectorizer.transform([query_tokens])
    
    # 計算餘弦相似度
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    # 排序並取得前 N 筆
    top_indices = cosine_sim.argsort()[-top_n:][::-1]
    results = []
    for idx in top_indices:
        results.append({
            "file_name": file_names[idx],
            "similarity": cosine_sim[idx]
        })
    return results

In [8]:
query = "耳麥"
results = search_documents(query)

Tokenization: 100%|██████████| 1/1 [00:00<?, ?it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 66.20it/s]


In [None]:
print("\n檢索結果:")
for res in results:
    print(f"檔名: {res['file_name']}, 相似度: {res['similarity']:.4f}")


檢索結果:
檔名: SADES DIABLO 暗黑鬥狼RGB REALTEK 電競耳麥 7.1 (USB) SA-916.pdf, 相似度: 0.0673
檔名: 羅技 Logitech H340 USB耳機麥克風.pdf, 相似度: 0.0000
檔名: 世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf, 相似度: 0.0000
檔名: [折疊收納]懶人折疊桌.pdf, 相似度: 0.0000
檔名: W202 人體工學椅.pdf, 相似度: 0.0000


In [11]:
query = "辦公桌"
results = search_documents(query)

print("\n檢索結果:")
for res in results:
    print(f"檔名: {res['file_name']}, 相似度: {res['similarity']:.4f}")

Tokenization: 100%|██████████| 1/1 [00:00<?, ?it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 21.94it/s]


檢索結果:
檔名: DE-291-1 DE-293 工作桌.pdf, 相似度: 0.0864
檔名: 羅技 Logitech H340 USB耳機麥克風.pdf, 相似度: 0.0000
檔名: 世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf, 相似度: 0.0000
檔名: [折疊收納]懶人折疊桌.pdf, 相似度: 0.0000
檔名: W202 人體工學椅.pdf, 相似度: 0.0000



