In [1]:
import numpy as np
from myTFIDFModel import MyTFIDFModel
from myChromaDB import MyChromaDB
from myProcesser import Processer

def hybridSearch(quary: str, db: MyChromaDB, tfidf: MyTFIDFModel, chroma_weight: float = 0.4, tfidf_weight: float = 0.6, top_n: int = 10):
    dbResults = db.search(quary, top_k=top_n)
    tfidfResults = tfidf.search(quary, top_k=top_n)

    # dbScores = {
    #     i['fileName']: i['score']
    #     for i in dbResults
    # }
    dbScores = dict()
    for i in dbResults:
        while i['score'] > 1:
            i['score'] /= 10
        dbScores[i['fileName']] = i['score']
    tfidfScores = {
        i['file_name']: i['cosine_similarity']
        for i in tfidfResults
    }

    combinedScores = {}
    for file in set(dbScores.keys()).union(tfidfScores.keys()):
        dbScore = dbScores.get(file, 0)
        tfidfScore = tfidfScores.get(file, 0)
        combinedScores[file] = dbScore * chroma_weight + tfidfScore * tfidf_weight

    sorted_result = sorted(combinedScores.items(), key=lambda x: x[1], reverse=True)[:top_n]

    print('dbScores:')
    for i in dbScores.keys():
        print(f'{i}: {dbScores[i]}')
    print()
    print('tfidfScores:')
    for i in tfidfScores.keys():
        print(f'{i}: {tfidfScores[i]}')
    print()

    print('combinedScores:')
    for i in sorted_result:
        print(f'{i[0]}, {i[1]}')
    # return sorted_result


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
db = MyChromaDB()
tfidf = MyTFIDFModel()
p = Processer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [None]:
folder = '..\\product infomation'

import os
files = [f for f in os.listdir(folder) if f.endswith('.pdf')]

for pdf_file in files:
    text = p.extractTextFromPdf(os.path.join(folder, pdf_file))
    tokenizedText = p.ckip_tokenize(text)
    db.addPDF(os.path.join(folder, pdf_file), tokenizedText)
    tfidf.addPDF(os.path.join(folder, pdf_file), tokenizedText)

FileNotFoundError: [Errno 2] No such file or directory: '商品名稱DE-291-1 DE-293 工作桌 店家名稱家昀國際暢貨中心\n商品售價1,299 店家電話04-25651960\n店家地址台中市大雅區龍善二街22號\n商品描述\n商品資料表\n商品型號：I-R-DE291-1胡桃色/I-R-DE293胡桃搭黑色\n商品尺寸：W120×D60×H121.5CM\n商品顏色：胡桃木色/胡桃搭黑色\n商品材質：厚1.5cm塑合板貼美耐皿+2*2cm烤漆鐵管\n組裝方式：需要自行組裝\n商品特色：\n加大桌面120公分寬60公分深.比市面120x48cm縮小版,作業更順手 .\n書桌搭配書櫃的貼心設計   \n簡約俐落設計，時尚具質感  \n可當書桌、辦公桌、電腦桌 \n美耐皿板，防水、防刮、耐磨，好整理,非市售貼紙/貼PVC不耐刮,易吸水膨脹,發霉.\n配送說明：商品皆配送至同平面1樓\n組裝方式：需DIY組裝，請按照說明書上定期檢查及維護。'

In [None]:
qaury = '推理小說'

hybridSearch(qaury, db, tfidf)

AttributeError: 'super' object has no attribute 'ckip_tokenize'

In [None]:
qaury = '辦公桌'

hybridSearch(qaury, db, tfidf)

Tokenization: 100%|██████████| 1/1 [00:00<00:00, 663.76it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 86.84it/s]
Tokenization: 100%|██████████| 1/1 [00:00<?, ?it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 79.44it/s]
Tokenization: 100%|██████████| 1/1 [00:00<?, ?it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 58.64it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 1001.51it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 76.81it/s]

dbScores:
DE-291-1 DE-293 工作桌.pdf: -12.76992137261561
W202 人體工學椅.pdf: -21.26057318224408
羅技 Logitech H340 USB耳機麥克風.pdf: -21.392244867154474
[折疊收納]懶人折疊桌.pdf: -22.24353385018338
SADES DIABLO 暗黑鬥狼RGB REALTEK 電競耳麥 7.1 (USB) SA-916.pdf: -23.69264679550995
世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf: -24.02208945994217

tfidfScores:

combinedScores:
DE-291-1 DE-293 工作桌.pdf, -5.1079685490462445
W202 人體工學椅.pdf, -8.504229272897632
羅技 Logitech H340 USB耳機麥克風.pdf, -8.55689794686179
[折疊收納]懶人折疊桌.pdf, -8.897413540073352
SADES DIABLO 暗黑鬥狼RGB REALTEK 電競耳麥 7.1 (USB) SA-916.pdf, -9.477058718203981
世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf, -9.608835783976868





In [None]:
qaury = '麥克風'

hybridSearch(qaury, db, tfidf)

Tokenization: 100%|██████████| 1/1 [00:00<00:00, 1000.31it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 64.47it/s]
Tokenization: 100%|██████████| 1/1 [00:00<?, ?it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 62.37it/s]
Tokenization: 100%|██████████| 1/1 [00:00<?, ?it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 70.12it/s]
Tokenization: 100%|██████████| 1/1 [00:00<?, ?it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 78.68it/s]

dbScores:
SADES DIABLO 暗黑鬥狼RGB REALTEK 電競耳麥 7.1 (USB) SA-916.pdf: -10.976673072350891
羅技 Logitech H340 USB耳機麥克風.pdf: -11.5916784801745
DE-291-1 DE-293 工作桌.pdf: -12.203403076875647
世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf: -12.222576283714577
[折疊收納]懶人折疊桌.pdf: -14.045641763395933
W202 人體工學椅.pdf: -15.46428643704579

tfidfScores:

combinedScores:
SADES DIABLO 暗黑鬥狼RGB REALTEK 電競耳麥 7.1 (USB) SA-916.pdf, -4.390669228940356
羅技 Logitech H340 USB耳機麥克風.pdf, -4.6366713920697995
DE-291-1 DE-293 工作桌.pdf, -4.881361230750259
世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf, -4.889030513485832
[折疊收納]懶人折疊桌.pdf, -5.618256705358373
W202 人體工學椅.pdf, -6.185714574818316





In [None]:
qaury = '耳機'

hybridSearch(qaury, db, tfidf)

Tokenization: 100%|██████████| 1/1 [00:00<00:00, 1001.51it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 55.43it/s]
Tokenization: 100%|██████████| 1/1 [00:00<?, ?it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 83.27it/s]
Tokenization: 100%|██████████| 1/1 [00:00<?, ?it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 79.82it/s]
Tokenization: 100%|██████████| 1/1 [00:00<?, ?it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 62.08it/s]

dbScores:
羅技 Logitech H340 USB耳機麥克風.pdf: -9.511294550029747
SADES DIABLO 暗黑鬥狼RGB REALTEK 電競耳麥 7.1 (USB) SA-916.pdf: -11.040900747954042
DE-291-1 DE-293 工作桌.pdf: -20.46354480934308
[折疊收納]懶人折疊桌.pdf: -21.06903613347773
W202 人體工學椅.pdf: -21.32928564074964
世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf: -23.789294481988247

tfidfScores:

combinedScores:
羅技 Logitech H340 USB耳機麥克風.pdf, -3.804517820011899
SADES DIABLO 暗黑鬥狼RGB REALTEK 電競耳麥 7.1 (USB) SA-916.pdf, -4.416360299181617
DE-291-1 DE-293 工作桌.pdf, -8.185417923737232
[折疊收納]懶人折疊桌.pdf, -8.427614453391092
W202 人體工學椅.pdf, -8.531714256299855
世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf, -9.515717792795298





In [None]:
qaury = '最近肩頸痠痛，想買一張舒適的椅子'

hybridSearch(qaury, db, tfidf)

Tokenization: 100%|██████████| 1/1 [00:00<00:00, 1017.79it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 68.34it/s]
Tokenization: 100%|██████████| 1/1 [00:00<?, ?it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 73.65it/s]
Tokenization: 100%|██████████| 1/1 [00:00<?, ?it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 86.82it/s]
Tokenization: 100%|██████████| 1/1 [00:00<?, ?it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 53.77it/s]

dbScores:
W202 人體工學椅.pdf: -14.415148405822402
羅技 Logitech H340 USB耳機麥克風.pdf: -16.05920319567249
DE-291-1 DE-293 工作桌.pdf: -17.852934251779928
[折疊收納]懶人折疊桌.pdf: -18.216598281248316
SADES DIABLO 暗黑鬥狼RGB REALTEK 電競耳麥 7.1 (USB) SA-916.pdf: -18.54358414769552
世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf: -19.367739873899385

tfidfScores:

combinedScores:
W202 人體工學椅.pdf, -5.766059362328961
羅技 Logitech H340 USB耳機麥克風.pdf, -6.423681278268997
DE-291-1 DE-293 工作桌.pdf, -7.141173700711971
[折疊收納]懶人折疊桌.pdf, -7.286639312499327
SADES DIABLO 暗黑鬥狼RGB REALTEK 電競耳麥 7.1 (USB) SA-916.pdf, -7.417433659078208
世界上最透明的故事（日本出版界話題作，只有紙本書可以體驗的感動）.pdf, -7.747095949559754



