### 0.　準備

In [1]:
import pandas as pd

product_us = pd.read_csv('../data/product_us.csv')

In [2]:
invert_index = {}

for index, row in product_us.iterrows():
    texts = row['product_title'].split()

    for text in texts:
        # 単語がまだインデックスに存在しない場合、空のセットを作成します
        if text not in invert_index:
            invert_index[text] = {}

        # 単語が特定の商品の出現回数を記録する辞書がまだ存在しない場合、新しい辞書を作成します
        if row['product_id'] not in invert_index[text]:
            invert_index[text][row['product_id']] = 0

        # 出現回数をインクリメントします
        invert_index[text][row['product_id']] += 1


In [3]:
import json

with open("../data/invert_index/invert_index_wordcount.json", "w") as f:
    json.dump(invert_index, f)

In [4]:
import json
with open("../data/invert_index/invert_index_wordcount.json", "r") as f:
    load_invert_index = json.load(f)

In [5]:
invert_index = {}

for key in load_invert_index.keys():
    invert_index[key] = {x[0] : x[1] for x in sorted(load_invert_index[key].items())}

### 1.　優先度つきキュー

参考URL
https://prd-xxx.hateblo.jp/entry/2019/06/24/235844

In [6]:
import heapq
class Heapq:
    def __init__(self, arr, desc=False): # 大きい順 : desc=True, 小さい順 : desc=False
        if desc:
            arr=[-a for a in arr]
        self.sign = -1 if desc else 1
        self.hq = arr
        heapq.heapify(self.hq)

    def pop(self):
        return heapq.heappop(self.hq) * self.sign
    
    def push(self, a):
        heapq.heappush(self.hq, a*self.sign)

    def top(self):
        return self.hq[0] * self.sign

In [7]:
size10 = Heapq([3, 5, 1, 8, 9, 2, 4, 6, 7, 0], False)

In [8]:
for i in range(10):
    print(size10.pop())

0
1
2
3
4
5
6
7
8
9


### 2.　イテレータ

In [9]:
class Wordcount:
    def __init__(self, postinglist1, postinglist2):
        self.postinglist1 = postinglist1
        self.postinglist2 = postinglist2

        self.id1 = self.postinglist1.keys() #　ポスティングリストのキーの一覧リスト
        self.id2 = self.postinglist2.keys()

        self.or_id = iter(sorted(set(self.id1) | set(self.id2))) # 少なくとも片方に含まれる製品のproduct_id

    def __iter__(self):
        return self
    
    def __next__(self):
        try:
            product_id = next(self.or_id)
            word_count1 = self.postinglist1.get(product_id, 0)
            word_count2 = self.postinglist2.get(product_id, 0)
            return product_id, word_count1, word_count2
        except StopIteration:
            raise StopIteration()

    

In [10]:
iterator = Wordcount(invert_index["science"], invert_index["information"])

for product_id, count1, count2 in iterator:
    print("Product ID:", product_id, "count1", count1, "count2", count2)

Product ID: 0692901159 count1 1 count2 0
Product ID: 0982616309 count1 1 count2 0
Product ID: 0996832203 count1 1 count2 0
Product ID: 1544217560 count1 0 count2 1
Product ID: 1563924269 count1 0 count2 1
Product ID: 1563928124 count1 0 count2 1
Product ID: 1620920107 count1 0 count2 1
Product ID: 1620920743 count1 0 count2 1
Product ID: 1620923505 count1 0 count2 1
Product ID: 1620923890 count1 0 count2 1
Product ID: 1703094875 count1 0 count2 1
Product ID: 8934974427 count1 1 count2 0
Product ID: B004YQCGLW count1 1 count2 0
Product ID: B01HF3BJPY count1 1 count2 0
Product ID: B01N4PEA3C count1 1 count2 0
Product ID: B06XQJ4Q9G count1 1 count2 0
Product ID: B072QBG11J count1 1 count2 0
Product ID: B07818FJNP count1 1 count2 0
Product ID: B07DTXM8L4 count1 1 count2 0
Product ID: B07KV2P1HV count1 0 count2 1
Product ID: B07WD1L8FS count1 1 count2 0
Product ID: B08777HHVZ count1 1 count2 0
Product ID: B088QN1MWZ count1 0 count2 1
Product ID: B0897WT9N9 count1 0 count2 1
Product ID: B08R

### 3. TF

In [11]:
class WordcountSum:
    def __init__(self, postinglist1, postinglist2):
        self.postinglist1 = postinglist1
        self.postinglist2 = postinglist2

        self.id1 = self.postinglist1.keys() #　ポスティングリストのキーの一覧リスト
        self.id2 = self.postinglist2.keys()

        self.or_id = iter(sorted(set(self.id1) | set(self.id2))) # 少なくとも片方に含まれる製品のproduct_id

    def __iter__(self):
        return self
    
    def __next__(self):
        try:
            product_id = next(self.or_id)
            word_count1 = self.postinglist1.get(product_id, 0)
            word_count2 = self.postinglist2.get(product_id, 0)
            word_count = word_count1 + word_count2 # ここが合計値となっている
            return product_id, word_count
        
        except StopIteration:
            raise StopIteration()

    

In [12]:
iterator = WordcountSum(invert_index["HDMI"], invert_index["Cable"])

hq = [] 
for product_id, tf_sum in iterator:
    heapq.heappush(hq, (-tf_sum, product_id)) # heapq.heappush(ヒープ, (優先度(キー), 実際のデータ))　タプル形式で解く

# while hq:
#     tf_sum, product_id = heapq.heappop(hq)
#     print("Product ID:", product_id, "| Product Title:", product_us[product_us['product_id'] == product_id]['product_title'], "| TF Sum:", -tf_sum)

for i in range(20):
    tf_sum, product_id = heapq.heappop(hq)
    print("Product ID:", product_id)
    print("Product Title:", product_us[product_us['product_id'] == product_id]['product_title'])
    print("TF Sum:", -tf_sum)
    print("---------")

Product ID: B004EFLOFM
Product Title: 132339    Coax Cable - Coaxial Cable Connector - 6ft Ant...
Name: product_title, dtype: object
TF Sum: 7
---------
Product ID: B008APMTEW
Product Title: 132260    Coax Cable Connector - Coaxial Cable Connector...
Name: product_title, dtype: object
TF Sum: 7
---------
Product ID: B07RKN7BVS
Product Title: 941181    HDMI Fiber Cable Slim,100FT Light Speed HDMI 2...
Name: product_title, dtype: object
TF Sum: 7
---------
Product ID: B088BLPQSD
Product Title: 1054798    Duttek Mini HDMI to HDMI Cable, HDMI to Mini H...
Name: product_title, dtype: object
TF Sum: 7
---------
Product ID: B08F9V3XBJ
Product Title: 1054827    HDMI to VGA HDMI Adapter, Onten HDMI Splitter ...
Name: product_title, dtype: object
TF Sum: 7
---------
Product ID: B0160BHPG4
Product Title: 132835    4K HDMI Cable -KAYO High Speed HDMI 2.0b Cable...
Name: product_title, dtype: object
TF Sum: 6
---------
Product ID: B06XT2JS1G
Product Title: 492316    Cerrxian 0.5m High Speed HDMI 2.

### 4. TFIDF

pip install rank_bm25

In [13]:
from rank_bm25 import BM25Okapi

def caluculate_idf(product_us):
    titles = []
    for index, row in product_us.iterrows():
        texts = row['product_title'].split()
        titles.append(texts)
    bm25 = BM25Okapi(titles)
    return bm25

bm25 = caluculate_idf(product_us)

In [14]:
import heapq
wordcount = Wordcount(invert_index["HDMI"], invert_index["Cable"])

hq = [] 
for product_id, hdmi_tf, cable_tf in wordcount:

    tfidf = hdmi_tf * bm25.idf["HDMI"] + cable_tf * bm25.idf["Cable"]

    heapq.heappush(hq, (-tfidf, product_id)) # heapq.heappush(ヒープ, (優先度(キー), 実際のデータ))　タプル形式で解く

for i in range(20):
    tf_idf, product_id = heapq.heappop(hq)
    print("Product ID:", product_id)
    print("Product Title:", product_us[product_us['product_id'] == product_id]['product_title'])
    print("TF Sum:", -tf_idf)
    print("---------")

Product ID: B08F9V3XBJ
Product Title: 1054827    HDMI to VGA HDMI Adapter, Onten HDMI Splitter ...
Name: product_title, dtype: object
TF Sum: 42.76059428009977
---------
Product ID: B088BLPQSD
Product Title: 1054798    Duttek Mini HDMI to HDMI Cable, HDMI to Mini H...
Name: product_title, dtype: object
TF Sum: 41.31667759623826
---------
Product ID: B07RKN7BVS
Product Title: 941181    HDMI Fiber Cable Slim,100FT Light Speed HDMI 2...
Name: product_title, dtype: object
TF Sum: 38.428844228515246
---------
Product ID: B07FQH4Z1C
Product Title: 837176    4K HDMI Cable,Capshi 15FT HDMI Cord High Speed...
Name: product_title, dtype: object
TF Sum: 35.208021270509725
---------
Product ID: B07S91FT8R
Product Title: 1131112    4K HDMI Cable,Capshi 20FT HDMI Cord High Speed...
Name: product_title, dtype: object
TF Sum: 35.208021270509725
---------
Product ID: B06XT2JS1G
Product Title: 492316    Cerrxian 0.5m High Speed HDMI 2.0 HDMI Left An...
Name: product_title, dtype: object
TF Sum: 33.76410

### 5. フィールド長

In [15]:
field = {}

for index, row in product_us.iterrows():
    texts = row['product_title'].split()
    field[row['product_id']] = len(texts)
    


In [16]:
# フィールド長の平均

sum = 0
for key, value in field.items():
    sum += field[key]

print(f'フィール長の平均:{sum / len(field)}')

フィール長の平均:16.63483773545179


### 6. BM25

In [17]:
import heapq
query = "Cable HDMI"
bm25_score = bm25.get_scores(query)

hq = []
for score, product_id in zip(bm25_score, product_us['product_id']):
    heapq.heappush(hq, (-score, product_id))

In [18]:
for i in range(20):
    score, product_id = heapq.heappop(hq)
    print("Product ID:", product_id)
    print("Product Title:", product_us[product_us['product_id'] == product_id]['product_title'])
    print("BM25:", -score)
    print("---------")

Product ID: B01E7KBXWC
Product Title: 983981    C h a n e l no.5 EDP Spray for women 3.4 OZ/10...
Name: product_title, dtype: object
BM25: 30.323506523674666
---------
Product ID: B07C5GFF4C
Product Title: 72490    G r e a t e s t H i t s
Name: product_title, dtype: object
BM25: 27.262040577740066
---------
Product ID: B07WSF4KJ9
Product Title: 1073398    B e a u t y . B e h i n d . M a d n e s s
Name: product_title, dtype: object
BM25: 24.17180805278996
---------
Product ID: B08F2LZH1G
Product Title: 921400    M e g a h i t s
Name: product_title, dtype: object
BM25: 23.365999905955675
---------
Product ID: B01N6WS03S
Product Title: 41015    StreetCalledDesire & M o r e
Name: product_title, dtype: object
BM25: 18.904161811366524
---------
Product ID: B082LVV8FY
Product Title: 589379    Vine Vera | Resveratrol l Vitamin C Moisturizer
Name: product_title, dtype: object
BM25: 18.721562732252508
---------
Product ID: B00LVMMAY0
Product Title: 248285    Baby Trend Hybrid Booster 3 in 1 Car 