### 0. 準備

In [3]:
import tensorflow_hub as hub

embed = hub.load("https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/universal-sentence-encoder/2")
embeddings = embed([
    "The quick brown fox jumps over the lazy dog.",
    "I am a sentence for which I would like to get its embedding"])

print(embeddings)



2024-04-27 12:49:00.302911: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-27 12:49:04.273295: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


tf.Tensor(
[[-0.03133017 -0.06338634 -0.01607501 ... -0.03242778 -0.0457574
   0.05370456]
 [ 0.0508086  -0.01652434  0.01573779 ...  0.00976657  0.03170121
   0.01788118]], shape=(2, 512), dtype=float32)


### 1. ベクトル化

In [4]:
import pandas as pd
import numpy as np
us_con = pd.read_csv('../data/us_con.csv')
us_con = us_con.sort_values(by=['query_id', 'product_id'], ascending=False)

In [3]:
query = us_con['query'].tolist()[:1000]
query_embed = embed(query)

### 2. ドット積

In [4]:
title = us_con['product_title'].tolist()[:1000]
title_embed = embed(title)

In [34]:
def cal_dot(title, query):
    return np.dot(title, query)

In [60]:
# クエリ番号 0
result = []
for titled in title_embed:
    result.append(cal_dot(np.array(query_embed[0]), np.array(titled)))
    

sorted_result = sorted(((value, index) for index, value in enumerate(result)), reverse=True)[:10]

In [61]:
for _, i in sorted_result:
    print(_,title[i])

0.48800233 Zemfira
0.45124966 谷胱甘肽生产技术
0.40224433 Z-Sides
0.39796084 Шахтау
0.39408344 功能肽的加工技术与活性评价/现代食品深加工技术丛书
0.36847132 C型钠肽对牛卵母细胞体外成熟和发育能力的影响研究
0.35382083 Кроссовки
0.35382083 Кроссовки
0.35382083 Кроссовки
0.35382083 Кроссовки


### 3. ANNライブラリ

In [43]:
import faiss

index = faiss.IndexFlatIP(title_embed.shape[1])
print(index.is_trained)

True


In [44]:
# インデックスにデータを追加
index.add(np.array(title_embed).astype('float32'))
print(index.ntotal)

1000


In [45]:
# 近傍探索の実行
D, I = index.search(np.array(query_embed).astype('float32'), 1)

# 確認
print(D) # クエリベクトルとその近傍点の距離

[[0.48800233]
 [0.48800233]
 [0.48800233]
 [0.48800233]
 [0.48800233]
 [0.48800233]
 [0.48800233]
 [0.48800233]
 [0.48800233]
 [0.54635334]
 [0.54635334]
 [0.54635334]
 [0.54635334]
 [0.54635334]
 [0.54635334]
 [0.54635334]
 [0.54635334]
 [0.54635334]
 [0.54635334]
 [0.54635334]
 [0.54635334]
 [0.54635334]
 [0.54635334]
 [0.54635334]
 [0.55567753]
 [0.55567753]
 [0.55567753]
 [0.55567753]
 [0.55567753]
 [0.55567753]
 [0.55567753]
 [0.55567753]
 [0.5171559 ]
 [0.5171559 ]
 [0.5171559 ]
 [0.5171559 ]
 [0.5171559 ]
 [0.5171559 ]
 [0.5171559 ]
 [0.5171559 ]
 [0.5171559 ]
 [0.5171559 ]
 [0.5171559 ]
 [0.5171559 ]
 [0.5171559 ]
 [0.5171559 ]
 [0.5171559 ]
 [0.5171559 ]
 [0.48468792]
 [0.48468792]
 [0.48468792]
 [0.48468792]
 [0.48468792]
 [0.48468792]
 [0.48468792]
 [0.48468792]
 [0.48468792]
 [0.48468792]
 [0.48468792]
 [0.48468792]
 [0.48468792]
 [0.48468792]
 [0.48468792]
 [0.48468792]
 [0.48468792]
 [0.48468792]
 [0.52825296]
 [0.52825296]
 [0.52825296]
 [0.52825296]
 [0.52825296]
 [0.52

In [60]:
print(I) #クエリに対する最近傍点のインデックス

[[ 279]
 [ 279]
 [ 279]
 [ 279]
 [ 279]
 [ 279]
 [ 279]
 [ 279]
 [ 279]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 265]
 [ 265]
 [ 265]
 [ 265]
 [ 265]
 [ 265]
 [ 265]
 [ 265]
 [ 248]
 [ 248]
 [ 248]
 [ 248]
 [ 248]
 [ 248]
 [ 248]
 [ 248]
 [ 248]
 [ 248]
 [ 248]
 [ 248]
 [ 248]
 [ 248]
 [ 248]
 [ 248]
 [ 250]
 [ 250]
 [ 250]
 [ 250]
 [ 250]
 [ 250]
 [ 250]
 [ 250]
 [ 250]
 [ 250]
 [ 250]
 [ 250]
 [ 250]
 [ 250]
 [ 250]
 [ 250]
 [ 250]
 [ 250]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 268]
 [ 264]
 [ 264]
 [ 264]
 [ 264]
 [ 264]
 [ 264]
 [ 264]
 [ 264]
 [ 264]
 [ 264]
 [ 264]
 [  91]
 [  91]
 [  91]
 [  91]
 [  91]
 [  91]
 [  91]
 [  91]
 [  91]
 [  91]
 [  91]
 [  91]
 [  91]
 [  91]
 [  87]
 [  87]
 [  87]
 [  87]
 [  87]
 [  87]
 [  87]
 [  87]
 [ 265]
 [ 265]
 [ 265]
 [ 265]
 [ 265]
 [ 265]
 [ 265]
 [ 265]
 [ 265]
 [ 265]


### 4.　代表点

In [46]:
sampling = np.random.choice(1000, size=int(np.sqrt(1000)), replace=False) # replaceで重複削除

In [47]:
K_100  = np.array(title_embed)

represent_point = K_100[sampling]
represent_id = np.arange(int(np.sqrt(1000)))

In [48]:
index = faiss.IndexFlatIP(title_embed.shape[1])
index.add(represent_point)

In [50]:
print("サンプル:", sampling)
print("代表点ベクトルの形状:", represent_point.shape)
print("代表点の通し番号:", represent_id)

サンプル: [774 246 939 812 663 627 389 503 778  26 222 455 646 282 461 505 402 288
 770 603 810 544  14 679  11 633 201 534 918 622  60]
代表点ベクトルの形状: (31, 512)
代表点の通し番号: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]


In [51]:
# [774 246 939 812 663 627 389 503 778  26 222 455 646 282 461 505 402 288 770 603 810 544  14 679  11 633 201 534 918 622  60] //// サンプリングしたインデックス

# [ 0   1   2   3   4   5   6   7   8   9  10  11   12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30] //// 通し番号

### 5. インデックス時の引き当て

In [66]:
number = []
for i in I:
    number.append(i[0])

### 6. クエリ時の引き当て


In [147]:
sampling = np.random.choice(1000, size=int(np.sqrt(1000)), replace=False) # サンプリング

K_1000  = np.array(title_embed)
share_vec = K_1000[sampling] #　サンプリングした個数だけ、titleベクトルを受け取る
represent_id = np.arange(int(np.sqrt(1000)))

index = faiss.IndexFlatIP(title_embed.shape[1])
index.add(share_vec)


In [148]:
D, I = index.search(np.array(query_embed).astype('float32'), k=10)

In [152]:
#クエリ番号 0
for i in I[0]:
    print(title[i]) 

Osmo - Little Genius Starter Kit for Fire Tablet + Early Math Adventure-6 Educational Games-Ages 3-5-Counting, Shapes & Phonics-Christmas Toys-STEM Toy(Osmo Fire Tablet Base Included-Amazon Exclusive)
MIZUNO RB 566V GOLF BALL, White
Titleist Pro V1 Golf Balls, White, High Play Numbers (5-8), One Dozen
2021 Callaway Supersoft Golf Balls , White
ZhenSanHuan Chinese Hand Hammered Iron Woks and Stir Fry Pans, Non-stick, No Coating, Less Oil, 章丘铁锅，Carbon Steel Pow (Seasoned 32CM)
Volvik Vimax Soft Green Golf Balls, Dozen
LivTee 5 pcs Auto Trim Removal Tool Kit, Interior Door Panel Clip Removal Set for Vehicle Dash Radio Audio Installer (Blue)
2020 Callaway Chrome Soft Golf Balls (Triple Track White)
Homeries Pre-Seasoned Cast Iron Wok with 2 Handled and Wooden Lid (14 Inches) Nonstick Iron Deep Frying Pan with Flat Base for Stir-Fry, Grilling, Frying, Steaming - For Authentic Asian, Chinese Food
Callaway Assorted Models Recycled B/C Grade Golf Balls in Onion Mesh Bag (72-Piece), White


### 7. 近似の精度の評価


In [154]:
# クエリ番号77
for i in I[77]:
    print(title[i]) 

MIZUNO RB 566V GOLF BALL, White
Osmo - Little Genius Starter Kit for Fire Tablet + Early Math Adventure-6 Educational Games-Ages 3-5-Counting, Shapes & Phonics-Christmas Toys-STEM Toy(Osmo Fire Tablet Base Included-Amazon Exclusive)
Titleist Pro V1 Golf Balls, White, High Play Numbers (5-8), One Dozen
2021 Callaway Supersoft Golf Balls , White
2020 Callaway Chrome Soft X Golf Balls (Triple Track White)
Homeries Pre-Seasoned Cast Iron Wok with 2 Handled and Wooden Lid (14 Inches) Nonstick Iron Deep Frying Pan with Flat Base for Stir-Fry, Grilling, Frying, Steaming - For Authentic Asian, Chinese Food
Hieha Car Stereo Compatible with Apple Carplay and Android Auto, 7 Inch Double Din Car Stereo with Bluetooth, Touch Screen Car Radios MP5 Player with A/V Input, Backup Camera, Mirror Link, SWC
Titleist AVX Golf Balls, White, (One Dozen)
Callaway Assorted Models Recycled B/C Grade Golf Balls in Onion Mesh Bag (72-Piece), White
24" Cooking Discada with Open Fire Cooking Stand with Removable Le

In [159]:
# 2. の計算
# クエリ番号 77
result = []
for titled in title_embed:
    result.append(cal_dot(np.array(query_embed[77]), np.array(titled)))
    

sorted_result = sorted(((value, index) for index, value in enumerate(result)), reverse=True)[:10]

index2 = []
for _, i in sorted_result:
    index2.append(i)

In [162]:
def proportion(index2, index6):
    sum = 0
    for i, j in zip(index2, index6):
        if i == j:
            sum += 1
    
    return sum / len(index2)

index6 = I[77]
proportion(index2, index6)

0.0

1000件の中から近似最近傍探索していて、サンプル数は33しかない。
サンプルされたものの中でしか計算していないため、重なることが少なくなっていると考察。

2の方は、全体から一から計算しているが、6はサンプルしたものの中の探索。

### 8. パラメータチューニング

In [169]:
'''
用意する代表点の数を増やす
'''
import time

# サンプル数 : 300
sampling = np.random.choice(1000, size=300, replace=False) # サンプリング
K_1000  = np.array(title_embed)
share_vec = K_1000[sampling] #　サンプリングした個数だけ、titleベクトルを受け取る

index = faiss.IndexFlatIP(title_embed.shape[1])
index.add(share_vec)

start = time.perf_counter()
D, I = index.search(np.array(query_embed).astype('float32'), k=10)
end = time.perf_counter()

print(f'実行時間(s) : {end-start}')

実行時間(s) : 0.014246804639697075


In [170]:
# サンプル数 : 500
sampling = np.random.choice(1000, size=500, replace=False) # サンプリング
K_1000  = np.array(title_embed)
share_vec = K_1000[sampling] #　サンプリングした個数だけ、titleベクトルを受け取る

index = faiss.IndexFlatIP(title_embed.shape[1])
index.add(share_vec)

start = time.perf_counter()
D, I = index.search(np.array(query_embed).astype('float32'), k=10)
end = time.perf_counter()

print(f'実行時間(s) : {end-start}')

実行時間(s) : 0.020443967543542385


In [171]:
'''
製品に対して引き当てる代表点の数 → 計算するクエリ数を変化させる
'''
# クエリ数 : 1000
sampling = np.random.choice(1000, size=int(np.sqrt(1000)), replace=False) # サンプリング
K_1000  = np.array(title_embed)
share_vec = K_1000[sampling] #　サンプリングした個数だけ、titleベクトルを受け取る

index = faiss.IndexFlatIP(title_embed.shape[1])
index.add(share_vec)

start = time.perf_counter()
D, I = index.search(np.array(query_embed).astype('float32'), k=10)
end = time.perf_counter()

print(f'実行時間(s) : {end-start}')

実行時間(s) : 0.008220500312745571


In [172]:
# クエリ数 : 500
sampling = np.random.choice(1000, size=int(np.sqrt(1000)), replace=False) # サンプリング
K_1000  = np.array(title_embed)
share_vec = K_1000[sampling] #　サンプリングした個数だけ、titleベクトルを受け取る

index = faiss.IndexFlatIP(title_embed.shape[1])
index.add(share_vec)

start = time.perf_counter()
D, I = index.search(np.array(query_embed[:500]).astype('float32'), k=10)
end = time.perf_counter()

print(f'実行時間(s) : {end-start}')

実行時間(s) : 0.00547370221465826


In [178]:
'''
クエリに対して引き当てる代表点の数 → 計算する製品数を変化させる

サンプリングする母数が変化する
'''
# 製品数 : 1000
sampling = np.random.choice(1000, size=int(np.sqrt(1000)), replace=False) # サンプリング
K_1000  = np.array(title_embed)
share_vec = K_1000[sampling] #　サンプリングした個数だけ、titleベクトルを受け取る

index = faiss.IndexFlatIP(title_embed.shape[1])
index.add(share_vec)

start = time.perf_counter()
D, I = index.search(np.array(query_embed).astype('float32'), k=10)
end = time.perf_counter()

print(f'実行時間(s) : {end-start}')

実行時間(s) : 0.006494143977761269


In [179]:
# 製品数 : 500
sampling = np.random.choice(500, size=int(np.sqrt(500)), replace=False) # サンプリング
K_1000  = np.array(title_embed[:500])
share_vec = K_1000[sampling] #　サンプリングした個数だけ、titleベクトルを受け取る

index = faiss.IndexFlatIP(title_embed.shape[1])
index.add(share_vec)

start = time.perf_counter()
D, I = index.search(np.array(query_embed).astype('float32'), k=10)
end = time.perf_counter()

print(f'実行時間(s) : {end-start}')

実行時間(s) : 0.004050822928547859


### 9. クラスタリング

In [5]:
# 製品のラベルでクラスタリングする
esci_label = us_con['esci_label'].tolist()[:1000]

In [18]:
gain = {'E': 4, 'S': 3, 'C': 2, 'I': 1}
int_label = []
for i in esci_label:
    int_label.append(gain[i])

In [19]:
# ラベルごとにデータを分割する

groups = us_con.groupby('esci_label')

classtaling = [group for _, group in groups]

In [8]:
query_embed_label = []
for df in classtaling:
    query  = df['query'].tolist()[:1000]
    query_embed_label.append(embed(query))

title_embed_label = []
for df in classtaling:   # [0]:C, [1]:E, [2]:S, [3]:I 
    title = df['product_title'].tolist()[:1000]
    title_embed_label.append(embed(title))

In [11]:
mean = [] # 各クラスの平均ベクトル

for x in title_embed_label:
    mean.append(np.mean(np.array(x), axis=0))

In [32]:
import faiss

present_id = []
for i in range(4):
    index = faiss.IndexFlatIP(title_embed_label[0].shape[1])
    index.add(title_embed_label[i])
    
    D, I = index.search(np.array([mean[i]]).astype('float32'), k=int(np.sqrt(1000)))
    present_id.append(I[0])


In [52]:
import time

def ann(title_embed, query_embed, id, present_id):
    share_vec = title_embed[present_id] #　サンプリングした個数だけ、titleベクトルを受け取る

    index = faiss.IndexFlatIP(title_embed.shape[1])
    index.add(share_vec)

    start = time.perf_counter()
    D, I = index.search(np.array(query_embed).astype('float32'), k=10)
    end = time.perf_counter()
    
    print(f'実行時間(s) : {end - start}')
    return I[id]

In [75]:
# クエリ番号0
index9 = []
for title_embed, query_embed, sampling in zip(title_embed_label, query_embed_label, present_id):
    indexs = ann(np.array(title_embed), np.array(query_embed), 0, np.array(sampling))
    for i in indexs:
        print(title[i])
        if i == 1:
            index9.append(indexs)
    print("--------")


実行時間(s) : 0.011859756894409657
Volvik Vimax Soft Green Golf Balls, Dozen
2020 Callaway Chrome Soft Golf Balls (Triple Track White)
Titleist Tour Speed Golf Balls, White, (One Dozen)
Nilight - ZH003 20Inch 126W Spot Flood Combo Led Light Bar 4PCS 4Inch 18W Spot LED Pods Fog Lights for Jeep Wrangler Boat Truck Tractor Trailer Off-Road, 2 Years Warranty
Cangshan W Series 6 Piece German Steel Knife Block Set, Walnut
24" Cooking Discada with Open Fire Cooking Stand with Removable Legs
Osmo - Little Genius Starter Kit for Fire Tablet + Early Math Adventure-6 Educational Games-Ages 3-5-Counting, Shapes & Phonics-Christmas Toys-STEM Toy(Osmo Fire Tablet Base Included-Amazon Exclusive)
2021 Callaway Supersoft Golf Balls , White
Cut Golf 2-Piece Ionomer Red Golf Balls
Cast Iron Cleanser by Culina - Cleans and Protects Cast Iron Cookware, Kosher Certified 8oz with scrapers
--------
実行時間(s) : 0.01902733836323023
Lodge 3.6 Quart Enamel Cast Iron Casserole Dish with Lid (Carribbean Blue)
Cangshan W 

In [76]:
def cal_dot(title, query):
    return np.dot(title, query)

title = us_con['product_title'].tolist()[:1000]
title_embed = embed(title)

query = us_con['query'].tolist()[:1000]
query_embed = embed(query)

result = []
for titled in title_embed:
    result.append(cal_dot(np.array(query_embed[0]), np.array(titled)))

sorted_result = sorted(((value, index) for index, value in enumerate(result)), reverse=True)[:10]

index2 = []
for _, i in sorted_result:
    index2.append(i)
        

In [77]:
def proportion(index2, index8):
    sum = 0
    for i, j in zip(index2, index8):
        if i == j:
            sum += 1
    
    return sum / len(index2)

index8 = index9[0].tolist()
proportion(index2, index8)

0.0