In [16]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import os

In [17]:
# Load data
# tb1_path = os.path.join("small_files", "amazon.csv")
# tb2_path = os.path.join("small_files", "best_buy.csv")

tb1_path = os.path.join("files", "amazon.csv")
tb2_path = os.path.join("files", "best_buy.csv")

table1 = pd.read_csv(tb1_path)
table2 = pd.read_csv(tb2_path)

# Initialize embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')
# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


table1['Brand'] = table1['Brand'].fillna('')
table1['Name'] = table1['Name'].fillna('')
table1['Features'] = table1['Features'].fillna('')
table1['combined'] = table1['Brand'] + " " + table1['Name'] + " " + table1['Features']

table2['Brand'] = table2['Brand'].fillna('')
table2['Name'] = table2['Name'].fillna('')
table2['Features'] = table2['Features'].fillna('')
table2['combined'] = table2['Brand'] + " " + table2['Name'] + " " + table2['Features']



embeddings1 = model.encode(table1['combined'].tolist(), show_progress_bar=True)
embeddings1 = np.ascontiguousarray(embeddings1, dtype=np.float32)

embeddings2 = model.encode(table2['combined'].tolist(), show_progress_bar=True)
embeddings2 = np.ascontiguousarray(embeddings2, dtype=np.float32)



Batches:   0%|          | 0/134 [00:00<?, ?it/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

In [18]:
embeddings1

array([[-0.06108384,  0.01972555, -0.02767031, ..., -0.02493372,
        -0.07697364,  0.02860137],
       [-0.08205646,  0.07693948,  0.00117907, ...,  0.04899307,
        -0.01678976,  0.08762425],
       [-0.03565126,  0.00840147, -0.01902267, ..., -0.05775996,
        -0.06447525,  0.05948677],
       ...,
       [-0.02066881,  0.00996584, -0.02643626, ..., -0.04005638,
        -0.07878473,  0.06013756],
       [-0.11395273,  0.08526459,  0.00513546, ..., -0.0614782 ,
        -0.06082411,  0.14654565],
       [-0.06918456,  0.0909764 , -0.0031052 , ..., -0.0399138 ,
        -0.03848388,  0.07772986]], shape=(4259, 384), dtype=float32)

In [19]:
print(embeddings1.shape)
print(embeddings2.shape)

print("embeddings1 NaNs:", np.isnan(embeddings1).sum(), "Infs:", np.isinf(embeddings1).sum())
print("embeddings2 NaNs:", np.isnan(embeddings2).sum(), "Infs:", np.isinf(embeddings2).sum())

print("Empty rows in table1:", (table1['combined'].str.strip() == '').sum())
print("Empty rows in table2:", (table2['combined'].str.strip() == '').sum())

(4259, 384)
(5001, 384)
embeddings1 NaNs: 0 Infs: 0
embeddings2 NaNs: 0 Infs: 0
Empty rows in table1: 0
Empty rows in table2: 0


In [7]:
import faiss


In [8]:
print(embeddings1.dtype, embeddings2.dtype)


float32 float32


In [6]:
# Create FAISS index
dimension = embeddings1.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings1)

In [7]:
index

<faiss.swigfaiss.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x14da66490> >

In [None]:
k = 3  # number of nearest neighbors to retrieve
D, I = index.search(embeddings2, k)

In [20]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search

# tb1_path = os.path.join("small_files", "amazon.csv")
# tb2_path = os.path.join("small_files", "best_buy.csv")

tb1_path = os.path.join("files", "amazon.csv")
tb2_path = os.path.join("files", "best_buy.csv")

table1 = pd.read_csv(tb1_path)
table2 = pd.read_csv(tb2_path)
# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

table1['Brand'] = table1['Brand'].fillna('')
table1['Name'] = table1['Name'].fillna('')
table1['Features'] = table1['Features'].fillna('')
table1['combined'] = table1['Brand'] + " " + table1['Name'] + " " + table1['Features']

table2['Brand'] = table2['Brand'].fillna('')
table2['Name'] = table2['Name'].fillna('')
table2['Features'] = table2['Features'].fillna('')
table2['combined'] = table2['Brand'] + " " + table2['Name'] + " " + table2['Features']

embeddings1 = model.encode(table1['combined'].tolist(), show_progress_bar=True)
embeddings2 = model.encode(table2['combined'].tolist(), show_progress_bar=True)

# Perform semantic search
results = semantic_search(embeddings2, embeddings1, top_k=3)

# Display results
for idx, result in enumerate(results):
    print(f"Query {idx + 1}:")
    for res in result:
        print(f" - ID: {table1.iloc[res['corpus_id']]['ID']}, Score: {res['score']}")

Batches:   0%|          | 0/134 [00:00<?, ?it/s]

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

Query 1:
 - ID: 248, Score: 0.8688457012176514
 - ID: 604, Score: 0.8508794903755188
 - ID: 344, Score: 0.8443682193756104
Query 2:
 - ID: 799, Score: 0.8742305636405945
 - ID: 3308, Score: 0.8722079396247864
 - ID: 2724, Score: 0.8716567754745483
Query 3:
 - ID: 4008, Score: 0.8564521074295044
 - ID: 186, Score: 0.8507880568504333
 - ID: 435, Score: 0.8465046286582947
Query 4:
 - ID: 879, Score: 0.8688091039657593
 - ID: 3163, Score: 0.8653074502944946
 - ID: 124, Score: 0.8649476170539856
Query 5:
 - ID: 3061, Score: 0.8864787817001343
 - ID: 125, Score: 0.8720830678939819
 - ID: 2747, Score: 0.8611739277839661
Query 6:
 - ID: 2447, Score: 0.8372248411178589
 - ID: 913, Score: 0.8342478275299072
 - ID: 925, Score: 0.8290587663650513
Query 7:
 - ID: 2724, Score: 0.8743645548820496
 - ID: 799, Score: 0.8640984892845154
 - ID: 270, Score: 0.8638225793838501
Query 8:
 - ID: 424, Score: 0.8829950094223022
 - ID: 88, Score: 0.8422229290008545
 - ID: 342, Score: 0.8327561616897583
Query 9:


In [21]:
# Perform semantic search
results = semantic_search(embeddings2, embeddings1, top_k=15)

# Display results
for idx, result in enumerate(results):
    print(f"Query {idx + 1}:")
    for res in result:
        print(f" - ID: {table1.iloc[res['corpus_id']]['ID']}, Score: {res['score']}")

Query 1:
 - ID: 248, Score: 0.8688457012176514
 - ID: 604, Score: 0.8508794903755188
 - ID: 344, Score: 0.8443682193756104
 - ID: 2459, Score: 0.8392264246940613
 - ID: 549, Score: 0.8392264246940613
 - ID: 618, Score: 0.8336161971092224
 - ID: 1, Score: 0.8298090100288391
 - ID: 307, Score: 0.8296117782592773
 - ID: 435, Score: 0.8210715055465698
 - ID: 960, Score: 0.814909815788269
 - ID: 4008, Score: 0.8131088018417358
 - ID: 186, Score: 0.8089366555213928
 - ID: 523, Score: 0.8084124326705933
 - ID: 864, Score: 0.8077008724212646
 - ID: 1313, Score: 0.8026927709579468
Query 2:
 - ID: 799, Score: 0.8742305636405945
 - ID: 3308, Score: 0.8722079396247864
 - ID: 2724, Score: 0.8716567754745483
 - ID: 925, Score: 0.863991379737854
 - ID: 17, Score: 0.8628737926483154
 - ID: 711, Score: 0.8480241298675537
 - ID: 170, Score: 0.846126914024353
 - ID: 566, Score: 0.8459141254425049
 - ID: 466, Score: 0.845513641834259
 - ID: 207, Score: 0.8447521328926086
 - ID: 4002, Score: 0.844387769699

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

