In [7]:
import random
import torch
import pickle
import numpy as np
import pandas as pd

from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util

# Preparing step

On this step we are creating 'test data base', it contains all unique names from dataset

In [3]:
raw_data = pd.read_csv('train_normalised.csv', sep='\t',  index_col='pair_id')
name_1 = raw_data['name_1'].to_list()
name_2 = raw_data['name_2'].to_list()
all_names = np.unique(name_1 + name_2)

print("Number of unique names in DB: {}".format(len(all_names)))

# Pipe

After creating a DB we need to store it useful way, so later computations be fast. So we decided to compute embiddings for every name and save it

In [13]:
model = SentenceTransformer('all-MiniLM-L6-v2', cache_folder='model')

embeddings = model.encode(all_names)
DB = [all_names, embeddings]
with open('DB.pkl', 'wb') as f:
    pickle.dump(DB, f)

Now we can open embeddings and compare new name with DB

In [14]:
with open('DB.pkl', 'rb') as f:
    DB = pickle.load(f)

all_names = DB[0]
embeddings = DB[1]

In [15]:
duplicates = []
test_name = 'bridgestone'
#add normalization here
test_emb = model.encode(test_name)
for idx, emb in tqdm(enumerate(embeddings)):
    if util.cos_sim(test_emb, emb) > 0.9:
        duplicates.append(all_names[idx])
        
print(duplicates)

17307it [00:02, 6797.46it/s]

['bridgestone americ tire', 'bridgestone do br il industria e menrcio', 'bridgestone do br il industria e mercio', 'bridgestone do br il industria mercio', 'bridgestone firestone argentina ic', 'bridgestone firestone do br il', 'bridgestone firestone mexi d v', 'bridgestone firestone venezolana', 'bridgestone firestone venezolana c', 'bridgestone golf', 'bridgestone hosepower', 'bridgestone india', 'bridgestone india automotive', 'bridgestone india automotive products', 'bridgestone international group', 'bridgestone mexi d v.', 'bridgestone ncr', 'bridgestone neumaticos', 'bridgestone neumaticos monterrey d v', 'bridgestone research and velopment', 'bridgestone sta ric ociedad anoni', 'bridgestone sta rica', 'bridgestone stargard', 'bridgestone tire', 'bridgestone （huizhou）synthetic rubber', 'bridgestone（wuxi）tire.', 'pt bridgestone tire indonesia', 'shenyang bridgestone', 'thai bridgestone']





Lets pretend we've found a new name and want to add it to DB. The process will look smth like that

In [None]:
new_name = "ololoil"

#compute name embedding and add it to DB
new_emb = model.encode(test_name)
all_names.append(new_name)
embeddings.append(new_emb)

#save changed DB
DB = [all_names, embeddings]
with open('DB.pkl', 'wb') as f:
    pickle.dump(DB, f)