In [None]:
import random
import pickle
import numpy as np

import torch
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util

from normalization import Normalizer

# Preparing step

On this step we are creating 'test data base', it contains all unique names from dataset

In [None]:
raw_data = pd.read_csv('data/train_normalised.csv', sep='\t',  index_col='pair_id')
name_1 = raw_data['name_1'].to_list() 
name_2 = raw_data['name_2'].to_list()
all_names = np.unique(name_1 + name_2)

print("Number of unique names in DB: {}".format(len(all_names)))

# Pipe

After creating a DB we need to store it useful way, so later computations be fast. So we decided to compute embiddings for every name and save it

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2', cache_folder='model')

embeddings = model.encode(all_names)
DB = [all_names, embeddings]
with open('DB.pkl', 'wb') as f:
    pickle.dump(DB, f)

Now we can open embeddings and compare new name with DB

In [None]:
with open('DB.pkl', 'rb') as f:
    DB = pickle.load(f)

all_names = DB[0]
embeddings = DB[1]

To find similar names we compute cosine distance between new name embedding and every embedding in DB.
If it's more than threshold we assume it may be duplicate

In [None]:
duplicates = []
test_name = 'bridgestone'

norm_name = normalizer.normalize(test_name)

test_emb = model.encode(norm_name)
for idx, emb in tqdm(enumerate(embeddings)):
    if util.cos_sim(test_emb, emb) > 0.9:
        duplicates.append(all_names[idx])
        
print(duplicates)

Lets pretend we've found a new name and want to add it to DB. The process will look smth like that

In [None]:
normalizer = Normalizer()
new_name = "ololoil co."

norm_name = normalizer.normalize(new_name)

#compute name embedding and add it to DB
new_emb = model.encode(test_name)
all_names.append(new_name)
embeddings.append(new_emb)

#save changed DB
DB = [all_names, embeddings]
with open('DB.pkl', 'wb') as f:
    pickle.dump(DB, f)