# Experiment 2
# Sentence embeddings and Qdrant Search

## Unprocessed dataest. Languages: EN

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from qdrant_client.http.models import models
from tqdm import tqdm

### Load datasets

In [39]:
df_embs = pd.read_parquet('../data/df_embs_2.parquet')

In [40]:
df = pd.read_parquet('../data/all_lang_train.parquet')

In [41]:
df

Unnamed: 0,name_1,name_2,is_duplicate
0,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
1,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
2,"Rishichem Distributors Pvt., Ltd.",Dsa,0
3,Powermax Rubber Factory,Co. One,0
4,Tress A/S,Longyou Industries Park Zhejiang,0
...,...,...,...
497567,BIT-MAT PRODUCTS,The Goodyear Tire and Rubber Company,0
497568,"Bnd Trading Co., Ltd.",Zhong Shan Yue Liang Economy& Trade Imp. & Exp...,0
497569,"Xeikon Industrial Co., Ltd. Of Dongguan City","Yi Cheng Trading Co., Ltd. Of Dongguan City",0
497570,"Shanghai Kechuan Trading Co., Ltd.",Shanghai M&G Stationery Inc.,0


In [42]:
df = df.merge(df_embs, how='left', left_on='name_1', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_1'}, axis=1)

In [43]:
df = df.merge(df_embs, how='left', left_on='name_2', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_2'}, axis=1)

In [44]:
df

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
0,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0,iko,"[-0.05934986099600792, 0.06893619149923325, 0....",enormous trade,"[-0.05795746296644211, -0.028908999636769295, ..."
1,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0,apcotex,"[-0.027187814936041832, 0.0036133553367108107,...",technocraft,"[-0.045615799725055695, 0.012592996470630169, ..."
2,"Rishichem Distributors Pvt., Ltd.",Dsa,0,rishichem distributors,"[-0.029004117473959923, 0.02204391174018383, -...",dsa,"[-0.1476079523563385, 0.003658777102828026, -0..."
3,Powermax Rubber Factory,Co. One,0,powermax rubber factory,"[-0.02773008868098259, 0.06848593056201935, -0...",one,"[-0.053144607692956924, -0.04593740776181221, ..."
4,Tress A/S,Longyou Industries Park Zhejiang,0,tress,"[-0.07435113191604614, 0.0670902281999588, -0....",longyou park zhejiang,"[0.0423990860581398, 0.04695170372724533, 0.08..."
...,...,...,...,...,...,...,...
497567,BIT-MAT PRODUCTS,The Goodyear Tire and Rubber Company,0,bit mat products,"[-0.032756511121988297, 0.010041077621281147, ...",the goodyear tire and rubber company,"[-0.055517975240945816, 0.023900089785456657, ..."
497568,"Bnd Trading Co., Ltd.",Zhong Shan Yue Liang Economy& Trade Imp. & Exp...,0,bnd trading,"[-0.05821019038558006, -0.015586151741445065, ...",zhong shan yue liang economy& trade,"[-0.028075631707906723, 0.03612184897065163, 0..."
497569,"Xeikon Industrial Co., Ltd. Of Dongguan City","Yi Cheng Trading Co., Ltd. Of Dongguan City",0,xeikon of dongguan city,"[-0.07554799318313599, 0.06064029037952423, -0...",yi cheng trading of dongguan city,"[-0.05994296446442604, 0.019235583022236824, 0..."
497570,"Shanghai Kechuan Trading Co., Ltd.",Shanghai M&G Stationery Inc.,0,shanghai kechuan trading,"[-0.050915852189064026, -0.010859746485948563,...",shanghai m&g stationery,"[-0.08534333854913712, 0.024522045627236366, 0..."


In [45]:
np.stack(df.emb_1).shape

(497572, 384)

### Add vectors to qdrant

In [None]:
# !docker pull qdrant/qdrant

In [None]:
# !docker run --name qdrant -d -p 6333:6333 --net=bridge qdrant/qdrant

In [63]:
# df_embs = pd.read_parquet('../data/df_embs_preproc_1.parquet')

In [46]:
qdrant_client = QdrantClient(host='0.0.0.0', port=6333)

In [2]:
def create_collection_and_upload(vectors, payload, ids, col_name, vec_shape, bs):
    qdrant_client.recreate_collection(collection_name=col_name,
                                      vectors_config=models.VectorParams(size=vec_shape, distance=models.Distance.COSINE),
                                      on_disk_payload=True)
    qdrant_client.upload_collection(
        collection_name=col_name,
        vectors=vectors,
        payload=payload,
        ids=ids,
        batch_size=bs,
        parallel=6
    )
    col = qdrant_client.get_collection(col_name)
    return col

In [48]:
vectors = np.stack(df_embs.emb)
payload = df_embs.rename({'name': 'original_name'}, axis=1)[['original_name']].to_dict(orient='records')
ids = df_embs.index.values.tolist()
col_name = 'companies'
vec_shape = vectors.shape[1]
bs = 1024

In [49]:
col = create_collection_and_upload(vectors, payload, ids, col_name, vec_shape, bs)

In [50]:
col.points_count

18022

### Similarity search with cosine distance

In [51]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
from experiments.preprocess import preproc, stopwords

In [4]:
def inference(company_name_original, col_name, model, limit=30, df_embs=None, preproc_text=False, debug=False):
    if preproc_text:
        company_name = preproc(company_name_original, stopwords)
    else:
        company_name = company_name_original

    if debug:
        print(f'company name:{company_name}')

    if df_embs is None:
        qvector = model.encode(company_name)
    else:
        qvector = np.stack(df_embs[df_embs['name'] == company_name].emb)[0]

    search_result = qdrant_client.search(
        collection_name=col_name,
        query_vector=qvector,
        query_filter=models.Filter(
            must_not=[
                models.FieldCondition(
                    key="original_name",
                    match=models.MatchValue(value=company_name_original)
                ),
            ]
        ),
        limit=limit,
        offset=0
    )
    return search_result#[r for r in search_result if r.payload['name'] != company_name]

In [54]:
inference('Dow Chemical (Shanghai) Co., Ltd.', col_name, model, preproc_text=False)

[ScoredPoint(id=9176, version=10, score=0.8766744, payload={'original_name': 'Dow Chemical Pacific Ltd.'}, vector=None),
 ScoredPoint(id=5474, version=5, score=0.8710414, payload={'original_name': 'Dow Chemical International Private Ltd.'}, vector=None),
 ScoredPoint(id=8733, version=10, score=0.8694346, payload={'original_name': 'The Dow Chemical Company'}, vector=None),
 ScoredPoint(id=608, version=4, score=0.83456326, payload={'original_name': 'Dow Chemical Thailand Ltd.'}, vector=None),
 ScoredPoint(id=2892, version=1, score=0.8277112, payload={'original_name': 'Dow Chemical'}, vector=None),
 ScoredPoint(id=12778, version=15, score=0.81730974, payload={'original_name': 'Yuanbai Chemicals (Shanghai) Co., Ltd.'}, vector=None),
 ScoredPoint(id=6102, version=5, score=0.8106376, payload={'original_name': 'Shanghai Yikang Chemicals & Industries Co., Ltd.'}, vector=None),
 ScoredPoint(id=566, version=4, score=0.8073822, payload={'original_name': 'Shanghai Origin Chem International Trading

In [55]:
df[((df.name_1 == 'Dow Chemical (Shanghai) Co., Ltd.') | (df.name_2 == 'Dow Chemical (Shanghai) Co., Ltd.')) & df.is_duplicate == 1]

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
11987,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical Pacific Ltd.,1,dow chemical,"[-0.03864812105894089, 0.0032505786512047052, ...",dow chemical,"[-0.03055720031261444, 0.0038767859805375338, ..."
84089,Dow Chemical International Private Ltd.,"Dow Chemical (Shanghai) Co., Ltd.",1,dow chemical,"[-0.03750493377447128, 0.0004331097879912704, ...",dow chemical,"[-0.03864812105894089, 0.0032505786512047052, ..."
145164,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical International Private Ltd.,1,dow chemical,"[-0.03864812105894089, 0.0032505786512047052, ...",dow chemical,"[-0.03750493377447128, 0.0004331097879912704, ..."
223104,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical,1,dow chemical,"[-0.03864812105894089, 0.0032505786512047052, ...",dow chemical,"[-0.06868678331375122, -0.01470722071826458, -..."
223944,Dow Chemical Pacific Ltd.,"Dow Chemical (Shanghai) Co., Ltd.",1,dow chemical,"[-0.03055720031261444, 0.0038767859805375338, ...",dow chemical,"[-0.03864812105894089, 0.0032505786512047052, ..."
286950,Dow Chemical Pacific,"Dow Chemical (Shanghai) Co., Ltd.",1,dow chemical,"[-0.03130755200982094, -0.012462282553315163, ...",dow chemical,"[-0.03864812105894089, 0.0032505786512047052, ..."
381110,Dow Chemical Thailand Ltd.,"Dow Chemical (Shanghai) Co., Ltd.",1,dow chemical,"[-0.04509812220931053, 0.03672919049859047, -0...",dow chemical,"[-0.03864812105894089, 0.0032505786512047052, ..."
388593,Dow Chemical,"Dow Chemical (Shanghai) Co., Ltd.",1,dow chemical,"[-0.06868678331375122, -0.01470722071826458, -...",dow chemical,"[-0.03864812105894089, 0.0032505786512047052, ..."
422738,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical Pacific,1,dow chemical,"[-0.03864812105894089, 0.0032505786512047052, ...",dow chemical,"[-0.03130755200982094, -0.012462282553315163, ..."
493048,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical Thailand Ltd.,1,dow chemical,"[-0.03864812105894089, 0.0032505786512047052, ...",dow chemical,"[-0.04509812220931053, 0.03672919049859047, -0..."


### Calculate metrics

In [56]:
# Select test dataset
df_isdup = df[df.is_duplicate == 1]
names = np.unique(np.concatenate([df_isdup.name_1, df_isdup.name_2]))

In [57]:
names

array([' Alfagomma', ' CONTITECH TRANSPORTBANDSYSTEME GMBH',
       ' SO.F.TER. SPA', ..., 'ФИЛИАЛ КОМПАНИИ ЭКСОН НЕФТЕГАЗ ЛИМИТЕД',
       'ФИЛИАЛ КОМПАНИИ"ЭКСОН НЕФТЕГАЗ ЛИМИТЕД"', 'ХИМИНВЕСТ ГРУПП, ООО'],
      dtype=object)

In [5]:
def precision_score_at_k(k, names, unique_groups, model, df_embs, col_name, debug, preproc_text=False):
    scores = []
    for item in tqdm(names):
        q_name = item
        res = inference(q_name, col_name, model, limit=k, df_embs=df_embs, preproc_text=preproc_text, debug=debug)
        found_name = res[0].payload['original_name']
        if debug:
            print(res)
            print(f'q_name {q_name}')
            print(f'found_name {found_name}')

        # todo p@ k > 1
        # ap_k = 1/k * ()
        # for i in enumerate(range(11)):

        for ugroup in unique_groups:
            if q_name in ugroup:
                if debug:
                    print(ugroup)
                if found_name in ugroup:
                    scores.append((q_name, found_name))
                    if debug:
                        print('GOOD')
                break

        if debug:
            break

    return scores #len(scores) / len(names)

In [6]:
# DATASET GROUPINGS
def get_unique_groups(names, df_isdup):
    company_groups = []
    for company_name in tqdm(names):
        tmpdf_names_1 = set(df_isdup[df_isdup.name_2 == company_name].name_1.tolist())
        tmpdf_names_2 = set(df_isdup[df_isdup.name_1 == company_name].name_2.tolist())
        tmpdf_names_1.update(tmpdf_names_2)
        tmpdf_names_1.update([company_name])
        for comp_name in tmpdf_names_1.copy():
            tmpdf_names_1_ = set(df_isdup[df_isdup.name_2 == comp_name].name_1.tolist())
            tmpdf_names_2_ = set(df_isdup[df_isdup.name_1 == comp_name].name_2.tolist())
            tmpdf_names_1_.update(tmpdf_names_2_)
            tmpdf_names_1.update(tmpdf_names_1_)

        tmpdf_names_1 = sorted(tmpdf_names_1)
        company_groups.append(tmpdf_names_1)

    unique_groups = []
    for group in company_groups:
        if group not in unique_groups:
            unique_groups.append(group)

    return unique_groups

In [60]:
unique_groups = get_unique_groups(names, df[df.is_duplicate == 1])

100%|██████████| 1394/1394 [00:11<00:00, 124.62it/s]


In [61]:
df_embs[df_embs.name == names[0]]

Unnamed: 0,name,name_preproc,emb
16508,Alfagomma,alfagomma,"[-0.062376949936151505, 0.05818292871117592, -..."


In [62]:
map_score = precision_score_at_k(1,
                                 names,#[19:],
                                 unique_groups,
                                 model,
                                 df_embs,#.rename({'name':'tmp', 'name_preproc': 'name'}, axis=1),
                                 col_name,
                                 debug=False, preproc_text=False
                                 )
# map_score

100%|██████████| 1394/1394 [01:45<00:00, 13.18it/s]


In [63]:
print(f'Precision@1={len(map_score) / len(names)}')

Precision@1=0.5746054519368723


# Qdrant + Preprocess

In [64]:
df_emb = pd.read_parquet('../data/df_embs_preproc_1.parquet')

In [65]:
df = pd.read_parquet('../data/all_lang_train.parquet')

In [66]:
df

Unnamed: 0,name_1,name_2,is_duplicate
0,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
1,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
2,"Rishichem Distributors Pvt., Ltd.",Dsa,0
3,Powermax Rubber Factory,Co. One,0
4,Tress A/S,Longyou Industries Park Zhejiang,0
...,...,...,...
497567,BIT-MAT PRODUCTS,The Goodyear Tire and Rubber Company,0
497568,"Bnd Trading Co., Ltd.",Zhong Shan Yue Liang Economy& Trade Imp. & Exp...,0
497569,"Xeikon Industrial Co., Ltd. Of Dongguan City","Yi Cheng Trading Co., Ltd. Of Dongguan City",0
497570,"Shanghai Kechuan Trading Co., Ltd.",Shanghai M&G Stationery Inc.,0


In [67]:
df = df.merge(df_emb, how='left', left_on='name_1', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_1'}, axis=1)

In [68]:
df = df.merge(df_emb, how='left', left_on='name_2', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_2'}, axis=1)

In [69]:
df

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
0,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0,iko,"[0.008997186087071896, -0.01706981100142002, -...",enormous trade,"[0.02667689323425293, -0.046701788902282715, -..."
1,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0,apcotex,"[0.005421018227934837, 0.017310479655861855, -...",technocraft,"[-0.055486664175987244, 0.04864133894443512, -..."
2,"Rishichem Distributors Pvt., Ltd.",Dsa,0,rishichem distributors,"[-0.010888855904340744, -0.04686060547828674, ...",dsa,"[0.0220376867800951, 0.02404574118554592, -0.0..."
3,Powermax Rubber Factory,Co. One,0,powermax rubber factory,"[-0.017552083358168602, -0.0015446854522451758...",one,"[0.04439183324575424, -0.08115479350090027, -0..."
4,Tress A/S,Longyou Industries Park Zhejiang,0,tress,"[0.044657666236162186, -0.05069291591644287, -...",longyou park zhejiang,"[0.037980083376169205, 0.13952958583831787, -0..."
...,...,...,...,...,...,...,...
497567,BIT-MAT PRODUCTS,The Goodyear Tire and Rubber Company,0,bit mat products,"[0.01473778486251831, -0.036650754511356354, 0...",the goodyear tire and rubber company,"[-0.02824639342725277, -0.03880760073661804, 0..."
497568,"Bnd Trading Co., Ltd.",Zhong Shan Yue Liang Economy& Trade Imp. & Exp...,0,bnd trading,"[0.04614870250225067, -0.05307731777429581, -0...",zhong shan yue liang economy& trade,"[0.014954282902181149, 0.009358176961541176, -..."
497569,"Xeikon Industrial Co., Ltd. Of Dongguan City","Yi Cheng Trading Co., Ltd. Of Dongguan City",0,xeikon of dongguan city,"[-0.03605031222105026, 0.06106617674231529, -0...",yi cheng trading of dongguan city,"[0.007113339379429817, 0.023789547383785248, -..."
497570,"Shanghai Kechuan Trading Co., Ltd.",Shanghai M&G Stationery Inc.,0,shanghai kechuan trading,"[0.025122083723545074, 0.03761020302772522, -0...",shanghai m&g stationery,"[0.023444771766662598, 0.08165428787469864, -0..."


In [70]:
np.stack(df.emb_1).shape

(497572, 512)

In [71]:
df_names = pd.read_parquet('../data/df_names_preproc.parquet')

In [72]:
df_names = df_names.merge(df_emb[['name', 'emb']], how='left', left_on='Names', right_on='name')
df_names = df_names.drop(columns=['Names', 'languages_langdetect'])
df_names = df_names.rename({'name': 'original_name', 'name_preproc':'preprocessed_name'}, axis=1)

In [73]:
df_names

Unnamed: 0,preprocessed_name,original_name,emb
0,iko,Iko Industries Ltd.,"[0.008997186087071896, -0.01706981100142002, -..."
1,apcotex,Apcotex Industries Ltd.,"[0.005421018227934837, 0.017310479655861855, -..."
2,rishichem distributors,"Rishichem Distributors Pvt., Ltd.","[-0.010888855904340744, -0.04686060547828674, ..."
3,powermax rubber factory,Powermax Rubber Factory,"[-0.017552083358168602, -0.0015446854522451758..."
4,tress,Tress A/S,"[0.044657666236162186, -0.05069291591644287, -..."
...,...,...,...
18017,plastic packaging,Plastic Packaging (Pty) Ltd.,"[-0.10477159172296524, 0.09692458808422089, -0..."
18018,hengshui mechanical & electrical building,'Hengshui Mechanical & Electrical Building Co....,"[0.024205442517995834, 0.06279519945383072, -0..."
18019,jiangsu baoli investment,"Jiangsu Baoli International Investment Co., Lt...","[0.007433122955262661, -0.0124862901866436, 0...."
18020,lanxess ag,Lanxess AG,"[0.00656414870172739, -0.008185080252587795, -..."


### Add vectors to qdrant

In [None]:
# !docker pull qdrant/qdrant

In [None]:
# !docker run --name qdrant -d -p 6333:6333 --net=bridge qdrant/qdrant

In [74]:
qdrant_client = QdrantClient(host='0.0.0.0', port=6333)

In [75]:
vectors = np.stack(df_emb.emb)
payload = df_names[['original_name', 'preprocessed_name']].to_dict(orient='records')
ids = df_emb.index.values.tolist()
col_name = 'companies'
vec_shape = vectors.shape[1]
bs = 1024
col = create_collection_and_upload(vectors, payload, ids, col_name, vec_shape, bs)

In [76]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

In [46]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [77]:
inference('Dow Chemical International Private Ltd.', col_name, model, preproc_text=True)

[ScoredPoint(id=6647, version=8, score=1.0, payload={'original_name': 'Dow Chemical (Shanghai) Co., Ltd.', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=608, version=0, score=1.0, payload={'original_name': 'Dow Chemical Thailand Ltd.', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=2892, version=2, score=1.0, payload={'original_name': 'Dow Chemical', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=3748, version=3, score=1.0, payload={'original_name': 'Dow Chemical Pacific', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=9176, version=6, score=1.0, payload={'original_name': 'Dow Chemical Pacific Ltd.', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=469, version=0, score=0.93736213, payload={'original_name': 'Vohra Chemical Co.', 'preprocessed_name': 'vohra chemical'}, vector=None),
 ScoredPoint(id=2417, version=2, score=0.92631656, payload={'original_name': 'Intl Chemical Industries'

In [78]:
# Select test dataset
df_isdup = df[df.is_duplicate == 1]
names = np.unique(np.concatenate([df_isdup.name_1, df_isdup.name_2]))

In [49]:
df_isdup[df_isdup.name_2 == 'A.P.I. APPLICAZIONI PLASTICHE INDUSTRIALI S.P.A.']

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
18065,A.P.I.,A.P.I. APPLICAZIONI PLASTICHE INDUSTRIALI S.P.A.,1,api,"[-0.1123679056763649, 0.022189512848854065, -0...",api applicazioni plastiche i,"[-0.12707926332950592, 0.05647152289748192, -0..."
277795,API,A.P.I. APPLICAZIONI PLASTICHE INDUSTRIALI S.P.A.,1,api,"[-0.1123679056763649, 0.022189512848854065, -0...",api applicazioni plastiche i,"[-0.12707926332950592, 0.05647152289748192, -0..."
431643,Trinseo API,A.P.I. APPLICAZIONI PLASTICHE INDUSTRIALI S.P.A.,1,trinseo api,"[-0.09881430119276047, 0.01207094918936491, -0...",api applicazioni plastiche i,"[-0.12707926332950592, 0.05647152289748192, -0..."


In [None]:
df_isdup[df_isdup.name_1 == 'A.P.I. Applicazioni Plastiche Industriali SPA']

In [114]:
df_isdup[df_isdup.name_1 == 'API']

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
29979,API,A.P.I. Applicazioni Plastiche Industriali SPA,1,api,"[0.019749535247683525, -0.014046860858798027, ...",api applicazioni plastiche industriali,"[-0.03701728209853172, -0.023362185806035995, ..."
277795,API,A.P.I. APPLICAZIONI PLASTICHE INDUSTRIALI S.P.A.,1,api,"[0.019749535247683525, -0.014046860858798027, ...",api applicazioni plastiche industriali,"[-0.03701728209853172, -0.023362185806035995, ..."
329198,API,Trinseo API,1,api,"[0.019749535247683525, -0.014046860858798027, ...",trinseo api,"[0.012135565280914307, -0.020962947979569435, ..."
390564,API,A.P.I.,1,api,"[0.019749535247683525, -0.014046860858798027, ...",api,"[0.019749535247683525, -0.014046860858798027, ..."


In [79]:
unique_groups = get_unique_groups(names, df_isdup)

100%|██████████| 1394/1394 [00:10<00:00, 127.06it/s]


In [52]:
unique_groups

[[' Alfagomma', 'ALFAGOMMA INDUSTRIAL SPA'],
 [' CONTITECH TRANSPORTBANDSYSTEME GMBH', 'ContiTech Thermopol Inc.'],
 [' SO.F.TER. SPA', 'Softer Us Inc.'],
 [' TOTAL OIL INDIA PRIVATE LIMITED,  TOTAL',
  'Total Oil India Limited',
  'Total Oil India Private Limited'],
 ['*** ПОЛИМАРКЕТ, ООО', 'ООО "Полимаркет"', 'ООО Полимаркет'],
 ['A. WESTENSEE & PARTNER ROHSTOFF GMBH', 'awp-rohstoffe'],
 ['A.P.I.',
  'A.P.I. APPLICAZIONI PLASTICHE INDUSTRIALI S.P.A.',
  'A.P.I. Applicazioni Plastiche Industriali SPA',
  'API',
  'Trinseo API'],
 ['ABENA INTERNATIONAL A/S', 'ООО"АБЕНА"'],
 ['ABRO INDUSTRIES, INC.', 'ООО "АБРО ИНДАСТРИС"'],
 ['ADI (SALAMBO)', 'ADI COMMERCE', 'ADI commerce ltd'],
 ['ADRIATICA BITUMI', 'ADRIATICA BITUMI S.P.A.', 'Adriatica Bitumi Spa'],
 ['AGILENT TECHNOLOGIES MFG GMBH & SHIPPING DEPARTMENT',
  'ООО "АДЖИЛЕНТ ТЕКНОЛОДЖИЗ"'],
 ['AGIP',
  'AGIP  ( ENI GROUP)',
  'Azienda Generale Italiana Petroli',
  'agip spa'],
 ['ALBEMARLE GREEN CREST ',
  'Albemarle Corporation',
  'Al

In [161]:
df[df.name_2 == 'SO.F.TER.']

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
1328,Lohmann GmbH & Co. KG,SO.F.TER.,0,lohmann & kg,"[-0.00794435478746891, 0.024964027106761932, 0...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."
4332,ANKARA INSAAT TICARET ve SANAYI LIMITED SIRKETI,SO.F.TER.,0,ankara insaat ticaret ve sanayi limited sirketi,"[0.024160169064998627, -0.051298387348651886, ...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."
12029,POLYMERTEAM,SO.F.TER.,0,polymerteam,"[-0.007658825255930424, 0.023627055808901787, ...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."
13804,Joss Holding B.V.,SO.F.TER.,0,joss holding bv,"[0.0007745528127998114, 0.012520086951553822, ...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."
22179,MCASPHALT,SO.F.TER.,0,mcasphalt,"[0.049031779170036316, -0.008015838451683521, ...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."
...,...,...,...,...,...,...,...
483258,PAVIMENTAL,SO.F.TER.,0,pavimental,"[-0.004661908373236656, -0.006784559227526188,...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."
485337,PROTRADE,SO.F.TER.,0,protrade,"[0.000575021025724709, -0.023494573310017586, ...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."
490856,PORR,SO.F.TER.,0,porr,"[0.011393015272915363, -0.007374722044914961, ...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."
492066,Rompetrol Rafinare S.A.,SO.F.TER.,0,rompetrol rafinare,"[0.005357269197702408, -0.008388257585465908, ...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."


In [163]:
df[df.name_2 == ' SO.F.TER. SPA']

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
86652,Softer Us Inc.,SO.F.TER. SPA,1,softer us,"[-0.09364961832761765, -0.05149330943822861, 0...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."


In [56]:
names[names == 'Fenner Dunlop (Toledo), Llc']

array(['Fenner Dunlop (Toledo), Llc'], dtype=object)

In [197]:
df[df.name_2 == ' SO.F.TER. SPA']

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
86652,Softer Us Inc.,SO.F.TER. SPA,1,softer us,"[-0.09364961832761765, -0.05149330943822861, 0...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."


In [186]:
df[(df.name_2 == 'Softer Us Inc.') & (df.is_duplicate != 1)]

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2


In [83]:
preproc('ALIBESA', stopwords)

'alibesa'

In [25]:
names_test = [n[0] for n in unique_groups]

In [None]:
unique_groups = get_unique_groups(names, df_isdup)

In [81]:
map_score = precision_score_at_k(1,
                                 names,#[19:],
                                 unique_groups,
                                 model,
                                 df_emb.rename({'name':'tmp', 'name_preproc': 'name'}, axis=1),
                                 col_name,
                                 debug=False, preproc_text=True
                                 )

100%|██████████| 1394/1394 [01:47<00:00, 12.98it/s]


In [54]:
len(map_score) / len(names)

0.6183644189383071

In [82]:
print(f'Precision@1={len(map_score) / len(names)}')

Precision@1=0.4569583931133429


# Experiment 3
# Quaterion

In [7]:
df_emb = pd.read_parquet('../data/df_embs_preproc_2.parquet')

In [8]:
df = pd.read_parquet('../data/all_lang_train.parquet')

In [9]:
df = df.merge(df_emb, how='left', left_on='name_1', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_1'}, axis=1)

In [10]:
df = df.merge(df_emb, how='left', left_on='name_2', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_2'}, axis=1)

In [11]:
df

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
0,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0,iko,"[-0.09832967817783356, 0.07520564645528793, 0....",enormous trade,"[-0.05738343298435211, 0.058508045971393585, -..."
1,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0,apcotex,"[-0.040096644312143326, 0.01942046545445919, -...",technocraft,"[-0.03547694534063339, 0.033784572035074234, 0..."
2,"Rishichem Distributors Pvt., Ltd.",Dsa,0,rishichem distributors,"[-0.015067839995026588, -0.02670854702591896, ...",dsa,"[-0.1476079523563385, 0.003658777102828026, -0..."
3,Powermax Rubber Factory,Co. One,0,powermax rubber factory,"[-0.027730092406272888, 0.06848588585853577, -...",one,"[-0.03424311801791191, -0.01212331559509039, -..."
4,Tress A/S,Longyou Industries Park Zhejiang,0,tress,"[-0.02364397421479225, 0.08266414701938629, -0...",longyou park zhejiang,"[0.04394970461726189, 0.06074526533484459, 0.0..."
...,...,...,...,...,...,...,...
497567,BIT-MAT PRODUCTS,The Goodyear Tire and Rubber Company,0,bit mat products,"[-0.030496124178171158, 0.008710866793990135, ...",the goodyear tire and rubber company,"[-0.055517930537462234, 0.02390013262629509, 0..."
497568,"Bnd Trading Co., Ltd.",Zhong Shan Yue Liang Economy& Trade Imp. & Exp...,0,bnd trading,"[-0.09089754521846771, -0.051476895809173584, ...",zhong shan yue liang economy& trade,"[-0.02577291801571846, 0.009628668427467346, 0..."
497569,"Xeikon Industrial Co., Ltd. Of Dongguan City","Yi Cheng Trading Co., Ltd. Of Dongguan City",0,xeikon of dongguan city,"[-0.02690267749130726, 0.08781823515892029, 0....",yi cheng trading of dongguan city,"[-0.034574463963508606, 0.01867447979748249, 0..."
497570,"Shanghai Kechuan Trading Co., Ltd.",Shanghai M&G Stationery Inc.,0,shanghai kechuan trading,"[-0.026555752381682396, -0.06021178886294365, ...",shanghai m&g stationery,"[-0.06080954894423485, 0.023097140714526176, 0..."


In [15]:
df_names = pd.read_parquet('../data/df_names_preproc.parquet')

In [16]:
df_names = df_names.merge(df_emb[['name', 'emb']], how='left', left_on='Names', right_on='name')
df_names = df_names.drop(columns=['Names', 'languages_langdetect'])
df_names = df_names.rename({'name': 'original_name', 'name_preproc':'preprocessed_name'}, axis=1)

In [17]:
df_names

Unnamed: 0,preprocessed_name,original_name,emb
0,iko,Iko Industries Ltd.,"[-0.09832967817783356, 0.07520564645528793, 0...."
1,apcotex,Apcotex Industries Ltd.,"[-0.040096644312143326, 0.01942046545445919, -..."
2,rishichem distributors,"Rishichem Distributors Pvt., Ltd.","[-0.015067839995026588, -0.02670854702591896, ..."
3,powermax rubber factory,Powermax Rubber Factory,"[-0.027730092406272888, 0.06848588585853577, -..."
4,tress,Tress A/S,"[-0.02364397421479225, 0.08266414701938629, -0..."
...,...,...,...
18017,plastic packaging,Plastic Packaging (Pty) Ltd.,"[-0.1276782602071762, 0.08994223177433014, -0...."
18018,hengshui mechanical & electrical building,'Hengshui Mechanical & Electrical Building Co....,"[-0.04444390907883644, 0.07687955349683762, 0...."
18019,jiangsu baoli investment,"Jiangsu Baoli International Investment Co., Lt...","[-0.004805956035852432, 0.02011996880173683, -..."
18020,lanxess ag,Lanxess AG,"[-0.06939245760440826, -0.013326993212103844, ..."


In [18]:
qdrant_client = QdrantClient(host='0.0.0.0', port=6333)

In [19]:
vectors = np.stack(df_emb.emb)
payload = df_names[['original_name', 'preprocessed_name']].to_dict(orient='records')
ids = df_emb.index.values.tolist()
col_name = 'companies'
vec_shape = vectors.shape[1]
bs = 1024
col = create_collection_and_upload(vectors, payload, ids, col_name, vec_shape, bs)

In [20]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [21]:
inference('Dow Chemical International Private Ltd.', col_name, model, preproc_text=True)

[ScoredPoint(id=2892, version=0, score=1.0000001, payload={'original_name': 'Dow Chemical', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=608, version=1, score=1.0000001, payload={'original_name': 'Dow Chemical Thailand Ltd.', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=9176, version=8, score=1.0000001, payload={'original_name': 'Dow Chemical Pacific Ltd.', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=3748, version=5, score=1.0000001, payload={'original_name': 'Dow Chemical Pacific', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=6647, version=6, score=1.0000001, payload={'original_name': 'Dow Chemical (Shanghai) Co., Ltd.', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=8733, version=8, score=0.877924, payload={'original_name': 'The Dow Chemical Company', 'preprocessed_name': 'the dow chemical company'}, vector=None),
 ScoredPoint(id=7901, version=7, score=0.67946684, payload

In [22]:
# Select test dataset
df_isdup = df[df.is_duplicate == 1]
names = np.unique(np.concatenate([df_isdup.name_1, df_isdup.name_2]))

In [23]:
unique_groups = get_unique_groups(names, df[df.is_duplicate == 1])

100%|██████████| 1394/1394 [00:13<00:00, 100.87it/s]


In [53]:
import json
import uuid

with open('../data/unique_groups.jsonl', 'w', encoding='utf-8') as fw:
    for ung in unique_groups:
        group_uid = str(uuid.uuid4())
        for el in ung:
            dc = {'original_name': el, 'preprocessed_name': preproc(el, stopwords), 'group': group_uid}
            json.dump(dc, fw, ensure_ascii=False)
            fw.write('\n')

In [41]:
import argparse
import os
import random
import json
from typing import Any, Dict, List, Union
import pytorch_lightning as pl
import torch
from quaterion.eval.attached_metric import AttachedMetric
from quaterion.eval.pair import RetrievalPrecision, RetrievalReciprocalRank
from quaterion_models.encoders import Encoder
from quaterion_models.heads import EncoderHead, GatedHead
from quaterion_models.types import CollateFnType
from torch.utils.data import Dataset
from quaterion import Quaterion, TrainableModel
from quaterion.dataset.similarity_data_loader import (
    GroupSimilarityDataLoader,
    SimilarityGroupSample,
)
from quaterion.loss import SimilarityLoss, SoftmaxLoss
from sentence_transformers import SentenceTransformer
random.seed(42)

In [62]:
class CompaniesDataset(Dataset):
    def __init__(self, path: str, max_samples: int = 50000):
        super().__init__()
        with open(path, "r", encoding="utf8") as f:
            lines = f.readlines()[:max_samples]
            random.shuffle(lines)
            self.data = [json.loads(line) for line in lines]

        _company_groups = set(sorted([item["group"] for item in self.data]))
        self._label2idx = {label: idx for idx, label in enumerate(_company_groups)}

    def __getitem__(self, index: int) -> SimilarityGroupSample:
        item = self.data[index]
        return SimilarityGroupSample(obj=item, group=self._label2idx[item["group"]])

    def __len__(self) -> int:
        return len(self.data)

    def get_num_groups(self) -> int:
        return len(self._label2idx)

In [63]:
class CompanyEncoder(Encoder):
    def __init__(self, pretrained_name: str):
        super().__init__()
        self.encoder = SentenceTransformer(pretrained_name)
        self._pretrained_name = pretrained_name

    @property
    def trainable(self) -> bool:
        return False

    @property
    def embedding_size(self) -> int:
        return self.encoder.get_sentence_embedding_dimension()

    def get_collate_fn(self) -> CollateFnType:
        return self.extract_texts

    def extract_texts(self, batch: List[Union[str, Dict[str, Any]]]):
        if isinstance(batch[0], str):
            return batch
        elif isinstance(batch[0], Dict):
            return [item["preprocessed_name"] for item in batch]
        else:
            raise TypeError("Expecting list of strings or dicts as inputs")

    def forward(self, inputs):
        return self.encoder.encode(
            inputs, convert_to_numpy=False, convert_to_tensor=True
        )

    def save(self, output_path: str):
        self.encoder.save(os.path.join(output_path, self._pretrained_name))

    @classmethod
    def load(cls, input_path: str) -> "Encoder":
        return CompanyEncoder(input_path)


In [64]:
class CompanyMatchingModel(TrainableModel):
    def __init__(
        self,
        pretrained_name: str = "all-MiniLM-L6-v2",
        num_groups: int = 20,
        lr: float = 3e-5,
    ):
        self._pretrained_name = pretrained_name
        self._num_groups = num_groups
        self._lr = lr
        super().__init__()

    def configure_metrics(self):
        # attach batch-wise metrics which will be automatically computed and logged during training
        return [
            AttachedMetric(
                "RetrievalPrecision",
                RetrievalPrecision(k=1),
                prog_bar=True,
                on_epoch=True,
            ),
            AttachedMetric(
                "RetrievalReciprocalRank",
                RetrievalReciprocalRank(),
                prog_bar=True,
                on_epoch=True
            ),
        ]

    def configure_encoders(self) -> Union[Encoder, Dict[str, Encoder]]:
        return CompanyEncoder(self._pretrained_name)

    def configure_head(self, input_embedding_size) -> EncoderHead:
        return GatedHead(input_embedding_size)

    def configure_loss(self) -> SimilarityLoss:
        return SoftmaxLoss(self.model.head.output_size, self._num_groups)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(
            [
                {"params": self.model.parameters(), "lr": self._lr},
                {"params": self.loss.parameters(), "lr": self._lr * 10.0},
            ]
        )

        return optimizer

In [69]:
pretrain = 'all-MiniLM-L6-v2'

In [70]:
cd = CompaniesDataset(path='../data/unique_groups.jsonl')

In [71]:
enc = CompanyEncoder(pretrained_name=pretrain)

In [72]:
ftmodel = CompanyMatchingModel(pretrained_name=pretrain, num_groups=cd.get_num_groups())

In [74]:
ftmodel.eval()

CompanyMatchingModel(
  (_model): SimilarityModel(
    (default): CompanyEncoder(
      (encoder): SentenceTransformer(
        (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
        (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
        (2): Normalize()
      )
    )
    (head): GatedHead(
      (dropout): Identity()
    )
  )
  (_loss): SoftmaxLoss()
)

In [75]:
train_dataloader = GroupSimilarityDataLoader(cd, batch_size=64, shuffle=True)

In [76]:
trainer = pl.Trainer(accelerator="auto", devices=1, num_nodes=1, max_epochs=30)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [77]:
Quaterion.fit(
    trainable_model=model,
    trainer=trainer,
    train_dataloader=train_dataloader,
)

AttributeError: 'SentenceTransformer' object has no attribute 'loss'

In [None]:
model.save_servable("companies")