# Experiment 2
# Sentence embeddings and Qdrant Search

## Unprocessed dataest

In [75]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from qdrant_client.http.models import models
from tqdm import tqdm

In [76]:
emb_variations = {
    1 : "distiluse-base-multilingual-cased-v2",
    2 : "all-MiniLM-L6-v2",
    3 : "all-MiniLM-L12-v2",
    4 : "paraphrase-MiniLM-L6-v2",
    5 : "paraphrase-MiniLM-L12-v2",
    6 : "all-mpnet-base-v2",
    7 : "LaBSE",
    8 : "paraphrase-multilingual-MiniLM-L12-v2",
}

### Load datasets
Load and merge datasets

In [77]:
num_emb = 5

In [78]:
df_embs = pd.read_parquet(f'../data/df_embs_{num_emb}.parquet')

In [79]:
df = pd.read_parquet('../data/all_lang_train.parquet')

In [80]:
df

Unnamed: 0,name_1,name_2,is_duplicate
0,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
1,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
2,"Rishichem Distributors Pvt., Ltd.",Dsa,0
3,Powermax Rubber Factory,Co. One,0
4,Tress A/S,Longyou Industries Park Zhejiang,0
...,...,...,...
497567,BIT-MAT PRODUCTS,The Goodyear Tire and Rubber Company,0
497568,"Bnd Trading Co., Ltd.",Zhong Shan Yue Liang Economy& Trade Imp. & Exp...,0
497569,"Xeikon Industrial Co., Ltd. Of Dongguan City","Yi Cheng Trading Co., Ltd. Of Dongguan City",0
497570,"Shanghai Kechuan Trading Co., Ltd.",Shanghai M&G Stationery Inc.,0


In [81]:
df = df.merge(df_embs, how='left', left_on='name_1', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_1'}, axis=1)

In [82]:
df = df.merge(df_embs, how='left', left_on='name_2', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_2'}, axis=1)

In [83]:
df

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
0,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0,iko,"[-0.3477937877178192, -0.09515048563480377, -0...",enormous trade,"[0.0022828795481473207, -0.2031659185886383, -..."
1,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0,apcotex,"[-0.27617186307907104, -0.02770525962114334, -...",technocraft,"[0.14635850489139557, -0.21192078292369843, 0...."
2,"Rishichem Distributors Pvt., Ltd.",Dsa,0,rishichem distributors,"[-0.09656728059053421, -0.23950479924678802, -...",dsa,"[-0.23276162147521973, -0.5193449854850769, -0..."
3,Powermax Rubber Factory,Co. One,0,powermax rubber factory,"[-0.405891090631485, 0.4249858856201172, 0.068...",one,"[-0.25359031558036804, -0.19084671139717102, -..."
4,Tress A/S,Longyou Industries Park Zhejiang,0,tress,"[-0.49151352047920227, -0.0839085653424263, 0....",longyou park zhejiang,"[0.37256649136543274, 0.5339913964271545, 0.22..."
...,...,...,...,...,...,...,...
497567,BIT-MAT PRODUCTS,The Goodyear Tire and Rubber Company,0,bit mat products,"[-0.24340814352035522, -0.19688552618026733, 0...",the goodyear tire and rubber company,"[-0.10538866370916367, 0.29580366611480713, -0..."
497568,"Bnd Trading Co., Ltd.",Zhong Shan Yue Liang Economy& Trade Imp. & Exp...,0,bnd trading,"[-0.36903491616249084, 0.12761077284812927, -0...",zhong shan yue liang economy& trade,"[-0.1297224462032318, -0.0495435856282711, 0.1..."
497569,"Xeikon Industrial Co., Ltd. Of Dongguan City","Yi Cheng Trading Co., Ltd. Of Dongguan City",0,xeikon of dongguan city,"[-0.19072137773036957, -0.008813923224806786, ...",yi cheng trading of dongguan city,"[-0.18082399666309357, -0.13061846792697906, -..."
497570,"Shanghai Kechuan Trading Co., Ltd.",Shanghai M&G Stationery Inc.,0,shanghai kechuan trading,"[-0.14333350956439972, -0.20379912853240967, -...",shanghai m&g stationery,"[-0.03641090542078018, 0.1324702948331833, 0.0..."


In [85]:
np.stack(df.emb_1[:3]).shape

(3, 384)

### Add vectors to qdrant
Here we create qdrant collection and add embeddings to it

In [None]:
# !docker pull qdrant/qdrant

In [None]:
# !docker run --name qdrant -d -p 6333:6333 --net=bridge qdrant/qdrant

In [63]:
# df_embs = pd.read_parquet('../data/df_embs_preproc_1.parquet')

In [86]:
qdrant_client = QdrantClient(host='0.0.0.0', port=6333)

In [87]:
def create_collection_and_upload(vectors, payload, ids, col_name, vec_shape, bs):
    qdrant_client.recreate_collection(collection_name=col_name,
                                      vectors_config=models.VectorParams(size=vec_shape, distance=models.Distance.COSINE),
                                      on_disk_payload=True)
    qdrant_client.upload_collection(
        collection_name=col_name,
        vectors=vectors,
        payload=payload,
        ids=ids,
        batch_size=bs,
        parallel=6
    )
    col = qdrant_client.get_collection(col_name)
    return col

In [88]:
vectors = np.stack(df_embs.emb)
payload = df_embs.rename({'name': 'original_name'}, axis=1)[['original_name']].to_dict(orient='records')
ids = df_embs.index.values.tolist()
col_name = 'companies'
vec_shape = vectors.shape[1]
bs = 1024

In [89]:
col = create_collection_and_upload(vectors, payload, ids, col_name, vec_shape, bs)

In [90]:
col.points_count

18022

### Similarity search with cosine distance
Here we make a query to qdrant collection with embedding vector obtained from SentenceTransformer
Cosine distance is used

In [91]:
model = SentenceTransformer(emb_variations[num_emb])

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/631 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [92]:
from experiments.preprocess import preproc, stopwords

In [93]:
def inference(company_name_original, col_name, model, limit=30, df_embs=None, preproc_text=False, debug=False):
    if preproc_text:
        company_name = preproc(company_name_original, stopwords)
    else:
        company_name = company_name_original

    if debug:
        print(f'company name:{company_name}')

    if df_embs is None:
        qvector = model.encode(company_name)
    else:
        qvector = np.stack(df_embs[df_embs['name'] == company_name].emb)[0]

    if qvector.shape[0] == 1:
        qvector = qvector[0]

    search_result = qdrant_client.search(
        collection_name=col_name,
        query_vector=qvector,
        query_filter=models.Filter(
            must_not=[
                models.FieldCondition(
                    key="original_name",
                    match=models.MatchValue(value=company_name_original)
                ),
            ]
        ),
        limit=limit,
        offset=0
    )
    return search_result#[r for r in search_result if r.payload['name'] != company_name]

We can see score from highest (closest to query vector in embedding space) to lowest (farthest from query vector in embedding space)
All payload data like original company name is also shown

In [94]:
inference('Dow Chemical (Shanghai) Co., Ltd.', col_name, model, preproc_text=False)

[ScoredPoint(id=5474, version=1, score=0.86313254, payload={'original_name': 'Dow Chemical International Private Ltd.'}, vector=None),
 ScoredPoint(id=9176, version=8, score=0.84244597, payload={'original_name': 'Dow Chemical Pacific Ltd.'}, vector=None),
 ScoredPoint(id=608, version=0, score=0.79936284, payload={'original_name': 'Dow Chemical Thailand Ltd.'}, vector=None),
 ScoredPoint(id=2892, version=2, score=0.79532886, payload={'original_name': 'Dow Chemical'}, vector=None),
 ScoredPoint(id=8733, version=8, score=0.79446477, payload={'original_name': 'The Dow Chemical Company'}, vector=None),
 ScoredPoint(id=12778, version=13, score=0.78160894, payload={'original_name': 'Yuanbai Chemicals (Shanghai) Co., Ltd.'}, vector=None),
 ScoredPoint(id=8365, version=8, score=0.7809888, payload={'original_name': 'Eastman (Shanghai) Chemical Commercial Co., Ltd.'}, vector=None),
 ScoredPoint(id=3748, version=4, score=0.77455723, payload={'original_name': 'Dow Chemical Pacific'}, vector=None),


In [112]:
df[((df.name_1 == 'Dow Chemical (Shanghai) Co., Ltd.') | (df.name_2 == 'Dow Chemical (Shanghai) Co., Ltd.')) & df.is_duplicate == 1]

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
11987,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical Pacific Ltd.,1,dow chemical,"[-0.3575917184352875, -0.0433979295194149, -0....",dow chemical,"[-0.05168405920267105, -0.21086931228637695, -..."
84089,Dow Chemical International Private Ltd.,"Dow Chemical (Shanghai) Co., Ltd.",1,dow chemical,"[-0.11826933175325394, -0.14124689996242523, -...",dow chemical,"[-0.3575917184352875, -0.0433979295194149, -0...."
145164,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical International Private Ltd.,1,dow chemical,"[-0.3575917184352875, -0.0433979295194149, -0....",dow chemical,"[-0.11826933175325394, -0.14124689996242523, -..."
223104,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical,1,dow chemical,"[-0.3575917184352875, -0.0433979295194149, -0....",dow chemical,"[-0.49079570174217224, 0.14337339997291565, -0..."
223944,Dow Chemical Pacific Ltd.,"Dow Chemical (Shanghai) Co., Ltd.",1,dow chemical,"[-0.05168405920267105, -0.21086931228637695, -...",dow chemical,"[-0.3575917184352875, -0.0433979295194149, -0...."
286950,Dow Chemical Pacific,"Dow Chemical (Shanghai) Co., Ltd.",1,dow chemical,"[-0.2712400257587433, -0.2303997278213501, -0....",dow chemical,"[-0.3575917184352875, -0.0433979295194149, -0...."
381110,Dow Chemical Thailand Ltd.,"Dow Chemical (Shanghai) Co., Ltd.",1,dow chemical,"[-0.2474443018436432, 0.19269879162311554, -0....",dow chemical,"[-0.3575917184352875, -0.0433979295194149, -0...."
388593,Dow Chemical,"Dow Chemical (Shanghai) Co., Ltd.",1,dow chemical,"[-0.49079570174217224, 0.14337339997291565, -0...",dow chemical,"[-0.3575917184352875, -0.0433979295194149, -0...."
422738,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical Pacific,1,dow chemical,"[-0.3575917184352875, -0.0433979295194149, -0....",dow chemical,"[-0.2712400257587433, -0.2303997278213501, -0...."
493048,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical Thailand Ltd.,1,dow chemical,"[-0.3575917184352875, -0.0433979295194149, -0....",dow chemical,"[-0.2474443018436432, 0.19269879162311554, -0...."


### Calculate metrics

In [95]:
# Select test dataset
df_isdup = df[df.is_duplicate == 1]
names = np.unique(np.concatenate([df_isdup.name_1, df_isdup.name_2]))

In [96]:
names

array([' Alfagomma', ' CONTITECH TRANSPORTBANDSYSTEME GMBH',
       ' SO.F.TER. SPA', ..., 'ФИЛИАЛ КОМПАНИИ ЭКСОН НЕФТЕГАЗ ЛИМИТЕД',
       'ФИЛИАЛ КОМПАНИИ"ЭКСОН НЕФТЕГАЗ ЛИМИТЕД"', 'ХИМИНВЕСТ ГРУПП, ООО'],
      dtype=object)

Our main evaluation metric is Precision@1 because it is retreival task

In [97]:
def precision_score_at_k(k, names, unique_groups, model, df_embs, col_name, debug, preproc_text=False):
    scores = []
    for item in tqdm(names):
        q_name = item
        res = inference(q_name, col_name, model, limit=k, df_embs=df_embs, preproc_text=preproc_text, debug=debug)
        found_name = res[0].payload['original_name']
        if debug:
            print(res)
            print(f'q_name {q_name}')
            print(f'found_name {found_name}')

        # todo p@ k > 1
        # ap_k = 1/k * ()
        # for i in enumerate(range(11)):

        for ugroup in unique_groups:
            if q_name in ugroup:
                if debug:
                    print(ugroup)
                if found_name in ugroup:
                    scores.append((q_name, found_name))
                    if debug:
                        print('GOOD')
                break

        if debug:
            break

    return scores #len(scores) / len(names)

Function to make groupings of same companies
Each group is a list of different company names representing instance of one company

In [98]:
# DATASET GROUPINGS
def get_unique_groups(names, df_isdup):
    company_groups = []
    for company_name in tqdm(names):
        tmpdf_names_1 = set(df_isdup[df_isdup.name_2 == company_name].name_1.tolist())
        tmpdf_names_2 = set(df_isdup[df_isdup.name_1 == company_name].name_2.tolist())
        tmpdf_names_1.update(tmpdf_names_2)
        tmpdf_names_1.update([company_name])
        for comp_name in tmpdf_names_1.copy():
            tmpdf_names_1_ = set(df_isdup[df_isdup.name_2 == comp_name].name_1.tolist())
            tmpdf_names_2_ = set(df_isdup[df_isdup.name_1 == comp_name].name_2.tolist())
            tmpdf_names_1_.update(tmpdf_names_2_)
            tmpdf_names_1.update(tmpdf_names_1_)

        tmpdf_names_1 = sorted(tmpdf_names_1)
        company_groups.append(tmpdf_names_1)

    unique_groups = []
    for group in company_groups:
        if group not in unique_groups:
            unique_groups.append(group)

    return unique_groups

In [99]:
unique_groups = get_unique_groups(names, df[df.is_duplicate == 1])

100%|██████████| 1394/1394 [00:23<00:00, 59.98it/s] 


In [100]:
df_embs[df_embs.name == names[0]]

Unnamed: 0,name,name_preproc,emb
16508,Alfagomma,alfagomma,"[-0.13865633308887482, -0.2346138060092926, -0..."


In [101]:
map_score = precision_score_at_k(1,
                                 names,#[19:],
                                 unique_groups,
                                 model,
                                 df_embs,#.rename({'name':'tmp', 'name_preproc': 'name'}, axis=1),
                                 col_name,
                                 debug=False, preproc_text=False
                                 )
# map_score

100%|██████████| 1394/1394 [01:39<00:00, 14.03it/s]


In [102]:
print(f'Precision@1={len(map_score) / len(names)}')

Precision@1=0.5839311334289814


# Qdrant + Preprocess
All same as above but with preprocessed company names.
Embeddings are calculated from preprocessed names.

In [103]:
df_emb = pd.read_parquet(f'../data/df_embs_preproc_{num_emb}.parquet')

In [104]:
df = pd.read_parquet('../data/all_lang_train.parquet')

In [105]:
df

Unnamed: 0,name_1,name_2,is_duplicate
0,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
1,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
2,"Rishichem Distributors Pvt., Ltd.",Dsa,0
3,Powermax Rubber Factory,Co. One,0
4,Tress A/S,Longyou Industries Park Zhejiang,0
...,...,...,...
497567,BIT-MAT PRODUCTS,The Goodyear Tire and Rubber Company,0
497568,"Bnd Trading Co., Ltd.",Zhong Shan Yue Liang Economy& Trade Imp. & Exp...,0
497569,"Xeikon Industrial Co., Ltd. Of Dongguan City","Yi Cheng Trading Co., Ltd. Of Dongguan City",0
497570,"Shanghai Kechuan Trading Co., Ltd.",Shanghai M&G Stationery Inc.,0


In [106]:
df = df.merge(df_emb, how='left', left_on='name_1', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_1'}, axis=1)

In [107]:
df = df.merge(df_emb, how='left', left_on='name_2', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_2'}, axis=1)

In [108]:
df

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
0,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0,iko,"[-0.19439682364463806, 0.25839725136756897, -0...",enormous trade,"[-0.010587654076516628, -0.43423140048980713, ..."
1,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0,apcotex,"[-0.3500242829322815, 0.05114082247018814, -0....",technocraft,"[0.03335000202059746, 0.18506525456905365, 0.0..."
2,"Rishichem Distributors Pvt., Ltd.",Dsa,0,rishichem distributors,"[-0.11705498397350311, -0.367933452129364, -0....",dsa,"[-0.23276162147521973, -0.5193449854850769, -0..."
3,Powermax Rubber Factory,Co. One,0,powermax rubber factory,"[-0.4058910608291626, 0.42498621344566345, 0.0...",one,"[-0.1291559636592865, 0.176286518573761, -0.25..."
4,Tress A/S,Longyou Industries Park Zhejiang,0,tress,"[-0.3934316337108612, -0.05785433202981949, 0....",longyou park zhejiang,"[0.5160265564918518, 0.605634331703186, 0.1857..."
...,...,...,...,...,...,...,...
497567,BIT-MAT PRODUCTS,The Goodyear Tire and Rubber Company,0,bit mat products,"[-0.1815347820520401, -0.16408121585845947, 0....",the goodyear tire and rubber company,"[-0.10538887977600098, 0.2958035469055176, -0...."
497568,"Bnd Trading Co., Ltd.",Zhong Shan Yue Liang Economy& Trade Imp. & Exp...,0,bnd trading,"[-0.380824476480484, 0.19009342789649963, -0.1...",zhong shan yue liang economy& trade,"[-0.13692092895507812, 0.039321813732385635, 0..."
497569,"Xeikon Industrial Co., Ltd. Of Dongguan City","Yi Cheng Trading Co., Ltd. Of Dongguan City",0,xeikon of dongguan city,"[-0.004541510250419378, 0.15645372867584229, 0...",yi cheng trading of dongguan city,"[-0.08541423082351685, -0.10143561661243439, -..."
497570,"Shanghai Kechuan Trading Co., Ltd.",Shanghai M&G Stationery Inc.,0,shanghai kechuan trading,"[-0.10921867936849594, -0.3104268014431, -0.23...",shanghai m&g stationery,"[0.00526794046163559, 0.21739764511585236, 0.1..."


In [109]:
np.stack(df.emb_1[:2]).shape

(2, 384)

In [110]:
df_names = pd.read_parquet('../data/df_names_preproc.parquet')

In [111]:
df_names = df_names.merge(df_emb[['name', 'emb']], how='left', left_on='Names', right_on='name')
df_names = df_names.drop(columns=['Names', 'languages_langdetect'])
df_names = df_names.rename({'name': 'original_name', 'name_preproc':'preprocessed_name'}, axis=1)

In [112]:
df_names

Unnamed: 0,preprocessed_name,original_name,emb
0,iko,Iko Industries Ltd.,"[-0.19439682364463806, 0.25839725136756897, -0..."
1,apcotex,Apcotex Industries Ltd.,"[-0.3500242829322815, 0.05114082247018814, -0...."
2,rishichem distributors,"Rishichem Distributors Pvt., Ltd.","[-0.11705498397350311, -0.367933452129364, -0...."
3,powermax rubber factory,Powermax Rubber Factory,"[-0.4058910608291626, 0.42498621344566345, 0.0..."
4,tress,Tress A/S,"[-0.3934316337108612, -0.05785433202981949, 0...."
...,...,...,...
18017,plastic packaging,Plastic Packaging (Pty) Ltd.,"[-0.25402212142944336, -0.17969439923763275, 0..."
18018,hengshui mechanical & electrical building,'Hengshui Mechanical & Electrical Building Co....,"[0.19610990583896637, 0.4086482524871826, -0.1..."
18019,jiangsu baoli investment,"Jiangsu Baoli International Investment Co., Lt...","[0.07093257457017899, 0.21796780824661255, -0...."
18020,lanxess ag,Lanxess AG,"[-0.13746507465839386, -0.15307718515396118, 0..."


### Add vectors to qdrant

In [None]:
# !docker pull qdrant/qdrant

In [None]:
# !docker run --name qdrant -d -p 6333:6333 --net=bridge qdrant/qdrant

In [113]:
qdrant_client = QdrantClient(host='0.0.0.0', port=6333)

In [114]:
vectors = np.stack(df_emb.emb)
payload = df_names[['original_name', 'preprocessed_name']].to_dict(orient='records')
ids = df_emb.index.values.tolist()
col_name = 'companies'
vec_shape = vectors.shape[1]
bs = 1024
col = create_collection_and_upload(vectors, payload, ids, col_name, vec_shape, bs)

In [115]:
model = SentenceTransformer(emb_variations[num_emb])

In [116]:
inference('Dow Chemical International Private Ltd.', col_name, model, preproc_text=True)

[ScoredPoint(id=2892, version=0, score=0.99999994, payload={'original_name': 'Dow Chemical', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=608, version=1, score=0.99999994, payload={'original_name': 'Dow Chemical Thailand Ltd.', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=6647, version=10, score=0.99999994, payload={'original_name': 'Dow Chemical (Shanghai) Co., Ltd.', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=9176, version=11, score=0.99999994, payload={'original_name': 'Dow Chemical Pacific Ltd.', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=3748, version=4, score=0.99999994, payload={'original_name': 'Dow Chemical Pacific', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=8733, version=11, score=0.8916837, payload={'original_name': 'The Dow Chemical Company', 'preprocessed_name': 'the dow chemical company'}, vector=None),
 ScoredPoint(id=11225, version=6, score=0.70929, 

In [117]:
# Select test dataset
df_isdup = df[df.is_duplicate == 1]
names = np.unique(np.concatenate([df_isdup.name_1, df_isdup.name_2]))

In [118]:
df_isdup[df_isdup.name_2 == 'A.P.I. APPLICAZIONI PLASTICHE INDUSTRIALI S.P.A.']

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
18065,A.P.I.,A.P.I. APPLICAZIONI PLASTICHE INDUSTRIALI S.P.A.,1,api,"[-0.027601545676589012, -0.3346496522426605, -...",api applicazioni plastiche i,"[-0.26109832525253296, -0.33632346987724304, -..."
277795,API,A.P.I. APPLICAZIONI PLASTICHE INDUSTRIALI S.P.A.,1,api,"[-0.027601545676589012, -0.3346496522426605, -...",api applicazioni plastiche i,"[-0.26109832525253296, -0.33632346987724304, -..."
431643,Trinseo API,A.P.I. APPLICAZIONI PLASTICHE INDUSTRIALI S.P.A.,1,trinseo api,"[-0.6698942184448242, -0.1688176840543747, 0.0...",api applicazioni plastiche i,"[-0.26109832525253296, -0.33632346987724304, -..."


In [None]:
df_isdup[df_isdup.name_1 == 'A.P.I. Applicazioni Plastiche Industriali SPA']

In [None]:
df_isdup[df_isdup.name_1 == 'API']

In [119]:
unique_groups = get_unique_groups(names, df_isdup)

100%|██████████| 1394/1394 [00:21<00:00, 65.23it/s] 


In [120]:
unique_groups

[[' Alfagomma', 'ALFAGOMMA INDUSTRIAL SPA'],
 [' CONTITECH TRANSPORTBANDSYSTEME GMBH', 'ContiTech Thermopol Inc.'],
 [' SO.F.TER. SPA', 'Softer Us Inc.'],
 [' TOTAL OIL INDIA PRIVATE LIMITED,  TOTAL',
  'Total Oil India Limited',
  'Total Oil India Private Limited'],
 ['*** ПОЛИМАРКЕТ, ООО', 'ООО "Полимаркет"', 'ООО Полимаркет'],
 ['A. WESTENSEE & PARTNER ROHSTOFF GMBH', 'awp-rohstoffe'],
 ['A.P.I.',
  'A.P.I. APPLICAZIONI PLASTICHE INDUSTRIALI S.P.A.',
  'A.P.I. Applicazioni Plastiche Industriali SPA',
  'API',
  'Trinseo API'],
 ['ABENA INTERNATIONAL A/S', 'ООО"АБЕНА"'],
 ['ABRO INDUSTRIES, INC.', 'ООО "АБРО ИНДАСТРИС"'],
 ['ADI (SALAMBO)', 'ADI COMMERCE', 'ADI commerce ltd'],
 ['ADRIATICA BITUMI', 'ADRIATICA BITUMI S.P.A.', 'Adriatica Bitumi Spa'],
 ['AGILENT TECHNOLOGIES MFG GMBH & SHIPPING DEPARTMENT',
  'ООО "АДЖИЛЕНТ ТЕКНОЛОДЖИЗ"'],
 ['AGIP',
  'AGIP  ( ENI GROUP)',
  'Azienda Generale Italiana Petroli',
  'agip spa'],
 ['ALBEMARLE GREEN CREST ',
  'Albemarle Corporation',
  'Al

In [161]:
df[df.name_2 == 'SO.F.TER.']

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
1328,Lohmann GmbH & Co. KG,SO.F.TER.,0,lohmann & kg,"[-0.00794435478746891, 0.024964027106761932, 0...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."
4332,ANKARA INSAAT TICARET ve SANAYI LIMITED SIRKETI,SO.F.TER.,0,ankara insaat ticaret ve sanayi limited sirketi,"[0.024160169064998627, -0.051298387348651886, ...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."
12029,POLYMERTEAM,SO.F.TER.,0,polymerteam,"[-0.007658825255930424, 0.023627055808901787, ...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."
13804,Joss Holding B.V.,SO.F.TER.,0,joss holding bv,"[0.0007745528127998114, 0.012520086951553822, ...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."
22179,MCASPHALT,SO.F.TER.,0,mcasphalt,"[0.049031779170036316, -0.008015838451683521, ...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."
...,...,...,...,...,...,...,...
483258,PAVIMENTAL,SO.F.TER.,0,pavimental,"[-0.004661908373236656, -0.006784559227526188,...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."
485337,PROTRADE,SO.F.TER.,0,protrade,"[0.000575021025724709, -0.023494573310017586, ...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."
490856,PORR,SO.F.TER.,0,porr,"[0.011393015272915363, -0.007374722044914961, ...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."
492066,Rompetrol Rafinare S.A.,SO.F.TER.,0,rompetrol rafinare,"[0.005357269197702408, -0.008388257585465908, ...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."


In [163]:
df[df.name_2 == ' SO.F.TER. SPA']

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
86652,Softer Us Inc.,SO.F.TER. SPA,1,softer us,"[-0.09364961832761765, -0.05149330943822861, 0...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."


In [56]:
names[names == 'Fenner Dunlop (Toledo), Llc']

array(['Fenner Dunlop (Toledo), Llc'], dtype=object)

In [197]:
df[df.name_2 == ' SO.F.TER. SPA']

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
86652,Softer Us Inc.,SO.F.TER. SPA,1,softer us,"[-0.09364961832761765, -0.05149330943822861, 0...",softer,"[-0.06651424616575241, -0.02606678567826748, 0..."


In [186]:
df[(df.name_2 == 'Softer Us Inc.') & (df.is_duplicate != 1)]

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2


In [83]:
preproc('ALIBESA', stopwords)

'alibesa'

In [25]:
names_test = [n[0] for n in unique_groups]

In [None]:
unique_groups = get_unique_groups(names, df_isdup)

In [121]:
map_score = precision_score_at_k(1,
                                 names,#[19:],
                                 unique_groups,
                                 model,
                                 df_emb.rename({'name':'tmp', 'name_preproc': 'name'}, axis=1),
                                 col_name,
                                 debug=False, preproc_text=True
                                 )

100%|██████████| 1394/1394 [01:37<00:00, 14.27it/s]


In [122]:
print(f'Precision@1={len(map_score) / len(names)}')

Precision@1=0.6162123385939742


# Experiment 3
# Quaterion
Here we fine tune pretrained SentenceEmbeddings models with Quaterion

In [123]:
df_emb = pd.read_parquet(f'../data/df_embs_preproc_{num_emb}.parquet')

In [124]:
df = pd.read_parquet('../data/all_lang_train.parquet')

In [125]:
df = df.merge(df_emb, how='left', left_on='name_1', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_1'}, axis=1)

In [126]:
df = df.merge(df_emb, how='left', left_on='name_2', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_2'}, axis=1)

In [127]:
df

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
0,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0,iko,"[-0.19439682364463806, 0.25839725136756897, -0...",enormous trade,"[-0.010587654076516628, -0.43423140048980713, ..."
1,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0,apcotex,"[-0.3500242829322815, 0.05114082247018814, -0....",technocraft,"[0.03335000202059746, 0.18506525456905365, 0.0..."
2,"Rishichem Distributors Pvt., Ltd.",Dsa,0,rishichem distributors,"[-0.11705498397350311, -0.367933452129364, -0....",dsa,"[-0.23276162147521973, -0.5193449854850769, -0..."
3,Powermax Rubber Factory,Co. One,0,powermax rubber factory,"[-0.4058910608291626, 0.42498621344566345, 0.0...",one,"[-0.1291559636592865, 0.176286518573761, -0.25..."
4,Tress A/S,Longyou Industries Park Zhejiang,0,tress,"[-0.3934316337108612, -0.05785433202981949, 0....",longyou park zhejiang,"[0.5160265564918518, 0.605634331703186, 0.1857..."
...,...,...,...,...,...,...,...
497567,BIT-MAT PRODUCTS,The Goodyear Tire and Rubber Company,0,bit mat products,"[-0.1815347820520401, -0.16408121585845947, 0....",the goodyear tire and rubber company,"[-0.10538887977600098, 0.2958035469055176, -0...."
497568,"Bnd Trading Co., Ltd.",Zhong Shan Yue Liang Economy& Trade Imp. & Exp...,0,bnd trading,"[-0.380824476480484, 0.19009342789649963, -0.1...",zhong shan yue liang economy& trade,"[-0.13692092895507812, 0.039321813732385635, 0..."
497569,"Xeikon Industrial Co., Ltd. Of Dongguan City","Yi Cheng Trading Co., Ltd. Of Dongguan City",0,xeikon of dongguan city,"[-0.004541510250419378, 0.15645372867584229, 0...",yi cheng trading of dongguan city,"[-0.08541423082351685, -0.10143561661243439, -..."
497570,"Shanghai Kechuan Trading Co., Ltd.",Shanghai M&G Stationery Inc.,0,shanghai kechuan trading,"[-0.10921867936849594, -0.3104268014431, -0.23...",shanghai m&g stationery,"[0.00526794046163559, 0.21739764511585236, 0.1..."


In [128]:
df_names = pd.read_parquet('../data/df_names_preproc.parquet')

In [129]:
df_names = df_names.merge(df_emb[['name', 'emb']], how='left', left_on='Names', right_on='name')
df_names = df_names.drop(columns=['Names', 'languages_langdetect'])
df_names = df_names.rename({'name': 'original_name', 'name_preproc':'preprocessed_name'}, axis=1)

In [130]:
df_names

Unnamed: 0,preprocessed_name,original_name,emb
0,iko,Iko Industries Ltd.,"[-0.19439682364463806, 0.25839725136756897, -0..."
1,apcotex,Apcotex Industries Ltd.,"[-0.3500242829322815, 0.05114082247018814, -0...."
2,rishichem distributors,"Rishichem Distributors Pvt., Ltd.","[-0.11705498397350311, -0.367933452129364, -0...."
3,powermax rubber factory,Powermax Rubber Factory,"[-0.4058910608291626, 0.42498621344566345, 0.0..."
4,tress,Tress A/S,"[-0.3934316337108612, -0.05785433202981949, 0...."
...,...,...,...
18017,plastic packaging,Plastic Packaging (Pty) Ltd.,"[-0.25402212142944336, -0.17969439923763275, 0..."
18018,hengshui mechanical & electrical building,'Hengshui Mechanical & Electrical Building Co....,"[0.19610990583896637, 0.4086482524871826, -0.1..."
18019,jiangsu baoli investment,"Jiangsu Baoli International Investment Co., Lt...","[0.07093257457017899, 0.21796780824661255, -0...."
18020,lanxess ag,Lanxess AG,"[-0.13746507465839386, -0.15307718515396118, 0..."


In [131]:
qdrant_client = QdrantClient(host='0.0.0.0', port=6333)

In [132]:
vectors = np.stack(df_emb.emb)
payload = df_names[['original_name', 'preprocessed_name']].to_dict(orient='records')
ids = df_emb.index.values.tolist()
col_name = 'companies'
vec_shape = vectors.shape[1]
bs = 1024
col = create_collection_and_upload(vectors, payload, ids, col_name, vec_shape, bs)

In [133]:
model = SentenceTransformer(emb_variations[num_emb])

In [134]:
inference('Dow Chemical International Private Ltd.', col_name, model, preproc_text=True)

[ScoredPoint(id=2892, version=4, score=0.99999994, payload={'original_name': 'Dow Chemical', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=9176, version=7, score=0.99999994, payload={'original_name': 'Dow Chemical Pacific Ltd.', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=608, version=2, score=0.99999994, payload={'original_name': 'Dow Chemical Thailand Ltd.', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=3748, version=0, score=0.99999994, payload={'original_name': 'Dow Chemical Pacific', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=6647, version=9, score=0.99999994, payload={'original_name': 'Dow Chemical (Shanghai) Co., Ltd.', 'preprocessed_name': 'dow chemical'}, vector=None),
 ScoredPoint(id=8733, version=7, score=0.8916837, payload={'original_name': 'The Dow Chemical Company', 'preprocessed_name': 'the dow chemical company'}, vector=None),
 ScoredPoint(id=11225, version=11, score=0.70929, pa

In [135]:
# Select test dataset
df_isdup = df[df.is_duplicate == 1]
names = np.unique(np.concatenate([df_isdup.name_1, df_isdup.name_2]))

In [136]:
unique_groups = get_unique_groups(names, df[df.is_duplicate == 1])

100%|██████████| 1394/1394 [00:18<00:00, 73.48it/s] 


In [137]:
import json
import uuid

with open('../data/unique_groups.jsonl', 'w', encoding='utf-8') as fw:
    for ung in unique_groups:
        group_uid = str(uuid.uuid4())
        for el in ung:
            dc = {'original_name': el, 'preprocessed_name': preproc(el, stopwords), 'group': group_uid}
            json.dump(dc, fw, ensure_ascii=False)
            fw.write('\n')

In [138]:
import os
import random
import json
from typing import Any, Dict, List, Union
import pytorch_lightning as pl
import torch
from quaterion.eval.attached_metric import AttachedMetric
from quaterion.eval.pair import RetrievalPrecision, RetrievalReciprocalRank
from quaterion_models.encoders import Encoder
from quaterion_models.heads import EncoderHead, GatedHead
from quaterion_models.types import CollateFnType
from torch.utils.data import Dataset
from quaterion import Quaterion, TrainableModel
from quaterion.dataset.similarity_data_loader import (
    GroupSimilarityDataLoader,
    SimilarityGroupSample,
)
from quaterion.loss import SimilarityLoss, SoftmaxLoss
from sentence_transformers import SentenceTransformer
random.seed(42)

Class for company dataset. On __getitem__ call it has to return SimilarityGroupSample or SimilarityPairSample. We use SimilarityGroupSample because we have groups of companies

In [139]:
class CompaniesDataset(Dataset):
    def __init__(self, path: str, max_samples: int = 50000):
        super().__init__()
        with open(path, "r", encoding="utf8") as f:
            lines = f.readlines()[:max_samples]
            random.shuffle(lines)
            self.data = [json.loads(line) for line in lines]

        _company_groups = set(sorted([item["group"] for item in self.data]))
        self._label2idx = {label: idx for idx, label in enumerate(_company_groups)}

    def __getitem__(self, index: int) -> SimilarityGroupSample:
        item = self.data[index]
        return SimilarityGroupSample(obj=item, group=self._label2idx[item["group"]])

    def __len__(self) -> int:
        return len(self.data)

    def get_num_groups(self) -> int:
        return len(self._label2idx)

Quaterion Encoder class with SentenceTransformer embedding model

In [140]:
class CompanyEncoder(Encoder):
    def __init__(self, pretrained_name: str):
        super().__init__()
        self.encoder = SentenceTransformer(pretrained_name)
        self._pretrained_name = pretrained_name

    @property
    def trainable(self) -> bool:
        return False

    @property
    def embedding_size(self) -> int:
        return self.encoder.get_sentence_embedding_dimension()

    def get_collate_fn(self) -> CollateFnType:
        return self.extract_texts

    def extract_texts(self, batch: List[Union[str, Dict[str, Any]]]):
        if isinstance(batch[0], str):
            return batch
        elif isinstance(batch[0], Dict):
            return [item["preprocessed_name"] for item in batch]
        else:
            raise TypeError("Expecting list of strings or dicts as inputs")

    def forward(self, inputs):
        return self.encoder.encode(
            inputs, convert_to_numpy=False, convert_to_tensor=True
        )

    def save(self, output_path: str):
        self.encoder.save(os.path.join(output_path, self._pretrained_name))

    @classmethod
    def load(cls, input_path: str) -> "Encoder":
        return CompanyEncoder(input_path)


Quaterion Main similarity learning model.
We use GatedHead and Softmax loss. These are subject to tune and change in future work.

In [141]:
class CompanyMatchingModel(TrainableModel):
    def __init__(
        self,
        pretrained_name: str = emb_variations[num_emb],
        num_groups: int = 20,
        lr: float = 3e-5,
    ):
        self._pretrained_name = pretrained_name
        self._num_groups = num_groups
        self._lr = lr
        super().__init__()

    # def configure_metrics(self):
    #     # attach batch-wise metrics which will be automatically computed and logged during training
    #     return [
    #         AttachedMetric(
    #             "RetrievalPrecision",
    #             RetrievalPrecision(k=1),
    #             prog_bar=True,
    #             on_epoch=True,
    #         ),
    #         AttachedMetric(
    #             "RetrievalReciprocalRank",
    #             RetrievalReciprocalRank(),
    #             prog_bar=True,
    #             on_epoch=True
    #         ),
    #     ]

    def configure_encoders(self) -> Union[Encoder, Dict[str, Encoder]]:
        return CompanyEncoder(self._pretrained_name)

    def configure_head(self, input_embedding_size) -> EncoderHead:
        return GatedHead(input_embedding_size)

    def configure_loss(self) -> SimilarityLoss:
        return SoftmaxLoss(self.model.head.output_size, self._num_groups)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(
            [
                {"params": self.model.parameters(), "lr": self._lr},
                {"params": self.loss.parameters(), "lr": self._lr * 10.0},
            ]
        )

        return optimizer

In [142]:
pretrain = emb_variations[num_emb]

In [143]:
cd = CompaniesDataset(path='../data/unique_groups.jsonl')

In [144]:
enc = CompanyEncoder(pretrained_name=pretrain)

In [145]:
ftmodel = CompanyMatchingModel(pretrained_name=pretrain, num_groups=cd.get_num_groups())

In [88]:
#ftmodel.eval()

CompanyMatchingModel(
  (_model): SimilarityModel(
    (default): CompanyEncoder(
      (encoder): SentenceTransformer(
        (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
        (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
        (2): Normalize()
      )
    )
    (head): GatedHead(
      (dropout): Identity()
    )
  )
  (_loss): SoftmaxLoss()
)

In [146]:
train_dataloader = GroupSimilarityDataLoader(cd, batch_size=64, shuffle=True)

In [147]:
trainer = pl.Trainer(accelerator="auto", devices=1, num_nodes=1, max_epochs=30)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
Quaterion.fit(
    trainable_model=ftmodel,
    trainer=trainer,
    train_dataloader=train_dataloader,
)

  rank_zero_deprecation(
  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type            | Params
-------------------------------------------
0 | _model | SimilarityModel | 33.4 M
1 | _loss  | SoftmaxLoss     | 189 K 
-------------------------------------------
189 K     Trainable params
33.4 M    Non-trainable params
33.5 M    Total params
134.199   Total estimated model params size (MB)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Save Quaterion model for serving

In [None]:
ftmodel.save_servable(f"../models/companies_{num_emb}")

## Eval similarity model

In [None]:
from quaterion_models import SimilarityModel

BATCH_SIZE = 32

In [None]:
def eval_model(dataloader, model):
    embeddings = []

    for el in tqdm(dataloader):
        with torch.no_grad():
            embeddings_batch = model.encode(
                el, batch_size=BATCH_SIZE, to_numpy=True
            )
            embeddings.append(embeddings_batch)

    return np.concatenate(embeddings)


def serve_tuned_embeddings(dataloader, model_path):
    model = SimilarityModel.load(model_path)
    model.eval()
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    return eval_model(dataloader, model)

Save serving embedding to numpy object

In [None]:
embeddings = serve_tuned_embeddings(df_names.preprocessed_name, f'../models/companies_{num_emb}')

In [None]:
np.save(os.path.join('../data/tuned/', f"tuned_{num_emb}.npy"), embeddings, allow_pickle=False)

In [None]:
embeddings = np.load(f'../data/tuned/tuned_{num_emb}.npy')

In [None]:
embeddings.shape

### Load fine-tuned embeddings to qdrant

In [None]:
qdrant_client = QdrantClient(host='0.0.0.0', port=6333)

In [None]:
vectors = embeddings
payload = df_names[['original_name', 'preprocessed_name']].to_dict(orient='records')
# ids = df_emb.index.values.tolist()
col_name = 'companies-ft'
vec_shape = vectors.shape[1]
bs = 1024
col = create_collection_and_upload(vectors, payload, ids, col_name, vec_shape, bs)

In [None]:
model = SimilarityModel.load(f'../models/companies_{num_emb}')
model.eval()
model.to('cuda')

In [None]:
inference('Dow Chemical International Private Ltd.', col_name, model, preproc_text=True)

In [None]:
map_score = precision_score_at_k(1,
                                 names,#[19:],
                                 unique_groups,
                                 model,
                                 df_emb.rename({'name':'tmp', 'name_preproc': 'name'}, axis=1),
                                 col_name,
                                 debug=False, preproc_text=True
                                 )

In [None]:
print(f'Precision@1={len(map_score) / len(names)}')