# Experiment 2
# Sentence embeddings and Qdrant Search

## Unprocessed dataest. Languages: EN

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from qdrant_client.http.models import models
from tqdm import tqdm

### Load datasets

In [71]:
df_emb = pd.read_parquet('../data/df_embs_en.parquet')  # Embeddings from all-MiniLM-L6-v2 pretrain. Vector length is 384.

In [72]:
df = pd.read_parquet('../data/train.parquet')  # Only EN

In [73]:
df

Unnamed: 0,name_1,name_2,is_duplicate
0,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
1,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
2,Powermax Rubber Factory,Co. One,0
3,National Bank Of,Action International,0
4,R.I.Intl,"Rass Mfg. India Pvt., Ltd.",0
...,...,...,...
149729,Eoc Polymers India Private Ltd.,Imp. Express India Private Ltd.,0
149730,Societe Des Transports,Ho Sports Co. Inc.,0
149731,"Computime Electric (Shenzhen) Co., Ltd.",A S International,0
149732,"Shanghai Haizhige Furniture Co., Ltd.",Sharang Corporation,0


In [74]:
df = df.merge(df_emb, how='left', left_on='name_1', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_1'}, axis=1)

In [75]:
df = df.merge(df_emb, how='left', left_on='name_2', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_2'}, axis=1)

In [76]:
df

Unnamed: 0,name_1,name_2,is_duplicate,emb_1,emb_2
0,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0,"[-0.05934988334774971, 0.06893617659807205, 0....","[-0.05795738846063614, -0.02890898659825325, -..."
1,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0,"[-0.027187837287783623, 0.0036133911926299334,...","[-0.045615799725055695, 0.012592996470630169, ..."
2,Powermax Rubber Factory,Co. One,0,"[-0.027730092406272888, 0.06848588585853577, -...","[-0.053144559264183044, -0.04593741148710251, ..."
3,National Bank Of,Action International,0,"[-0.018415303900837898, -0.006000360939651728,...","[-0.05674505606293678, -0.04642457142472267, -..."
4,R.I.Intl,"Rass Mfg. India Pvt., Ltd.",0,"[-0.07493618130683899, -0.0017950021428987384,...","[-0.07074024528265, 0.03370615467429161, -0.10..."
...,...,...,...,...,...
149729,Eoc Polymers India Private Ltd.,Imp. Express India Private Ltd.,0,"[-0.06773572415113449, -0.01392741035670042, -...","[-0.06258784979581833, 0.018974745646119118, 0..."
149730,Societe Des Transports,Ho Sports Co. Inc.,0,"[-0.03525467589497566, -0.03506492078304291, -...","[-0.06926314532756805, 0.06603555381298065, -0..."
149731,"Computime Electric (Shenzhen) Co., Ltd.",A S International,0,"[-0.05353596806526184, 0.057930838316679, 0.00...","[-0.03811235725879669, 0.02773694135248661, -0..."
149732,"Shanghai Haizhige Furniture Co., Ltd.",Sharang Corporation,0,"[-0.014642884023487568, 0.0201723612844944, 0....","[-0.009399804286658764, -0.002773549407720566,..."


In [77]:
np.stack(df.emb_1).shape

(149734, 384)

### Add vectors to qdrant

In [None]:
!docker pull qdrant/qdrant

In [None]:
!docker run --name qdrant -d -p 6333:6333 --net=bridge qdrant/qdrant

In [9]:
df_embs = pd.read_parquet('../data/df_embs_en.parquet')

In [10]:
qdrant_client = QdrantClient(host='0.0.0.0', port=6333)

In [68]:
def create_collection_and_upload(vectors, payload, ids, col_name, vec_shape, bs):
    qdrant_client.recreate_collection(collection_name=col_name,
                                      vectors_config=models.VectorParams(size=vec_shape, distance=models.Distance.COSINE),
                                      on_disk_payload=True)
    qdrant_client.upload_collection(
        collection_name=col_name,
        vectors=vectors,
        payload=payload,
        ids=ids,
        batch_size=bs,
        parallel=6
    )
    col = qdrant_client.get_collection(col_name)
    return col

In [69]:
vectors = np.stack(df_embs.emb)
payload = df_embs[['name']].to_dict(orient='records')
ids = df_embs.index.values.tolist()
col_name = 'companies-EN'
vec_shape = vectors.shape[1]
bs = 1024

In [70]:
col = create_collection_and_upload(vectors, payload, ids, col_name, vec_shape, bs)

### Similarity search with cosine distance

In [20]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [114]:
from experiments.preprocess import preproc, stopwords


def inference(company_name, model, limit=30, df_embs=None, preproc_text=False):
    if preproc_text is not None:
        company_name = preproc(company_name, stopwords)

    if df_embs is None:
        qvector = model.encode(company_name)
    else:
        qvector = np.stack(df_embs[df_embs['name'] == company_name].emb)[0]

    search_result = qdrant_client.search(
        collection_name=col_name,
        query_vector=qvector,
        query_filter=models.Filter(
            must_not=[
                models.FieldCondition(
                    key="name",
                    match=models.MatchValue(value=company_name)
                ),
            ]
        ),
        limit=limit,
        offset=0
    )
    return search_result#[r for r in search_result if r.payload['name'] != company_name]

In [54]:
inference('Dow Chemical (Shanghai) Co., Ltd.', model)

[-3.86481360e-02  3.25051323e-03 -4.05685641e-02  4.68646176e-02
  3.04586273e-02  1.18689779e-02  6.79872036e-02  3.58943567e-02
  3.76277417e-02 -8.06272924e-02  4.35762927e-02  1.09155392e-02
 -7.25863576e-02  1.50767795e-03 -6.77131712e-02 -4.66056690e-02
 -7.04153790e-04  2.95533035e-02  2.44878349e-03 -7.07464665e-02
 -9.96298343e-02 -9.28939581e-02  4.60370556e-02  3.87204438e-02
 -8.49876627e-02  7.70448446e-02 -2.08164603e-02  7.05066398e-02
  1.28157269e-02 -5.57344779e-02  3.96616757e-03  6.31402209e-02
  2.90050898e-02  2.62158103e-02  9.24626887e-02  2.24703662e-02
 -7.77371451e-02 -5.77038117e-02  2.63690539e-02  1.86589046e-03
  3.84277217e-02  9.97254811e-03  3.39368396e-02 -8.80111158e-02
  8.04794878e-02 -5.65073546e-03  2.58084526e-03 -1.59460101e-02
 -2.80876784e-03  1.83662064e-02 -4.16462794e-02  1.23528764e-02
 -1.65041853e-02 -1.19401729e-02  5.21688834e-02  5.41791692e-02
  2.27732453e-02 -5.71125336e-02 -2.85425633e-02  1.30495965e-03
  8.48037750e-03  2.27029

[ScoredPoint(id=2407, version=3, score=0.8766744, payload={'name': 'Dow Chemical Pacific Ltd.'}, vector=None),
 ScoredPoint(id=2405, version=3, score=0.8710414, payload={'name': 'Dow Chemical International Private Ltd.'}, vector=None),
 ScoredPoint(id=8799, version=7, score=0.8694345, payload={'name': 'The Dow Chemical Company'}, vector=None),
 ScoredPoint(id=2408, version=3, score=0.83456326, payload={'name': 'Dow Chemical Thailand Ltd.'}, vector=None),
 ScoredPoint(id=2403, version=3, score=0.8277112, payload={'name': 'Dow Chemical'}, vector=None),
 ScoredPoint(id=9927, version=6, score=0.81730974, payload={'name': 'Yuanbai Chemicals (Shanghai) Co., Ltd.'}, vector=None),
 ScoredPoint(id=7757, version=9, score=0.81063753, payload={'name': 'Shanghai Yikang Chemicals & Industries Co., Ltd.'}, vector=None),
 ScoredPoint(id=7693, version=9, score=0.80738235, payload={'name': 'Shanghai Origin Chem International Trading Co., Ltd.'}, vector=None),
 ScoredPoint(id=7681, version=9, score=0.806

In [29]:
df[((df.name_1 == 'Dow Chemical (Shanghai) Co., Ltd.') | (df.name_2 == 'Dow Chemical (Shanghai) Co., Ltd.')) & df.is_duplicate == 1]

Unnamed: 0,name_1,name_2,is_duplicate,emb_1,emb_2
3621,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical Pacific Ltd.,1,"[-0.028365090489387512, -0.02730587124824524, ...","[-0.053025662899017334, 0.003969652112573385, ..."
25254,Dow Chemical International Private Ltd.,"Dow Chemical (Shanghai) Co., Ltd.",1,"[-0.054589733481407166, -0.02028239332139492, ...","[-0.028365090489387512, -0.02730587124824524, ..."
43550,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical International Private Ltd.,1,"[-0.028365090489387512, -0.02730587124824524, ...","[-0.054589733481407166, -0.02028239332139492, ..."
67051,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical,1,"[-0.028365090489387512, -0.02730587124824524, ...","[0.01545174140483141, 0.002613763092085719, -0..."
67332,Dow Chemical Pacific Ltd.,"Dow Chemical (Shanghai) Co., Ltd.",1,"[-0.053025662899017334, 0.003969652112573385, ...","[-0.028365090489387512, -0.02730587124824524, ..."
86276,Dow Chemical Pacific,"Dow Chemical (Shanghai) Co., Ltd.",1,"[-0.024302493780851364, 0.016313781961798668, ...","[-0.028365090489387512, -0.02730587124824524, ..."
114460,Dow Chemical Thailand Ltd.,"Dow Chemical (Shanghai) Co., Ltd.",1,"[-0.05091489478945732, 0.00030382093973457813,...","[-0.028365090489387512, -0.02730587124824524, ..."
116796,Dow Chemical,"Dow Chemical (Shanghai) Co., Ltd.",1,"[0.01545174140483141, 0.002613763092085719, -0...","[-0.028365090489387512, -0.02730587124824524, ..."
127101,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical Pacific,1,"[-0.028365090489387512, -0.02730587124824524, ...","[-0.024302493780851364, 0.016313781961798668, ..."
148353,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical Thailand Ltd.,1,"[-0.028365090489387512, -0.02730587124824524, ...","[-0.05091489478945732, 0.00030382093973457813,..."


### Calculate metrics

In [30]:
search_scroll = qdrant_client.scroll(
    collection_name=col_name,
    limit=col.points_count,
    offset=0,
    with_vectors=True,
    with_payload=True,
)

In [31]:
# Select test dataset
df_isdup = df[df.is_duplicate == 1]
names = np.unique(np.concatenate([df_isdup.name_1, df_isdup.name_2]))

In [32]:
names

array(['ALBEMARLE GREEN CREST ', 'APS Paving Stone Inc',
       'APS Paving&Stone Inc', 'AUTOMOTIVE PERFORMANCE MATERIAL (APM)',
       'Achem Technology (Dongguan)', 'Achem Technology Corp.',
       'Achem Technology Dongguan',
       'Adams Resources Exploration Corporation, 112 Oil And Gas Assets',
       'Adams Resources Exploration Corporation, Oil And Gas Assets In Crocket And Irion Counties In Texas',
       'Adams Resources Exploration Corporation, Oil And Gas Assets Outside The Permian Basin',
       'Adhesives Research, Inc.', 'Afton Chemical De Mexicosa De Cv',
       'Afton Chemical Hyderabad Pvt., Ltd.',
       'Afton Chemical India Private Ltd.',
       'Afton Chemical Industria De Aditivos Ltda',
       'Albemarle Corporation', 'Albemarle Corporation (NYSE:ALB)',
       'Ashland Inc.', 'Ashland Specialty Ingredients Gp',
       'Automotive Performance Material',
       'Automotive Performance Materials', 'Avery Denisson (BE)',
       'Avery Dennison Kunshan Co., Ltd.',
 

In [61]:
def precision_score_at_k(k):
    scores = []
    for item in tqdm(names):
        # q_name = item.payload['name']
        q_name = item
        res = inference(q_name, model, limit=k, df_embs=df_embs)
        found_name = res[0].payload['name']

        # todo p@ k > 1
        # ap_k = 1/k * ()
        # for i in enumerate(range(11)):

        tmpd = df[((df.name_1 == found_name) & (df.name_2 == q_name)) | ((df.name_1 == q_name) & (df.name_2 == found_name))]
        tmpd = tmpd[tmpd.is_duplicate == 1]
        if len(tmpd) > 0:
            scores.append(1)

    return len(scores) / len(names)

In [62]:
map_score = precision_score_at_k(1)
map_score

100%|██████████| 369/369 [00:39<00:00,  9.33it/s]


0.5853658536585366

P@1 = 0.58
We need preprocessing.

## Qdrant + preproc + all langs

In [81]:
df_emb = pd.read_parquet('../data/df_embs_preproc.parquet')  # Embeddings from distiluse-base-multilingual-cased-v2 pretrain. Vector length is 512.

In [82]:
df = pd.read_parquet('../data/all_lang_train.parquet')  # Only EN

In [83]:
df

Unnamed: 0,name_1,name_2,is_duplicate
0,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
1,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
2,"Rishichem Distributors Pvt., Ltd.",Dsa,0
3,Powermax Rubber Factory,Co. One,0
4,Tress A/S,Longyou Industries Park Zhejiang,0
...,...,...,...
497567,BIT-MAT PRODUCTS,The Goodyear Tire and Rubber Company,0
497568,"Bnd Trading Co., Ltd.",Zhong Shan Yue Liang Economy& Trade Imp. & Exp...,0
497569,"Xeikon Industrial Co., Ltd. Of Dongguan City","Yi Cheng Trading Co., Ltd. Of Dongguan City",0
497570,"Shanghai Kechuan Trading Co., Ltd.",Shanghai M&G Stationery Inc.,0


In [84]:
df = df.merge(df_emb, how='left', left_on='name_1', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_1'}, axis=1)

In [85]:
df = df.merge(df_emb, how='left', left_on='name_2', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_2'}, axis=1)

In [86]:
df

Unnamed: 0,name_1,name_2,is_duplicate,name_preproc_x,emb_1,name_preproc_y,emb_2
0,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0,Iko Industries,"[0.017819033935666084, -0.014882056973874569, ...",Enormous Industrial Trade,"[0.026229966431856155, -0.07940708845853806, -..."
1,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0,Apcotex Industries,"[0.013420317322015762, -0.023446308448910713, ...",Technocraft Industries,"[-0.045590683817863464, -0.005337950307875872,..."
2,"Rishichem Distributors Pvt., Ltd.",Dsa,0,Rishichem Distributors,"[-0.01495872437953949, -0.03767695277929306, -...",Dsa,"[0.03230986371636391, 0.015820275992155075, -0..."
3,Powermax Rubber Factory,Co. One,0,Powermax Rubber Factory,"[-0.02500852569937706, 0.012965007685124874, -...",One,"[0.04849866405129433, -0.08649260550737381, -0..."
4,Tress A/S,Longyou Industries Park Zhejiang,0,Tress A/S,"[0.025902049615979195, -0.002849553246051073, ...",Longyou Industries Park Zhejiang,"[0.03905995562672615, 0.11018368601799011, -0...."
...,...,...,...,...,...,...,...
497567,BIT-MAT PRODUCTS,The Goodyear Tire and Rubber Company,0,BIT-MAT PRODUCTS,"[0.005421825684607029, -0.017194371670484543, ...",The Goodyear Tire and Rubber Company,"[-0.040851492434740067, -0.025978924706578255,..."
497568,"Bnd Trading Co., Ltd.",Zhong Shan Yue Liang Economy& Trade Imp. & Exp...,0,Bnd Trading,"[0.027749311178922653, -0.04488192871212959, -...",Zhong Shan Yue Liang Economy& Trade .,"[0.005387855693697929, 0.0054251644760370255, ..."
497569,"Xeikon Industrial Co., Ltd. Of Dongguan City","Yi Cheng Trading Co., Ltd. Of Dongguan City",0,Xeikon Industrial Of Dongguan City,"[-0.024616584181785583, 0.006875607650727034, ...",Yi Cheng Trading Of Dongguan City,"[0.015609915368258953, 0.044810373336076736, 0..."
497570,"Shanghai Kechuan Trading Co., Ltd.",Shanghai M&G Stationery Inc.,0,Shanghai Kechuan Trading,"[0.021050529554486275, 0.03309483453631401, -0...",Shanghai M&G Stationery,"[0.043589092791080475, 0.06722193956375122, -0..."


In [87]:
np.stack(df.emb_1).shape

(497572, 512)

In [101]:
df_names = pd.read_parquet('../data/df_names_preproc.parquet')

In [102]:
df_names = df_names.merge(df_emb[['name', 'emb']], how='left', left_on='Names', right_on='name')
df_names = df_names.drop(columns=['Names', 'languages_langdetect'])
df_names = df_names.rename({'name': 'original_name', 'name_preproc':'preprocessed_name'}, axis=1)

In [103]:
df_names

Unnamed: 0,preprocessed_name,original_name,emb
0,Iko Industries,Iko Industries Ltd.,"[0.017819033935666084, -0.014882056973874569, ..."
1,Apcotex Industries,Apcotex Industries Ltd.,"[0.013420317322015762, -0.023446308448910713, ..."
2,Rishichem Distributors,"Rishichem Distributors Pvt., Ltd.","[-0.01495872437953949, -0.03767695277929306, -..."
3,Powermax Rubber Factory,Powermax Rubber Factory,"[-0.02500852569937706, 0.012965007685124874, -..."
4,Tress A/S,Tress A/S,"[0.025902049615979195, -0.002849553246051073, ..."
...,...,...,...
18017,Plastic Packaging,Plastic Packaging (Pty) Ltd.,"[-0.10133685171604156, 0.09970243275165558, -0..."
18018,Hengshui Mechanical & Electrical Building,'Hengshui Mechanical & Electrical Building Co....,"[0.028690272942185402, 0.07035619765520096, -0..."
18019,Jiangsu Baoli Investment,"Jiangsu Baoli International Investment Co., Lt...","[0.0056513226591050625, 0.03347638249397278, 0..."
18020,Lanxess AG,Lanxess AG,"[-0.03159470111131668, 0.009131425991654396, -..."


### Add vectors to qdrant

In [None]:
!docker pull qdrant/qdrant

In [None]:
!docker run --name qdrant -d -p 6333:6333 --net=bridge qdrant/qdrant

In [90]:
qdrant_client = QdrantClient(host='0.0.0.0', port=6333)

In [104]:
vectors = np.stack(df_emb.emb)
payload = df_names[['original_name', 'preprocessed_name']].to_dict(orient='records')
ids = df_emb.index.values.tolist()
col_name = 'companies-all-preprocessed'
vec_shape = vectors.shape[1]
bs = 1024
col = create_collection_and_upload(vectors, payload, ids, col_name, vec_shape, bs)

In [105]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

In [115]:
inference('Dow Chemical International Private Ltd.', model, preproc_text=True)

[ScoredPoint(id=5474, version=5, score=1.0, payload={'original_name': 'Dow Chemical International Private Ltd.', 'preprocessed_name': 'Dow Chemical'}, vector=None),
 ScoredPoint(id=3748, version=3, score=1.0, payload={'original_name': 'Dow Chemical Pacific', 'preprocessed_name': 'Dow Chemical'}, vector=None),
 ScoredPoint(id=608, version=1, score=1.0, payload={'original_name': 'Dow Chemical Thailand Ltd.', 'preprocessed_name': 'Dow Chemical'}, vector=None),
 ScoredPoint(id=9176, version=9, score=1.0, payload={'original_name': 'Dow Chemical Pacific Ltd.', 'preprocessed_name': 'Dow Chemical'}, vector=None),
 ScoredPoint(id=2892, version=4, score=1.0, payload={'original_name': 'Dow Chemical', 'preprocessed_name': 'Dow Chemical'}, vector=None),
 ScoredPoint(id=6647, version=10, score=1.0, payload={'original_name': 'Dow Chemical (Shanghai) Co., Ltd.', 'preprocessed_name': 'Dow Chemical'}, vector=None),
 ScoredPoint(id=1306, version=2, score=0.90410197, payload={'original_name': 'Dro Chemica

In [None]:
# Select test dataset
df_isdup = df[df.is_duplicate == 1]
names = np.unique(np.concatenate([df_isdup.name_1, df_isdup.name_2]))

In [None]:
map_score = precision_score_at_k(1)
map_score