# Experiment 2
# Sentence embeddings and Qdrant Search

## Unprocessed dataest. Languages: EN

In [109]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from qdrant_client.http.models import models
from tqdm import tqdm

### Load datasets

In [9]:
df_emb = pd.read_parquet('../data/df_embs.parquet')  # Embeddings from all-MiniLM-L6-v2 pretrain. Vector length is 512.

In [15]:
df = pd.read_parquet('../data/train.parquet')  # Only EN

In [16]:
df

Unnamed: 0,name_1,name_2,is_duplicate
0,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0
1,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0
2,Powermax Rubber Factory,Co. One,0
3,National Bank Of,Action International,0
4,R.I.Intl,"Rass Mfg. India Pvt., Ltd.",0
...,...,...,...
149729,Eoc Polymers India Private Ltd.,Imp. Express India Private Ltd.,0
149730,Societe Des Transports,Ho Sports Co. Inc.,0
149731,"Computime Electric (Shenzhen) Co., Ltd.",A S International,0
149732,"Shanghai Haizhige Furniture Co., Ltd.",Sharang Corporation,0


In [17]:
df = df.merge(df_emb, how='left', left_on='name_1', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_1'}, axis=1)

In [18]:
df = df.merge(df_emb, how='left', left_on='name_2', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_2'}, axis=1)

In [20]:
df

Unnamed: 0,name_1,name_2,is_duplicate,emb_1,emb_2
0,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0,"[-0.0183494221419096, -0.00905199721455574, 0....","[-0.018555620685219765, -0.05678172409534454, ..."
1,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0,"[-0.019763024523854256, -0.024556715041399002,...","[-0.06599077582359314, 0.04381170868873596, -0..."
2,Powermax Rubber Factory,Co. One,0,"[-0.02500852569937706, 0.012965007685124874, -...","[0.0013258784310892224, -0.08776025474071503, ..."
3,National Bank Of,Action International,0,"[-0.024452747777104378, 0.020691601559519768, ...","[-0.023129969835281372, 0.004134805407375097, ..."
4,R.I.Intl,"Rass Mfg. India Pvt., Ltd.",0,"[-0.013006784953176975, 0.03725733980536461, -...","[-0.022258969023823738, 0.07864850014448166, 0..."
...,...,...,...,...,...
149729,Eoc Polymers India Private Ltd.,Imp. Express India Private Ltd.,0,"[-0.05212598294019699, 0.12076789140701294, 0....","[-0.08675865083932877, 0.06544617563486099, -0..."
149730,Societe Des Transports,Ho Sports Co. Inc.,0,"[-0.05330152064561844, -0.07089784741401672, 0...","[-0.017611868679523468, 0.0801854059100151, 0...."
149731,"Computime Electric (Shenzhen) Co., Ltd.",A S International,0,"[-0.07178869843482971, 0.0024943070020526648, ...","[-0.033666517585515976, 0.030529430136084557, ..."
149732,"Shanghai Haizhige Furniture Co., Ltd.",Sharang Corporation,0,"[-0.052497878670692444, 0.04104209691286087, -...","[-0.03629408776760101, 0.002414196962490678, 0..."


In [21]:
np.stack(df.emb_1).shape

(149734, 512)

### Add vectors to qdrant

In [None]:
!docker pull qdrant/qdrant

In [None]:
!docker run --name qdrant -d -p 6333:6333 --net=bridge qdrant/qdrant

In [51]:
df_embs = pd.read_parquet('../data/df_embs_en.parquet')

In [52]:
qdrant_client = QdrantClient(host='0.0.0.0', port=6333)

In [53]:
vectors = np.stack(df_embs.emb)
payload = df_embs[['name']].to_dict(orient='records')
ids = df_embs.index.values.tolist()

In [54]:
col_name = 'companies-EN'
vec_shape = vectors.shape[1]
bs = 1024

In [55]:
qdrant_client.recreate_collection(collection_name=col_name,
                                  vectors_config=models.VectorParams(size=vec_shape, distance=models.Distance.COSINE),
                                  on_disk_payload=True)

In [56]:
qdrant_client.upload_collection(
    collection_name=col_name,
    vectors=vectors,
    payload=payload,
    ids=ids,
    batch_size=bs,
    parallel=6
)

In [57]:
col = qdrant_client.get_collection(col_name)

In [58]:
col.points_count

10106

### Similarity search with cosine distance

In [59]:
model = SentenceTransformer('all-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

  return torch._C._cuda_getDeviceCount() > 0


In [106]:
def inference(company_name, model, limit=30, df_embs=None):
    if df_embs is not None:
        qvector = model.encode(company_name)
    else:
        qvector = np.stack(df_embs[df_embs['name'] == company_name].emb)

    search_result = qdrant_client.search(
        collection_name=col_name,
        query_vector=qvector,
        query_filter=models.Filter(
            must_not=[
                models.FieldCondition(
                    key="name",
                    match=models.MatchValue(value=company_name)
                ),
            ]
        ),
        limit=limit,
        offset=0
    )
    return search_result#[r for r in search_result if r.payload['name'] != company_name]

In [77]:
inference('Dow Chemical (Shanghai) Co., Ltd.', model)

[ScoredPoint(id=2407, version=4, score=0.8766745, payload={'name': 'Dow Chemical Pacific Ltd.'}, vector=None),
 ScoredPoint(id=2405, version=4, score=0.8710415, payload={'name': 'Dow Chemical International Private Ltd.'}, vector=None),
 ScoredPoint(id=8799, version=6, score=0.86943454, payload={'name': 'The Dow Chemical Company'}, vector=None),
 ScoredPoint(id=2408, version=4, score=0.8345633, payload={'name': 'Dow Chemical Thailand Ltd.'}, vector=None),
 ScoredPoint(id=2403, version=4, score=0.8277112, payload={'name': 'Dow Chemical'}, vector=None),
 ScoredPoint(id=9927, version=8, score=0.81730974, payload={'name': 'Yuanbai Chemicals (Shanghai) Co., Ltd.'}, vector=None),
 ScoredPoint(id=7757, version=7, score=0.81063765, payload={'name': 'Shanghai Yikang Chemicals & Industries Co., Ltd.'}, vector=None),
 ScoredPoint(id=7693, version=7, score=0.80738246, payload={'name': 'Shanghai Origin Chem International Trading Co., Ltd.'}, vector=None),
 ScoredPoint(id=7681, version=7, score=0.806

In [63]:
df[((df.name_1 == 'Dow Chemical (Shanghai) Co., Ltd.') | (df.name_2 == 'Dow Chemical (Shanghai) Co., Ltd.')) & df.is_duplicate == 1]

Unnamed: 0,name_1,name_2,is_duplicate,emb_1,emb_2
3621,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical Pacific Ltd.,1,"[-0.028365090489387512, -0.02730587124824524, ...","[-0.053025662899017334, 0.003969652112573385, ..."
25254,Dow Chemical International Private Ltd.,"Dow Chemical (Shanghai) Co., Ltd.",1,"[-0.054589733481407166, -0.02028239332139492, ...","[-0.028365090489387512, -0.02730587124824524, ..."
43550,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical International Private Ltd.,1,"[-0.028365090489387512, -0.02730587124824524, ...","[-0.054589733481407166, -0.02028239332139492, ..."
67051,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical,1,"[-0.028365090489387512, -0.02730587124824524, ...","[0.01545174140483141, 0.002613763092085719, -0..."
67332,Dow Chemical Pacific Ltd.,"Dow Chemical (Shanghai) Co., Ltd.",1,"[-0.053025662899017334, 0.003969652112573385, ...","[-0.028365090489387512, -0.02730587124824524, ..."
86276,Dow Chemical Pacific,"Dow Chemical (Shanghai) Co., Ltd.",1,"[-0.024302493780851364, 0.016313781961798668, ...","[-0.028365090489387512, -0.02730587124824524, ..."
114460,Dow Chemical Thailand Ltd.,"Dow Chemical (Shanghai) Co., Ltd.",1,"[-0.05091489478945732, 0.00030382093973457813,...","[-0.028365090489387512, -0.02730587124824524, ..."
116796,Dow Chemical,"Dow Chemical (Shanghai) Co., Ltd.",1,"[0.01545174140483141, 0.002613763092085719, -0...","[-0.028365090489387512, -0.02730587124824524, ..."
127101,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical Pacific,1,"[-0.028365090489387512, -0.02730587124824524, ...","[-0.024302493780851364, 0.016313781961798668, ..."
148353,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical Thailand Ltd.,1,"[-0.028365090489387512, -0.02730587124824524, ...","[-0.05091489478945732, 0.00030382093973457813,..."


### Calculate metrics

In [64]:
search_scroll = qdrant_client.scroll(
    collection_name=col_name,
    limit=col.points_count,
    offset=0,
    with_vectors=True,
    with_payload=True,
)

In [111]:
# Select test dataset
df_isdup = df[df.is_duplicate == 1]
names = np.unique(np.concatenate([df_isdup.name_1, df_isdup.name_2]))

In [112]:
names

array(['ALBEMARLE GREEN CREST ', 'APS Paving Stone Inc',
       'APS Paving&Stone Inc', 'AUTOMOTIVE PERFORMANCE MATERIAL (APM)',
       'Achem Technology (Dongguan)', 'Achem Technology Corp.',
       'Achem Technology Dongguan',
       'Adams Resources Exploration Corporation, 112 Oil And Gas Assets',
       'Adams Resources Exploration Corporation, Oil And Gas Assets In Crocket And Irion Counties In Texas',
       'Adams Resources Exploration Corporation, Oil And Gas Assets Outside The Permian Basin',
       'Adhesives Research, Inc.', 'Afton Chemical De Mexicosa De Cv',
       'Afton Chemical Hyderabad Pvt., Ltd.',
       'Afton Chemical India Private Ltd.',
       'Afton Chemical Industria De Aditivos Ltda',
       'Albemarle Corporation', 'Albemarle Corporation (NYSE:ALB)',
       'Ashland Inc.', 'Ashland Specialty Ingredients Gp',
       'Automotive Performance Material',
       'Automotive Performance Materials', 'Avery Denisson (BE)',
       'Avery Dennison Kunshan Co., Ltd.',
 

In [118]:
def precision_score_at_k(k):
    scores = []
    for item in tqdm(names):
        # q_name = item.payload['name']
        q_name = item
        res = inference(q_name, model, limit=k, df_embs=df_embs)
        found_name = res[0].payload['name']

        # todo p@ k > 1
        # ap_k = 1/k * ()
        # for i in enumerate(range(11)):

        tmpd = df[((df.name_1 == found_name) & (df.name_2 == q_name)) | ((df.name_1 == q_name) & (df.name_2 == found_name))]
        tmpd = tmpd[tmpd.is_duplicate == 1]
        if len(tmpd) > 0:
            scores.append(1)

    return len(scores) / len(names)

100%|██████████| 369/369 [01:00<00:00,  6.12it/s]


In [None]:
map_score = precision_score_at_k(1)
map_score