# Experiment 2
# Sentence embeddings and Qdrant Search

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from qdrant_client.http.models import models

In [5]:
df_emb = pd.read_parquet('../../data/df_embs.parquet')

In [4]:
df = pd.read_parquet('../../data/train.parquet')

In [6]:
df = df.merge(df_emb, how='left', left_on='name_1', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_1'}, axis=1)

In [7]:
df = df.merge(df_emb, how='left', left_on='name_2', right_on='name')
df = df.drop(columns=['name'])
df = df.rename({'emb': 'emb_2'}, axis=1)

In [17]:
df_emb

Unnamed: 0,name,emb
0,Iko Industries Ltd.,"[-0.0183494221419096, -0.00905199721455574, 0...."
1,Apcotex Industries Ltd.,"[-0.019763024523854256, -0.024556715041399002,..."
2,"Rishichem Distributors Pvt., Ltd.","[-0.04853949323296547, -0.0022187146823853254,..."
3,Powermax Rubber Factory,"[-0.02500852569937706, 0.012965007685124874, -..."
4,Tress A/S,"[0.025902049615979195, -0.002849553246051073, ..."
...,...,...
18017,Plastic Packaging (Pty) Ltd.,"[-0.11937715858221054, 0.0766262337565422, 0.0..."
18018,'Hengshui Mechanical & Electrical Building Co....,"[0.0029773120768368244, 0.04723874479532242, 0..."
18019,"Jiangsu Baoli International Investment Co., Lt...","[-0.07105936110019684, 0.025270389392971992, 0..."
18020,Lanxess AG,"[-0.03159470111131668, 0.009131425991654396, -..."


In [16]:
np.stack(df.emb_1).shape

(149734, 512)

# Add vectors to qdrant

In [None]:
!docker pull qdrant/qdrant

In [None]:
!docker run --name qdrant -d -p 6333:6333 --net=bridge qdrant/qdrant

In [106]:
df_embs_preproc = pd.read_parquet('../../data/df_embs_preproc.parquet')

In [107]:
df_embs_preproc

Unnamed: 0,name,name_preproc,emb
0,Iko Industries Ltd.,Iko Industries,"[0.017819033935666084, -0.014882056973874569, ..."
1,Apcotex Industries Ltd.,Apcotex Industries,"[0.013420317322015762, -0.023446308448910713, ..."
2,"Rishichem Distributors Pvt., Ltd.",Rishichem Distributors,"[-0.01495872437953949, -0.03767695277929306, -..."
3,Powermax Rubber Factory,Powermax Rubber Factory,"[-0.02500852569937706, 0.012965007685124874, -..."
4,Tress A/S,Tress A/S,"[0.025902049615979195, -0.002849553246051073, ..."
...,...,...,...
18017,Plastic Packaging (Pty) Ltd.,Plastic Packaging,"[-0.10133685171604156, 0.09970243275165558, -0..."
18018,'Hengshui Mechanical & Electrical Building Co....,Hengshui Mechanical & Electrical Building,"[0.028690272942185402, 0.07035619765520096, -0..."
18019,"Jiangsu Baoli International Investment Co., Lt...",Jiangsu Baoli Investment,"[0.0056513226591050625, 0.03347638249397278, 0..."
18020,Lanxess AG,Lanxess AG,"[-0.03159470111131668, 0.009131425991654396, -..."


In [108]:
qdrant_client = QdrantClient(host='0.0.0.0', port=6333)

In [109]:
vectors = np.stack(df_emb.emb)
payload = df_embs_preproc[['name', 'name_preproc']].to_dict(orient='records')
ids = df_emb.index.values.tolist()

In [110]:
col_name = 'company'
vec_shape = vectors.shape[1]
bs = 1024

In [111]:
qdrant_client.recreate_collection(collection_name=col_name,
                                  vectors_config=models.VectorParams(size=vec_shape, distance=models.Distance.COSINE),
                                  on_disk_payload=True)

In [None]:
qdrant_client.upload_collection(
    collection_name=col_name,
    vectors=vectors,
    payload=payload,
    ids=ids,
    batch_size=bs,
    parallel=6
)

In [1]:
col = qdrant_client.get_collection(col_name)

NameError: name 'qdrant_client' is not defined

In [None]:
col.points_count

#### Try sim search

In [57]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

Downloading:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/531 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

In [89]:
company_name = 'Dow Chemical (Shanghai) Co., Ltd.'
qvector = model.encode(company_name)

In [90]:
search_result = qdrant_client.search(
    collection_name=col_name,
    query_vector=qvector,
    query_filter=None,
    limit=30,
    offset=0
)

In [91]:
search_result

[ScoredPoint(id=6647, version=6, score=1.0, payload={'name': 'Dow Chemical (Shanghai) Co., Ltd.'}, vector=None),
 ScoredPoint(id=12778, version=14, score=0.9300273, payload={'name': 'Yuanbai Chemicals (Shanghai) Co., Ltd.'}, vector=None),
 ScoredPoint(id=35, version=4, score=0.91014034, payload={'name': 'Lord Fine Chemical (Shanghai) Co., Ltd.'}, vector=None),
 ScoredPoint(id=4192, version=2, score=0.89989686, payload={'name': 'Quadro Chemicals (Shanghai) Co., Ltd.'}, vector=None),
 ScoredPoint(id=1384, version=0, score=0.8955901, payload={'name': 'Shanghai Yechuan Chemicals Co., Ltd.'}, vector=None),
 ScoredPoint(id=5456, version=1, score=0.8779373, payload={'name': 'Shanghai Junxin Chemical Co., Ltd.'}, vector=None),
 ScoredPoint(id=9193, version=7, score=0.8764175, payload={'name': 'Shanghai Manyi Chemical Co., Ltd.'}, vector=None),
 ScoredPoint(id=8855, version=7, score=0.87584376, payload={'name': 'Shanghai Uniwell Chemical Co., Ltd.'}, vector=None),
 ScoredPoint(id=10132, version

In [96]:
df[((df.name_1 == company_name) | (df.name_2 == company_name)) & df.is_duplicate == 1]

Unnamed: 0,name_1,name_2,is_duplicate,emb_1,emb_2
3621,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical Pacific Ltd.,1,"[-0.028365090489387512, -0.02730587124824524, ...","[-0.053025662899017334, 0.003969652112573385, ..."
25254,Dow Chemical International Private Ltd.,"Dow Chemical (Shanghai) Co., Ltd.",1,"[-0.054589733481407166, -0.02028239332139492, ...","[-0.028365090489387512, -0.02730587124824524, ..."
43550,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical International Private Ltd.,1,"[-0.028365090489387512, -0.02730587124824524, ...","[-0.054589733481407166, -0.02028239332139492, ..."
67051,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical,1,"[-0.028365090489387512, -0.02730587124824524, ...","[0.01545174140483141, 0.002613763092085719, -0..."
67332,Dow Chemical Pacific Ltd.,"Dow Chemical (Shanghai) Co., Ltd.",1,"[-0.053025662899017334, 0.003969652112573385, ...","[-0.028365090489387512, -0.02730587124824524, ..."
86276,Dow Chemical Pacific,"Dow Chemical (Shanghai) Co., Ltd.",1,"[-0.024302493780851364, 0.016313781961798668, ...","[-0.028365090489387512, -0.02730587124824524, ..."
114460,Dow Chemical Thailand Ltd.,"Dow Chemical (Shanghai) Co., Ltd.",1,"[-0.05091489478945732, 0.00030382093973457813,...","[-0.028365090489387512, -0.02730587124824524, ..."
116796,Dow Chemical,"Dow Chemical (Shanghai) Co., Ltd.",1,"[0.01545174140483141, 0.002613763092085719, -0...","[-0.028365090489387512, -0.02730587124824524, ..."
127101,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical Pacific,1,"[-0.028365090489387512, -0.02730587124824524, ...","[-0.024302493780851364, 0.016313781961798668, ..."
148353,"Dow Chemical (Shanghai) Co., Ltd.",Dow Chemical Thailand Ltd.,1,"[-0.028365090489387512, -0.02730587124824524, ...","[-0.05091489478945732, 0.00030382093973457813,..."


In [None]:
stopwords = ['Co.', 'Ltd.', 'Ltd', 'Private', 'International', 'Pacific', 'Pvt.', 'Corp.', 'Inc.', 'Sdn Bhd', '']

# Train Quaterion