In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 8.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 49.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 66.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [None]:
!git clone -b dev https://github.com/SashaMogilevskii/duplicate_names.git

Cloning into 'duplicate_names'...
remote: Enumerating objects: 171, done.[K
remote: Counting objects: 100% (42/42), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 171 (delta 12), reused 25 (delta 8), pack-reused 129[K
Receiving objects: 100% (171/171), 37.96 MiB | 17.39 MiB/s, done.
Resolving deltas: 100% (60/60), done.


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score, f1_score

In [None]:
data = pd.read_csv('duplicate_names/data/data_v2_19_oct.csv', index_col=0) #  index_col='pair_id'

In [None]:
data[data.is_duplicate==1]

Unnamed: 0_level_0,name_1,name_2,is_duplicate,name_1_upd,name_2_upd
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
162,JX Nippon Oil & Gas Exploration (Brasil) Ltda,JX Nippon Oil & Gas Exploration Technical Serv...,1,JX Nippon Oil Gas Exploration Brasil,JX Nippon Oil Gas Exploration Technical Services
604,Pirelli Neumaticos S.A.I.C.,"Pirelli Tyre Co., Ltd.",1,Pirelli Neumaticos S A I C,Pirelli Tyre
836,Brenntag Australia (Pty) Ltd.,Brenntag Group,1,Brenntag Australia Pty,Brenntag Group
1329,"PAUL BAUDER GMBH & CO KG, BOCHUM PLANT",Paul Bauder ag,1,PAUL BAUDER GMBH CO KG BOCHUM PLANT,Paul Bauder ag
1563,TOTAL CESKA REPUBLIKA s.r.o.,TOTAL FRANCE (ARNAY LE DUC),1,TOTAL CESKA REPUBLIKA s r o,TOTAL FRANCE ARNAY LE DUC
...,...,...,...,...,...
496575,"Bridgestone （Huizhou）Synthetic Rubber Co., Ltd.","Bridgestone India Pvt., Ltd.",1,Bridgestone Huizhou Synthetic,Bridgestone
496761,Arlanxeo International Group,Arlanxeo Corp.,1,Arlanxeo Group,Arlanxeo Corp
497016,Brenntag Peru S.A.C.,Brenntag Chile Comercial E Industrial Ltda,1,Brenntag Peru S A C,Brenntag Chile Comercial E
497084,Dow Chemical International Private Ltd.,Dow Chemical Pacific,1,Dow Chemical,Dow Chemical Pacific


In [None]:
tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
model = BertModel.from_pretrained("prajjwal1/bert-tiny")

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [None]:
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt').to(device)
output = model(**encoded_input)

In [None]:
output.pooler_output.shape

torch.Size([1, 128])

In [None]:
def cos_sim(text1, text2):
  encoded_input_text_1 = tokenizer(text1, return_tensors='pt').to(device)
  text_1 = model(**encoded_input_text_1).pooler_output.cpu().detach().numpy()

  encoded_input_text_2 = tokenizer(text2, return_tensors='pt').to(device)
  text_2 = model(**encoded_input_text_2).pooler_output.cpu().detach().numpy()

  return cosine_similarity(text_1, text_2)[0][0]

In [None]:
cos_sim('Iko Industries Ltd.', 'Enormous Industrial Trade Pvt., Ltd.')

0.95116353

In [None]:
cos_sim('Pirelli Neumaticos S.A.I.C.	', 'Pirelli Tyre Co., Ltd.')

0.93543

In [None]:
data_sample = data.sample(30000)

In [None]:
predict_source = data_sample.apply(lambda row: cos_sim(row['name_1'], row['name_2']), axis=1)

In [None]:
roc_auc_score(data_sample.is_duplicate, predict_source)

0.5874301390477752

In [None]:
for thr in np.arange(0.6, 1, 0.03):
  print(f'thr - {thr}, f1 - {f1_score(data_sample.is_duplicate, (predict_source>thr).astype(int))}')

thr - 0.6, f1 - 0.01581106112728235
thr - 0.63, f1 - 0.01574751050385417
thr - 0.66, f1 - 0.015753764686414033
thr - 0.6900000000000001, f1 - 0.015766810202053658
thr - 0.7200000000000001, f1 - 0.01580450229098878
thr - 0.7500000000000001, f1 - 0.015822151011415982
thr - 0.7800000000000001, f1 - 0.01588797630267941
thr - 0.8100000000000002, f1 - 0.015686814131893084
thr - 0.8400000000000002, f1 - 0.016043742580668367
thr - 0.8700000000000002, f1 - 0.016118736035748482
thr - 0.9000000000000002, f1 - 0.01692623566528119
thr - 0.9300000000000003, f1 - 0.020615209804398994
thr - 0.9600000000000003, f1 - 0.030538051381483276
thr - 0.9900000000000003, f1 - 0.07624633431085044


In [None]:
predict_normal = data_sample.apply(lambda row: cos_sim(row['name_1_upd'], row['name_2_upd']), axis=1)

In [None]:
roc_auc_score(data_sample.is_duplicate, predict_normal)

0.7427119173544214

In [None]:
for thr in np.arange(0.6, 1, 0.03):
  print(f'thr - {thr}, f1 - {f1_score(data_sample.is_duplicate, (predict_normal>thr).astype(int))}')

thr - 0.6, f1 - 0.015812630255714714
thr - 0.63, f1 - 0.015813676514374566
thr - 0.66, f1 - 0.015750115809675072
thr - 0.6900000000000001, f1 - 0.015753243314800103
thr - 0.7200000000000001, f1 - 0.01569172708312643
thr - 0.7500000000000001, f1 - 0.015707847295864268
thr - 0.7800000000000001, f1 - 0.015753265312905048
thr - 0.8100000000000002, f1 - 0.015801807833947103
thr - 0.8400000000000002, f1 - 0.01596929508927042
thr - 0.8700000000000002, f1 - 0.01679690352734974
thr - 0.9000000000000002, f1 - 0.018799586064160056
thr - 0.9300000000000003, f1 - 0.025860354087925204
thr - 0.9600000000000003, f1 - 0.04669421487603305
thr - 0.9900000000000003, f1 - 0.11803278688524592
