In [1]:
%pip install sentence-transformers

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install --upgrade ipywidgets

Defaulting to user installation because normal site-packages is not writeable
Collecting ipywidgets
  Using cached ipywidgets-8.0.2-py3-none-any.whl (134 kB)
Collecting widgetsnbextension~=4.0
  Using cached widgetsnbextension-4.0.3-py3-none-any.whl (2.0 MB)
Collecting jupyterlab-widgets~=3.0
  Using cached jupyterlab_widgets-3.0.3-py3-none-any.whl (384 kB)
Collecting debugpy<2.0,>=1.0.0
  Downloading debugpy-1.6.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: widgetsnbextension, jupyterlab-widgets, debugpy, ipywidgets
Successfully installed debugpy-1.6.3 ipywidgets-8.0.2 jupyterlab-widgets-3.0.3 widgetsnbextension-4.0.3
Note: you may need to restart the kernel to use updated packages.


In [None]:
import warnings

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tqdm import tqdm

tqdm.pandas()
warnings.filterwarnings("ignore")

In [None]:
model_distiluse = SentenceTransformer(
    "sentence-transformers/distiluse-base-multilingual-cased-v2", device="cuda"
)

In [None]:
dataset_train_test = pd.read_csv(
    "data/dataset_train_test.csv",
    sep=";",
    dtype=object,
    usecols=["pair_id", "name_1", "name_2", "is_duplicate"],
)
dataset_train_test.head(10)

In [None]:
embeddings_name_1 = np.zeros((dataset_train_test.shape[0], 512), dtype="float32")
embeddings_name_2 = np.zeros((dataset_train_test.shape[0], 512), dtype="float32")

for i, sentence in enumerate(dataset_train_test["name_1"]):
    embeddings_name_1[i, :] = model_distiluse.encode(sentence)
for i, sentence in enumerate(dataset_train_test["name_2"]):
    embeddings_name_2[i, :] = model_distiluse.encode(sentence)

In [None]:
embeddings_1 = np.hstack((embeddings_name_1, embeddings_name_2))

In [None]:
def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)


def simularity(embeddings):
    distances = np.zeros(embeddings.shape[0])

    for i, sentence in enumerate(embeddings):
        vector_i = sentence[0:512]
        vector_j = sentence[512:]
        distances[i] = cos_sim(vector_i, vector_j)

    return distances


cos_distance = simularity(embeddings_1)

dataset_train_test["cos_distance"] = cos_distance

In [None]:
cols = ["emb_" + str(i) for i in range(embeddings_1.shape[1])]
embeddings_1_pd = pd.DataFrame(data=embeddings_1, columns=cols)
embeddings_1_pd.head(10)

In [None]:
dataset_train_test = dataset_train_test[
    ["pair_id", "name_1", "name_2", "cos_distance", "is_duplicate"]
]
dataset_train_test = pd.concat((dataset_train_test, embeddings_1_pd), axis=1)

dataset_train_test.shape

In [None]:
train, test, y_train, y_test = train_test_split(
    dataset_train_test.drop(["pair_id", "name_1", "name_2", "is_duplicate"], axis=1),
    dataset_train_test["is_duplicate"],
    test_size=0.3,
    shuffle=True,
    random_state=42,
)

In [None]:
from sklearn.linear_model import LogisticRegression


logreg = LogisticRegression()
logreg.fit(train, y_train)
y_train_pred = logreg.predict(train)

print(classification_report(y_train, y_train_pred))

In [None]:
y_test_pred = logreg.predict(test)

print(classification_report(y_test, y_test_pred))

In [None]:
from sentence_transformers import SentenceTransformer

model_miniLM = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", device="cuda"
)

dataset_train_test = pd.read_csv(
    "data/dataset_train_test.csv",
    sep=";",
    dtype=object,
    usecols=["pair_id", "name_1", "name_2", "is_duplicate"],
)
dataset_train_test.head(10)

In [None]:
embeddings_name_1 = np.zeros((dataset_train_test.shape[0], 384), dtype="float32")
embeddings_name_2 = np.zeros((dataset_train_test.shape[0], 384), dtype="float32")

for i, sentence in enumerate(dataset_train_test["name_1"]):
    embeddings_name_1[i, :] = model_miniLM.encode(sentence)
for i, sentence in enumerate(dataset_train_test["name_2"]):
    embeddings_name_2[i, :] = model_miniLM.encode(sentence)

In [None]:
embeddings_1 = np.hstack((embeddings_name_1, embeddings_name_2))


def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)


def simularity(embeddings):
    distances = np.zeros(embeddings.shape[0])

    for i, sentence in enumerate(embeddings):
        vector_i = sentence[0:384]
        vector_j = sentence[384:]
        distances[i] = cos_sim(vector_i, vector_j)

    return distances


cos_distance = simularity(embeddings_1)

dataset_train_test["cos_distance"] = cos_distance

cols = ["emb_" + str(i) for i in range(embeddings_1.shape[1])]
embeddings_1_pd = pd.DataFrame(data=embeddings_1, columns=cols)

dataset_train_test = dataset_train_test[
    ["pair_id", "name_1", "name_2", "cos_distance", "is_duplicate"]
]
dataset_train_test = pd.concat((dataset_train_test, embeddings_1_pd), axis=1)

train, test, y_train, y_test = train_test_split(
    dataset_train_test.drop(["pair_id", "name_1", "name_2", "is_duplicate"], axis=1),
    dataset_train_test["is_duplicate"],
    test_size=0.3,
    shuffle=True,
    random_state=42,
)

In [None]:
from sklearn.linear_model import LogisticRegression


logreg = LogisticRegression()
logreg.fit(train, y_train)
y_train_pred = logreg.predict(train)

print(classification_report(y_train, y_train_pred))

In [10]:
y_test_pred = logreg.predict(test)

print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1087
           1       0.91      0.93      0.92      1073

    accuracy                           0.92      2160
   macro avg       0.92      0.92      0.92      2160
weighted avg       0.92      0.92      0.92      2160



In [11]:
%pip freeze>>requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.
