In [None]:
%pip install langdetect

In [None]:
import string
import sys
import warnings

import fasttext
import numpy as np
import pandas as pd
from langdetect import detect
from nltk.stem import SnowballStemmer
from parapply import parapply
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tqdm import tqdm

tqdm.pandas()
warnings.filterwarnings("ignore")

data = pd.read_csv(sys.path[0] + "/train.csv", sep=",", dtype=object)
data.shape

In [None]:
def preprocessing(data, n_jobs=1):

    for character in string.punctuation:
        data["name_1"] = data["name_1"].apply(lambda x: str(x).replace(character, ""))
        data["name_2"] = data["name_2"].apply(lambda x: str(x).replace(character, ""))

    data["lang_1"] = parapply(data["name_1"], lambda x: detect(x), n_jobs=n_jobs)
    data["lang_2"] = parapply(data["name_2"], lambda x: detect(x), n_jobs=n_jobs)

    lang_ru_indexes = data[data["lang_1"] == "ru"].index
    print(lang_ru_indexes)
    if len(list(lang_ru_indexes)) == 0:
        lang_other_indexes = data.index
    else:
        lang_other_indexes = data.index.drop(lang_ru_indexes)
    stemmer_ru = SnowballStemmer("russian")
    stemmer_other = SnowballStemmer("english")

    if len(list(lang_ru_indexes)) != 0:
        data.loc[lang_ru_indexes, "name_1_stemmed"] = parapply(
            data.loc[lang_ru_indexes, "name_1"], lambda x: stemmer_ru.stem(x), n_jobs=4
        )
        data.loc[lang_other_indexes, "name_1_stemmed"] = parapply(
            data.loc[lang_other_indexes, "name_1"],
            lambda x: stemmer_other.stem(x),
            n_jobs=4,
        )
    else:
        data.loc[lang_other_indexes, "name_1_stemmed"] = parapply(
            data.loc[lang_other_indexes, "name_1"],
            lambda x: stemmer_other.stem(x),
            n_jobs=4,
        )

    lang_ru_indexes = data[data["lang_2"] == "ru"].index
    print(lang_ru_indexes)
    if len(list(lang_ru_indexes)) == 0:
        lang_other_indexes = data.index
    else:
        lang_other_indexes = data.index.drop(lang_ru_indexes)
    stemmer_ru = SnowballStemmer("russian")
    stemmer_other = SnowballStemmer("english")

    if len(list(lang_ru_indexes)) != 0:
        data.loc[lang_ru_indexes, "name_2_stemmed"] = parapply(
            data.loc[lang_ru_indexes, "name_2"],
            lambda x: stemmer_ru.stem(x),
            n_jobs=n_jobs,
        )
        data.loc[lang_other_indexes, "name_2_stemmed"] = parapply(
            data.loc[lang_other_indexes, "name_2"],
            lambda x: stemmer_other.stem(x),
            n_jobs=n_jobs,
        )
    else:
        data.loc[lang_other_indexes, "name_2_stemmed"] = parapply(
            data.loc[lang_other_indexes, "name_2"],
            lambda x: stemmer_other.stem(x),
            n_jobs=n_jobs,
        )

    return data


def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [None]:
dataset = preprocessing(data, 4)

In [None]:
dataset.to_csv(
    sys.path[0] + "data/CompanyNamesSimularity_dataset.csv", sep=";", index=False
)

In [None]:
dataset = pd.read_csv(
    sys.path[0] + "data/CompanyNamesSimularity_dataset.csv", sep=";", dtype=object
)

In [None]:
model_1 = fasttext.train_unsupervised(
    input=sys.path[0] + "/train_strings.csv",
    model="skipgram",
    minn=2,
    maxn=5,
    dim=300,
    epoch=5,
    lr=0.1,
    thread=10,
)
model_2 = fasttext.train_unsupervised(
    input=sys.path[0] + "/train_strings.csv",
    model="skipgram",
    minn=2,
    maxn=5,
    dim=150,
    epoch=7,
    lr=0.05,
    thread=10,
)
model_3 = fasttext.train_unsupervised(
    input=sys.path[0] + "/train_strings.csv",
    model="skipgram",
    minn=2,
    maxn=5,
    dim=100,
    epoch=10,
    lr=0.01,
    thread=10,
)

In [None]:
model_1.save_model(sys.path[0] + "/model_1.bin")
model_2.save_model(sys.path[0] + "/model_2.bin")
model_3.save_model(sys.path[0] + "/model_3.bin")

In [None]:
model_1 = fasttext.load_model(sys.path[0] + "/model_1.bin")

In [None]:
dataset["is_duplicate"] = dataset["is_duplicate"].astype(int)

In [None]:
dataset_0 = dataset[dataset["is_duplicate"] == 0].sample(n=3600)
dataset_1 = dataset[dataset["is_duplicate"] == 1].sample(n=3600)

dataset_train_test = dataset_0.append(dataset_1).reset_index(drop=True)

print(dataset_train_test.shape)

In [None]:
embeddings_name_1 = np.zeros((dataset_train_test.shape[0], 300), dtype="float32")
embeddings_name_2 = np.zeros((dataset_train_test.shape[0], 300), dtype="float32")

for i, sentence in enumerate(dataset_train_test["name_1_stemmed"]):
    embeddings_name_1[i, :] = model_1.get_sentence_vector(sentence)
for i, sentence in enumerate(dataset_train_test["name_2_stemmed"]):
    embeddings_name_2[i, :] = model_1.get_sentence_vector(sentence)

In [None]:
embeddings_1 = np.hstack((embeddings_name_1, embeddings_name_2))
embeddings_1.shape

In [None]:
def simularity(embeddings):
    distances = np.zeros(embeddings.shape[0])

    for i, sentence in enumerate(embeddings):
        vector_i = sentence[0:300]
        vector_j = sentence[300:]
        distances[i] = cos_sim(vector_i, vector_j)

    return distances


cos_distance = simularity(embeddings_1)

dataset_train_test["cos_distance"] = cos_distance

dataset_train_test.head(10)

In [None]:
cols = ["emb_" + str(i) for i in range(embeddings_1.shape[1])]
embeddings_1_pd = pd.DataFrame(data=embeddings_1, columns=cols)
embeddings_1_pd.head(10)

In [None]:
dataset_train_test.columns

In [None]:
dataset_train_test = dataset_train_test[
    ["pair_id", "name_1", "name_2", "cos_distance", "is_duplicate"]
]
dataset_train_test = pd.concat((dataset_train_test, embeddings_1_pd), axis=1)

dataset_train_test.shape

In [None]:
dataset_train_test.drop(["pair_id", "name_1", "name_2", "is_duplicate"], axis=1)

In [None]:
dataset_train_test["is_duplicate"]

In [None]:
dataset_train_test.to_csv("data/dataset_train_test.csv", sep=";", index=False)

In [None]:
train, test, y_train, y_test = train_test_split(
    dataset_train_test.drop(["pair_id", "name_1", "name_2", "is_duplicate"], axis=1),
    dataset_train_test["is_duplicate"],
    test_size=0.3,
    shuffle=True,
    random_state=42,
)

In [None]:
from sklearn.linear_model import LogisticRegression


logreg = LogisticRegression()
logreg.fit(train, y_train)
y_train_pred = logreg.predict(train)

print(classification_report(y_train, y_train_pred))

In [None]:
y_test_pred = logreg.predict(test)

print(classification_report(y_test, y_test_pred))

In [None]:
model_2 = fasttext.load_model(sys.path[0] + "/model_2.bin")

dataset_train_test = dataset_0.append(dataset_1).reset_index(drop=True)

embeddings_name_1 = np.zeros((dataset_train_test.shape[0], 300), dtype="float32")
embeddings_name_2 = np.zeros((dataset_train_test.shape[0], 300), dtype="float32")

for i, sentence in enumerate(dataset_train_test["name_1_stemmed"]):
    embeddings_name_1[i, :] = model_1.get_sentence_vector(sentence)
for i, sentence in enumerate(dataset_train_test["name_2_stemmed"]):
    embeddings_name_2[i, :] = model_1.get_sentence_vector(sentence)

embeddings_2 = np.hstack((embeddings_name_1, embeddings_name_2))


cos_distance = simularity(embeddings_2)

dataset_train_test["cos_distance"] = cos_distance

cols = ["emb_" + str(i) for i in range(embeddings_2.shape[1])]
embeddings_2_pd = pd.DataFrame(data=embeddings_2, columns=cols)

dataset_train_test = dataset_train_test[
    ["pair_id", "name_1", "name_2", "cos_distance", "is_duplicate"]
]
dataset_train_test = pd.concat((dataset_train_test, embeddings_2_pd), axis=1)

train, test, y_train, y_test = train_test_split(
    dataset_train_test.drop(["pair_id", "name_1", "name_2", "is_duplicate"], axis=1),
    dataset_train_test["is_duplicate"],
    test_size=0.3,
    shuffle=True,
    random_state=42,
)


logreg = LogisticRegression()
logreg.fit(train, y_train)

y_train_pred = logreg.predict(train)

print(classification_report(y_train, y_train_pred))

y_test_pred = logreg.predict(test)

print(classification_report(y_test, y_test_pred))

In [None]:
model_3 = fasttext.load_model(sys.path[0] + "/model_3.bin")

dataset_train_test = dataset_0.append(dataset_1).reset_index(drop=True)

embeddings_name_1 = np.zeros((dataset_train_test.shape[0], 300), dtype="float32")
embeddings_name_3 = np.zeros((dataset_train_test.shape[0], 300), dtype="float32")

for i, sentence in enumerate(dataset_train_test["name_1_stemmed"]):
    embeddings_name_1[i, :] = model_1.get_sentence_vector(sentence)
for i, sentence in enumerate(dataset_train_test["name_2_stemmed"]):
    embeddings_name_3[i, :] = model_1.get_sentence_vector(sentence)

embeddings_3 = np.hstack((embeddings_name_1, embeddings_name_3))


cos_distance = simularity(embeddings_3)

dataset_train_test["cos_distance"] = cos_distance

cols = ["emb_" + str(i) for i in range(embeddings_3.shape[1])]
embeddings_3_pd = pd.DataFrame(data=embeddings_3, columns=cols)

dataset_train_test = dataset_train_test[
    ["pair_id", "name_1", "name_2", "cos_distance", "is_duplicate"]
]
dataset_train_test = pd.concat((dataset_train_test, embeddings_3_pd), axis=1)

train, test, y_train, y_test = train_test_split(
    dataset_train_test.drop(["pair_id", "name_1", "name_2", "is_duplicate"], axis=1),
    dataset_train_test["is_duplicate"],
    test_size=0.3,
    shuffle=True,
    random_state=42,
)


logreg = LogisticRegression()
logreg.fit(train, y_train)

y_train_pred = logreg.predict(train)

print(classification_report(y_train, y_train_pred))

y_test_pred = logreg.predict(test)

print(classification_report(y_test, y_test_pred))