In [4]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
import pickle
import pymorphy2
import Levenshtein
import gensim.downloader
import json
from os import path

In [37]:
# конфигурация
with open('config.json', 'r') as file:
    config = json.load(file)

inference_path = config["bins_directory"]
pca_n_components = config["pca_n_components"]

In [17]:
cat_model = CatBoostClassifier()
cat_model.load_model(path.join(inference_path, 'catboost_model.bin'))

morph = pymorphy2.MorphAnalyzer()

word2vec_rus = gensim.models.KeyedVectors.load(path.join(inference_path, "word2vec_rus.model"))

In [22]:
def reduce_dimension(X):
    with open(path.join(inference_path, 'pca_model.pkl'), 'rb') as file:
        pca = pickle.load(file)
    X_transformed = pca.transform(X)
    return X_transformed

In [16]:
# get a part of speech needed to make an embedding of word
def get_part_of_speech(word):
    
    parsed_word = morph.parse(word)[0]
    pos = parsed_word.tag.POS

    if pos == "ADJF":
        return "ADJ"
    
    return pos 

In [18]:
# making embedding by pretrained word2vec model 
def get_embedding(word):

    w2v_word = f"{word}_{get_part_of_speech(word)}"

    try:
        emb = word2vec_rus[w2v_word]

    except KeyError:
        return None
    
    return emb

In [49]:
# function for testing new words
def test(word1, word2):

    # getting embs and lev dist 
    emb1, emb2 = get_embedding(word1), get_embedding(word2)
    pca_emb1 = reduce_dimension(emb1.reshape(1, -1))
    pca_emb2 = reduce_dimension(emb2.reshape(1, -1))
    lev_dist = Levenshtein.distance(word1, word2)

    # creating dataframe
    row = pca_emb1.tolist()[0] + pca_emb2.tolist()[0] + [lev_dist] + [lev_dist**2] 
    print(row)
    X_new = pd.DataFrame([row], columns=[f"emb_{int(i > pca_n_components - 1) + 1}_{i % pca_n_components}" for i in range(pca_n_components*2)] + ["lev_dist", "lev_dist_2"])
    
    X_new["lev_dist_%"] = lev_dist / X_new.apply(lambda row: max(len(word1), len(word2)), axis=1)
    
    prediction = cat_model.predict_proba(X_new)
    res = np.argmax(prediction[0])
    proba = max(prediction[0])

    # result
    print(f"Слова {word1} и {word2} {'не '*(not res)}являются паронимами с вероятностью {proba}")

In [52]:
# не паронимы
word1 = "приветливый"
word2 = "страна" 

# паронимы
word1 = "целый"
word2 = "цельный" 

# тестирование
test(word1, word2)

[0.2617155010774862, -0.2579223616028451, 0.0021111314533581657, 0.2517039075997864, -0.07435253373697175, 0.034992947033226196, 0.005026388669455227, -0.30020079829751445, 0.026327639274767986, -0.05272361675305906, 0.11037401566418316, -0.051037696946646255, 0.14503985416912926, -0.061720566889175876, -0.12413863783476567, -0.15910530531001804, 0.0707032444906683, 0.0815723261557095, 0.027174671723108335, 0.1163755219662081, 0.0034671994117857763, -0.0440980951277343, -0.023349320367278603, -0.07531375698344417, -0.062093878461368435, 0.05837473767946606, 0.005803923205723043, -0.012445041057151367, 0.04567067855779507, 0.017053470749244553, 0.02048814758545208, 0.01756466370453212, 0.04411988567549472, -0.02067333698669901, 0.025259517491871317, -0.0827712655651441, 0.004695425701704929, 0.014293480344216958, 0.014898388376503662, -0.005789015240754911, 0.050830242997634376, 0.07939590137787297, 0.03339441306786387, 0.033831152245353484, -0.005684394147052442, -0.01010952504079439, 

